读书人

打包抓取网页信息的实例

发布时间: 2013-03-13 10:56:58 作者: rapoo

封装抓取网页信息的实例

  1. package cn.mypic;
  2. import java.io.BufferedInputStream;
  3. import java.io.BufferedReader;
  4. import java.io.File;
  5. import java.io.FileNotFoundException;
  6. import java.io.FileOutputStream;
  7. import java.io.IOException;
  8. import java.io.InputStreamReader;
  9. import java.net.MalformedURLException;
  10. import java.net.URL;
  11. import java.util.regex.Matcher;
  12. import java.util.regex.Pattern;
  13. public class GetContentPicture {
  14. //得到了图片地址并下载图片
  15. public void getHtmlPicture(String httpUrl) {
  16. URL url;
  17. BufferedInputStream in;
  18. FileOutputStream file;
  19. int count; //图片文件名序号
  20. FileNumber num=new FileNumber();//图片文件名序号类,num为对象
  21. count=num.NumberReadFromFile();//获取图片文件序号
  22. try {
  23. System.out.println("获取网络图片");
  24. String fileName = (String.valueOf(count)).concat(httpUrl.substring(httpUrl.lastIndexOf(".")));//图片文件序号加上图片的后缀名,后缀名用了String内的一个方法来获得
  25. //httpUrl.substring(httpUrl.lastIndexOf("/"));//这样获得的文件名即是图片链接里图片的名字
  26. String filePath = "d:/image/";//图片存储的位置
  27. url = new URL(httpUrl);
  28. in = new BufferedInputStream(url.openStream());
  29. file = new FileOutputStream(new File(filePath+fileName));
  30. int t;
  31. while ((t = in.read()) != -1) {
  32. file.write(t);
  33. }
  34. file.close();
  35. in.close();
  36. System.out.println("图片获取成功");
  37. count=count+1;//图片文件序号加1
  38. num.NumberWriteToFile(count);//将图片名序号保存
  39. } catch (MalformedURLException e) {
  40. e.printStackTrace();
  41. } catch (FileNotFoundException e) {
  42. e.printStackTrace();
  43. } catch (IOException e) {
  44. e.printStackTrace();
  45. }
  46. }
  47. //获取网页的代码保存在String格式的Content中
  48. public String getHtmlCode(String httpUrl) throws IOException {
  49. String content ="";
  50. URL uu = new URL(httpUrl); // 创建URL类对象
  51. BufferedReader ii = new BufferedReader(new InputStreamReader(uu
  52. .openStream())); // //使用openStream得到一输入流并由此构造一个BufferedReader对象
  53. String input;
  54. while ((input = ii.readLine()) != null) { // 建立读取循环,并判断是否有读取值
  55. content += input;
  56. }
  57. ii.close();
  58. return content;
  59. }
  60. //分析网页代码,找到匹配的网页图片地址
  61. public void get(String url) throws IOException {
  62. String searchImgReg = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";//用于在网页代码Content中查找匹配的图片链接。
  63. String searchImgReg2 = "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
  64. String content = this.getHtmlCode(url);//this指对象gcp,在此地调用获取网页代码,getHtmlCode方法
  65. //System.out.println(content); //输出的content将是一个连续的字符串。
  66. Pattern pattern = Pattern.compile(searchImgReg);//java.util.regex.Pattern
  67. Matcher matcher = pattern.matcher(content); //java.util.regex.Matcher
  68. while (matcher.find()) {
  69. System.out.println(matcher.group(3));//输出图片链接地址到屏幕
  70. // System.out.println(url);
  71. this.getHtmlPicture(matcher.group(3));//对象调用getHtmlPicture从网上下载并输出图片文件到指定目录
  72. }
  73. pattern = Pattern.compile(searchImgReg2);
  74. matcher = pattern.matcher(content);
  75. while (matcher.find()) {
  76. System.out.println(matcher.group(3));
  77. this.getHtmlPicture(matcher.group(3));
  78. }
  79. // searchImgReg =
  80. // "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
  81. }
  82. //主函数url网页的地址
  83. public static void main(String[] args) throws IOException {
  84. String url = "http://www.baidu.com";
  85. GetContentPicture gcp = new GetContentPicture();
  86. gcp.get(url);
  87. }
  88. }

Java代码 打包抓取网页信息的实例
  1. package cn.mypic;
  2. import java.io.*;
  3. public class FileNumber{
  4. //文件写
  5. public void NumberWriteToFile(int x){
  6. int c=0;
  7. c=x;
  8. File filePath=new File("d:/image");//文件名序号TXT文件保存地址
  9. File f1=new File(filePath,"number.txt");
  10. try{
  11. FileOutputStream fout=new FileOutputStream(f1);
  12. DataOutputStream out=new DataOutputStream(fout);
  13. out.writeInt(c);
  14. }
  15. catch(FileNotFoundException e){
  16. System.err.println(e);
  17. }
  18. catch(IOException e){
  19. System.err.println(e);
  20. }
  21. }
  22. //文件读
  23. public int NumberReadFromFile(){
  24. int c1 = 0;
  25. File filePath=new File("d:/image");
  26. File f1=new File(filePath,"number.txt");
  27. try{
  28. FileInputStream fin=new FileInputStream(f1);
  29. DataInputStream in=new DataInputStream(fin);
  30. c1=in.readInt();
  31. System.out.println(c1);//输出文件内容至屏幕
  32. }
  33. catch(FileNotFoundException e){
  34. System.err.println(e);
  35. }
  36. catch(IOException e){
  37. System.err.println(e);
  38. }
  39. return c1;
  40. }
  41. public static void main(String args[]){
  42. }
  43. }

读书人网 >移动开发

热点推荐