2011.06.29——— Jsoup HttpClient 抓取网络上的图片
2011.06.29——— Jsoup HttpClient 抓取网络上的图片
参考:http://www.iteye.com/topic/1106648
http://www.ibm.com/developerworks/cn/java/j-lo-jsouphtml/index.html?ca=drs-
jsoup 官方网站:http://jsoup.org
需要的主要jar包
httpclient-4.0.1jar jsoup-1.5.2.jar
主要代码 如下
Exmaple3.java
package com.th.spider.test;import java.io.BufferedOutputStream;import java.io.FileOutputStream;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.HttpStatus;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.CoreConnectionPNames;import org.apache.http.util.EntityUtils;import org.jsoup.Connection;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class Exmaple3 {private static final Log log = LogFactory.getLog(Exmaple3.class);/** * 抓取图片存放目录 */private static final String PIC_DIR = "/home/li/pic";/** * 链接超时 */private static final int TIME_OUT = 5000;static void go3(String url) throws Exception { Connection conn= Jsoup.connect(url); Document doc = conn.get(); Elements links = doc.select("div.piclist img[src]"); for(int i=0;i<links.size();i++){ Element element = links.get(i); final String imgUrl = element.attr("src"); log.info(imgUrl); Thread.sleep(500); new Thread(new Runnable() { public void run() { try { save(imgUrl); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } }).start(); } }static void go2(String url) throws Exception { Connection conn= Jsoup.connect(url); Document doc = conn.get(); Elements links = doc.select("div.cc a[href]"); for(int i=0;i<links.size();i++){ Element element = links.get(i); final String dirUrl = "http://www.3lian.com"+element.attr("href"); log.info(dirUrl); Thread.sleep(500); new Thread(new Runnable() { public void run() { try { Connection conn= Jsoup.connect(dirUrl); Document doc = conn.get(); Elements images = doc.select("div.mb_jjnr img[src]"); for(int j=0;j<images.size();j++){ Element img = images.get(j); String imgUrl = img.attr("src"); log.info(imgUrl); save(imgUrl); } } catch (Exception e) { e.printStackTrace(); } } }).start(); } }/** * 处理帖子URL * @param url * @throws Exception */static void go(String url) throws Exception {// JSOP创建链接Connection conn = Jsoup.connect(url);// 请求返回整个文档对象Document doc = conn.post();// 选择所有class=zoom 的img标签对象Elements imgs = doc.select("img[class=zoom]");// 循环每个img标签for (int i = 0; i < imgs.size(); i++) {Element img = imgs.get(i);// 取得图片的下载地址String picURL = doc.baseUri() + img.attr("file");log.info(picURL);// 保存图片save(picURL);}}//<img src="static/image/common/none.gif" file="data/attachment/forum/201105/08/174412nz3jq4z90s33s2t0.jpg" width="770" onclick="zoom(this, this.src)" id="aimg_180565" onmouseover="showMenu({'ctrlid':this.id,'pos':'12'})" alt="2011.06.29——— Jsoup HttpClient 抓取网络下的图片" title="img_src_29620.jpg" />//doc.select("img[class=zoom]")/** * 保存图片 * @param url * @param i * @throws Exception */static void save(String url) throws Exception {String fileName = url.substring(url.lastIndexOf("/"));String filePath = PIC_DIR + "/" + fileName;BufferedOutputStream out = null;byte[] bit = getByte(url);if (bit.length > 0) {try {out = new BufferedOutputStream(new FileOutputStream(filePath));out.write(bit);out.flush();log.info("Create File success! [" + filePath + "]");} finally {if (out != null)out.close();}}}/** * 获取图片字节流 * @param uri * @return * @throws Exception */static byte[] getByte(String uri) throws Exception {HttpClient client = new DefaultHttpClient();client.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, TIME_OUT);HttpGet get = new HttpGet(uri);get.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, TIME_OUT);try {HttpResponse resonse = client.execute(get);if (resonse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {HttpEntity entity = resonse.getEntity();if (entity != null) {return EntityUtils.toByteArray(entity);}}} catch (Exception e) {e.printStackTrace();} finally {client.getConnectionManager().shutdown();}return new byte[0];}public static void main(String[] args) throws Exception {// 开始抓取图片 go2("http://www.3lian.com/gif/more/03/0301.html");//go3("http://www.ivsky.com/tupian/nvxing_gouwu_qingjing_v6969/");}}