正则表达式解析出页面所有链接,并得到链接的内容
Main类的main方法得到所有链接,此方法是带链接状态的
package com.logistics;import java.io.ByteArrayOutputStream;import java.io.IOException;import java.io.InputStream;import java.util.ArrayList;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.HttpVersion;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.conn.ClientConnectionManager;import org.apache.http.conn.params.ConnManagerParams;import org.apache.http.conn.scheme.PlainSocketFactory;import org.apache.http.conn.scheme.Scheme;import org.apache.http.conn.scheme.SchemeRegistry;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;import org.apache.http.params.BasicHttpParams;import org.apache.http.params.HttpParams;import org.apache.http.params.HttpProtocolParams;public class Main1 {/** * @param args * @throws IOException * @throws ClientProtocolException */public static void main(String[] args) throws Exception { // Create and initialize HTTP parameters HttpParams params = new BasicHttpParams(); ConnManagerParams.setMaxTotalConnections(params, 10); HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1); // Create and initialize scheme registry SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register( new Scheme("http", PlainSocketFactory.getSocketFactory(), 80)); ClientConnectionManager cm = new ThreadSafeClientConnManager(params, schemeRegistry); HttpClient client = new DefaultHttpClient(cm, params);HttpGet get = new HttpGet("http://localhost:8080/docs/");HttpResponse response = client.execute(get);HttpEntity entity = response.getEntity();byte[] b = new byte[1024];ByteArrayOutputStream stream = new ByteArrayOutputStream();if (entity != null) {InputStream is = entity.getContent();while (is.read(b) != -1) {stream.write(b);}}Pattern pattern = Pattern.compile("\\w+\\.html");Matcher matcher = pattern.matcher(stream.toString("utf-8"));ArrayList<String> list=new ArrayList<String>();while (matcher.find()) {list.add("http://localhost:8080/docs/"+matcher.group());}for (int i = 0; i < list.size(); i++) {new SpiderThread(client, new HttpGet(list.get(i)), i + 1).run();}}}然后使用线程得到链接内容
package com.logistics;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.protocol.BasicHttpContext;import org.apache.http.protocol.HttpContext;import org.apache.http.util.EntityUtils;public class SpiderThread extends Thread { private final HttpClient httpClient; private final HttpContext context; private final HttpGet httpGet; private final int id; public SpiderThread(HttpClient httpClient, HttpGet httpGet, int id) { this.httpClient = httpClient; this.context = new BasicHttpContext(); this.httpGet = httpGet; this.id = id; } /** * Executes the GetMethod and prints some status information. */ @Override public void run() { Long start = System.currentTimeMillis(); try { HttpResponse response = httpClient.execute(httpGet); HttpEntity entity = response.getEntity(); if (entity != null) { byte[] bytes = EntityUtils.toByteArray(entity); // System.out.println(new String(bytes,"utf-8")); System.out.println(httpGet.getURI().getPath()); } } catch (Exception e) { httpGet.abort(); System.out.println(id + " - error: " + e); } Long end = System.currentTimeMillis(); System.out.println(id +" -- 用时:"+(end-start)); } }
你不应该这么问,应为HtmlParser不是为了抽取页面链接而生的,它主要是用来解析或纠正html的。如果楼主对网页解析感兴趣,可以查查HtmlParser的资料,因为它应该是当前java语言中最好的网页解析组件了
你不应该这么问,应为HtmlParser不是为了抽取页面链接而生的,它主要是用来解析或纠正html的。如果楼主对网页解析感兴趣,可以查查HtmlParser的资料,因为它应该是当前java语言中最好的网页解析组件了
HtmlParser
这个东东在现在的项目中也使用了,用来抓取页面,做日志了!
你不应该这么问,应为HtmlParser不是为了抽取页面链接而生的,它主要是用来解析或纠正html的。如果楼主对网页解析感兴趣,可以查查HtmlParser的资料,因为它应该是当前java语言中最好的网页解析组件了
HtmlParser
这个东东在现在的项目中也使用了,用来抓取页面,做日志了!
相比HtmlParser,我更推荐jsoup