读书人

网页爬虫抓取URL简略实现

发布时间: 2012-12-30 10:43:15 作者: rapoo

网页爬虫抓取URL简单实现

关键字:网页爬虫抓取URL简单实现 .

//开始......

package com.ogilvy.sayes.util;

import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Hashtable;

/*
Description: 爬网页用
Author : long.tang
*/

public class SearchCrawler {

public String myGetHttpFile2(String url) {

String urlSource = url;
StringBuffer htmlBuffer = new StringBuffer();
String returnStr = null;
try {
InputStream imageSource = new URL(urlSource).openStream();
int ch;
while ((ch = imageSource.read()) > -1) {
htmlBuffer.append((char) ch);
}
imageSource.close();
returnStr = new String(htmlBuffer);
returnStr = new String(returnStr.getBytes("ISO8859_1"), "GBK");
} catch (Exception e) {
System.out.println("error>>>>");
e.printStackTrace();
}

//System.out.println("@@@:" + returnStr);
if (returnStr != null) {
return returnStr;
} else {
return "nothing";
}

}

public void doit(String content, int depth) throws Exception {

depth--;
if (depth < 1) {
//System.out.println("break::::");
return;
}

SearchCrawler search = new SearchCrawler();
ArrayList list = new ArrayList();
int j = 0;
String start = "href=";
String end = "\"";
String url = "";
String type = "http";
String[] urls;
while (content.indexOf(start, j) > -1) {

url = content.substring(content.indexOf(start, j) + 6, content.indexOf(end, content.indexOf(start, j) + 6));//+6 href="
if (url.indexOf(type) > -1) {
if (url.indexOf(".css") == -1&&url.indexOf(".ico") == -1&&url.indexOf(".exe") == -1) {
System.out.println(url);

list.add(url);

if (list != null && list.size() > 0) {

for (int k = 0; k < list.size(); k++) {
doit(search.myGetHttpFile2(String.valueOf(list.get(k))), depth);

}

}
}

}


j = content.indexOf(start, j) + 1;

}

}

public static void main(String arg[]) {

SearchCrawler search = new SearchCrawler();
try {
search.doit(search.myGetHttpFile2("http://www.2345.com/"),3);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}

}



//结束.....



读书人网 >互联网

热点推荐