读书人

html抓取网页链接的例证

发布时间: 2012-10-29 10:03:53 作者: rapoo

html抓取网页链接的例子

package function.htmlparser;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.HasParentFilter;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;public class Test {public void listAll(Parser parser){try {NodeIterator nodeIterator=parser.elements();while (nodeIterator.hasMoreNodes()){System.out.println("+++++++++++++++++++++");Node node=nodeIterator.nextNode();System.out.println("getText():"+node.getText());System.out.println("getHtml():"+node.toHtml());}} catch (ParserException e) {// TODO Auto-generated catch blocke.printStackTrace();}}public void filter(Parser parser){NodeList nodelist;//NodeFilter filterL = new TagNameFilter("a");NodeFilter filterS = new HasAttributeFilter("class","post-title");NodeFilter filterP= new HasParentFilter(filterS);try {nodelist=parser.parse(filterP);//Node node=nodelist.elementAft(0);//NodeFilter haf= new HasAttributeFilter("class","post-title");//获取相应的节点nodelist=nodelist.extractAllNodesThatMatch(filterP,true);for(int i=0;i<nodelist.size();i++){LinkTag link=(LinkTag)nodelist.elementAt(i).getFirstChild();System.out.println(link.getAttribute("href")+"/n");System.out.println(link.getStringText());//System.out.println(nodelist.elementAt(i).getFirstChild().getText()+"-----"+nodelist.elementAt(i).getFirstChild().toHtml());}} catch (ParserException e) {// TODO Auto-generated catch blocke.printStackTrace();}}public static void main(String[] args) {String urlStr="http://localhost:8080/tomfish88/error.jsp";Parser parser=new Parser();try {parser.setURL(urlStr);parser.setEncoding("gb2312");} catch (ParserException e) {// TODO Auto-generated catch blocke.printStackTrace();}Test test=new Test();test.filter(parser);}}?

?html文件?

?

?

?

<%@ page language="java" contentType="text/html; charset=GB18030"    pageEncoding="GB18030"%><!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html><head><meta http-equiv="Content-Type" content="text/html; charset=GB18030"><title>Insert title here</title></head><body>error!!!!!!<table><tr><td>td-c1</td></tr><tr class="post-title"><td><a href="http://www.fsd.com">连接1</a></td></tr><tr><td>td-cc1 <a href="http://www.fsd44444.com">连接3</a> </td></tr><tr class="post-title"><td><a href="http://www.fsd222222.com">连接2</a></td></tr></table></body></html>

?

?

?

java文件

?

?

?

?

1 楼 ningwuyu 2011-08-09 jar 咋没上传啊

读书人网 >CSS

热点推荐