一个通用html抽取类
首先先放正文,我把换行去掉了为了省正则的写法.下面以新浪微博为例子
然后就是我的代码了
这里是调用例子
这个是返回结果
看了,蛮不错的String beginRegex = "<div id=\"main\" style=\"background:none;\">";
String endRegex = "<iframe id=\"ifr\" width=\"100%\" height=\"0\" scrolling=\"no\" frameborder=\"0\" src=\"/n?cmd=2&page=http%3A%2F%2Fnews.yesky.com%2F346%2F30918346.shtml&pn=1&tn=newsrela&s_richtext=1\"></iframe>";
AnsjPaser ansjHtml = new AnsjPaser(beginRegex, endRegex, br,
AnsjPaser.TEXTTEGEX);
//SplitPaser paser = new SplitPaser();
//paser.splitString(beginRegex, endRegex, br);
// 正文抽取
beginRegex = "<p class=\"<.*?>\"><a href=\"<.*?>\" target=\"<.*?>\" mon=\"<.*?>\" >";
endRegex = "<a.*?>";
AnsjPaser ansjContent = new AnsjPaser(beginRegex, endRegex)
.addFilterRegex("<.*?>");
// 时间抽取
beginRegex = "<div class=\"abstract\">";
endRegex = "来源:<.*?></div>";
AnsjPaser ansjPubTime = new AnsjPaser(beginRegex, endRegex);
// 来源抽取
beginRegex = "<span class=\"resource\">(来源:";
endRegex = ")</span></p>";
AnsjPaser ansjFrom = new AnsjPaser(beginRegex, endRegex);
// 转发抽取
//beginRegex = "<strong lang=\"CD0023\" pop=\"true\">转发</strong><strong id=\"num_\\d*?\" rid=\"\\d*?\" type=\"rttCount\">\\(";
//endRegex = "\\)</strong>";
//AnsjPaser ansjRepeat = new AnsjPaser(beginRegex, endRegex);
//// 评论抽取
//beginRegex = "<strong lang=\"CL1004\">评论</strong><strong rid=\"\\d*?\" type=\"commtCount\">\\(";
//endRegex = "\\)</strong>";
//AnsjPaser ansjComment = new AnsjPaser(beginRegex, endRegex);
// 开始抽取
while (ansjHtml.hasNext()) {
String c1 = ansjHtml.getNext();
System.out
.println("=========================================================================");
System.out.println("时间:" + ansjPubTime.reset(c1).getText());
System.out.println("来源:" + ansjFrom.reset(c1).getText());
//String str = ansjRepeat.reset(c1).getText();
//System.out.println("转发:"
//+ ((str == null || "".equals(str)) ? "0" : str));
//str = ansjComment.reset(c1).getText();
//System.out.println("评论:"
//+ ((str == null || "".equals(str)) ? "0" : str));
//
//System.out.println("正文:" + ansjContent.reset(c1).getText());
//
}
}
}
public class IOUtil {
public static String getReader(String urlString) throws Exception,
IOException {
try {
StringBuffer html = new StringBuffer();
URL url;
try {
url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
InputStreamReader isr = new InputStreamReader(conn.getInputStream());
BufferedReader br = new BufferedReader(isr);
String temp;
while ((temp = br.readLine()) != null) {
html.append(temp);
}
br.close();
isr.close();
} catch (RuntimeException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return html.toString();
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
public static void main(String[] args) throws Exception, IOException {
System.out.println(IOUtil.getReader("http://news.163.com"));
}
}
我把你的html信息提取下来重命名为王龙君.html现在问题停在路径读取抛异常 9 楼 chenpenghui 2012-01-19 好像不错的样子,先顶一个,但是我感觉,用jsoup的话更方便点,网页是一棵树,直接取节点好了。毕竟写正则比较麻烦。