通过htmlParser抓取百度的相关内容
最近这两天我做了个最新电影网的视频网站,主要是从土豆抓取来的.所以内容页就是框架,不便于搜索的抓取。所以我就想加些相关内容,像这样的内容
????? 由于我就是通过百度搜索关键字来填充内容。下面就是我通过htmlParser抓取的代码。
public?class?BaiduResultAction?extends?BaseAction?
{
????public?static?final?Logger?logger?=?Logger
????????????.getLogger(BaiduResultAction.class);
????/**?*//**
?????*?组装新闻
?????*?
?????*?@param?url
?????*?@return
?????*/
????public?String?compNews(String?url)?
{
????????String?returnContent?=?null;
????????try?
{
????????????ParserModel?parserModel?=?new?ParserModel();
????????????//table?的抓取标签
????????????String?content?=?"border=\"0\"?cellpadding=\"0\"?cellspacing=\"0\"";
????????????parserModel.setContent(content);
????????????NodeClassNameFilter?contentNodeClassNameFilter?=?new?NodeClassNameFilter(
????????????????????TableTag.class,?parserModel);
????????????NodeList?contentList?=?getAllNodeList(url,
????????????????????contentNodeClassNameFilter);
????????????//?对table的处理?只取第一个table中的一项记录
????????????//如果全部抓取内容,则要去掉最后一个break;
????????????for?(int?i?=?1;?i?<?contentList.size();?i++)?
{
????????????????if?(contentList.elementAt(i)?instanceof?TableTag)?
{
????????????????????TableTag?tableContent?=?(TableTag)?contentList
????????????????????????????.elementAt(i);
????????????????????int?rowCount?=?tableContent.getRowCount();
????????????????????TableRow[]?arrRows?=?tableContent.getRows();
????????????????????for?(int?j?=?0;?j?<?arrRows.length;?j++)?
{
????????????????????????TableRow?tableRow?=?arrRows[j];
????????????????????????TableColumn[]?arrColumm?=?tableRow.getColumns();
????????????????????????for?(int?k?=?0;?k?<?arrColumm.length;?k++)?
{
????????????????????????????String?columContent?=?arrColumm[k].toHtml();
????????????????????????????if(columContent?!=?null)
{
????????????????????????????????String[]?split?=?columContent.split("<br>");
????????????????????????????????if(split.length>2)
????????????????????????????????????returnContent?=?split[1].substring(0,split[1].length()-4);
????????????????????????????}
????????????????????????????break;
????????????????????????}
????????????????????}
????????????????}
????????????????break;
????????????}????????
????????}?catch?(IllegalArgumentException?e)?
{
????????????//?TODO?Auto-generated?catch?block
????????????e.printStackTrace();
????????????return?null;
????????}?catch?(Exception?e)?
{
????????????//?TODO?Auto-generated?catch?block
????????????e.printStackTrace();
????????}
????????return?returnContent;
????}
????public?static?void?main(String[]?args)?throws?Exception?
{
????????BaiduResultAction?action?=?new?BaiduResultAction();
????????//抓取sohu的内容.通过百度
????????String?url?=?"http://www.baidu.com/s?wd=%BA%DA%BF%CD%B5%DB%B9%FAII+11%28112%29++site%3Asohu.com";
????????//?String?url?=?"http://bbs.hoopchina.com/htm_data/96/0712/274754.html";
????????//?List<String>?hrefList?=?sinaAction.parseLink(url,?getIndexFilter());
????????/**//*?logger.debug(sinaAction.compNews(url));?*/
????????action.compNews(url);
????}
}效果如下:http://www.tondou.cn/c/2008-05-12/314146
public?class?BaiduResultAction?extends?BaseAction?
????/**?*//**