lucene如何抽取html网页
要解析html页面 就要对html中的标签做处理
先准备几个工具类
package com.cs.parser.util;import org.htmlparser.Node;public class PageContent { private StringBuffer textBuffer; private int number; private Node node; public Node getNode() {return node;}public void setNode(Node node) {this.node = node;}public int getNumber() { return number; } public void setNumber(int number) { this.number = number; } public StringBuffer getTextBuffer() { return textBuffer; } public void setTextBuffer(StringBuffer textBuffer) { this.textBuffer = textBuffer; }}package com.cs.parser.util;public class TableValid { private int trnum; private int tdnum; private int linknum; private int textnum; private int scriptnum; public int getScriptnum() { return scriptnum; } public void setScriptnum(int scriptnum) { this.scriptnum = scriptnum; } public int getLinknum() { return linknum; } public void setLinknum(int linknum) { this.linknum = linknum; } public int getTdnum() { return tdnum; } public void setTdnum(int tdnum) { this.tdnum = tdnum; } public int getTextnum() { return textnum; } public void setTextnum(int textnum) { this.textnum = textnum; } public int getTrnum() { return trnum; } public void setTrnum(int trnum) { this.trnum = trnum; }}package com.cs.parser.util;public class TableColumnValid { int tdNum; boolean valid;public int getTdNum() {return tdNum;}public void setTdNum(int tdNum) {this.tdNum = tdNum;}public boolean isValid() {return valid;}public void setValid(boolean valid) {this.valid = valid;} }接下来看看如何解析html页面
加入htmlparser.jar包
package com.cs;public interface Parsable {public String getTitle() ;public String getContent() ;public String getSummary() ;}package com.cs;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStreamReader;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.nodes.TagNode;import org.htmlparser.nodes.TextNode;import org.htmlparser.tags.Div;import org.htmlparser.tags.ImageTag;import org.htmlparser.tags.LinkTag;import org.htmlparser.tags.ParagraphTag;import org.htmlparser.tags.ScriptTag;import org.htmlparser.tags.SelectTag;import org.htmlparser.tags.Span;import org.htmlparser.tags.StyleTag;import org.htmlparser.tags.TableColumn;import org.htmlparser.tags.TableHeader;import org.htmlparser.tags.TableRow;import org.htmlparser.tags.TableTag;import org.htmlparser.tags.TitleTag;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import com.cs.parser.util.PageContent;import com.cs.parser.util.TableColumnValid;import com.cs.parser.util.TableValid; public class EasyHtmlParser implements Parsable { protected static final String lineSign = System.getProperty( "line.separator"); protected static final int lineSign_size = lineSign.length();private File file ;private String content ;private String summary ;private String title ;public static void main(String[] args) {EasyHtmlParser eParser = new EasyHtmlParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\www.htm")) ;System.out.println("html content : "+eParser.getContent()) ;}public EasyHtmlParser(File file) {this.file = file ;}private String getString() {try {BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file))) ;String html = "" ;String str = null ;while ((str = br.readLine())!= null ) {html += str ;}return html ;} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}return null ;}public synchronized String getContent() {if (content != null ) {return content ;}String html = this.getString() ;Parser parser = new Parser() ; try { parser.setInputHTML(html) ; for (NodeIterator e = parser.elements(); e.hasMoreNodes();){ Node node = (Node) e.nextNode(); PageContent context = new PageContent(); context.setNumber(0); context.setTextBuffer(new StringBuffer()); //抓取出内容 extractHtml(node, context, ""); StringBuffer testContext = context.getTextBuffer(); //System.out.println(testContext); content = testContext.toString() ; } if (content == null ) { content = "" ; } if (content.length() < 200) { summary = content ; }else { summary = content.substring(0,200) ; } NodeFilter filter = new NodeClassFilter(TitleTag.class) ; parser.reset() ; NodeList titleNodes = parser.extractAllNodesThatMatch(filter) ; if (titleNodes != null && titleNodes.elementAt(0) != null){ title = titleNodes.elementAt(0).toPlainTextString() ; }else{ title = "" ; } /* System.out.println(file.getAbsolutePath()+" "+"title:"+title); System.out.println(file.getAbsolutePath()+" "+"content:"+content); System.out.println(file.getAbsolutePath()+" "+"summary:"+summary); */} catch (ParserException e1) {// TODO Auto-generated catch blocke1.printStackTrace();} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}return content;}public String getSummary() {if (summary != null) {return summary ;}if (content == null ) {getContent() ; }return summary;}public String getTitle() {if (title != null) {return title ;}if (content == null ) {getContent() ; }return "";} protected List extractHtml(Node nodeP, PageContent pageContent, String siteUrl) throws Exception { NodeList nodeList = nodeP.getChildren(); boolean bl = false; if ((nodeList == null) || (nodeList.size() == 0)) { if (nodeP instanceof ParagraphTag) { ArrayList tableList = new ArrayList(); StringBuffer temp = new StringBuffer(); temp.append("<p style=\"TEXT-INDENT: 2em\">"); tableList.add(temp); temp = new StringBuffer(); temp.append("</p>").append(lineSign); tableList.add(temp); return tableList; } return null; } if ((nodeP instanceof TableTag) || (nodeP instanceof Div)) { bl = true; } if (nodeP instanceof ParagraphTag) { ArrayList tableList = new ArrayList(); StringBuffer temp = new StringBuffer(); temp.append("<p style=\"TEXT-INDENT: 2em\">"); tableList.add(temp); extractParagraph(nodeP, siteUrl, tableList); temp = new StringBuffer(); temp.append("</p>").append(lineSign); tableList.add(temp); return tableList; } ArrayList tableList = new ArrayList(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); if (node instanceof LinkTag) { tableList.add(node); setLinkImg(node, siteUrl); } else if (node instanceof ImageTag) { ImageTag img = (ImageTag) node; if (img.getImageURL().toLowerCase().indexOf("http://") < 0) { img.setImageURL(siteUrl + img.getImageURL()); } else { img.setImageURL(img.getImageURL()); } tableList.add(node); } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof TextNode) { if (node.getText().length() > 0) { StringBuffer temp = new StringBuffer(); String text = collapse(node.getText() .replaceAll(" ", "") .replaceAll(" ", "")); temp.append(text.trim()); tableList.add(temp); } } else { if (node instanceof TableTag || node instanceof Div) { TableValid tableValid = new TableValid(); isValidTable(node, tableValid); if (tableValid.getTrnum() > 2) { tableList.add(node); continue; } } List tempList = extractHtml(node, pageContent, siteUrl); if ((tempList != null) && (tempList.size() > 0)) { Iterator ti = tempList.iterator(); while (ti.hasNext()) { tableList.add(ti.next()); } } } } } catch (Exception e) { return null; } if ((tableList != null) && (tableList.size() > 0)) { if (bl) { StringBuffer temp = new StringBuffer(); Iterator ti = tableList.iterator(); int wordSize = 0; StringBuffer node; int status = 0; StringBuffer lineStart = new StringBuffer( "<p style=\"TEXT-INDENT: 2em\">"); StringBuffer lineEnd = new StringBuffer("</p>" + lineSign); while (ti.hasNext()) { Object k = ti.next(); if (k instanceof LinkTag) { if (status == 0) { temp.append(lineStart); status = 1; } node = new StringBuffer(((LinkTag) k).toHtml()); temp.append(node); } else if (k instanceof ImageTag) { if (status == 0) { temp.append(lineStart); status = 1; } node = new StringBuffer(((ImageTag) k).toHtml()); temp.append(node); } else if (k instanceof TableTag) { if (status == 0) { temp.append(lineStart); status = 1; } node = new StringBuffer(((TableTag) k).toHtml()); temp.append(node); } else if (k instanceof Div) { if (status == 0) { temp.append(lineStart); status = 1; } node = new StringBuffer(((Div) k).toHtml()); temp.append(node); } else { node = (StringBuffer) k; if (status == 0) { if (node.indexOf("<p") < 0) { temp.append(lineStart); temp.append(node); wordSize = wordSize + node.length(); status = 1; } else { temp.append(node); status = 1; } } else if (status == 1) { if (node.indexOf("</p") < 0) { if (node.indexOf("<p") < 0) { temp.append(node); wordSize = wordSize + node.length(); } else { temp.append(lineEnd); temp.append(node); status = 1; } } else { temp.append(node); status = 0; } } } } if (status == 1) { temp.append(lineEnd); } if (wordSize > pageContent.getNumber()) { pageContent.setNumber(wordSize); pageContent.setTextBuffer(temp); } return null; } else { return tableList; } } return null;}/** * 提取段落中的内容 * @param nodeP * @param siteUrl * @param tableList * @return */ private List extractParagraph(Node nodeP, String siteUrl, List tableList) { NodeList nodeList = nodeP.getChildren(); if ((nodeList == null) || (nodeList.size() == 0)) { if (nodeP instanceof ParagraphTag) { StringBuffer temp = new StringBuffer(); temp.append("<p style=\"TEXT-INDENT: 2em\">"); tableList.add(temp); temp = new StringBuffer(); temp.append("</p>").append(lineSign); tableList.add(temp); return tableList; } return null; } try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof LinkTag) { tableList.add(node); setLinkImg(node, siteUrl); } else if (node instanceof ImageTag) { ImageTag img = (ImageTag) node; if (img.getImageURL().toLowerCase().indexOf("http://") < 0) { img.setImageURL(siteUrl + img.getImageURL()); } else { img.setImageURL(img.getImageURL()); } tableList.add(node); } else if (node instanceof TextNode) { if (node.getText().trim().length() > 0) { String text = collapse(node.getText() .replaceAll(" ", "") .replaceAll(" ", "")); StringBuffer temp = new StringBuffer(); temp.append(text); tableList.add(temp); } } else if (node instanceof Span) { StringBuffer spanWord = new StringBuffer(); getSpanWord(node, spanWord); if ((spanWord != null) && (spanWord.length() > 0)) { String text = collapse(spanWord.toString() .replaceAll(" ", "") .replaceAll(" ", "")); StringBuffer temp = new StringBuffer(); temp.append(text); tableList.add(temp); } } else if (node instanceof TagNode) { String tag = node.toHtml(); if (tag.length() <= 10) { tag = tag.toLowerCase(); if ((tag.indexOf("strong") >= 0) || (tag.indexOf("b") >= 0)) { StringBuffer temp = new StringBuffer(); temp.append(tag); tableList.add(temp); } } else { if (node instanceof TableTag || node instanceof Div) { TableValid tableValid = new TableValid(); isValidTable(node, tableValid); if (tableValid.getTrnum() > 2) { tableList.add(node); continue; } } extractParagraph(node, siteUrl, tableList); } } } } catch (Exception e) { return null; } return tableList; } protected void getSpanWord(Node nodeP, StringBuffer spanWord) { NodeList nodeList = nodeP.getChildren(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof TextNode) { spanWord.append(node.getText()); } else if (node instanceof Span) { getSpanWord(node, spanWord); } else if (node instanceof ParagraphTag) { getSpanWord(node, spanWord); } else if (node instanceof TagNode) { String tag = node.toHtml().toLowerCase(); if (tag.length() <= 10) { if ((tag.indexOf("strong") >= 0) || (tag.indexOf("b") >= 0)) { spanWord.append(tag); } } } } } catch (Exception e) { } return; } /** * 判断TABLE是否是表单 * @param nodeP * @return */ private void isValidTable(Node nodeP, TableValid tableValid) { NodeList nodeList = nodeP.getChildren(); /**如果该表单没有子节点则返回**/ if ((nodeList == null) || (nodeList.size() == 0)) { return; } try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); /**如果子节点本身也是表单则返回**/ if (node instanceof TableTag || node instanceof Div) { return; } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { return; } else if (node instanceof TableColumn) { return; } else if (node instanceof TableRow) { TableColumnValid tcValid = new TableColumnValid(); tcValid.setValid(true); findTD(node, tcValid); if (tcValid.isValid()) { if (tcValid.getTdNum() < 2) { if (tableValid.getTdnum() > 0) { return; } else { continue; } } else { if (tableValid.getTdnum() == 0) { tableValid.setTdnum(tcValid.getTdNum()); tableValid.setTrnum(tableValid.getTrnum() + 1); } else { if (tableValid.getTdnum() == tcValid.getTdNum()) { tableValid.setTrnum(tableValid.getTrnum() + 1); } else { return; } } } } } else { isValidTable(node, tableValid); } } } catch (Exception e) { return; } return; } /** * 判断是否有效TR * @param nodeP * @param TcValid * @return */ private void findTD(Node nodeP, TableColumnValid tcValid) { NodeList nodeList = nodeP.getChildren(); /**如果该表单没有子节点则返回**/ if ((nodeList == null) || (nodeList.size() == 0)) { return; } try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); /**如果有嵌套表单**/ if (node instanceof TableTag || node instanceof Div || node instanceof TableRow || node instanceof TableHeader) { tcValid.setValid(false); return; } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { tcValid.setValid(false); return; } else if (node instanceof TableColumn) { tcValid.setTdNum(tcValid.getTdNum() + 1); } else { findTD(node, tcValid); } } } catch (Exception e) { tcValid.setValid(false); return; } return; } protected String collapse(String string) { int chars; int length; int state; char character; StringBuffer buffer = new StringBuffer(); chars = string.length(); if (0 != chars) { length = buffer.length(); state = ((0 == length) || (buffer.charAt(length - 1) == ' ') || ((lineSign_size <= length) && buffer.substring(length - lineSign_size, length).equals(lineSign))) ? 0 : 1; for (int i = 0; i < chars; i++) { character = string.charAt(i); switch (character) { case '\u0020': case '\u0009': case '\u000C': case '\u200B': case '\u00a0': case '\r': case '\n': if (0 != state) { state = 1; } break; default: if (1 == state) { buffer.append(' '); } state = 2; buffer.append(character); } } } return buffer.toString(); } /** * 设置图象连接 * @param nodeP * @param siteUrl */ private void setLinkImg(Node nodeP, String siteUrl) { NodeList nodeList = nodeP.getChildren(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); if (node instanceof ImageTag) { ImageTag img = (ImageTag) node; if (img.getImageURL().toLowerCase().indexOf("http://") < 0) { img.setImageURL(siteUrl + img.getImageURL()); } else { img.setImageURL(img.getImageURL()); } } } } catch (Exception e) { return; } return; }}现在可以成功的把html解析为纯文本了