读书人

lucene3.0+版本汉语分词测试+搜索结果

发布时间: 2012-08-27 21:21:57 作者: rapoo

lucene3.0+版本中文分词测试+搜索结果+创建索引测试

lucene3.0+版本中文分词测试+搜索结果+创建索引测试


import java.io.File;import java.io.IOException;import java.io.StringReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.tokenattributes.TermAttribute;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.Field.Index;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriter.MaxFieldLength;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.search.highlight.Highlighter;import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;import org.apache.lucene.search.highlight.QueryScorer;import org.apache.lucene.search.highlight.SimpleFragmenter;import org.apache.lucene.search.highlight.SimpleHTMLFormatter;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.LockObtainFailedException;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;public class AnalzyerTest {/** * lucene3.0开始已经抛弃了原来的分词方式,转而使用新的分词方式<br> * 本方法以SmartChineseAnalyzer为例,演示如何分词以及取得分词之后的term * http://blog.csdn.net/yjflinchong/article/details/7906116 * @throws Exception */public static void analysis() throws Exception {Analyzer analyzer = new IKAnalyzer();String string = "据外媒报道,菲律宾国防部长加斯明9日称,多种新式战机、船只将于年内陆续交付军方,菲国防实力将得到大幅增强。但加斯明同时强调,此次军备采购与黄岩岛争端无关。";StringReader reader = new StringReader(string);TokenStream ts = analyzer.tokenStream("", reader);TermAttribute termAttribute = ts.getAttribute(TermAttribute.class);while (ts.incrementToken()) {System.out.println(termAttribute.term() + "  ");}System.out.println();}/** * 建索引 * 在构造IndexWriter时必须使用Directory作为参数了 *  * @throws CorruptIndexException * @throws LockObtainFailedException * @throws IOException */private static void build() throws CorruptIndexException, LockObtainFailedException, IOException {String path = "index";IndexWriter writer = new IndexWriter(FSDirectory.open(new File(path)), new IKAnalyzer(), true, MaxFieldLength.LIMITED);Document document = new Document();document.add(new Field("text", "中国人民银行采取了一系列措施防止人民币升值,但是很遗憾,这些措施在今天看来其作用是微乎其微的。难道真的就没有什么别的措施防止人民币再次疯狂升值了吗?", Store.YES, Index.ANALYZED));writer.addDocument(document);writer.optimize();writer.close();}/** *  * @param keyword * @throws CorruptIndexException * @throws IOException * @throws ParseException * @throws InvalidTokenOffsetsException */private static void search(String keyword) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException {Analyzer analyzer = new IKAnalyzer();QueryParser parser = new QueryParser(Version.LUCENE_30, "text", analyzer);IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("index")));Query query = parser.parse(keyword);System.out.println(query);TopDocs topDocs = searcher.search(query, 10);ScoreDoc[] scoreDocs = topDocs.scoreDocs;System.out.println("hits:" + topDocs.totalHits);for (ScoreDoc scoreDoc : scoreDocs) {Document doc = searcher.doc(scoreDoc.doc);String text = doc.get("text");System.out.println(highlight(text, query, analyzer));}}/** * 高亮关键词 * http://blog.csdn.net/yjflinchong/article/details/7906116 * @param content *            需要高亮的内容 * @param query *            搜索时使用的Query对象 * @param analyzer *            分词器 * @return 高亮之后的文本 * @throws IOException * @throws InvalidTokenOffsetsException */private static String highlight(String content, Query query, Analyzer analyzer) throws IOException, InvalidTokenOffsetsException {SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<b>", "</b>");Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));highlighter.setTextFragmenter(new SimpleFragmenter(100));String resultString = highlighter.getBestFragment(analyzer.tokenStream("", new StringReader(content)), content);return resultString + "...";}public static void main(String[] args) throws Exception {analysis();build();search("人民币 升值");}}


注明来源:http://blog.csdn.net/yjflinchong/article/details/7906116

lucene3.0以上版本中文分词测试+搜索结果+创建索引测试



1楼yjflinchong昨天 09:08
新手入门很好的资料

读书人网 >编程

热点推荐