lucene初探(二):创建索引,查询索引
package com.iris.scm.lucene.test;import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.en.EnglishAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.Field.Index;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.queryParser.MultiFieldQueryParser;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.search.highlight.Formatter;import org.apache.lucene.search.highlight.Fragmenter;import org.apache.lucene.search.highlight.Highlighter;import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;import org.apache.lucene.search.highlight.QueryScorer;import org.apache.lucene.search.highlight.Scorer;import org.apache.lucene.search.highlight.SimpleFragmenter;import org.apache.lucene.search.highlight.SimpleHTMLFormatter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;import com.iris.scm.lucene.model.Publication;public class LuceneTest {// 分词器private Analyzer analyzerEn;private Analyzer analyzerZh;// 索引存放目录private Directory directoryZh;// 索引存放目录private Directory directoryEn;public static void main(String[] args) throws Exception {LuceneTest test = new LuceneTest();test.initDir();// test.createIndex();test.searchZhPub();test.searchEnPub();}/** * 初始化Analyzer和Directory. * * @throws IOException */public void initDir() throws IOException {// 建立一个标准分词器// Version.LUCENE_36 表示匹配Lucene3.6版本,使用英文分词解析工具analyzerEn = new EnglishAnalyzer(Version.LUCENE_36);analyzerZh = new IKAnalyzer();// 使用智能分词((IKAnalyzer) analyzerZh).setUseSmart(true);// 在当前路径下建立一个目录叫indexDirFile indexDirZh = new File("d:/lucene/LuceneTestZh");File indexDirEn = new File("d:/lucene/LuceneTestEn");// 创建索引目录directoryZh = FSDirectory.open(indexDirZh);directoryEn = FSDirectory.open(indexDirEn);}/** * 建立索引文件 * * @throws IOException */public void createIndex() throws IOException {// 获取文献信息// 中文Publication pubZh1 = new Publication();pubZh1.setId(123456L);pubZh1.setPublishYear(2010);pubZh1.setZhTitle("金刚石薄膜抗激光破坏研究");pubZh1.setZhAbstract("介绍了金刚石优异的光学和力学特性,对金刚石薄膜在从紫外到红外波段以及不同脉宽激光参数下的激光损伤行为和损伤阈值进行了评述。");// 英文Publication pubEn1 = new Publication();pubEn1.setId(123456L);pubEn1.setPublishYear(2010);pubEn1.setEnTitle("Laser induced damage for diamond films");pubEn1.setEnAbstract("The outstanding optical and mechanical properties of diamond films are summarized. ");// 中文Publication pubZh2 = new Publication();pubZh2.setId(68545L);pubZh2.setPublishYear(2009);pubZh2.setZhTitle("激光破坏金刚石薄膜研究");pubZh2.setZhAbstract("分析了不同激光工作参数对金刚石薄膜的激光损伤机理,认为石墨化导致晶格失稳是金刚石薄膜激光损伤的主要原因。金刚石薄膜石墨化有两种方式:垂直表面向体层方向石墨化和平行表面按分层的方式逐层石墨化。");// 英文Publication pubEn2 = new Publication();pubEn2.setId(68545L);pubEn2.setPublishYear(2009);pubEn2.setEnTitle("Laser destruction of the diamond thin films");pubEn2.setEnAbstract(" Laser damage for films irradiated with different wave lengths and pulse width are reviewed and the laser damage mechanism analyzed for different parameters. It is found that graphitization induced instability of the crystal lattice is the main reason for laser induced damage. There are two ways that lead to graphitized damage on the surface of diamond films under long and short laser pulses. For nanosecond or longer laser pul...");// 建立DocumentDocument docZh1 = new Document();// Store指定Field是否需要存储,Index指定Field是否需要分词索引docZh1.add(new Field("id", pubZh1.getId().toString(), Store.YES, Index.NOT_ANALYZED));docZh1.add(new Field("publish_year", pubZh1.getPublishYear().toString(), Store.YES, Index.NOT_ANALYZED));docZh1.add(new Field("zh_title", pubZh1.getZhTitle(), Store.YES, Index.ANALYZED));docZh1.add(new Field("zh_abstract", pubZh1.getZhAbstract(), Store.YES, Index.ANALYZED));// 建立DocumentDocument docZh2 = new Document();docZh2.add(new Field("id", pubZh2.getId().toString(), Store.YES, Index.NOT_ANALYZED));docZh2.add(new Field("publish_year", pubZh2.getPublishYear().toString(), Store.YES, Index.NOT_ANALYZED));docZh2.add(new Field("zh_title", pubZh2.getZhTitle(), Store.YES, Index.ANALYZED));docZh2.add(new Field("zh_abstract", pubZh2.getZhAbstract(), Store.YES, Index.ANALYZED));Document docEn1 = new Document();docEn1.add(new Field("id", pubEn1.getId().toString(), Store.YES, Index.NOT_ANALYZED));docEn1.add(new Field("publish_year", pubEn1.getPublishYear().toString(), Store.YES, Index.NOT_ANALYZED));docEn1.add(new Field("en_title", pubEn1.getEnTitle(), Store.YES, Index.ANALYZED));docEn1.add(new Field("en_abstract", pubEn1.getEnAbstract(), Store.YES, Index.ANALYZED));Document docEn2 = new Document();docEn2.add(new Field("id", pubEn2.getId().toString(), Store.YES, Index.NOT_ANALYZED));docEn2.add(new Field("publish_year", pubEn2.getPublishYear().toString(), Store.YES, Index.NOT_ANALYZED));docEn2.add(new Field("en_title", pubEn2.getEnTitle(), Store.YES, Index.ANALYZED));docEn2.add(new Field("en_abstract", pubEn2.getEnAbstract(), Store.YES, Index.ANALYZED));// 建立一个IndexWriter配置,指定匹配的版本,以及分词器IndexWriterConfig indexWriterConfigZh = new IndexWriterConfig(Version.LUCENE_36, analyzerZh);IndexWriterConfig indexWriterConfigEn = new IndexWriterConfig(Version.LUCENE_36, analyzerEn);// 创建IndexWriter,它负责索引的创建和维护IndexWriter indexWriterZh = new IndexWriter(directoryZh, indexWriterConfigZh);IndexWriter indexWriterEn = new IndexWriter(directoryEn, indexWriterConfigEn);// 把Document加入到索引中indexWriterZh.addDocument(docZh1);indexWriterZh.addDocument(docZh2);indexWriterEn.addDocument(docEn1);indexWriterEn.addDocument(docEn2);// 提交改变到索引,然后关闭indexWriterZh.close();indexWriterEn.close();}/** * 搜索文献中文内容. * * @throws ParseException * @throws CorruptIndexException * @throws IOException * @throws InvalidTokenOffsetsException */public void searchZhPub() throws ParseException, CorruptIndexException, IOException, InvalidTokenOffsetsException {// 搜索的关键词String queryKeyWord = "金刚石薄膜";// 创建查询分析器,把查询关键词转化为查询对象Query(单个Field中搜索)// 在标题的索引中搜索// QueryParser queryParser = new QueryParser(Version.LUCENE_36, "zh_title", analyzerZh);String[] fields = { "zh_title", "zh_abstract" };// (在多个Filed中搜索)QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36, fields, analyzerZh);Query query = queryParser.parse(queryKeyWord);// 获取访问索引的接口,进行搜索IndexReader indexReader = IndexReader.open(directoryZh);IndexSearcher indexSearcher = new IndexSearcher(indexReader);// TopDocs 搜索返回的结果TopDocs topDocs = indexSearcher.search(query, 100);// 只返回前100条记录int totalCount = topDocs.totalHits; // 搜索结果总数量System.out.println("搜索到的结果总数量为:" + totalCount);ScoreDoc[] scoreDocs = topDocs.scoreDocs; // 搜索的结果列表// 创建高亮器,使搜索的关键词突出显示Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");Scorer fragmentScore = new QueryScorer(query);Highlighter highlighter = new Highlighter(formatter, fragmentScore);Fragmenter fragmenter = new SimpleFragmenter(100);highlighter.setTextFragmenter(fragmenter);List<Publication> pubs = new ArrayList<Publication>();// 把搜索结果取出放入到集合中for (ScoreDoc scoreDoc : scoreDocs) {int docID = scoreDoc.doc;// 当前结果的文档编号float score = scoreDoc.score;// 当前结果的相关度得分System.out.println("score is : " + score);Document document = indexSearcher.doc(docID);Publication pubZh = new Publication();pubZh.setId(Long.parseLong(document.get("id")));// 高亮显示titleString zhTitle = document.get("zh_title");String highlighterTitle = highlighter.getBestFragment(analyzerZh, "zh_title", zhTitle);// 如果title中没有找到关键词if (highlighterTitle == null) {highlighterTitle = zhTitle;}pubZh.setZhTitle(highlighterTitle);// 高亮显示abstractString zhAbstract = document.get("zh_abstract");String highlighterAbstract = highlighter.getBestFragment(analyzerZh, "zh_abstract", zhAbstract);// 如果Abstract中没有找到关键词if (highlighterAbstract == null) {highlighterAbstract = zhAbstract;}pubZh.setZhAbstract(highlighterAbstract);pubZh.setPublishYear(Integer.parseInt(document.get("publish_year")));pubs.add(pubZh);}// 关闭indexReader.close();indexSearcher.close();for (Publication pub : pubs) {System.out.println("pub'id is : " + pub.getId());System.out.println("pub'publish year is : " + pub.getPublishYear());System.out.println("pub'title is : " + pub.getZhTitle());System.out.println("pub'abstract is : " + pub.getZhAbstract());}}/** * 搜索文献英文内容. * * @throws ParseException * @throws CorruptIndexException * @throws InvalidTokenOffsetsException */public void searchEnPub() throws ParseException, CorruptIndexException, IOException, InvalidTokenOffsetsException {// 搜索的关键词String queryKeyWord = "diamond films";// 创建查询分析器,把查询关键词转化为查询对象Query(单个Field中搜索)// 在标题的索引中搜索// QueryParser queryParser = new QueryParser(Version.LUCENE_36, "en_title", analyzerEn);String[] fields = { "en_title", "en_abstract" };// (在多个Filed中搜索)QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36, fields, analyzerEn);Query query = queryParser.parse(queryKeyWord);// 获取访问索引的接口,进行搜索IndexReader indexReader = IndexReader.open(directoryEn);IndexSearcher indexSearcher = new IndexSearcher(indexReader);// TopDocs 搜索返回的结果TopDocs topDocs = indexSearcher.search(query, 100);// 只返回前100条记录int totalCount = topDocs.totalHits; // 搜索结果总数量System.out.println("搜索到的结果总数量为:" + totalCount);ScoreDoc[] scoreDocs = topDocs.scoreDocs; // 搜索的结果列表// 创建高亮器,使搜索的关键词突出显示Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");Scorer fragmentScore = new QueryScorer(query);Highlighter highlighter = new Highlighter(formatter, fragmentScore);Fragmenter fragmenter = new SimpleFragmenter(100);highlighter.setTextFragmenter(fragmenter);List<Publication> pubs = new ArrayList<Publication>();// 把搜索结果取出放入到集合中for (ScoreDoc scoreDoc : scoreDocs) {int docID = scoreDoc.doc;// 当前结果的文档编号float score = scoreDoc.score;// 当前结果的相关度得分System.out.println("score is : " + score);Document document = indexSearcher.doc(docID);Publication pubEn = new Publication();pubEn.setId(Long.parseLong(document.get("id")));// 高亮显示titleString enTitle = document.get("en_title");String highlighterTitle = highlighter.getBestFragment(analyzerEn, "en_title", enTitle);// 如果title中没有找到关键词if (highlighterTitle == null) {highlighterTitle = enTitle;}pubEn.setEnTitle(highlighterTitle);// 高亮显示abstractString enAbstract = document.get("en_abstract");String highlighterAbstract = highlighter.getBestFragment(analyzerEn, "en_abstract", enAbstract);// 如果Abstract中没有找到关键词if (highlighterAbstract == null) {highlighterAbstract = enAbstract;}pubEn.setEnAbstract(highlighterAbstract);pubEn.setPublishYear(Integer.parseInt(document.get("publish_year")));pubs.add(pubEn);}// 关闭indexReader.close();indexSearcher.close();for (Publication pub : pubs) {System.out.println("pub'id is : " + pub.getId());System.out.println("pub'publish year is : " + pub.getPublishYear());System.out.println("pub'title is : " + pub.getEnTitle());System.out.println("pub'abstract is : " + pub.getEnAbstract());}}}
?3、结果
写道加载扩展词典:ext.dic加载扩展停止词典:stopword.dic
加载扩展停止词典:stopword_chinese.dic
搜索到的结果总数量为:2
score is : 0.30121902
score is : 0.24961227
pub'id is : 68545
pub'publish year is : 2009
pub'title is : 激光破坏<font color='red'>金刚石薄膜</font>研究
pub'abstract is : 分析了不同激光工作参数对<font color='red'>金刚石薄膜</font>的激光损伤机理,认为石墨化导致晶格失稳是<font color='red'>金刚石薄膜</font>激光损伤的主要原因。<font color='red'>金刚石薄膜</font>石墨化有两种方式:垂直表面向体层方向石墨化和平行表面按分层的方式逐层石墨化。
pub'id is : 123456
pub'publish year is : 2010
pub'title is : <font color='red'>金刚石薄膜</font>抗激光破坏研究
pub'abstract is : 介绍了金刚石优异的光学和力学特性,对<font color='red'>金刚石薄膜</font>在从紫外到红外波段以及不同脉宽激光参数下的激光损伤行为和损伤阈值进行了评述。
搜索到的结果总数量为:2
score is : 0.48305953
score is : 0.34981734
pub'id is : 123456
pub'publish year is : 2010
pub'title is : Laser induced damage for <font color='red'>diamond</font> <font color='red'>films</font>
pub'abstract is : The outstanding optical and mechanical properties of <font color='red'>diamond</font> <font color='red'>films</font> are summarized.
pub'id is : 68545
pub'publish year is : 2009
pub'title is : Laser destruction of the <font color='red'>diamond</font> thin <font color='red'>films</font>
pub'abstract is : that lead to graphitized damage on the surface of <font color='red'>diamond</font> <font color='red'>films</font> under long and short laser pulses
?