读书人

lucene搜索引擎简略应用

发布时间: 2012-08-31 12:55:03 作者: rapoo

lucene搜索引擎简单应用



还用lucene架了个搜索引擎,对pdf进行全文搜索(联合pdfbox)。

核心类是一个Agent,使用开源的庖丁中文分词器

代码:

?

QUOTE:package gov.jsgs.ssgs.service;

import gov.jsgs.ssgs.form.PdfForm;
import gov.jsgs.ssgs.model.Ssgs_pdfModel;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.StaleReaderException;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;

import common.Logger;

/**
?* 搜索引擎代理,目前设置为在c:\pdf_index目录下保存索引,可以通过setIndexDir()修改.
?*? <p>
?*? <b>注意:必须用单例模式运行,使用init()初始化,destroy()释放资源</b>
?*
?* @author tedeyang
?*
?*/
public class LuceneAgent {

?? ?private static Logger log = Logger.getLogger(LuceneAgent.class);
?? ?static Object lock = new Object();
?? ?Analyzer analyzer = null;

?? ?Directory ramDir = null;

?? ?IndexWriter writer = null;

?? ?String indexDir = "c:/pdf_index";

?? ?IndexReader reader;

?? ?QueryParser parser;

?? ?Searcher searcher;

?? ?public void init() throws CorruptIndexException, LockObtainFailedException,
?? ??? ??? ?IOException {
?? ??? ?log.info("初始化Lucene搜索引擎...");
?? ??? ?log.debug("初始化分词器...");
?? ??? ?analyzer = new PaodingAnalyzer();
?? ??? ?ramDir = FSDirectory.getDirectory(indexDir);
?? ??? ?if (ramDir.fileExists("write.lock")) {
?? ??? ??? ?ramDir.deleteFile("write.lock");
?? ??? ??? ?log.debug("清除引擎文件锁 ...");
?? ??? ?}
?? ??? ?try {
?? ??? ??? ?writer = new IndexWriter(ramDir, analyzer, false);
?? ??? ?} catch (Exception e) {
?? ??? ??? ?writer = new IndexWriter(ramDir, analyzer, true);
?? ??? ?}
?? ??? ?reader = IndexReader.open(ramDir);
?? ??? ?parser = new QueryParser(LucenePDFDocument.CONTENT, analyzer);
?? ??? ?searcher = new IndexSearcher(ramDir);

?? ?}

?? ?public void destroy() {
?? ??? ?log.info("关闭搜索引擎...");
?? ??? ?try {
?? ??? ??? ?reader.close();
?? ??? ?} catch (IOException e) {
?? ??? ??? ?e.printStackTrace();
?? ??? ?}
?? ??? ?try {
?? ??? ??? ?writer.close();
?? ??? ?} catch (CorruptIndexException e) {
?? ??? ??? ?e.printStackTrace();
?? ??? ?} catch (IOException e) {
?? ??? ??? ?e.printStackTrace();
?? ??? ?}
?? ?}

?? ?public String getIndexDir() {
?? ??? ?return indexDir;
?? ?}

?? ?/**
?? ? * @param indexDir
?? ? *??????????? 存放索引文件的目录,默认c:/pdf_index
?? ? */
?? ?public void setIndexDir(String indexDir) {
?? ??? ?this.indexDir = indexDir;
?? ?}

?? ?public void resetIndex() {

?? ?}

?? ?/**
?? ? * 添加索引文档
?? ? *
?? ? * @param doc
?? ? * @throws CorruptIndexException
?? ? * @throws IOException
?? ? */
?? ?public void addPdf(Document doc) throws CorruptIndexException, IOException {
?? ??? ?writer.addDocument(doc, analyzer);
?? ??? ?writer.optimize();
?? ??? ?writer.flush();
?? ?}

?? ?/**
?? ? * 添加pdf文件索引。完成后会关闭pdf的inputstream
?? ? *
?? ? * @param pdf
?? ? * @throws CorruptIndexException
?? ? * @throws IOException
?? ? */
?? ?public void addPdf(Ssgs_pdfModel pdf) throws CorruptIndexException,
?? ??? ??? ?IOException {
?? ??? ?writer.addDocument(LucenePDFDocument.getDocument(pdf), analyzer);
?? ??? ?writer.optimize();
?? ??? ?writer.flush();
?? ?}

?? ?/**
?? ? * 查询
?? ? *
?? ? * @param keyword
?? ? * @return list[PdfForm]
?? ? * @throws ParseException
?? ? * @throws IOException
?? ? */
?? ?public List search(String keyword) throws ParseException, IOException {
?? ??? ?if (keyword == null || keyword.matches("^\\s*$")) {
?? ??? ??? ?return null;
?? ??? ?}
?? ??? ?Hits hits = null;
?? ??? ?synchronized (lock) {
?? ??? ??? ?Query query = parser.parse(keyword).rewrite(reader);
?? ??? ??? ?hits = searcher.search(query);
?? ??? ?}
?? ??? ?List pdfs = new ArrayList(hits.length());
?? ??? ?for (int i = 0; i < hits.length(); i++) {
?? ??? ???? PdfForm pdf = new PdfForm();
?? ??? ??? ?pdf.setFile_name(hits.doc(i).get(LucenePDFDocument.FILE_NAME));
?? ??? ??? ?pdf.setId(Integer.parseInt(hits.doc(i).get(LucenePDFDocument.ID)));
?? ??? ??? ?try {
?? ??? ??? ??? ?pdf.setM_time( hits.doc(i).get( LucenePDFDocument.MODIFIED));
?? ??? ??? ?} catch (Exception e) {
?? ??? ??? ??? ?pdf.setM_time(null);
?? ??? ??? ?}
?? ??? ??? ?pdf.setSummary(hits.doc(i).get(LucenePDFDocument.SUMMARY));
?? ??? ??? ?pdfs.add(pdf);
?? ??? ?}
?? ??? ?return pdfs;
?? ?}

?? ?/**
?? ? * 根据唯一主键删除索引
?? ? *
?? ? * @param id
?? ? * @throws StaleReaderException
?? ? * @throws CorruptIndexException
?? ? * @throws LockObtainFailedException
?? ? * @throws IOException
?? ? */
?? ?public synchronized void delete(String id) throws StaleReaderException,
?? ??? ??? ?CorruptIndexException, LockObtainFailedException, IOException {
?? ??? ?Term term = new Term(LucenePDFDocument.ID, id);
?? ??? ?synchronized (lock) {
?? ??? ??? ?writer.deleteDocuments(term);
?? ??? ??? ?writer.optimize();
?? ??? ??? ?writer.flush();
?? ??? ?}
?? ?}
}

?

?

1 楼 gongji 2010-05-04 有机会得研究研究lucene了

读书人网 >网络基础

热点推荐