Lucene笔记
参考http://blog.csdn.net/zzpchina/archive/2006/01/15/579875.aspx
IR(Information Retrieval)来描述像Lucene这样的搜索工具。
lucene in action第二版,亚马逊没抢购到
直接看源码http://www.manning.com/hatcher3/LIAsourcecode.zip
使用的是lucene-core-3.0.2.jar
-------------------------------------------------------
Evaluating search quality:
评价搜索质量:
D.5.1 Precision and recallPrecision and recall are standard metrics in the information retrieval community for objectively measuringrelevance of search results. Precision measures what subset of the documents returned for each query wererelevant. For example, if a query has 20 hits and only 1 is relevant, precision is 0.05. If only 1 hit was returnedand it was relevant, precision is 1.0. Recall measures what percentage of the relevant documents for that querywas actually returned. So if the query listed 8 documents as being relevant, but 6 were in the result set, that’s arecall of 0.75.In a properly configured search application, these two measures are naturally at odds with one another. Let’ssay, on one extreme, you only show the user the very best (top 1) document matching their query. With such anapproach, your precision will typically be high, because the first result has a good chance of being relevant, whileyour recall would be very low, because if there are many relevant documents for a given query you have onlyreturned one of them. If we increase top 1 to top 10, then suddenly we will be returning many documents for eachquery. The precision will necessarily drop because most likely you are now allowing some non-relevant documentsinto the result set. But recall should increase because each query should return a larger subset of its relevantdocuments.Still, you’d like the relevant documents to be higher up in the ranking. To measure this, average precision iscomputed. This measure computes precision at each of the N cutoffs, where N ranges from 1 to a maximum value,and then takes the average. So this measure is higher if your search application generally returns relevantdocuments earlier in the result set. Mean average precision, or MAP, then measures the mean of average precisionacross a set of queries. A related measure, mean reciprocal rank or MRR, measures 1/M where M is the first rankthat had a relevant document. You want both of these numbers to be as high as possible!
import java.io.File;import java.io.PrintWriter;import java.io.BufferedReader;import java.io.FileReader;import org.apache.lucene.search.*;import org.apache.lucene.store.*;import org.apache.lucene.benchmark.quality.*;import org.apache.lucene.benchmark.quality.utils.*;import org.apache.lucene.benchmark.quality.trec.*;public class PrecisionRecall { public static void main(String[] args) throws Throwable { File topicsFile = new File("D:/Workspaces/suanfa/sohu3/src/lia/benchmark/topics.txt"); File qrelsFile = new File("D:/Workspaces/suanfa/sohu3/src/lia/benchmark/qrels.txt"); Directory dir = FSDirectory.open(new File("indexes/MeetLucene")); org.apache.lucene.search.Searcher searcher = new IndexSearcher(dir, true); String docNameField = "filename"; PrintWriter logger = new PrintWriter(System.out, true); TrecTopicsReader qReader = new TrecTopicsReader(); //#1 QualityQuery qqs[] = qReader.readQueries( new BufferedReader(new FileReader(topicsFile))); //#1 Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile))); //#2 judge.validateData(qqs, logger); //#3 QualityQueryParser qqParser = new SimpleQQParser("title", "contents"); //#4 QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField); SubmissionReport submitLog = null; QualityStats stats[] = qrun.execute(judge,submitLog, logger); QualityStats avg = QualityStats.average(stats); //#6 avg.log("SUMMARY",2,logger, " "); dir.close(); }}
-----------------------------------------------------
helloword在
LIAsourcecode\lia2e\src\lia\meetlucene\Indexer.java
简化一下:
import java.io.File;import java.io.FileFilter;import java.io.FileReader;import java.io.IOException;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;public class Indexer {public static void main(String[] args) throws IOException {String indexDir = "D:\\Workspaces\\suanfa\\sohu3\\src\\lia\\meetlucene\\index";// args[0];String dataDir = "D:\\Workspaces\\suanfa\\sohu3\\src\\lia\\meetlucene\\data";// args[1];long start = System.currentTimeMillis();Directory dir = FSDirectory.open(new File(indexDir));IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); // 3int numIndexed = 0;try {TextFilesFilter filter = new TextFilesFilter();File[] files = new File(dataDir).listFiles();for (File f : files) {if (!f.isDirectory() && !f.isHidden() && f.exists()&& f.canRead() && (filter == null || filter.accept(f))) {// indexFile(f);System.out.println("Indexing " + f.getCanonicalPath());Document doc = new Document();doc.add(new Field("contents", new FileReader(f))); // 7doc.add(new Field("filename", f.getName(),Field.Store.YES, Field.Index.NOT_ANALYZED));// 8doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));// 9writer.addDocument(doc);numIndexed = writer.numDocs();}}} finally {writer.close();}long end = System.currentTimeMillis();System.out.println("Indexing " + numIndexed + " files took "+ (end - start) + " milliseconds");}private static class TextFilesFilter implements FileFilter {public boolean accept(File path) {return path.getName().toLowerCase().endsWith(".txt"); // 6}}}
import org.apache.lucene.document.Document;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.Directory;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.util.Version;import java.io.File;import java.io.IOException;public class Searcher { public static void main(String[] args) throws IllegalArgumentException,IOException, ParseException { String indexDir = "D:\\Workspaces\\suanfa\\sohu3\\src\\lia\\meetlucene\\index";//args[0]; //1 String q = "Redistri*";//args[1]; //2 Directory dir = FSDirectory.open(new File(indexDir)); //3 IndexSearcher is = new IndexSearcher(dir); //3 QueryParser parser = new QueryParser(Version.LUCENE_30,"contents",new StandardAnalyzer(Version.LUCENE_30)); //4 Query query = parser.parse(q); //4 long start = System.currentTimeMillis(); TopDocs hits = is.search(query, 10); //5 long end = System.currentTimeMillis(); System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start) +" milliseconds) that matched query '" +q + "':"); // 6 for(ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = is.doc(scoreDoc.doc); //7 System.out.println(doc.get("fullpath")); //8 } is.close(); }}
----------------------------------
不用lucene,直接用流统计一个文件夹中字符出现的个数
package com.hao;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.FileReader;import java.io.FileWriter;import java.util.Iterator;import java.util.Map;import java.util.TreeMap;import java.util.regex.Matcher;import java.util.regex.Pattern;public class UserTreeMap {/** * @param args * @throws Exception */public static void main(String[] args) throws Exception {//test();Map map=getMapFromFile("D:\\Workspaces\\suanfa\\sohu3\\src\\english.txt");Iterator it = map.entrySet().iterator(); while (it.hasNext()) { Map.Entry entry = (Map.Entry) it.next(); Object key = entry.getKey(); Object value = entry.getValue(); System.out.println(key+"--"+value); }}public static Map getMapFromFile(String filepath) throws Exception{BufferedReader buf = new BufferedReader(new FileReader(filepath));StringBuffer sbuf = new StringBuffer();// 缓冲字符串String line = null;while ((line = buf.readLine()) != null) {sbuf.append(line);// 追加到缓冲字符串中}buf.close();// 读取结束Pattern expression = Pattern.compile("[a-zA-Z]+");// 定义正则表达式匹配单词String string1 = sbuf.toString();//.toLowerCase();// 转换成小写Matcher matcher = expression.matcher(string1);// 定义string1的匹配器TreeMap myTreeMap = new TreeMap();// 创建树映射 存放键/值对int n = 0;// 文章中单词总数Object word = null;// 文章中的单词Object num = null;// 出现的次数while (matcher.find()) {// 是否匹配单词word = matcher.group();// 得到一个单词-树映射的键n++;// 单词数加1if (myTreeMap.containsKey(word)) {// 如果包含该键,单词出现过num = myTreeMap.get(word);// 得到单词出现的次数Integer count = (Integer) num;// 强制转化myTreeMap.put(word, new Integer(count.intValue() + 1));} else {myTreeMap.put(word,new Integer(1));//否则单词第一次出现,添加到映射中}}return myTreeMap;}public static void test() throws Exception{BufferedReader buf = new BufferedReader(new FileReader("D:\\sohu3\\english.txt"));System.out.println("Read under this dir English.txt");StringBuffer sbuf = new StringBuffer();// 缓冲字符串String line = null;while ((line = buf.readLine()) != null) {sbuf.append(line);// 追加到缓冲字符串中}buf.close();// 读取结束Pattern expression = Pattern.compile("[a-zA-Z]+");// 定义正则表达式匹配单词String string1 = sbuf.toString().toLowerCase();// 转换成小写Matcher matcher = expression.matcher(string1);// 定义string1的匹配器TreeMap myTreeMap = new TreeMap();// 创建树映射 存放键/值对int n = 0;// 文章中单词总数Object word = null;// 文章中的单词Object num = null;// 出现的次数while (matcher.find()) {// 是否匹配单词word = matcher.group();// 得到一个单词-树映射的键n++;// 单词数加1if (myTreeMap.containsKey(word)) {// 如果包含该键,单词出现过num = myTreeMap.get(word);// 得到单词出现的次数Integer count = (Integer) num;// 强制转化myTreeMap.put(word, new Integer(count.intValue() + 1));} else {// src="http://images.csdn.net/syntaxhighlighting/OutliningIndicators/InBlock.gif"// alt="Lucene札记" /> myTreeMap.put(word,new Integer(1));//否则单词第一次出现,添加到映射中}}System.out.println("统计分析如下:");System.out.println(" 文章中单词总数" + n + "个");System.out.println("具体的信息在当前目录的result.txt文件中");BufferedWriter bufw = new BufferedWriter(new FileWriter("result.txt"));Iterator iter = myTreeMap.keySet().iterator();// 得到树映射键集合的迭代器Object key = null;while (iter.hasNext()) {// 使用迭代器遍历树映射的键key = iter.next();bufw.write((String) key + ":" + myTreeMap.get(key));// 键/值写到文件中bufw.newLine();}bufw.write("english.txt中的单词总数" + n + "个");bufw.newLine();bufw.write("english.txt中不同单词" + myTreeMap.size() + "个");bufw.close();}}