lucene的中文分词器
package?analyzer;
import?java.io.Reader;
import?java.io.StringReader;
import?org.apache.lucene.analysis.Analyzer;
import?org.apache.lucene.analysis.StopFilter;
import?org.apache.lucene.analysis.Token;
import?org.apache.lucene.analysis.TokenFilter;
import?org.apache.lucene.analysis.TokenStream;
import?org.apache.lucene.analysis.cjk.CJKAnalyzer;
import?org.apache.lucene.analysis.cn.ChineseAnalyzer;
import?org.apache.lucene.analysis.standard.StandardAnalyzer;
import?org.mira.lucene.analysis.IK_CAnalyzer;
import?org.mira.lucene.analysis.MIK_CAnalyzer;
import?com.sohospace.lucene.analysis.xanalyzer.XAnalyzer;
import?com.sohospace.lucene.analysis.xanalyzer.XFactory;
import?com.sohospace.lucene.analysis.xanalyzer.XTokenizer;
//中文分词使用了Paoding的分词技术,特表示感谢
public?class?TestCJKAnalyzer?
{
????private?static?String?testString1?=?"中华人民共和国在1949年建立,从此开始了新中国的伟大篇章";
????private?static?String?testString2?=?"比尔盖茨从事餐饮业和服务业方面的工作";
????public?static?void?testStandard(String?testString)?throws?Exception
{
????????Analyzer?analyzer?=?new?StandardAnalyzer();??????
????????Reader?r?=?new?StringReader(testString);??????
????????StopFilter?sf?=?(StopFilter)?analyzer.tokenStream("",?r);
????????System.err.println("=====standard?analyzer====");
????????System.err.println("分析方法:默认没有词只有字");
????????Token?t;??????
????????while?((t?=?sf.next())?!=?null)?
{??????
????????????System.out.println(t.termText());??????
????????}?????
????}
????public?static?void?testCJK(String?testString)?throws?Exception
{
????????Analyzer?analyzer?=?new?CJKAnalyzer();??????
????????Reader?r?=?new?StringReader(testString);??????
????????StopFilter?sf?=?(StopFilter)?analyzer.tokenStream("",?r);
????????System.err.println("=====cjk?analyzer====");
????????System.err.println("分析方法:交叉双字分割");
????????Token?t;??????
????????while?((t?=?sf.next())?!=?null)?
{??????
????????????System.out.println(t.termText());??????
????????}?????
????}
????public?static?void?testChiniese(String?testString)?throws?Exception
{
????????Analyzer?analyzer?=?new?ChineseAnalyzer();??????
????????Reader?r?=?new?StringReader(testString);??????
????????TokenFilter?tf?=?(TokenFilter)?analyzer.tokenStream("",?r);
????????System.err.println("=====chinese?analyzer====");
????????System.err.println("分析方法:基本等同StandardAnalyzer");
????????Token?t;??????
????????while?((t?=?tf.next())?!=?null)?
{??????
????????????System.out.println(t.termText());??????
????????}?????
????}
????public?static?void?testPaoding(String?testString)?throws?Exception
{
????????XAnalyzer?analyzer?=?XFactory.getQueryAnalyzer();???
????????Reader?r?=?new?StringReader(testString);???
????????XTokenizer?ts?=?(XTokenizer)?analyzer.tokenStream("",?r);???
????????System.err.println("=====paoding?analyzer====");
????????System.err.println("分析方法:字典分词,去掉停止词。在字典不能匹配的情况下使用CJKAnalyzer的分割发。");
????????Token?t;???
????????while?((t?=?ts.next())?!=?null)?
{???
???????????System.out.println(t.termText());???
????????}???
????}
????public?static?void?testJe(String?testString)?throws?Exception
{
//????????Analyzer?analyzer?=?new?MIK_CAnalyzer();
????????Analyzer?analyzer?=?new?IK_CAnalyzer();
????????Reader?r?=?new?StringReader(testString);?
????????TokenStream?ts?=?(TokenStream)analyzer.tokenStream("",?r);
????????System.err.println("=====je?analyzer====");
????????System.err.println("分析方法:字典分词,正反双向搜索,具体不明");
????????Token?t;???
????????while?((t?=?ts.next())?!=?null)?
{???
???????????System.out.println(t.termText());???
????????}???
????}
????public?static?void?main(String[]?args)?throws?Exception
{
//????????String?testString?=?testString1;
????????String?testString?=?testString1;
????????System.out.println(testString);
????????
????????testStandard(testString);
????????testCJK(testString);
????????testPaoding(testString);
????????
//????????testChiniese(testString);
//????????testJe(testString);
????}
}
?