elasticsearch 实现联想输入搜索
import java.io.Serializable;import java.math.BigDecimal;/** * ICD抽象对象 * @author donlianli@126.com */public class ICD implements Serializable{private static final long serialVersionUID = 6934803011248581109L;//疾病IDprivate int id;//疾病编码private String code;//疾病名称private String diseaseName;//疾病加拼音private String mergeName;//汉语拼音简拼private String pinyin;//是否恶心肿瘤private boolean isTherioma;//是否住院特殊病种private boolean isSpecialDisease;public ICD(BigDecimal id, String diseaseName, String code,String pinyin, String isTherioma, String isSpecialDisease) {this.id = id.intValue();this.diseaseName = diseaseName;this.code = code;this.pinyin = pinyin;if("是".equals(isTherioma)){this.isTherioma = true;}else {this.isTherioma = false;}if("是".equals(isSpecialDisease)){this.isSpecialDisease = true;}else {this.isSpecialDisease = false;}this.mergeName = diseaseName + "," + pinyin;}//set,get ......}
?
?
第二步,将数据存储到elasticsearch里面,我们取个名称叫code,起个type名称叫icd。ICD大概2w条数据,我使用默认的bulkIndex,存到es大概用了3秒。
我这里是把数据从oracle导入到elasticsearch。
import java.math.BigDecimal;import java.sql.Connection;import java.sql.PreparedStatement;import java.sql.ResultSet;import java.util.ArrayList;import java.util.List;import org.elasticsearch.action.bulk.BulkRequestBuilder;import org.elasticsearch.action.bulk.BulkResponse;import org.elasticsearch.action.index.IndexRequestBuilder;import org.elasticsearch.client.Client;import com.donlianli.es.ESUtils;import com.donlianli.es.db.DatabaseUtils;public class ICDManager {public static void main(String[] argvs){ICDManager manager = new ICDManager();manager.indexDataDirect();}/** * 直接将数据初始化到ES中 * 不创建mapping */private void indexDataDirect() {List<ICD> icdList = getIcdListFromDB();System.out.println(" get icd from db finish,size:" + icdList.size());bulkIndex(icdList);}private void bulkIndex(List<ICD> icdList) {Client client = ESUtils.getCodeClient();BulkRequestBuilder bulkRequest = client.prepareBulk();long b = System.currentTimeMillis();for(int i=0,l=icdList.size();i<l;i++){//业务对象ICD icd = icdList.get(i);String json = ESUtils.toJson(icd);IndexRequestBuilder indexRequest = client.prepareIndex("code","icd") .setSource(json).setId(String.valueOf(icd.getId()));//添加到builder中bulkRequest.add(indexRequest);}BulkResponse bulkResponse = bulkRequest.execute().actionGet();if (bulkResponse.hasFailures()) {System.out.println(bulkResponse.buildFailureMessage());}long useTime = System.currentTimeMillis()-b;System.out.println("useTime:" + useTime);}private List<ICD> getIcdListFromDB() {Connection conn = DatabaseUtils.getOracleConnection();String sql = "select * from icd_11";PreparedStatement st = null;ResultSet rs = null;List<ICD> list = new ArrayList<ICD>();try{st = conn.prepareStatement(sql);rs = st.executeQuery();while(rs.next()){BigDecimal id = rs.getBigDecimal("ID");String diseaseName = rs.getString("DISEASE_NAME");String code = rs.getString("CODE");String pinyin = rs.getString("PINYIN");String isTherioma = rs.getString("THERIOMA_FLAG");String isSpecialDisease = rs.getString("OTHER_FLAG");list.add(new ICD(id,diseaseName,code,pinyin,isTherioma,isSpecialDisease));}return list;}catch(Exception e){e.printStackTrace();}finally{try{if(rs!= null){rs.close();}if(st!= null){st.close();}conn.close();}catch(Exception e){e.printStackTrace();}}return null;}}?
?
第三步,搜索接口,跑测试用例。
import org.elasticsearch.action.search.SearchResponse;import org.elasticsearch.client.Client;import org.elasticsearch.index.query.MultiMatchQueryBuilder;import org.elasticsearch.index.query.QueryBuilders;import org.elasticsearch.search.SearchHit;import org.elasticsearch.search.SearchHits;import com.donlianli.es.ESUtils;public class PinyinSearchTest {public static void main(String[] args) {Client client = ESUtils.getCodeClient();String keyWord = "高血压";//String keyWord = "老年 高血压";//String keyWord = "gxy";//多个字段匹配MultiMatchQueryBuilder query = QueryBuilders.multiMatchQuery(keyWord, "diseaseName","pinyin");long b = System.currentTimeMillis();SearchResponse response = client.prepareSearch("code").setTypes("icd").setQuery(query).setFrom(0)//前20个.setSize(20).execute().actionGet();long useTime = System.currentTimeMillis()-b;System.out.println("search use time:" + useTime + " ms");SearchHits shs = response.getHits();for (SearchHit hit : shs) {System.out.println("分数:" + hit.getScore()+ ",ID:"+ hit.getId()+ ", 疾病名称:"+ hit.getSource().get("diseaseName")+ ",拼音:" + hit.getSource().get("pinyin"));}client.close();}}?
3.1,关键字:'高血压'
search use time:174 ms分数:2.3859928,ID:6904, 疾病名称:高血压病,拼音:gxyb分数:2.136423,ID:6907, 疾病名称:高血压I期,拼音:gxyyq分数:2.12253,ID:6908, 疾病名称:高血压Ⅱ期,拼音:gxyeq分数:2.12253,ID:6910, 疾病名称:高血压危象,拼音:gxywx分数:2.0906634,ID:6917, 疾病名称:肾性高血压,拼音:sxgxy分数:2.0877438,ID:6909, 疾病名称:高血压Ⅲ期,拼音:gxysq分数:2.0821526,ID:18767, 疾病名称:高原性高血压,拼音:gyxgxy分数:1.9905697,ID:6906, 疾病名称:恶性高血压,拼音:exgxy分数:1.9510978,ID:7260, 疾病名称:高血压脑出血,拼音:gxyncx分数:1.9078629,ID:6923, 疾病名称:肾血管性高血压,拼音:sxgxgxy分数:1.8312198,ID:6914, 疾病名称:高血压性肾病,拼音:gxyxsb分数:1.8193114,ID:7367, 疾病名称:高血压性脑病,拼音:gxyxnb分数:1.8193114,ID:13470, 疾病名称:妊娠引起高血压,拼音:rsyqgxy分数:1.7919972,ID:6905, 疾病名称:临界性高血压,拼音:ljxgxy分数:1.7919972,ID:6912, 疾病名称:高血压性心脏病,拼音:gxyxxzb分数:1.7894946,ID:6928, 疾病名称:继发性高血压,拼音:jfxgxy分数:1.7062025,ID:6913, 疾病名称:高血压性肾衰竭,拼音:gxyxssj分数:1.7062025,ID:13485, 疾病名称:孕产妇高血压,拼音:ycfgxy分数:1.7062025,ID:14534, 疾病名称:新生儿高血压,拼音:xsegxy分数:1.7062025,ID:16181, 疾病名称:应激性高血压,拼音:yjxgxy
?3.2关键字:'老年 高血压'
search use time:144 ms分数:1.1089094,ID:6904, 疾病名称:高血压病,拼音:gxyb分数:0.99291986,ID:6907, 疾病名称:高血压I期,拼音:gxyyq分数:0.9864628,ID:6908, 疾病名称:高血压Ⅱ期,拼音:gxyeq分数:0.9864628,ID:6910, 疾病名称:高血压危象,拼音:gxywx分数:0.9716526,ID:6917, 疾病名称:肾性高血压,拼音:sxgxy分数:0.97029567,ID:6909, 疾病名称:高血压Ⅲ期,拼音:gxysq分数:0.96769714,ID:18767, 疾病名称:高原性高血压,拼音:gyxgxy分数:0.9251333,ID:6906, 疾病名称:恶性高血压,拼音:exgxy分数:0.9067884,ID:7260, 疾病名称:高血压脑出血,拼音:gxyncx分数:0.8866946,ID:6923, 疾病名称:肾血管性高血压,拼音:sxgxgxy分数:0.8510741,ID:6914, 疾病名称:高血压性肾病,拼音:gxyxsb分数:0.8455395,ID:7367, 疾病名称:高血压性脑病,拼音:gxyxnb分数:0.8455395,ID:13470, 疾病名称:妊娠引起高血压,拼音:rsyqgxy分数:0.8328451,ID:6905, 疾病名称:临界性高血压,拼音:ljxgxy分数:0.8328451,ID:6912, 疾病名称:高血压性心脏病,拼音:gxyxxzb分数:0.831682,ID:6928, 疾病名称:继发性高血压,拼音:jfxgxy分数:0.8074301,ID:6820, 疾病名称:老年耳聋,拼音:lnel分数:0.80348647,ID:7612, 疾病名称:老年痣,拼音:lnz分数:0.7929714,ID:6913, 疾病名称:高血压性肾衰竭,拼音:gxyxssj分数:0.7929714,ID:13485, 疾病名称:孕产妇高血压,拼音:ycfgxy
?高血压和老年的相关并都出来了。只可惜老年高血压,没有列入ICD.
3.3拼音:'gxy'
呃?怎么没有出来?
这个问题折腾了我一天。一开始我以为是被es列入了禁用词。后来,找到是因为没有设置analyzer导致,在设analyzer的过程中竟然还犯了好几个低级错误,导致我非常怀疑设置analyzer是否管用。
这个问题涉及到分词,而分词我还没有好好研究过。总之,在创建索引及mapping的时候,指定一个analyzer就可以解决这个问题。
创建index及mapping的代码如下:
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;import org.elasticsearch.client.Client;import org.elasticsearch.common.settings.ImmutableSettings;import org.elasticsearch.common.settings.ImmutableSettings.Builder;import org.elasticsearch.common.xcontent.XContentBuilder;import com.donlianli.es.ESUtils;/** * 创建code的mapping * @author donlianli@126.com */public class CodeMappingTest {static final String INDEX_NAME="code";static final String TYPE_NAME="icd";public static void main(String[] argv) throws Exception{Client client = ESUtils.getCodeClient();Builder settings = ImmutableSettings.settingsBuilder() .loadFromSource(getAnalysisSettings());//首先创建索引库CreateIndexResponse indexresponse = client.admin().indices()//这个索引库的名称还必须不包含大写字母.prepareCreate(INDEX_NAME).setSettings(settings)//这里直接添加type的mapping.addMapping(TYPE_NAME, getMapping()).execute().actionGet();System.out.println("success:"+indexresponse.isAcknowledged());}private static String getAnalysisSettings() throws Exception {XContentBuilder mapping = jsonBuilder() .startObject() //主分片数量 .field("number_of_shards",5) .field("number_of_replicas",0) .startObject("analysis") .startObject("filter") //创建分词过滤器 .startObject("pynGram") .field("type","nGram") //从1开始 .field("min_gram",1) .field("max_gram",15) .endObject() .endObject() .startObject("analyzer") //拼音analyszer .startObject("pyAnalyzer") .field("type","custom") .field("tokenizer","standard") .field("filter", new String[]{ "lowercase","pynGram"}) .endObject() .endObject() .endObject() .endObject(); System.out.println(mapping.string());return mapping.string();}/** * mapping 一旦定义,之后就不能修改。 * @return * @throws Exception */private static XContentBuilder getMapping() throws Exception{XContentBuilder mapping = jsonBuilder() .startObject() .startObject("icd") //指定分词器 .field("index_analyzer","pyAnalyzer") .startObject("properties") .startObject("id") .field("type", "long") .field("store", "yes") .endObject() .startObject("code") .field("type", "string") .field("store", "yes") .field("index", "analyzed") .endObject() .startObject("diseaseName") .field("type", "string") .field("store", "yes") .field("index", "analyzed") .endObject() .startObject("mergeName") .field("type", "string") .field("store", "yes") .field("index", "analyzed") .endObject() .startObject("pinyin") .field("type", "string") .field("store", "yes") .field("index", "analyzed") .endObject() .startObject("isTherioma") .field("type", "boolean") .field("store", "yes") .endObject() .startObject("isSpecialDisease") .field("type", "boolean") .field("store", "yes") .endObject() .endObject() .endObject() .endObject(); return mapping;}}?(PS:其实还有一种简单的方法,不用创建analyzer,在搜索的时候,使用'*gxy*'进行搜索也可以)
最后,我还把这个检索跟oracle的like进行了比较。结果发现oracle只用20ms就能算出结果,而es却用了将近100ms。可见这种吹捧的nosql,性能不见得比oracle强大啊,但是毋庸置疑的是,功能确实强大了。
?
?
?
对这类话题感兴趣?欢迎发送邮件至donlianli@126.com关于我:邯郸人,擅长Java,Javascript,Extjs,oracle sql。更多我之前的文章,可以访问?我的空间?