使用余弦相似性原理计算文本的相似度
原理参考:http://www.ruanyifeng.com/blog/2013/03/cosine_similarity.html
/** * */package com.text;import java.io.IOException;import java.io.StringReader;import java.util.HashMap;import java.util.Map;import org.apache.commons.collections.MapUtils;import org.apache.commons.lang3.tuple.MutablePair;import org.apache.commons.lang3.tuple.Pair;import org.wltea.analyzer.core.IKSegmenter;import org.wltea.analyzer.core.Lexeme;/** * @author Riching * * @date 2013-8-10 */public class IKMainTest { /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { String str1 = "我喜欢看电视,不喜欢看电影。"; String str2 = "我不喜欢看电视,也不喜欢看电影。"; Map<String, Integer> tf1 = getTF(str1); Map<String, Integer> tf2 = getTF(str2); Map<String, MutablePair<Integer, Integer>> tfs = new HashMap<String, MutablePair<Integer, Integer>>(); for (String key : tf1.keySet()) { MutablePair<Integer, Integer> pair = new MutablePair<Integer, Integer>(tf1.get(key), 0); tfs.put(key, pair); } for (String key : tf2.keySet()) { MutablePair<Integer, Integer> pair = tfs.get(key); if (null == pair) { pair = new MutablePair<Integer, Integer>(0, tf2.get(key)); } else { pair.setRight(tf2.get(key)); } } double d = caclIDF(tfs); System.out.println(d); } public static Map<String, Integer> getTF(String str) throws IOException { Map<String, Integer> map = new HashMap<String, Integer>(); IKSegmenter ikSegmenter = new IKSegmenter(new StringReader(str), true); Lexeme lexeme = null; while ((lexeme = ikSegmenter.next()) != null) { String key = lexeme.getLexemeText(); Integer count = map.get(key); if (null == count) { count = 1; } else { count = count + 1; } map.put(key, count); } return map; } public static double caclIDF(Map<String, MutablePair<Integer, Integer>> tf) { double d = 0; if (MapUtils.isEmpty(tf)) { return d; } double denominator = 0; double sqdoc1 = 0; double sqdoc2 = 0; Pair<Integer, Integer> count = null; for (String key : tf.keySet()) { count = tf.get(key); denominator += count.getLeft() * count.getRight(); sqdoc1 += count.getLeft() * count.getLeft(); sqdoc2 += count.getRight() * count.getRight(); } d = denominator / (Math.sqrt(sqdoc1) * Math.sqrt(sqdoc2)); return d; }}