大家推荐一下判断两篇文章相似度方法?我尝试用lucene模糊搜索可以很好判断5、6个字的标题,但内容就不行了?有什么框架或方法可以实现?
我得从个个网站采集些内容,但得判断他们是否重复,确定用标题和其他信息不能达到很好效果,只能通过文章内容来判断,但内容也可能有些字被修改过,所以也不完全一样,可能多几个字,少几个字。通过什么方法或者什么框架可以判断两篇文章相似度?
[解决办法]
用算法中的求最大相似子字符串的方法LCS或许可以,它可以找到两个字符串中最大相似的子字符串
- Java code
/* * @author talent_marquis<甜菜侯爵> * Email: talent_marquis@163.com * Copyright (C) 2007 talent_marquis<甜菜侯爵> * All rights reserved. */package ustc.mse.algorithms.dynamicProgramming;/* * LCS, Longest-Common-Subsequence */public class LCS{ public enum DIRECTION{ TOP, TOP_LEFT, LEFT }; private char[] first; private char[] second; private int[][] lcsTable; private DIRECTION[][] lcsAssistTable; private int lcsLength; private String lcs_str, lcsMatrix_str, lcsAssistMatrix_str; private StringBuffer str_buffer; public LCS( String str1, String str2 ) { first = str1.toCharArray(); second = str2.toCharArray(); lcsTable = new int[ first.length + 1 ][ second.length + 1 ]; lcsAssistTable = new DIRECTION[ first.length + 1 ][ second.length + 1]; lcs_str = null; str_buffer = new StringBuffer(); } public static void main(String[] args) { String a = "我抄我抄我抄抄抄:明月几时有,把酒问青天,不知天上宫阙,今夕是何年"; String b = "苏轼曾经写过“明月几时有,把酒问青天”的千古名句"; LCS lcs = new LCS( a, b ); lcs.getLCSLength(); lcs.runLCS(); println( "最大相似子字符串长度是:" + lcs.getLCSLength() ); println( "最大相似子字符串为:" + lcs.getLCS() ); } public int getLCSLength() { lcsLength = getLCSLength( first, second ); return lcsLength; } private int getLCSLength( char[] one, char[] two ) { lcsTable = new int[ one.length + 1 ][ two.length + 1 ]; lcsAssistTable = new DIRECTION[ one.length + 1 ][ two.length + 1]; for ( int i = 0; i < one.length ; i++ ) { lcsTable[ i ][ 0 ] = 0; } for ( int j = 0; j < two.length - 1; j++ ) { lcsTable[ 0 ][ j ] = 0; } for ( int i = 0; i < one.length; i++ ) { for ( int j = 0; j < two.length; j++ ) { if ( one[ i ] == two[ j ] ) { lcsTable[ i + 1 ][ j + 1 ] = lcsTable[ i ][ j ] + 1; lcsAssistTable[ i + 1 ][ j + 1 ] = DIRECTION.TOP_LEFT; } else if ( lcsTable[ i ][ j + 1 ] >= lcsTable[ i + 1 ][ j ] ) { lcsTable[ i + 1 ][ j + 1 ] = lcsTable[ i ][ j + 1 ]; lcsAssistTable[ i + 1 ][ j + 1 ] = DIRECTION.TOP; } else { lcsTable[ i + 1 ][ j + 1 ] = lcsTable[ i + 1 ][ j ]; lcsAssistTable[ i + 1 ][ j + 1 ] = DIRECTION.LEFT; } } } lcsLength = lcsTable[ one.length ][ two.length ]; return lcsLength; } public void runLCS() { runLCS( lcsAssistTable, first, first.length, second.length ); lcs_str = str_buffer.toString(); } private void runLCS( DIRECTION[][] lcsAssistTable, char[] one, int oneLength, int twoLength ) { if( oneLength == 0 || twoLength == 0 ) { return; } int i = oneLength ; int j = twoLength ; if( lcsAssistTable[ i ][ j ] == DIRECTION.TOP_LEFT ) { runLCS( lcsAssistTable, one, i - 1, j -1 ); str_buffer.append( one[ i - 1 ] ); } else if ( lcsAssistTable[ i ][ j ] == DIRECTION.TOP ) { runLCS( lcsAssistTable, one, i - 1, j ); } else { runLCS( lcsAssistTable, one, i, j -1 ); } } public String getLCSAssistMatrixString() { str_buffer = new StringBuffer(); for( DIRECTION[] row: lcsAssistTable) { for( DIRECTION element : row ) { if( element == DIRECTION.LEFT ) { str_buffer.append( "?? " ); } else if (element == DIRECTION.TOP ) { str_buffer.append( "?? " ); } else if (element == DIRECTION.TOP_LEFT) { str_buffer.append( "?I " ); } else { //str_buffer.append( "\t" ); } } str_buffer.append( "\n" ); } lcsAssistMatrix_str = str_buffer.toString(); return lcsAssistMatrix_str; } public String getLCSMatrixString() { str_buffer = new StringBuffer(); for( int[] row: lcsTable) { for( int element : row ) { str_buffer.append( element + " " ); } str_buffer.append( "\n" ); } lcsMatrix_str = str_buffer.toString(); return lcsMatrix_str; } public static void print( Object o ) { System.out.print( o ); } public static void println( Object o ) { System.out.println( o ); } public String getLCS() { return lcs_str; } /** * @return first */ public char[] getFirstCharArray() { return first; } /** * @return second */ public char[] getSecondCharArray() { return second; } /** * @return lcsAssistTable */ public DIRECTION[][] getLcsAssistTable() { return lcsAssistTable; } /** * @return lcsTable */ public int[][] getLcsTable() { return lcsTable; }}