读书人

大家推荐一下判断两篇文章相似度方法?

发布时间: 2012-01-15 22:57:48 作者: rapoo

大家推荐一下判断两篇文章相似度方法?我尝试用lucene模糊搜索可以很好判断5、6个字的标题,但内容就不行了?有什么框架或方法可以实现?
我得从个个网站采集些内容,但得判断他们是否重复,确定用标题和其他信息不能达到很好效果,只能通过文章内容来判断,但内容也可能有些字被修改过,所以也不完全一样,可能多几个字,少几个字。通过什么方法或者什么框架可以判断两篇文章相似度?

[解决办法]
用算法中的求最大相似子字符串的方法LCS或许可以,它可以找到两个字符串中最大相似的子字符串

Java code
/* * @author talent_marquis<甜菜侯爵> * Email: talent_marquis@163.com * Copyright (C) 2007 talent_marquis<甜菜侯爵> * All rights reserved. */package ustc.mse.algorithms.dynamicProgramming;/* * LCS, Longest-Common-Subsequence */public class LCS{        public enum DIRECTION{ TOP, TOP_LEFT, LEFT };    private char[] first;    private char[] second;    private int[][] lcsTable;    private DIRECTION[][] lcsAssistTable;    private int lcsLength;    private String lcs_str, lcsMatrix_str, lcsAssistMatrix_str;    private StringBuffer str_buffer;        public LCS( String str1, String str2 )    {        first = str1.toCharArray();        second = str2.toCharArray();        lcsTable = new int[ first.length + 1 ][ second.length + 1 ];        lcsAssistTable = new DIRECTION[ first.length + 1 ][ second.length + 1];        lcs_str = null;        str_buffer = new StringBuffer();    }            public static void main(String[] args)    {        String a = "我抄我抄我抄抄抄:明月几时有,把酒问青天,不知天上宫阙,今夕是何年";        String b = "苏轼曾经写过“明月几时有,把酒问青天”的千古名句";                LCS lcs = new LCS( a, b );                        lcs.getLCSLength();        lcs.runLCS();        println( "最大相似子字符串长度是:" + lcs.getLCSLength() );        println( "最大相似子字符串为:" + lcs.getLCS() );    }        public int getLCSLength()    {        lcsLength = getLCSLength( first, second );        return lcsLength;    }        private int getLCSLength( char[] one, char[] two )    {        lcsTable = new int[ one.length + 1 ][ two.length + 1 ];        lcsAssistTable = new DIRECTION[ one.length + 1 ][ two.length + 1];                for ( int i = 0; i < one.length ; i++ )        {            lcsTable[ i ][ 0 ] = 0;        }                for ( int j = 0; j < two.length - 1; j++ )        {            lcsTable[ 0 ][ j ] = 0;        }                for ( int i = 0; i < one.length; i++ )        {            for ( int j = 0; j < two.length; j++ )            {                if ( one[ i ] == two[ j ] )                {                    lcsTable[ i + 1 ][ j + 1 ] = lcsTable[ i ][ j ] + 1;                    lcsAssistTable[ i + 1 ][ j + 1 ] = DIRECTION.TOP_LEFT;                    }                else if ( lcsTable[ i ][ j + 1 ] >= lcsTable[ i + 1 ][ j ] )                {                    lcsTable[ i + 1 ][ j + 1 ] = lcsTable[ i ][ j + 1 ];                    lcsAssistTable[ i + 1 ][ j + 1 ] = DIRECTION.TOP;                }                else                {                    lcsTable[ i + 1 ][ j + 1 ] = lcsTable[ i + 1 ][ j ];                    lcsAssistTable[ i + 1 ][ j + 1 ] = DIRECTION.LEFT;                }            }        }                lcsLength = lcsTable[ one.length ][ two.length ];                return lcsLength;    }        public void runLCS()    {        runLCS( lcsAssistTable, first, first.length, second.length );        lcs_str = str_buffer.toString();    }        private void runLCS( DIRECTION[][] lcsAssistTable, char[] one, int oneLength, int twoLength )    {        if( oneLength == 0 || twoLength == 0 )        {            return;        }                            int i = oneLength ;        int j = twoLength ;                if( lcsAssistTable[ i ][ j ] == DIRECTION.TOP_LEFT )        {            runLCS( lcsAssistTable, one, i - 1, j -1 );            str_buffer.append( one[ i - 1 ] );        }        else if ( lcsAssistTable[ i ][ j ] == DIRECTION.TOP )        {            runLCS( lcsAssistTable, one, i - 1, j );        }        else        {            runLCS( lcsAssistTable, one, i, j -1 );        }            }        public String getLCSAssistMatrixString()    {        str_buffer = new StringBuffer();        for( DIRECTION[] row: lcsAssistTable)        {            for( DIRECTION element : row )            {                if( element == DIRECTION.LEFT )                {                    str_buffer.append( "?? " );                }                else if (element == DIRECTION.TOP )                {                    str_buffer.append( "?? " );                }                else if (element == DIRECTION.TOP_LEFT)                {                    str_buffer.append( "?I " );                }                else                {                    //str_buffer.append( "\t" );                }            }            str_buffer.append( "\n" );        }        lcsAssistMatrix_str = str_buffer.toString();                return lcsAssistMatrix_str;    }        public String getLCSMatrixString()    {        str_buffer = new StringBuffer();        for( int[] row: lcsTable)        {            for( int element : row )            {                str_buffer.append( element + " " );            }            str_buffer.append( "\n" );        }        lcsMatrix_str = str_buffer.toString();        return lcsMatrix_str;    }        public static void print( Object o )    {        System.out.print( o );    }        public static void println( Object o )    {        System.out.println( o );    }    public String getLCS()    {        return lcs_str;    }    /**     * @return first     */    public char[] getFirstCharArray()    {        return first;    }        /**     * @return second     */    public char[] getSecondCharArray()    {        return second;    }    /**     * @return lcsAssistTable     */    public DIRECTION[][] getLcsAssistTable()    {        return lcsAssistTable;    }    /**     * @return lcsTable     */    public int[][] getLcsTable()    {        return lcsTable;    }} 

读书人网 >J2EE开发

热点推荐