读书人

利用前缀树处理数据重复出现的有关问题

发布时间: 2012-12-19 14:13:14 作者: rapoo

利用前缀树处理数据重复出现的问题

之前在做网页采集时,需要检查网址是否已经访问过,当时做这个功能时,考虑到只是字符串的搜索功能,第一想到的是用前缀树来处理,并找到了一个叫SimpleStringSearch字符串搜索组件,这个组件的文本搜索实现也是类似前缀树(大概浏览了下源码),关于前缀词的内容有兴趣的可以自己上网搜索下。下面看测试代码,例子是用jsoup抓取网页的链接,并把链接记录下来,如果链接重复,则不记录。

import?java.io.StringReader;
import?java.util.LinkedList;
import?java.util.List;
import?net.sourceforge.sstringsearch.StringSearchFactory;
import?net.sourceforge.sstringsearch.search.Searcher;
import?net.sourceforge.sstringsearch.search.SearchingHitListener;
import?org.jsoup.Jsoup;
import?org.jsoup.nodes.Document;
import?org.jsoup.nodes.Element;
import?org.jsoup.select.Elements;

public?class?WebSearch?{
????public?static?void?main(String[]?args)?{
????????try?{
????????????String?url?=?"http://www.163.com/";
????????????Document?doc?=?Jsoup.connect(url).get();
????????????Elements?links?=?doc.select("a[href]");
????????????
????????????long?start1?=?System.currentTimeMillis();
????????????Searcher?s?=?StringSearchFactory.getSearcher();
????????????MyExampleHitListener?listener?=?new?MyExampleHitListener();
????????????s.addSearchTerm(url,?listener);
????????????for?(Element?link?:?links)?{
????????????????String?l?=?link.attr("abs:href");
????????????????s.search(new?StringReader(l));
????????????????if?(listener.getCount()?==?0)?{
????????????????????s.addSearchTerm(l,?listener);
????????????????}
????????????}
????????????start1?=?System.currentTimeMillis()?-?start1;
????????????
????????????long?start2?=?System.currentTimeMillis();
????????????List<String>?list?=?new?LinkedList<String>();
????????????for?(Element?link?:?links)?{
????????????????String?l?=?link.attr("abs:href");
????????????????boolean?b?=?list.contains(l);
????????????????if(b?==?false){
????????????????????list.add(l);
????????????????}
????????????}
????????????start2?=?System.currentTimeMillis()?-?start2;
????????????
????????????long?start3?=?System.currentTimeMillis();
????????????StringBuffer?sff?=?new?StringBuffer();
????????????for?(Element?link?:?links)?{
????????????????String?l?=?link.attr("abs:href");
????????????????int?i?=?sff.toString().indexOf(l);
????????????????if(i?>?-1){
????????????????????sff.append(l);
????????????????}
????????????}
????????????start3?=?System.currentTimeMillis()?-?start3;
????????????System.out.println("start1:"?+?start1?+?"??start2:"?+?start2?+?"??start3:"?+?start3);
????????}?catch?(Exception?e)?{
????????????e.printStackTrace();
????????}
????}
}

class?MyExampleHitListener?implements?SearchingHitListener?{
????private?int?count;
????public?MyExampleHitListener()?{
????????count?=?0;
????}
????public?boolean?foundAt(long?startIndex,?long?endIndex,?String?term,
????????????Object利用前缀树处理数据重复出现的有关问题?callbackArgs)?{
????????count++;
????????return?true;
????}
????public?int?getCount()?{
????????if(count?>?0?){
????????????count?=?0;
????????????return?1;
????????}
????????return?0;
????}
}


在我本机运行3次的输出结果:
start1:201? start2:50? start3:59
start1:172? start2:40? start3:50
start1:160? start2:40? start3:40

运行结果一出来,发现用SimpleStringSearch的方法,效率最差,忘记了SimpleStringSearch只适合大量的搜索条件下的查找,而且它需要先把查询条件建成树,在建树时比较耗时间。最后决定用链表来实现该功能。

SimpleStringSearch在有大量搜索条件的情况下进行文本查找和过滤是比较不错的,下面贴两个例子出来给大家看下。
文本查找例子

import?java.io.IOException;
import?java.io.StringReader;

import?net.sourceforge.sstringsearch.StringSearchFactory;
import?net.sourceforge.sstringsearch.search.Searcher;
import?net.sourceforge.sstringsearch.search.SearchingHitListener;

public?class?ExampleSearch?{
????public?static?void?main(String[]?args)?{
????????try?{
????????????Searcher?s?=?StringSearchFactory.getSearcher();
????????????ExampleHitListener1?listener?=?new?ExampleHitListener1();
????????????s.addSearchTerm("姚明",?listener);
????????????s.addSearchTerm("姚明离队",?listener);
????????????s.addSearchTerm("火箭队",?listener);
????????????s.search(new?StringReader(
????????????????????"海耶斯意外受伤,看似将加速姚明离队,毕竟现在火箭队在5号位的用人上已经是捉襟见肘,他们急需一位防守能力出众的纯正中锋。不过,现在似乎并不是交易姚明的最佳时机,莫雷没有必要那么早的亮出底牌。美国《篮圈世界》专栏作家阿莱克斯-肯尼迪今日撰文“火箭正在等待重磅交易”,透露火箭不会立刻着急送走姚明,他们期待一笔更一鸣惊人的交易,预计要等到2月份左右,才会用姚明换取一位全明星。"));
????????????System.out.println("Count1?is:?"?+?listener.getCount());
????????}?catch?(IOException?ex)?{
????????????ex.printStackTrace();
????????}
????}
}
class?ExampleHitListener1?implements?SearchingHitListener?{
????private?int?count;
????public?ExampleHitListener1()?{
????????count?=?0;
????}
????public?boolean?foundAt(long?startIndex,?long?endIndex,?String?term,
????????????Object利用前缀树处理数据重复出现的有关问题?callbackArgs)?{
????????count++;
????????return?true;
????}
????public?int?getCount()?{
????????return?count;
????}
}

运行结果:
Count1 is: 6

文本过滤例子

import?java.io.IOException;
import?java.io.StringReader;
import?java.io.StringWriter;

import?net.sourceforge.sstringsearch.StringSearchFactory;
import?net.sourceforge.sstringsearch.replace.Replacer;
import?net.sourceforge.sstringsearch.replace.ReplacingHitListener;

public?class?ExampleSearchAndReplace?{
????public?static?void?main(String[]?args)?{
????????Replacer?r?=?StringSearchFactory.getReplacer();
????????StringWriter?result?=?new?StringWriter();
????????ExampleReplacingHitListener?listener?=?new?ExampleReplacingHitListener();
????????r.addSearchTerm("防守能力出众",?listener);
????????r.addSearchTerm("全明星",?listener);
????????try?{
????????????r.searchAndReplace(
????????????????????new?StringReader(
????????????????????????????"海耶斯意外受伤,看似将加速姚明离队,毕竟现在火箭队在5号位的用人上已经是捉襟见肘,他们急需一位防守能力出众的纯正中锋。不过,现在似乎并不是交易姚明的最佳时机,莫雷没有必要那么早的亮出底牌。美国《篮圈世界》专栏作家阿莱克斯-肯尼迪今日撰文“火箭正在等待重磅交易”,透露火箭不会立刻着急送走姚明,他们期待一笔更一鸣惊人的交易,预计要等到2月份左右,才会用姚明换取一位全明星。"),
????????????????????result);
????????????System.out.println(result.toString());
????????}?catch?(IOException?ex)?{
????????????ex.printStackTrace();
????????}
????}
}

class?ExampleReplacingHitListener?implements?ReplacingHitListener?{
????public?String?replace(String?stringToReplace,?Object利用前缀树处理数据重复出现的有关问题?callbackArguments)?{
????????char[]?ch?=?new?char[stringToReplace.length()];
?????????for?(int?i?=?0;?i?<?ch.length;?i++)?{
?????????ch[i]?=?'*';
?????????}
????????return?new?String(ch);
????}
}

运行结果:
海耶斯意外受伤,看似将加速姚明离队,毕竟现在火箭队在5号位的用人上已经是捉襟见肘,他们急需一位******的纯正中锋。不过,现在似乎并不是交易姚明的最佳时机,莫雷没有必要那么早的亮出底牌。美国《篮圈世界》专栏作家阿莱克斯-肯尼迪今日撰文“火箭正在等待重磅交易”,透露火箭不会立刻着急送走姚明,他们期待一笔更一鸣惊人的交易,预计要等到2月份左右,才会用姚明换取一位***。

读书人网 >编程

热点推荐