读书人

nutch源码阅览(8)-Generator

发布时间: 2013-06-26 14:29:32 作者: rapoo

nutch源码阅读(8)-Generator

接着看下最后一个Job

?

?

  /**   * Update the CrawlDB so that the next generate won't include the same URLs.   */  public static class CrawlDbUpdater extends MapReduceBase implements      Mapper<Text,CrawlDatum,Text,CrawlDatum>, Reducer<Text,CrawlDatum,Text,CrawlDatum> {    long generateTime;    public void configure(JobConf job) {      generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);    }    public void map(Text key, CrawlDatum value, OutputCollector<Text,CrawlDatum> output,        Reporter reporter) throws IOException {      output.collect(key, value);    }    private CrawlDatum orig = new CrawlDatum();    private LongWritable genTime = new LongWritable(0L);    public void reduce(Text key, Iterator<CrawlDatum> values,        OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException {      genTime.set(0L);      //遍历相同url的crawlDatum      while (values.hasNext()) {        CrawlDatum val = values.next();        //判断是否生成过        if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) {          LongWritable gt = (LongWritable) val.getMetaData().get(              Nutch.WRITABLE_GENERATE_TIME_KEY);          genTime.set(gt.get());          if (genTime.get() != generateTime) {            orig.set(val);            genTime.set(0L);            continue;          }        } else {          orig.set(val);        }      }      if (genTime.get() != 0L) {        //设置新的生成时间        orig.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime);      }      output.collect(key, orig);    }  }

?

读书人网 >开源软件

热点推荐