nutch源码阅读(8)-Generator
接着看下最后一个Job
?
?
/** * Update the CrawlDB so that the next generate won't include the same URLs. */ public static class CrawlDbUpdater extends MapReduceBase implements Mapper<Text,CrawlDatum,Text,CrawlDatum>, Reducer<Text,CrawlDatum,Text,CrawlDatum> { long generateTime; public void configure(JobConf job) { generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L); } public void map(Text key, CrawlDatum value, OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException { output.collect(key, value); } private CrawlDatum orig = new CrawlDatum(); private LongWritable genTime = new LongWritable(0L); public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException { genTime.set(0L); //遍历相同url的crawlDatum while (values.hasNext()) { CrawlDatum val = values.next(); //判断是否生成过 if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) { LongWritable gt = (LongWritable) val.getMetaData().get( Nutch.WRITABLE_GENERATE_TIME_KEY); genTime.set(gt.get()); if (genTime.get() != generateTime) { orig.set(val); genTime.set(0L); continue; } } else { orig.set(val); } } if (genTime.get() != 0L) { //设置新的生成时间 orig.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime); } output.collect(key, orig); } }?