读书人

Grep源代码诠释

发布时间: 2012-12-20 09:53:21 作者: rapoo

Grep源代码注释

package org.apache.hadoop.examples;import java.util.Random;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapred.*;import org.apache.hadoop.mapred.lib.*;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;/* 对输入文件按正则表达式查找,把结果写到输出文件上。查找用到了RegexMapper,LongSumReducer,InverseMapper系统自带的工具类。这个源代码运行了两个job,一个查找,一个是排序。Extracts matching regexs from input files and counts them. */public class Grep extends Configured implements Tool {  private Grep() {}                               // 单例模式singleton  public int run(String[] args) throws Exception {    if (args.length < 3) {      System.out.println("Grep <inDir> <outDir> <regex> [<group>]");      ToolRunner.printGenericCommandUsage(System.out);      return -1;    }    Path tempDir =      new Path("grep-temp-"+          Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));    JobConf grepJob = new JobConf(getConf(), Grep.class);        try {            grepJob.setJobName("grep-search");      FileInputFormat.setInputPaths(grepJob, args[0]);      grepJob.setMapperClass(RegexMapper.class);//设置系统自带的mapper类来查找      grepJob.set("mapred.mapper.regex", args[2]);      if (args.length == 4)        grepJob.set("mapred.mapper.regex.group", args[3]);      grepJob.setCombinerClass(LongSumReducer.class);//设置系统自带的reducer来做合并      grepJob.setReducerClass(LongSumReducer.class);     //设置系统自带的reducer。      FileOutputFormat.setOutputPath(grepJob, tempDir);      grepJob.setOutputFormat(SequenceFileOutputFormat.class);//设置输出格式是二进制文件      grepJob.setOutputKeyClass(Text.class);//输出的key是Text类型      grepJob.setOutputValueClass(LongWritable.class);//输出的value是long类型      JobClient.runJob(grepJob);      JobConf sortJob = new JobConf(Grep.class);      sortJob.setJobName("grep-sort");      FileInputFormat.setInputPaths(sortJob, tempDir);      sortJob.setInputFormat(SequenceFileInputFormat.class);//设置输入的文件格式二进制文件      sortJob.setMapperClass(InverseMapper.class);//设置自带的排序mapper      sortJob.setNumReduceTasks(1);                 // write a single file      FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));      sortJob.setOutputKeyComparatorClass           // 输出结果是降序排列sort by decreasing freq      (LongWritable.DecreasingComparator.class);      JobClient.runJob(sortJob);    }    finally {      FileSystem.get(grepJob).delete(tempDir, true);    }    return 0;  }  public static void main(String[] args) throws Exception {    int res = ToolRunner.run(new Configuration(), new Grep(), args);    System.exit(res);  }}
?

读书人网 >编程

热点推荐