hadoop系列A：多资料输出

hadoop系列A：多文件输出
?package org.myorg; import java.io.DataOutputStream; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.OutputCommitter; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; public abstract class MultipleOutputFormat&lt;K extends WritableComparable, V extends Writable&gt; extends FileOutputFormat&lt;K, V&gt; { private MultiRecordWriter writer = null; public RecordWriter&lt;K, V&gt; getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { if (writer == null) { writer = new MultiRecordWriter(job, getTaskOutputPath(job)); } return writer; } private Path getTaskOutputPath(TaskAttemptContext conf) throws IOException {Path workPath = null; OutputCommitter committer = super.getOutputCommitter(conf); if (committer instanceof FileOutputCommitter) { workPath = ((FileOutputCommitter) committer).getWorkPath(); } else { Path outputPath = super.getOutputPath(conf); if (outputPath == null) { throw new IOException("Undefined job output-path"); } workPath = outputPath; } return workPath; } protected abstract String generateFileNameForKeyValue(K key, V value, TaskAttemptContext job);//Configuration conf); public class MultiRecordWriter extends RecordWriter&lt;K, V&gt; { private HashMap&lt;String, RecordWriter&lt;K, V&gt;&gt; recordWriters = null; private TaskAttemptContext job = null; private Path workPath = null; public MultiRecordWriter(TaskAttemptContext job, Path workPath) { super(); this.job = job; this.workPath = workPath; recordWriters = new HashMap&lt;String, RecordWriter&lt;K, V&gt;&gt;(); } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { Iterator&lt;RecordWriter&lt;K, V&gt;&gt; values = this.recordWriters.values().iterator(); while (values.hasNext()) { values.next().close(context); } this.recordWriters.clear(); } @Override public void write(K key, V value) throws IOException, InterruptedException {String baseName = generateFileNameForKeyValue(key, value, job);//job.getConfiguration()); RecordWriter&lt;K, V&gt; rw = this.recordWriters.get(baseName); if (rw == null) { rw = getBaseRecordWriter(job, baseName); this.recordWriters.put(baseName, rw); } //LongWritable keys=(LongWritable)key;//long ret=keys.get()&gt;&gt;1;//keys.set(ret);rw.write(key, value);//change } private RecordWriter&lt;K, V&gt; getBaseRecordWriter(TaskAttemptContext job, String baseName) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); String keyValueSeparator = "\t"; //change String pathname=baseName.substring(12); //changeRecordWriter&lt;K, V&gt; recordWriter = null; if (isCompressed) { Class&lt;? extends CompressionCodec&gt; codecClass = getOutputCompressorClass(job,GzipCodec.class); //String pathname=baseName.substring(12); CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf); Path file = new Path(workPath+"/"+pathname, baseName.substring(0,11) + codec.getDefaultExtension()); //change FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false); recordWriter = new LineRecordWriter&lt;K, V&gt;(new DataOutputStream(codec .createOutputStream(fileOut)), keyValueSeparator); } else { Path file = new Path(workPath+"/"+pathname, baseName.substring(0,11)); //changeFSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false); recordWriter = new LineRecordWriter&lt;K, V&gt;(fileOut, keyValueSeparator); } return recordWriter; }} }
??
2、把LineRecordWriter从TextOutputFormat抽取出来，作为一个独立的公共类使用。RecordWriter的一个实现，用于把&lt;Key, Value&gt;转化为一行文本。在Hadoop中，这个类作为TextOutputFormat的一个子类存在，protected访问权限，因此普通程序无法访问。
代码如下：
?
3、在主程序中加载generateFileNameForKeyValue方法：
?
在main函数中需添加 job.setOutputFormatClass(myOutput.class);
更多信息请查看?java进阶网?http://www.javady.com
hadoop系列A：多资料输出

热点推荐