读书人

MapReduce实战-分析apatch日记访问页面

发布时间: 2013-10-29 12:07:57 作者: rapoo

MapReduce实战--分析apatch日志访问页面大小

日志文件:

220.181.108.151 - - [31/Jan/2012:00:02:32 +0800] "GET /home.php?mod=space&uid=158&do=album&view=me&from=space HTTP/1.1" 200 8784 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"

208.115.113.82 - - [31/Jan/2012:00:07:54 +0800] "GET /robots.txt HTTP/1.1" 200 582 "-" "Mozilla/5.0 (compatible; Ezooms/1.0; ezooms.bot@gmail.com)"

220.181.94.221 - - [31/Jan/2012:00:09:24 +0800] "GET /home.php?mod=spacecp&ac=pm&op=showmsg&handlekey=showmsg_3&touid=3&pmid=0&daterange=2&pid=398&tid=66 HTTP/1.1" 200 10070 "-" "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"

112.97.24.243 - - [31/Jan/2012:00:14:48 +0800] "GET /data/cache/style_2_common.css?AZH HTTP/1.1" 200 57752 "http://forum-58-1.html" "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Mobile/9A406"

112.97.24.243 - - [31/Jan/2012:00:14:48 +0800] "GET /data/cache/style_2_widthauto.css?AZH HTTP/1.1" 200 1024 "http://forum-58-1.html" "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Mobile/9A406"


计算页面大小的平均值、最大值和最小值

代码:

import java.io.IOException;

import java.text.DateFormat;

import java.text.SimpleDateFormat;

import java.util.Date;

import java.util.Iterator;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.JobConf;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

public class WebLogMessageSizeAggregator {


public static final Pattern httplogPattern = Pattern

.compile("([^\\s]+) - - \\[(.+)\\] \"([^\\s]+) (/[^\\s]*) HTTP/[^\\s]+\" [^\\s]+ ([0-9]+)");

public static class AMapper extends Mapper<Object, Text, Text, IntWritable> {


public void map(Object key, Text value, Context context)throws IOException, InterruptedException {

Matcher matcher = httplogPattern.matcher(value.toString());

while (matcher.find())//查找符合pattern的字符串

{

int size = Integer.parseInt(matcher.group(5));

context.write(new Text("msgSize"),new IntWritable(size));

}

}

}

publicstaticclass AReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

public void reduce(Text key, Iterable<IntWritable> values, Context context)throws IOException,

InterruptedException {

double tot = 0;

int count = 0;

int min = Integer.MAX_VALUE;

int max = 0;

Iterator<IntWritable> iterator = values.iterator();

while (iterator.hasNext()) {

int value = iterator.next().get();

tot = tot + value;

count++;

if (value < min) {

min = value;

}

if (value > max) {

max = value;

}

}

context.write(new Text("Mean"),new IntWritable((int) tot / count));

context.write(new Text("Max"),new IntWritable(max));

context.write(new Text("Min"),new IntWritable(min));

}

}


public static void main(String[] args) throws Exception {

JobConf conf = new JobConf();

String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

if (otherArgs.length != 2) {

System.err.println("Usage: <in> <out>");

System.exit(2);

}


Job job = new Job(conf, "WebLogMessageSizeAggregator");

job.setJarByClass(WebLogMessageSizeAggregator.class);

job.setMapperClass(AMapper.class);

job.setReducerClass(AReducer.class);

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(IntWritable.class);

FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

System.exit(job.waitForCompletion(true) ? 0 : 1);

}

}

运行结果:

Mean 13221

Max 10240000

Min 1


读书人网 >云计算

热点推荐