Hadoop 根据web日志计算页面访问的独立IP数, 也即计算count(unique)

浏览: 1971

web日志

120.197.87.216 - - [04/Jan/2012:00:00:02 +0800] "GET /home.php?mod=space&uid=563413&mobile=yes HTTP/1.1" 200 3388 "-" "-"
123.126.50.73 - - [04/Jan/2012:00:00:02 +0800] "GET /thread-679411-1-1.html HTTP/1.1" 200 5251 "-" "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)"
203.208.60.187 - - [04/Jan/2012:00:00:02 +0800] "GET /archiver/tid-3003.html HTTP/1.1" 200 2056 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?action=getgold HTTP/1.1" 200 13886 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?action=getmedal HTTP/1.1" 200 13882 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
110.6.179.88 - - [04/Jan/2012:00:00:02 +0800] "GET /forum.php?mod=attachment&aid=NTczNzU3fDFjNDdjZTgzfDEzMjI4NzgwMDV8MTMzOTc4MDB8MTEwMTcxMA%3D%3D&mobile=no HTTP/1.1" 200 172 "http://www.itpub.net/forum.php?mod=attachment&aid=NTczNzU3fDFjNDdjZTgzfDEzMjI4NzgwMDV8MTMzOTc4MDB8MTEwMTcxMA%3D%3D&mobile=yes" "Mozilla/5.0 (Linux; U; Android 2.2; zh-cn; ZTE-U V880 Build/FRF91) UC AppleWebKit/530+ (KHTML, like Gecko) Mobile Safari/530"
116.205.130.2 - - [04/Jan/2012:00:00:02 +0800] "GET /popwin_js.php?fid=6 HTTP/1.1" 200 32 "http://www.itpub.net/forum-6-1.html?ts=28" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; QQDownload 702; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; AskTbPTV/5.11.3.15590; .NET4.0E)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /popwin_js.php?fid=133 HTTP/1.1" 200 11 "http://www.itpub.net/thread-1558574-3-9.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
114.112.141.6 - - [04/Jan/2012:00:00:02 +0800] "GET /ctp080113.php?tid=1558574 HTTP/1.1" 200 5 "http://www.itpub.net/thread-1558574-3-9.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)"
110.75.173.35 - - [04/Jan/2012:00:00:02 +0800] "GET /forum.php?goto=lastpost&mod=redirect&tid=1380214 HTTP/1.1" 302 5 "-" "Yahoo! Slurp China"

MapReducer类

package com.jack.hadoop.weblog.kpi;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

public class KPIIP {

public static class KPIIPMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text> {
private Text word = new Text();
private Text ips = new Text();

@Override
public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
KPI kpi = KPI.filterIPs(value.toString());
if (kpi.isValid()) {
word.set(kpi.getRequest());
ips.set(kpi.getRemote_addr());
output.collect(word, ips);
}
}
}

public static class KPIIPReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {

//private Set<String> count = new HashSet<String>();

@Override
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
Text result = new Text();
Set<String> count = new HashSet<String>();
while (values.hasNext()) {
count.add(values.next().toString());
// System.out.println(values.next().toString());
}
result.set(String.valueOf(count.size()));
System.out.println(key.toString() + ":" + String.valueOf(count.size()));
output.collect(key, result);
}
}

public static void main(String[] args) {
String input = "/tmp/access.20120104-1.log";
String output = "/tmp/ipout";

JobConf conf = new JobConf(KPIIP.class);
conf.setJobName("KPIIP");
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Text.class);

conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);

conf.setMapperClass(KPIIPMapper.class);
//conf.setCombinerClass(KPIIPReducer.class);
conf.setReducerClass(KPIIPReducer.class);

conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);

FileInputFormat.addInputPath(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));


try {
JobClient.runJob(conf);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

System.exit(0);
}

public static RunningJob myCustomRunJob(JobConf job) throws Exception {
JobClient jc = new JobClient(job);
RunningJob rj = jc.submitJob(job);
if (!jc.monitorAndPrintJob(job, rj)) {
throw new IOException("Job failed with infoaaa: " + rj.getFailureInfo());
}
return rj;
}

}

KPI类

package com.jack.hadoop.weblog.kpi;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;

/*
* KPI Object
*/
public class KPI {
private String remote_addr;// 记录客户端的ip地址
private String remote_user;// 记录客户端用户名称,忽略属性"-"
private String time_local;// 记录访问时间与时区
private String request;// 记录请求的url与http协议
private String status;// 记录请求状态;成功是200
private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
private String http_referer;// 用来记录从那个页面链接访问过来的
private String http_user_agent;// 记录客户浏览器的相关信息
private boolean valid = true;// 判断数据是否合法

private static KPI parser(String line) {
//System.out.println(line);
KPI kpi = new KPI();
String[] arr = line.split(" ");
if (arr.length > 11) {
kpi.setRemote_addr(arr[0]);
kpi.setRemote_user(arr[1]);
kpi.setTime_local(arr[3].substring(1));
kpi.setRequest(arr[6]);
kpi.setStatus(arr[8]);
kpi.setBody_bytes_sent(arr[9]);
kpi.setHttp_referer(arr[10]);
if (arr.length > 12) {
kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
} else {
kpi.setHttp_user_agent(arr[11]);
}
if (Integer.parseInt(kpi.getStatus()) >= 400) {// 大于400,HTTP错误
kpi.setValid(false);
}
} else {
kpi.setValid(false);
}
return kpi;
}

/**
* 按page的pv分类
*/
public static KPI filterPVs(String line) {
/*KPI kpi = parser(line);
Set<String> pages = new HashSet<String>();
pages.add("/forum-46-1.html");
pages.add("/forum-58-1.html");
pages.add("/forum-61-1.html");
if (!pages.contains(kpi.getRequest())) {
kpi.setValid(false);
}
return kpi;*/
return parser(line);
}

/**
* 按page的独立ip分类
*/
public static KPI filterIPs(String line) {
/*KPI kpi = parser(line);
Set<String> pages = new HashSet<String>();
pages.add("/forum-46-1.html");
pages.add("/forum-58-1.html");
pages.add("/forum-61-1.html");
if (!pages.contains(kpi.getRequest())) {
kpi.setValid(false);
}
return kpi;*/
return parser(line);
}

/**
* PV按浏览器分类
*/
public static KPI filterBroswer(String line) {
return parser(line);
}

/**
* PV按小时分类
*/
public static KPI filterTime(String line) {
return parser(line);
}

/**
* PV按访问域名分类
*/
public static KPI filterDomain(String line) {
return parser(line);
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("valid:" + this.valid);
sb.append("\nremote_addr:" + this.remote_addr);
sb.append("\nremote_user:" + this.remote_user);
sb.append("\ntime_local:" + this.time_local);
sb.append("\nrequest:" + this.request);
sb.append("\nstatus:" + this.status);
sb.append("\nbody_bytes_sent:" + this.body_bytes_sent);
sb.append("\nhttp_referer:" + this.http_referer);
sb.append("\nhttp_user_agent:" + this.http_user_agent);
return sb.toString();
}

public String getRemote_addr() {
return remote_addr;
}

public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}

public String getRemote_user() {
return remote_user;
}

public void setRemote_user(String remote_user) {
this.remote_user = remote_user;
}

public String getTime_local() {
return time_local;
}

public Date getTime_local_Date() throws ParseException {
SimpleDateFormat df = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",
Locale.US);
return df.parse(this.time_local);
}

public String getTime_local_Date_hour() throws ParseException {
SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHH");
return df.format(this.getTime_local_Date());
}

public void setTime_local(String time_local) {
this.time_local = time_local;
}

public String getRequest() {
return request;
}

public void setRequest(String request) {
this.request = request;
}

public String getStatus() {
return status;
}

public void setStatus(String status) {
this.status = status;
}

public String getBody_bytes_sent() {
return body_bytes_sent;
}

public void setBody_bytes_sent(String body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}

public String getHttp_referer() {
return http_referer;
}

public String getHttp_referer_domain() {
if (http_referer.length() < 8) {
return http_referer;
}
String str = this.http_referer.replace("\"", "").replace("http://", "")
.replace("https://", "");
return str.indexOf("/") > 0 ? str.substring(0, str.indexOf("/")) : str;
}

public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}

public String getHttp_user_agent() {
return http_user_agent;
}

public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}

public boolean isValid() {
return valid;
}

public void setValid(boolean valid) {
this.valid = valid;
}

public static void main(String args[]) {
String line = "112.97.24.243 - - [31/Jan/2012:00:14:48 +0800] \"GET /data/cache/style_2_common.css?AZH HTTP/1.1\" 200 57752 \"http://f.dataguru.cn/forum-58-1.html\" \"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Mobile/9A406\"";
System.out.println(line);
KPI kpi = new KPI();
String[] arr = line.split(" ");
kpi.setRemote_addr(arr[0]);
kpi.setRemote_user(arr[1]);
kpi.setTime_local(arr[3].substring(1));
kpi.setRequest(arr[6]);
kpi.setStatus(arr[8]);
kpi.setBody_bytes_sent(arr[9]);
kpi.setHttp_referer(arr[10]);
kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
System.out.println(kpi);
try {
SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd:HH:mm:ss z",Locale.US);
System.out.println(df.format(kpi.getTime_local_Date()));
System.out.println(kpi.getTime_local_Date_hour());
System.out.println(kpi.getHttp_referer_domain());
} catch (ParseException e) {
e.printStackTrace();
}
}
}

这里 有2个地方请大家注意

1、在Reducer类里面,Set 不能作为reducer类的 私有变量, 要写在reducer方法里面,可见我注释的地方

2、运行时,不能设置Combiner类

推荐 0
本文由 策马行空 创作,采用 知识共享署名-相同方式共享 3.0 中国大陆许可协议 进行许可。
转载、引用前需联系作者,并署名作者且注明文章出处。
本站文章版权归原作者及原出处所有 。内容为作者个人观点, 并不代表本站赞同其观点和对其真实性负责。本站是一个个人学习交流的平台,并不用于任何商业目的,如果有任何问题,请及时联系我们,我们将根据著作权人的要求,立即更正或者删除有关内容。本站拥有对此声明的最终解释权。

0 个评论

要回复文章请先登录注册