一. 工具:
idea,hadoop,fastjson
二.需求分析:
1.分析前端数据,找出该平台开播时间最长的十名主播
2.统计该主播们的信息,包括:
- 主播ID(uid)
- 金币数量(gold)
- 总观看pv(watchnumpv)
- 粉丝关注数量(follower)
- 总开播时长(length)
三.数据清洗
从前端获得的日志文件为Json格式,如图:
1.所以可以采用fastjson进行处理,获取需要的几个核心字段,并且进行异常数值判断
2.由于不需要聚合,只是一个简单的过滤操作,所以只需要map阶段,不需要reduce
3.在maven项目的“src–main–java”目录下new一个package,名为DataClean,在包里新建两个class:
- DataCleanJob
- DataCleanMap
4.DataCleanMap:
public class DataCleanMap extends Mapper <LongWritable,Text,Text, Text> {
/**
* 1:从原始数据中过滤出来需要的字段
* 2:针对核心字段进行异常值判断
* @param k1
* @param v1
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
//获取每一行内容
String line = v1.toString();
//把json字符串数据解析为json对象
JSONObject jsonObject = JSON.parseObject(line);
//从json对象中获取需要的字段信息(获取数值的时候建议使用getIntValue,这样如果字段缺失,则返回0)
String id = jsonObject.getString("uid");
int gold = jsonObject.getIntValue("gold");
int watchnumpv = jsonObject.getIntValue("watchnumpv");
int follower = jsonObject.getIntValue("follower");
int length = jsonObject.getIntValue("length");
//过滤异常数据
if(gold >=0 && watchnumpv >=0 && follower >=0 && length >=0){
//组装k2,v2
Text k2 = new Text();
k2.set(id);
Text v2 = new Text();
v2.set(gold+"\t"+watchnumpv+"\t"+follower+"\t"+length);
context.write(k2,v2);
}
}
}
5.DataCleanJob:
public class DataCleanJob {
public static void main(String []args) {
try{
if(args.length!=2){
System.exit(100);
}
//job需要的配置参数
Configuration conf = new Configuration();
//创建一个job
Job job = Job.getInstance(conf);
//这一行必须设置,否则在集群中执行时找不到这个类
job.setJarByClass(DataCleanJob.class);
//指定输入路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定输出目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//指定map相关的代码
job.setMapperClass(DataCleanMap.class);
//指定k2的类型
job.setMapOutputKeyClass(Text.class);
//指定v2类型
job.setMapOutputValueClass(Text.class);
//禁用reduce
job.setNumReduceTasks(0);
job.waitForCompletion(true);
}catch(Exception e){
e.printStackTrace();
}
}
}
四.基于主播进行统计
1.new一个名为VideoInfo的package
2.为了方便统计主播的指标数据,最好是把这些字段整合到一个对象中,这样维护起来比较方便,这样就需要自定义一个writable了——VideoInfoWritable,并且对几个方法进行重写;
而setGold和getGold等方法可以通过idea自动生成:
"Generate–setter"或"Generate–getter"即可
VideoInfoWritable:
public class VideoInfoWritable implements Writable {
private long gold;
private long watchnumpv;
private long follower;
private long length;
public void set(long gold, long watchnumpv, long follower ,long length){
this.gold = gold;
this.watchnumpv = watchnumpv;
this.follower = follower;
this.length = length;
}
public void setGold(long gold) {
this.gold = gold;
}
public void setWatchnumpv(long watchnumpv) {
this.watchnumpv = watchnumpv;
}
public void setFollower(long follower) {
this.follower = follower;
}
public void setLength(long length) {
this.length = length;
}
public long getGold() {
return gold;
}
public long getWatchnumpv() {
return watchnumpv;
}
public long getFollower() {
return follower;
}
public long getLength() {
return length;
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.gold = dataInput.readLong();
this.watchnumpv = dataInput.readLong();
this.follower = dataInput.readLong();
this.length = dataInput.readLong();
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeLong(gold);
dataOutput.writeLong(watchnumpv);
dataOutput.writeLong(follower);
dataOutput.writeLong(length);
}
@Override
public String toString() {
return gold+"\t"+watchnumpv+"\t"+follower+"\t"+length;
}
}
3.编写map函数对清洗过的数据进行切割,并且整合进k2,v2,其中k2是主播的id,类型为Text,v2类型为VideoInfoWritable
VideoInfoMap:
public class VideoInfoMap extends Mapper<LongWritable, Text,Text,VideoInfoWritable> {
@Override
protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
//读取清洗之后的每一行数据
String line = v1.toString();
//用制表符对数据进行切割
String[] fields = line.split("\t");
String id = fields[0];
long gold = Long.parseLong(fields[1]);
long watchnumpv = Long.parseLong(fields[2]);
long follower = Long.parseLong(fields[3]);
long length= Long.parseLong(fields[4]);
//组装k2 v2
Text k2 = new Text();
k2.set(id);
VideoInfoWritable v2 = new VideoInfoWritable();
v2.set(gold,watchnumpv,follower,length);
context.write(k2,v2);
}
}
4.编写reduce方法进行聚合:
VideoInfoReduce:
public class VideoInfoReduce extends Reducer<Text,VideoInfoWritable,Text,VideoInfoWritable> {
@Override
protected void reduce(Text k2, Iterable<VideoInfoWritable> v2s, Context context) throws IOException, InterruptedException {
//从v2s中把相同key的value取出来,进行累加求和
long goldsum = 0;
long watchhumpvsum = 0;
long followersum= 0 ;
long lengthsum=0;
for(VideoInfoWritable v2:v2s){
goldsum += v2.getGold();
watchhumpvsum += v2.getWatchnumpv();
followersum += v2.getFollower();
lengthsum += v2.getLength();
}
//组装k3,v3
Text k3 = k2;
VideoInfoWritable v3 = new VideoInfoWritable();
v3.set(goldsum,watchhumpvsum,followersum,lengthsum);
context.write(k3,v3);
}
}
5.将map和reduce进行组装为job:
VideoInfoJob:
public class VideoInfoJob {
public static void main(String []args) {
try{
if(args.length!=2){
System.exit(100);
}
//job需要的配置参数
Configuration conf = new Configuration();
//创建一个job
Job job = Job.getInstance(conf);
//这一行必须设置,否则在集群中执行时找不到这个类
job.setJarByClass(VideoInfoJob.class);
//指定输入路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定输出目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//指定map相关的代码
job.setMapperClass(VideoInfoMap.class);
//指定k2的类型
job.setMapOutputKeyClass(Text.class);
//指定v2类型
job.setMapOutputValueClass(VideoInfoWritable.class);
//指定reduce相关的代码
job.setReducerClass(VideoInfoReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(VideoInfoWritable.class);
job.waitForCompletion(true);
}catch(Exception e){
e.printStackTrace();
}
}
}
五.找出开播时长前十名的主播
VideoInfoTop10Map.java:
package Top10;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class VideoInfoTop10Map extends Mapper<LongWritable, Text,Text,LongWritable> {
@Override
protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
//读取清洗之后的一行数据
String line = v1.toString();
String[] fields = line.split("\t");
String id = fields[0];
long length = Long.parseLong(fields[4]);
//组装k2 v2
Text k2 = new Text();
k2.set(id);
LongWritable v2 = new LongWritable();
v2.set(length);
context.write(k2,v2);
}
}
在reduce的过程中,我们需要统计当日各个主播的直播总时长,并且按照总时长的顺序为主播排序。
虽然Hadoop在map和reduce之间会进行一次排序,但是这次排序是按照key值的字典序来排列的,在reduce的输入端,key值是主播id,这样的排序不满足我们的需求,因此,我们用一个HashMap来保存键值对,在其中进行排序,为此我们要实现一个MapUtils类用来处理Map中的数据。
为了方便日志文件的管理,我们可以约定在输入路径的最后以日期结尾,并且将日期和主播id都写进k3之中,为此,需要一个处理日期的类DateUtils。
DateUtils.java:
package Top10;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class DateUtils {
public static SimpleDateFormat sdft1 = new SimpleDateFormat("yyyyMMdd");
public static SimpleDateFormat sdft2 = new SimpleDateFormat("yyyy-MM-dd");
/**
* 转换日期格式
* 从yyyyMMdd转换为yyyy-MM-dd
*/
public static String transDateFormat(String dt){
String res = "1970-01-01";
try{
Date date = sdft1.parse(dt);
res = sdft2.format(date);
} catch (ParseException e) {
System.out.println("日期转换失败"+dt);
}
return res;
}
}
MapUtils.java:
package Top10;
import java.util.*;
public class MapUtils {
public static <K,V extends Comparable<? super V>> Map<K,V> sortValue(Map<K,V> map){
List<Map.Entry<K,V>> list = new ArrayList<>(map.entrySet());
Collections.sort(list, new Comparator<Map.Entry<K, V>>() {
@Override
//降序排序
public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2) {
int compare = (o1.getValue()).compareTo(o2.getValue());
return -compare;
}
});
Map<K,V> returnMap = new LinkedHashMap<>();
for(Map.Entry<K,V> entry : list){
returnMap.put(entry.getKey(),entry.getValue());
}
return returnMap;
}
}
VideoInfoTop10Reduce.java:
package Top10;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
public class VideoInfoTop10Reduce extends Reducer<Text, LongWritable,Text,LongWritable> {
//保存所有主播的id和开播总时长
HashMap<String,Long> map = new HashMap<>();
@Override
protected void reduce(Text k2, Iterable<LongWritable> v2s, Context context) throws IOException, InterruptedException {
long lengthsum=0;
for(LongWritable v2:v2s){
lengthsum += v2.get();
}
map.put(k2.toString(),lengthsum);
}
/**
* 任务初始化时执行一次,仅执行一次,一般在里面做一些初始化资源链接的动作
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
}
/**
* 任务结束的时候执行,仅执行一次,做一些关闭资源的操作
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
// 从配置类中获取dt参数
String dt = conf.get("dt");
//根据map的value进行排序
Map<String,Long> sortedMap = MapUtils.sortValue(map);
Set<Map.Entry<String,Long>> entries = sortedMap.entrySet();
Iterator<Map.Entry<String,Long>> it = entries.iterator();
int count = 1;
while(count<=10 && it.hasNext()){
Map.Entry<String,Long> entry = it.next();
String key = entry.getKey();
Long value = entry.getValue();
//封装k3 v3
Text k3 = new Text();
k3.set(dt+"\t"+key);
LongWritable v3 = new LongWritable();
v3.set(value);
context.write(k3,v3);
count++;
}
}
}
最后,将map和reduce组装在一起:
VideoInfoJobTop10:
package Top10;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class DateUtils {
public static SimpleDateFormat sdft1 = new SimpleDateFormat("yyyyMMdd");
public static SimpleDateFormat sdft2 = new SimpleDateFormat("yyyy-MM-dd");
/**
* 转换日期格式
* 从yyyyMMdd转换为yyyy-MM-dd
*/
public static String transDateFormat(String dt){
String res = "1970-01-01";
try{
Date date = sdft1.parse(dt);
res = sdft2.format(date);
} catch (ParseException e) {
System.out.println("日期转换失败"+dt);
}
return res;
}
}