对于给定的video日志数据,先利用mapreduce程序进行数据清洗,把数据的存储格式按我们的要求存入文件。

一、数据清洗代码

mapper端对数据清洗后直接输出,不需要reduce阶段 

public class ETLMapper extends Mapper<LongWritable, Text,Text, NullWritable>{
    private Counter pass;
    private Counter fail;

    private StringBuilder sb = new StringBuilder();
    private Text result = new Text();

    @Override
    protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        pass = context.getCounter("ETL", "Pass");
        fail = context.getCounter("ETL","Fail");
    }

    /**日志样例:SDNkMu8ZT68	w00dy911	630	People & Blogs	186	10181	3.49	494	257	rjnbgpPJUks
     * 将一行的日志进行处理,字段不够的抛弃,将第四个字段中的空格去掉,将最后的相关视频的分隔符改成‘&’
     * @param key 行偏移量
     * @param value 这行内容
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        //一行数据
        String line = value.toString();
        //这样切分会把最后一个字段的数据切开,需要最后做拼接
        String[] fields = line.split("\t");
        //每一行数据都会进入map一次,sb是可变字符串,所以每次进来需要清零。
        sb.setLength(0);
        //先判断字段个数够不够
        if (fields.length >= 9){
            //去掉第四个字段的空格(视频标签字段的数组用&连接)  样例:People & Blogs
            fields[3] = fields[3].replace(" ","");
            //拼接字段
            for (int i = 0; i < fields.length; i++) {
                if (i == fields.length-1){  //相关视频的最后一个,不用做处理,直接添加
                    sb.append(fields[i]);
                }else if (i <= 8){  //前九个字段之间用\t隔开
                    sb.append(fields[i]).append("\t");
                }else{  //相关视频字段用&隔开
                    sb.append(fields[i]).append("&");
                }
            }
            result.set(sb.toString());
            context.write(result,NullWritable.get());
            pass.increment(1);
        }else {
            //如果字段不够,舍弃不要
            fail.increment(1);
        }
    }
}

driver端

public class ETLToolDriver {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        //使用tez引擎
//        conf.set("mapreduce.framework.name","yarn-tez");

        Job job = Job.getInstance(conf);

        job.setJarByClass(ETLToolDriver.class);

        job.setMapperClass(ETLMapper.class);

        job.setNumReduceTasks(0);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);


    }
}

①程序编写完成后,现在本地模式下跑一遍看看有没有问题,没问题的话,直接打包上传到集群

②将要清洗数据的文件提前上传到hdfs

在hadoop服务器端执行如下代码实现数据清洗

yarn jar /opt/module/hadoop/上传的jar包名  driver类的reference引用 hdfs上的文件目录 清洗后存放的目录

我的代码
yarn jar /opt/module/hadoop/etltool-1.0-SNAPSHOT.jar com.atguigui.etl.ETLToolDriver /gulivideo/video /gulivideo/video_etl

在hive端创建数据库并使用该数据库

create database gulivideo;
use gulivideo;

创建两张外部表

create external table video_ori(
    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
location '/gulivideo/video_etl';
create external table user_ori(
    uploader string,
    videos int,
    friends int)
row format delimited fields terminated by "\t" 
location '/gulivideo/user';

在创建两张内部表并添加数据到内部表

create table video_orc(
    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int, 
    views int, 
    rate float, 
    ratings int, 
    comments int,
    relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by '&'
stored as orc
tblproperties("orc.compress"="snappy");
create external table user_orc(
    uploader string,
    videos int,
    friends int)
row format delimited fields terminated by "\t" 
stored as orc
tblproperties("orc.compress"="snappy");
insert into table video_orc select * from video_ori;
insert into table user_orc select * from user_ori;

需求①:统计视频播放量Top10 

select 
    videoid,views
from
    video_orc
order by
    views desc
limit 10;

 

hive清洗数据难点 hive数据清洗步骤_hadoop

需求②:统计视频类别热度Top10

第一步:把类别字段的数组炸开
select 
    videoid,cate
from 
    video_orc
lateral view 
    explode(category) tbl as cate;

第二步:根据第一步的结果,分类统计各个类别的数目并作排序输出目标数据
select 
    cate,count(videoid) as counter
from(
    select 
        videoid,cate
    from 
        video_orc
    lateral view 
        explode(category) tbl as cate) tbl
group by cate
order by counter desc
limit 10;

hive清洗数据难点 hive数据清洗步骤_大数据_02

需求③:统计视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数 

第一步:统计观看数最高的20个视频以及类别
select 
    videoid,category,views
from
    video_orc
order by views desc
limit 20;

//第二步:根据①的结果,将category炸开。
select cate,videoid
from(
    select 
        videoid,category,views
    from
        video_orc
    order by views desc
    limit 20) t1
lateral view
    explode(category) tbl as cate;

第三步:按照炸开后的类别分组,统计个数
select cate,count(videoid) n
from(
    select cate,videoid
    from(
        select 
            videoid,category,views
        from
            video_orc
        order by views desc
        limit 20) t1
    lateral view
        explode(category) tbl as cate
) t2
group by cate
order by n desc;

 

hive清洗数据难点 hive数据清洗步骤_hadoop_03

需求④:统计视频观看数Top50 所关联视频的所属类别排序

第一步:统计Top50的关联视频
select 
    videoid,views,relatedid
from
    video_orc
order by views desc
limit 50;


第二步:炸开相关视频的数组
select 
    explode(relatedid) videoid
from(
    select 
        videoid,views,relatedid
    from
        video_orc
    order by views desc
    limit 50
    ) t1;

第三步:和原表join获取category
select distinct t2.videoid,v.category
from(
    select 
        explode(relatedid) videoid
    from(
        select 
            videoid,views,relatedid
        from
            video_orc
        order by views desc
        limit 50
    ) t1
) t2
join video_orc v
on t2.videoid=v.videoid;

第四步:炸开第三步中得到的category
select videoid,cate
from(
    select distinct t2.videoid,v.category
    from(
        select 
            explode(relatedid) videoid
        from(
            select 
                videoid,views,relatedid
            from
                video_orc
            order by views desc
            limit 50
        ) t1
    ) t2
join video_orc v
on t2.videoid=v.videoid
) t3
lateral view
    explode(category) tbl as cate;
第五步:将第四步的结果按照cate分组并计数排序后输出
select cate,count(videoid) n
from(
    select videoid,cate
    from(
        select distinct t2.videoid,v.category
            from(
                select 
                    explode(relatedid) videoid
                from(
                    select 
                        videoid,views,relatedid
                    from
                       video_orc
                    order by views desc
                    limit 50
                ) t1
            ) t2
        join video_orc v
        on t2.videoid=v.videoid
        ) t3
        lateral view
            explode(category) tbl as cate
) t4
group by
    cate
order by n desc;

hive清洗数据难点 hive数据清洗步骤_java_04

需求⑤:统计每个类别中的视频热度Top10,以music为例

//创建一张中间表格,把category字段炸开
create table video_category
stored as orc
tblproperties("orc.compress"="snappy") as 
select
    videoid,uploader,age,cate,length,views,rate,ratings,comments,relatedid
from
    video_orc
lateral view
    explode(category) tbl as cate;

//从中间表格查询
select 
    videoid,views,cate
from 
    video_category
where 
    cate='Music'
order by views desc
limit 10;

hive清洗数据难点 hive数据清洗步骤_hadoop_05

 需求⑥:统计每个类别中视频流量Top10,以Music为例

//直接在需求⑤生成的中间表查询
select videoid,ratings
from video_category
where cate='Music'
order by ratings desc
limit 10;

 

hive清洗数据难点 hive数据清洗步骤_hive_06

需求⑦:统计上传视频最多的用户Top10 以及他们上传的观看次数在前20的视频

理解一:Top用户和他们的播放量Top20

第一步:统计上传视频数Top10
select 
    uploader,videos
from 
    user_orc
order by videos desc
limit 10;

第二步:和video_orc表join查看这些用户都上传了哪些视频和视频观看数
select t1.uploader,v.videoid,
    rank() over(partition by t1.uploader order by v.views desc) hot
from(
    select 
        uploader,videos
    from 
        user_orc
    order by videos desc
    limit 10
    ) t1
left join video_orc v
on t1.uploader = v.uploader;

第三步:根据第二步的结果排序
select t2.uploader,t2.videoid,t2.hot
from(
    select t1.uploader,v.videoid,
        rank() over(partition by t1.uploader order by v.views desc) hot
    from(
        select 
            uploader,videos
        from 
            user_orc
        order by videos desc
        limit 10
        ) t1
    left join video_orc v
    on t1.uploader = v.uploader
) t2
where t2.hot <= 20;

理解二: 播放量Top20哪些是由上传量Top10用户上传的

//上传量Top10用户
select 
    uploader,videos
from 
    user_orc
order by videos desc
limit 10;

//播放量Top20
select 
    videoid,uploader,views
from 
    video_orc
order by views desc
limit 20;

//两表join查看
select t1.uploader,t2.videoid,t2.views
from(
    select 
        uploader,videos
    from 
        user_orc
    order by videos desc
    limit 10 
) t1
left join (
    select 
        videoid,uploader,views
    from 
        video_orc
    order by views desc
    limit 20
) t2
on t1.uploader=t2.uploader;

 需求⑧:统计每个类别视频的观看数Top10

//从video_category表中查videoid,views和按类别分组的播放量排名
select cate,videoid,views,
    rank() over(partition by cate order by views desc) hot
from 
    video_category;

//从①的结果中输出各个类别的播放量Top10
select cate,videoid,views
from(
    select cate,videoid,views,
        rank() over(partition by cate order by views desc) hot
    from 
        video_category
) t1
where hot <= 10;