做题准备
环境准备
做题前需要安装好spark环境,以及python环境,当然有个Hadoop环境更好了,
环境搭建参考链接:spark环境,hadoop集群搭建
知识储备
- 熟练掌握python基础语法及编程训练
- 熟悉spark rdd编程
- 了解Scala基础知识
题目一:全校2018级学生名单分析
1、使用RDD编程的方式,找到自己的那一行信息
编写spark脚本
from pyspark import SparkConf, SparkContext
import sys
cf = SparkConf() # 创建SparkConf对象
cf.setMaster("local")
cf.setAppName("my-book-tongji")
sc = SparkContext(conf = cf)
rdd1 = sc.textFile("file://"+sys.argv[1]) #将传递给脚本的第一个参数作为路径
rdd2 = rdd1.map(lambda x:x.split(",")) # 分割数据
rdd3 = rdd2.filter(lambda x:("哇" in x[6])) # 通过filter筛选出满足函数的元素,并返回一个新的RDD(数据集)
names = rdd3.collect()
print("===================================================")
print("输出结果:")
print(names)
print("===================================================\n")
通过以下命令使用spark运行脚本
./spark-submit /home/user/tasks/one.py /caifl/spark/data/stuinfo/
最终输出结果:
2、按照学院分组,统计每个学院的学生数量,并排序。
编写spark脚本
from pyspark import SparkConf, SparkContext
import sys
from operator import add
conf = SparkConf().setMaster('local').setAppName('Grouping') # 创建SparkConf对象
sc = SparkContext(conf=conf) # 创建SparkContext对象
rdd1 = sc.textFile("file:///caifl/spark/data/stuinfo") # 读取数据
rdd2 = rdd1.map(lambda x: x.split(",")) # 分割数据
rdd3 = rdd2.map(lambda x: [x[2]]) # map提取学院,因为给的数据已经对学院进行分类了。
rdd3.foreach(print) # 输出测试
rdd4 = rdd3.map(lambda x: [x[0], 1]) # 新建个两个元素列表的RDD
rdd5 = rdd4.reduceByKey(lambda a, b: a + b) # 对rdd4的键值对进行聚合及相加
list = rdd5.sortBy(lambda a: a[1], False).collect() # 根据大小进行排序
print("=====================================================")
print("输出结果:")
print(list)
print("=====================================================")
sc.stop()
结果输出显示
3、按照学员分组,再按照性别分组统计每个学院的男女生占比。(不包含不在校的学生)
注:排除了“在校”学生以外的所有学生,只对"在校"学生进行筛选
编写脚本:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster('local').setAppName('Grouping') # 创建SparkConf对象
sc = SparkContext(conf=conf) # 创建SparkContext对象
rdd1 = sc.textFile("file:///caifl/spark/data/stuinfo") # 读取数据
rdd2 = rdd1.map(lambda x: x.split(",")) # 分割数据
rdd_xueyuan = rdd2.groupBy(lambda t: t[2]) # 根据学院对数据进行分类
print("================输出学院分类测试===================")
rdd_xueyuan.foreach(print) # 输出学院分类测试
list1 = rdd_xueyuan.collect()
print(list1)
# rdd_xueyuan_total = rdd_xueyuan.map(lambda t: (t[0], len(list(t[1]))))
# print("================输出学院人数统计===================")
# rdd_xueyuan_total.foreach(print) # 输出学院人数统计
def foo(it):
s = ''
ls = tuple(it)
ls_nan = []
ls_nv = []
for item in ls:
if item[8] == '在校': # 筛选在校学生,排除不在校学生
if item[7] == '男':
ls_nan.append(item) # 女生添加
elif item[7] == '女':
ls_nv.append(item) # 男生添加
if len(ls_nan) == 0: # 被除数不能为0,否则报错
s = "0%"
elif ls_nan != 0:
s = str(round(len(ls_nv) / len(ls_nan) * 100, 2)) + "%" # 计算百分比并保留两位小数
return [len(ls_nan), len(ls_nv), s] # 返回在校男女生人数及男女比率:女生/男女生*100%
rdd3 = rdd_xueyuan.mapValues(foo) # 对rddtuple值执行foo函数并返回男女人数及男女百分比:女生数量/男生数量*100%(保留两位小数)
print("================输出结果:================:")
list2 = rdd3.collect()
for t in list2:
print(t)
sc.stop()
通过命令运行
/caifl/spark/spark-2.4.7/bin/spark-submit /home/user/tasks/three.py
运行脚本,输出结果:
4、找出学号最小值,最大值。
分析及推测所有学生的学号排列规律,包括但不限于:
是否存在学号区段与专业班级的关系,是否存在学号区段与学院名称的关系,
是否存在空闲的学号区段未被使用
编写脚本
from pyspark import SparkConf, SparkContext
import sys
cf = SparkConf() # 创建SparkConf对象
cf.setMaster("local")
cf.setAppName("my-book-tongji")
sc = SparkContext(conf=cf)
rdd1 = sc.textFile("file:///caifl/spark/data/stuinfo") # 读取数据
rdd2 = rdd1.map(lambda x: x.split(",")) # 分割数据
# ===================方法一:=====================
rdd_num = rdd2.map(lambda x: x[5])
r3 = rdd_num.reduce(lambda a, b: max(a, b)) # 找到最大值学号
r4 = rdd_num.reduce(lambda a, b: min(a, b)) # 找到最小值学号
rdd4 = rdd2.filter(lambda x: str(x[5]) == str(r3)) # 根据学号筛选最大值信息
rdd5 = rdd2.filter(lambda x: str(x[5]) == str(r4)) # 根据学号筛选最小值信息
print("输出信息:")
t3 = rdd4.collect()
t4 = rdd5.collect()
print("分别输出最大值与最小值:")
print("最大值:", t3)
print("最小是", t4)
# ===================方法二:======================
# t3 = rdd2.max(lambda x: x[5]) # 筛选学号中的最大值
# t4 = rdd2.min(lambda x: x[5]) # 筛选学号中的最小值
# print("分别输出最大值与最小值:")
# print("最大值:", t3)
# print("最小是", t4)
# print("分别输出最大值与最小值:")
# print("最大值:", t3)
# print("最小是", t4)
sc.stop()
输出结果:
5、按学生姓氏统计排名TOP5的姓氏及人数(只统计在校的非留学生,注意复姓)
编写程序如下:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster('local').setAppName('Grouping') # 创建SparkConf对象
sc = SparkContext(conf=conf) # 创建SparkContext对象
rdd1 = sc.textFile("file:///caifl/spark/data/stuinfo") # 读取数据
rdd2 = rdd1.map(lambda x: x.split(",")) # 分割数据
# =====================方法一:=======================
list1 = rdd2.collect()
# rdd_xueyuan = rdd2.groupBy(lambda t: t[2]) # 根据学院对数据进行分类
# rdd_xueyuan = sc.parallelize(rdd2.collect())
# rdd_xueyuan = rdd2.groupBy(lambda x: x)
lastname = ['欧阳', '上官', '皇甫', '令狐', '诸葛', '司徒', '司马', '申屠', '夏侯', '贺兰', '完颜', '慕容', '尉迟', '长孙']
def dotit(it):
last_name = {}
for item in it:
if item[4] != "留学本科生":
name = item[6] # 提取学生姓名
length = len(name) # 提取名字长度
if length == 2: # 如果名字是两个字,必然不是复姓!
s1 = name[0]
if s1 in last_name: # 判断姓是否已经存在
last_name[s1] = last_name[s1] + 1 # 存在则数量加1
else:
last_name[s1] = 1 # 不在则赋值为1
elif length == 3:
s1 = name[0]
s2 = name[0] + name[1]
if s2 in lastname: # 判断姓名3个字名字的姓氏是否是复姓
if s2 in last_name: # 判断姓是否已经存在
last_name[s1] = last_name[s1] + 1 # 存在则数量加1
else:
last_name[s1] = 1 # 不在则赋值为1
else:
if s1 in last_name: # 判断姓是否已经存在
last_name[s1] = last_name[s1] + 1 # 存在则数量加1
else:
last_name[s1] = 1 # 不在则赋值为1
elif length >= 4:
s1 = name[0] + name[1]
if s1 in last_name: # 判断姓是否已经存在
last_name[s1] = last_name[s1] + 1 # 存在则数量加1
else:
last_name[s1] = 1 # 不在则赋值为1
a1 = sorted(last_name.items(), key=lambda x: x[1], reverse=True) # 对数据进行排序
return a1 # 返回排序处理好的出具
# rdd3 = rdd_xueyuan.mapValues(dotit)
last_name = dotit(list1)
rdd3 = sc.parallelize(last_name) # 装载到rdd中
ouput = rdd3.collect()
print("==============结果输出(姓氏人数从大到小排序):============")
print(ouput)
print("================输出人数排名top5的姓氏:===============")
for i in range(0, 5):
print(ouput[i], end=" ")
print()
结果输出:
题目二:红楼梦所用到的汉字统计分析
1、统计小说全文用到的汉字及其出现次数,并排序得到最常用的500个汉字。
编写程序如下:
import re
from pyspark import SparkConf, SparkContext
from collections import Counter
conf = SparkConf().setMaster('local').setAppName('WordCount') # 创建SparkConf对象
sc = SparkContext(conf=conf) # 创建SparkContext对象
lines = sc.textFile("file:///caifl/spark/data/红楼梦txt") # 读取数据
punc = '~`!#$%^&*()_+-=|‘’\'.”;":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》《{}\u3000, ' # 添加标点符号
def clean(w): # 清洗数据
w = w.replace(' ', '')
w = w.replace(',', '')
w = w.replace("'", "")
w = re.sub('[a-zA-Z]', '', w) # 消除英文字符
w = re.sub(r"[%s]+" % punc, "", w) # 消除标点符号
return w
def tongji(w): # 统计汉字字符出现次数
hanzi = {}
for s in w:
if s in hanzi: # 判断字典是否已存在汉字
hanzi[s] = hanzi[s] + 1
else:
hanzi[s] = 1
return hanzi
def jian_add(a, b): # 合并字典
A, B = Counter(a), Counter(b)
result = dict(A + B)
return result
# ====================方法一:====================
rdd1 = lines.map(clean) # 清洗数据
rdd2 = rdd1.map(tongji) # .reduceByKey(lambda x, y:x+y)
list1 = rdd2.reduce(jian_add) # 合并多个元素
list2 = sorted(list1.items(), key=lambda x: x[1], reverse=True) # 对数据进行排序
print(list2)
print("=========================最常用的500个汉字===================")
for i in range(0, 501):
print(list2[i], end=" ")
输出结果:
题目三:物联卡流量/短信活跃度分析
使用sparkSQL进行数据分析的步骤
- 1)读取文本文件创建RDD
- 2)RDD进行预处理,以便创建DataFrame 包括将每行数据分隔成多个字段,删除掉表头行
- 3)将RDD的每一个数据转换为Row类型 对象
- 4)创建DataFrame 代码如示: df1 = spark.createDataFrame(rdd) #要求入参的rdd的每一个数据都是Row类型的对象
- 5)用DataFram创建临时表 df1.createOrReplaceTempView(“tb_name??”)
- 6)使用SQL语句进行查询分析 spark.sql(“select … from tb_??? …”)
- 7)saprk.sql() 函数返回的是一个新的DataFrame(), 可以保存中间变量 df8=saprk.sql(“select …”) 。 df8.printSchema()
- 8)DataFrame 可以转换为RDD 代码: rdd_df8 =df8.rdd
小题
1、按照EC来统计每个EC所属的卡在有流量的和没有流量的占比(有活跃的卡的占比)学号尾号123
2、按照EC来统计每个EC所属的卡在有短信的和没有短信的占比(活跃的卡的占比) 456
3、按照省份来统计各个省分的卡有流量和没有流量的活跃度占比 78
4、按照省份来统计各个省分的卡有短信和没有短信的活跃度占比 90
题目解析:
1、2、3、4题相差不大,通过做一道题可以对其它题目的思路进行分析。
下面给出第4小题解法:
第4小题:
table分析
数据关系图:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-AMUyUPSk-1625197469333)(D:\Resources\md\大数据智能综合训练编程题(Spark、RDD、Hadoop).assets\image-20210627232945410.png)]
card_info表:卡号定义表, 一个OID对应一张卡且独一无二
prod_sublist:产品订购表, 一个OID与一个prod_pkg、一个prod_id决定一个prod_instid,prod_instid 在该表独一无二,表示某个卡订购某些产品的编号
provice_del:省份信息表
ec_info:公司信息表
prod_info:产品信息表, 有发送短信的必然同时选择了I90011与I90012这两个服务,有流量的必然选中了I90032与剩余两个(I90036、I90037)之中的其中一个。
flow_daily:流量日活跃表,活跃的必然是有流量的
sms_daily:短信日活跃表,活跃的必然是有短信的
活跃度指的是在某个统计周期内有流量的卡号,就是活跃的。这个周期内没有流量消耗的卡号,就是不活跃的。
要求大家求的是活跃度占比。就是某个分组内(按ECID或者按省份)的总卡数量为分母,活跃的卡数量为分子,求百分比
编写pyspark脚本程序:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
import sys
import re
conf = SparkConf().setMaster('local').setAppName('SQLName')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
'''导入文件'''
card_info = sc.textFile("file:///caifl/spark/data/iot_card/card_info.csv")
province_def = sc.textFile("file:///caifl/spark/data/iot_card/provice_def.csv")
sms_daily = sc.textFile("file:///caifl/spark/data/iot_card/sms_daily.csv")
prod_sublist = sc.textFile("file:///caifl/spark/data/iot_card/prod_sublist.csv")
ec_info = sc.textFile("file:///caifl/spark/data/iot_card/ec_info.csv")
prod_info = sc.textFile("file:///caifl/spark/data/iot_card/prod_info.csv")
# 创建DataFrame
rdd_card_info = card_info.map(lambda x: x.split(","))
rdd_province_def = province_def.map(lambda x: x.split(","))
rdd_sms_daily = sms_daily.map(lambda x: x.split(","))
rdd_prod_sublist = prod_sublist.map(lambda x: x.split(","))
rdd_ec_info = ec_info.map(lambda x: x.split(","))
rdd_prod_info = prod_info.map(lambda x: x.split(","))
# 清除第一行
rdd1_card_info = rdd_card_info.filter(lambda x: "status" not in x)
rdd1_province_def = rdd_province_def.filter(lambda x: "province_name" not in x)
rdd1_sms_daily = rdd_sms_daily.filter(lambda x: "sms_used" not in x)
rdd1_prod_sublist = rdd_prod_sublist.filter(lambda x: "status" not in x)
rdd1_ec_info = rdd_ec_info.filter(lambda x: "ecname" not in x)
rdd1_prod_info = rdd_prod_info.filter(lambda x: "prod_pkg" not in x)
# print("===============输出测试==============")
# for i in rdd1_province_def.collect():
# print(i)
# 将每一行转换为Row类型的对象
rdd2_card_info = rdd1_card_info.map(
lambda x: Row(OID=x[0], province=x[1], msisdn=x[2], active_day=x[3], active_time=x[4], status=int(x[5]), ecid=x[6]))
rdd2_sms_daily = rdd1_sms_daily.map(lambda x: Row(prod_instid=x[0], total=x[1], sms_used=x[2], stats_day=x[3]))
rdd2_prod_sublist = rdd1_prod_sublist.map((lambda x: Row(OID=x[0], prod_pkg=x[1], prod_id=x[2], prod_instid=x[3],
APN=x[4], status=int(x[5]), sub_day=x[6], sub_time=x[7])))
rdd2_province_def = rdd1_province_def.map(lambda x: Row(province_id=x[0], province_name=x[1]))
rdd2_ec_info = rdd1_ec_info.map(lambda x: Row(ecid=x[0], ecname=x[1], level=x[2]))
rdd2_prod_info = rdd1_prod_info.map(lambda x: Row(prod_pkg=x[0], prod_id=x[1], prod_name=x[2]))
print("===============输出测试==============")
print(rdd_prod_sublist.first())
'''转为为DataFrame类型的数据对象'''
rdd3_sms_daily = spark.createDataFrame(rdd2_sms_daily)
rdd3_card_info = spark.createDataFrame(rdd2_card_info)
rdd3_prod_sublist = spark.createDataFrame(rdd2_prod_sublist)
rdd3_province_def = spark.createDataFrame(rdd2_province_def)
rdd3_ec_info = spark.createDataFrame(rdd2_ec_info)
rdd3_prod_info = spark.createDataFrame(rdd2_prod_info)
'''输出表中信息'''
print("============输出表测试==========")
rdd3_prod_sublist.printSchema()
# 转变视图
rdd3_card_info.createTempView('card_info')
rdd3_prod_sublist.createTempView('prod_sublist')
rdd3_sms_daily.createTempView('sms_daily')
rdd3_province_def.createTempView('province_def')
rdd3_ec_info.createTempView('ec_info')
rdd3_prod_info.createTempView('prod_info')
'''测试'''
rdd4 = spark.sql("select * from card_info where province=100")
rdd4 = rdd4.rdd
r5 = rdd4.collect()
print("===============输出查询结果==========")
for i in r5:
print(i)
rdd4_1 = spark.sql("select * from prod_info")
r4_1 = rdd4_1.rdd
r5_1 = r4_1.collect()
print("===============输出prod_info查询结果==========")
for i in r5_1:
print(i)
'''开始查询'''
# 通过sql语句去重查找到活跃且有短信的OID
rdd4_sms_active_oid = spark.sql("select distinct OID from prod_sublist where prod_instid in (select distinct prod_instid from sms_daily)")
rdd4_sms_active_oid = rdd4_sms_active_oid.rdd
# 将活跃的active_instid变成table
rdd4_sms_active_oid_temp = rdd4_sms_active_oid.map(lambda x: Row(OID=x[0]))
active_oid = spark.createDataFrame(rdd4_sms_active_oid_temp)
active_oid.createTempView("active_oid")
# 输出
t1 = rdd4_sms_active_oid.collect()
# print("===============输出有短信且活跃的OID:==========")
# print(t1)
# print("==============================有短信且活跃的OID数量:", len(t1),"==============================")
# 查找有短信的所有OID
rdd4_sms_oid = spark.sql("select distinct OID from prod_sublist where OID in (select OID from prod_sublist where prod_id='I90011')"
"and OID in (select OID from prod_sublist where prod_id='I90012')")
rdd4_sms_oid = rdd4_sms_oid.rdd
# 将有短信的rdd转换成table
rdd4_sms_oid_temp = rdd4_sms_oid.map(lambda x: Row(OID=x[0]))
sms_oid = spark.createDataFrame(rdd4_sms_oid)
sms_oid.createTempView("sms_oid")
t2 = rdd4_sms_oid.collect()
print("===============输出有短信的OID:==========")
print(t2)
print("==============================有短信且活跃的OID数量:", len(t1),"==============================")
print("==============================有短信的OID数量:", len(t2), "==============================")
# 同时查找活跃的与全部的OID数量(语句尚有错误,提示IN使用错误)
# select count(OID in (select OID from active_oid) or NULL) as active, count(OID in (select OID from sms_oid) or NULL) as sms, province from card_info group by province
# 查找各省份有短信且活跃的数量
province_active_oid = spark.sql("select count(*), province from card_info where OID in (select OID from active_oid) group by province")
province_active_oid = province_active_oid.rdd
# province_active_oid.sortBy(lambda x: x[1], False)
t3 = province_active_oid.collect()
print("===============输出有短信且活跃的各省份的OID:==========")
print(t3)
print("==============================输出有短信且活跃的各省份的OID数量:", len(t3), "==============================")
#
#
# # 查找各省份有短信的数量
输出结果:
题目四:某电商平台用户行为日志统计分析
题目要求:
点击量排名前10的品类,是根据所有用户所有会话的行为统计得到。既然是用户/session的点击行为,就有可能会重复点击这一品类,所以再统计哪些user是多次点击了这前10的品类
最后要得出10*10=100行数据
2021-06-10 22:00:45 这个字段的数据,要将字符串转换成日期时间类型,需要引入两个函数
from pyspark.sql.functions import to_date, to_timestamp
并且这两个函数只能在DateFrame或者SQL语句中运用
1)用户session时长的计算,找到TOP10 session 的用户ID
编写脚本:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
import sys
import re
conf = SparkConf().setMaster('local').setAppName('Session')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
sc.setLogLevel("ERROR")
info_lines = sc.textFile("file:///caifl/spark/data/info.txt")
rdd1 = info_lines.map(lambda x: x.split(","))
print(rdd1.collect())
rdd2 = rdd1.map(lambda x: Row(date=x[0], user_id=x[1], session_id=x[2], page_id=x[3], action_time=x[4], search_keyword=x[5],
click_category_id=x[6], click_product_id=x[7], order_category_ids=x[8], order_product_ids =x[9],
pay_category_ids=x[10], pay_product_ids=x[11]))
rdd3 = spark.createDataFrame(rdd2)
# 转变视图
rdd3.printSchema()
rdd3.createTempView("session")
'''
select * from (select b.user_id, qd.session_id,qd.duration from(select a.session_id,cast(a.max_time as '
'long)-cast(a.min_time as long) as duration from(select session_id,max(to_timestamp(action_time)) as '
'max_time,min(to_timestamp(action_time)) as min_time from action_session group by '
'action_session.session_id) as a) as qd, (select distinct user_id,session_id from action_session) as b '
'where qd.session_id=b.session_id order by qd.duration DESC) limit 10
'''
rdd4 = spark.sql("select D.user_id, C.session_id, C.timeLength from (select A.session_id, cast(A.maxTime as long) - cast(A.minTime as long) as timeLength from (select session_id, max(to_timestamp(action_time)) as maxTime, min(to_timestamp(action_time)) as minTime from session group by session_id) as A) as C, (select distinct user_id,session_id from session) as D where C.session_id = D.session_id order by C.timeLength DESC")
rdd4 = rdd4.rdd
list = rdd4.collect()
print("=============输出查询结果:============")
for i in range(0, 10):
print("用户ID:", list[i]['user_id'], "SessionID:", list[i]['session_id'], "session时长:", list[i]['timeLength'])
SQL脚本编写思路:
‘’’
date:日期,代表这个用户点击行为是在哪一天发生的
user_id:代表这个点击行为是哪一个用户执行的
session_id :唯一标识了某个用户的一个访问session
page_id :点击了某些商品/品类,也可能是搜索了某个关键词,然后进入了某个页面,页面的id
action_time :这个点击行为发生的时间点
search_keyword :如果用户执行的是一个搜索行为,比如说在网站/app中,搜索了某个关键词,然后会跳转到商品列表页面;搜索的关键词
click_category_id :可能是在网站首页,点击了某个品类(美食、电子设备、电脑)
click_product_id :可能是在网站首页,或者是在商品列表页,点击了某个商品(比如呷哺呷哺火锅XX路店3人套餐、iphone 6s)
order_category_ids :代表了可能将某些商品加入了购物车,然后一次性对购物车中的商品下了一个订单,这就代表了某次下单的行为中,有哪些
商品品类,可能有6个商品,但是就对应了2个品类,比如有3根火腿肠(食品品类),3个电池(日用品品类)
order_product_ids :某次下单,具体对哪些商品下的订单
pay_category_ids :代表的是,对某个订单,或者某几个订单,进行了一次支付的行为,对应了哪些品类
pay_product_ids:代表的,支付行为下,对应的哪些具体的商品
‘’’
# cast将一种数据转换为指定类型的数据
# to_timestamp() 将日期转换为时间戳
# max()函数找到最大值
# min()函数找到最小值
# 查找
# sql思路
# 查找每个session的最大时间与最小时间(也可以理解为最晚时间与最短时间)转化为时间戳
select session_id, max(to_timestamp(action_time)) as maxTime, min(to_timestamp(action_time)) as minTime from session group by session_id
# 将上面的表作为临时表A
# cast(A.maxTime as long) 将timestamp类型转换为long类型
# timeLength 每个session的最大时间与最小时间间隔时间转换而成的long型数据,即间隔时间长度timeLength, 求到了每个session的最大间隔时长
select A.session_id, cast(A.maxTime as long) - cast(A.minTime as long) as timeLength
from (select session_id, max(to_timestamp(action_time)) as maxTime, min(to_timestamp(action_time)) as minTime from session group by session_id) as A
#将上面的表作为临时表C,
(select A.session_id, cast(A.maxTime as long) - cast(A.minTime as long) as timeLength
from (select session_id, max(to_timestamp(action_time)) as maxTime, min(to_timestamp(action_time)) as minTime from session group by session_id) as A) as C
# 去重统计user_id 与session_id
select distinct user_id,session_id from action_session
#将上面的表作为临时表D
(select distinct user_id,session_id from action_session) as D
# C表和D表都有相同的字段session_id
(select A.session_id, cast(A.maxTime as long) - cast(A.minTime as long) as timeLength
from (select session_id, max(to_timestamp(action_time)) as maxTime, min(to_timestamp(action_time)) as minTime from session group by session_id) as A) as C
# 最终合并为一个完整的SQL语句 排序时使用order by timeLength DESC
# top输出前10个,没有效果,采用rdd输出前十个数据, 根据session时长降序从大到小排序
select * from (select D.user_id, C.session_id, C.timeLength from (select A.session_id, cast(A.maxTime as long) - cast(A.minTime as long) as timeLength from (select session_id, max(to_timestamp(action_time)) as maxTime, min(to_timestamp(action_time)) as minTime from session group by session_id) as A) as C, (select distinct user_id,session_id from session) as D where C.session_id = D.session_id order by C.timeLength DESC) limit 10
输出截图:
2)获取点击操作排名(点击量)前10的品类、并获取top10品类的点击量(有些session/用户可能会多次点击这一个品类)排名前10的user
编写脚本代码:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
import sys
import re
conf = SparkConf().setMaster('local').setAppName('Session')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
sc.setLogLevel("ERROR")
info_lines = sc.textFile("file:///caifl/spark/data/info.txt")
rdd1 = info_lines.map(lambda x: x.split(",")) # 处理数据
print(rdd1.collect())
rdd2 = rdd1.filter(lambda x: x[5] == 'null') # 筛选点击操作
rdd2 = rdd2.filter(lambda x: x[6] != 'null' or x[7] != 'null') # 筛选点击操作
rdd3 = rdd2.map(
lambda x: Row(date=x[0], user_id=x[1], session_id=x[2], page_id=x[3], action_time=x[4], search_keyword=x[5],
click_category_id=x[6], click_product_id=x[7], order_category_ids=x[8], order_product_ids=x[9],
pay_category_ids=x[10], pay_product_ids=x[11]))
rdd4 = spark.createDataFrame(rdd3) # 创建dataframe
# 转变视图
rdd4.printSchema()
rdd4.createTempView("user_action") # 转换视图
rdd5 = spark.sql(
"select page_id, num, user_id, num_all, user_num from (select page_id, num, user_id, num_all, ROW_NUMBER() over(PARTITION by page_id ORDER BY num_all desc) as user_num from (select B.page_id as page_id, B.num as num, user_id as user_id, A.num_all as num_all from (select page_id, count(page_id) as num from user_action group by page_id order by num desc ) as B,(select page_id ,user_id,COUNT(*) as num_all from user_action group by page_id ,user_id ) as A where B.page_id=A.page_id order by B.num DESC, A.num_all DESC) as C) as D where user_num<11 ORDER BY num desc")
list = rdd5.collect()
print("=============================输出最终结果==============================")
for item in list:
print("page_id:", item['page_id'], " 点击总数: ", item['num'], " user_id:", item['user_id'], "user_id点击page_id总数:", item['num_all'],
"user_id排序:", item['user_num'])
sql思路:
# SQL思路
# 统计page_id出现的数量并按降序排序
select page_id, count(page_id) as num from user_action group by page_id order by num desc
#统计user_id, page_id出现的次数,即每个user_id点击同一个page_id的数量(多次点击)
select page_id ,user_id,COUNT(*) as num_all from user_action group by page_id ,user_id
-- 统计计算user_id数量,并将之前统计的数据提取到新的表中
select page_id, num, user_id, num_all, ROW_NUMBER() over(PARTITION by page_id ORDER BY num_all desc) as user_num from (select B.page_id as page_id, B.num as num, user_id as user_id, A.num_all as num_all from (select page_id, count(page_id) as num from user_action group by page_id order by num desc ) as B,(select page_id ,user_id,COUNT(*) as num_all from user_action group by page_id ,user_id ) as A where B.page_id=A.page_id order by B.num DESC, A.num_all DESC) as C
-- 将数据提取到外面的表,然后根据page_id点击总数进行排序
-- ROW_NUMBER() over(PARTITION by page_id ORDER BY page_id) as user_num 通过row_number()函数根据page_id分组并以num_all(user_id 点击page_id次数)排序
-- 最后在外面根据num(page_id点击总数)进行降序排序
'''
最终sql脚本
'''
select page_id, num, user_id, num_all, user_num from (select page_id, num, user_id, num_all, ROW_NUMBER() over(PARTITION by page_id ORDER BY num_all desc) as user_num from (select B.page_id as page_id, B.num as num, user_id as user_id, A.num_all as num_all from (select page_id, count(page_id) as num from user_action group by page_id order by num desc ) as B,(select page_id ,user_id,COUNT(*) as num_all from user_action group by page_id ,user_id ) as A where B.page_id=A.page_id order by B.num DESC, A.num_all DESC) as C) as D where user_num<11 ORDER BY num desc
输出: