文章目录
- 0.前言
- 1.百分比堆叠柱状图
- 1.1 导入包,连接数据库
- 1.2 查看数据
- 1.3 数据预处理
- 1.4 生成图表
- 2.复合柱状图和折线图
- 2.1 数据预处理
- 2.2 生成图表
- 3.竖直缩放条
- 3.1 数据预处理
- 3.2 生成图表
- 4.标记点
- 5.根据数据范围划分颜色
- 6.渐变圆柱
- 7.内外缩放
- 8.随时间动态变化
- 8.1 数据预处理
- 8.2 生成图表
0.前言
- 该分析系列使用爬取到的网易云歌单数据,对所学的Excel、SQL、Pandas、Pyecharts等数据分析及数据可视化工具进行巩固。
1.百分比堆叠柱状图
1.1 导入包,连接数据库
import numpy as np
import pandas as pd
import pymysql
from pyecharts import options as opts
from pyecharts.charts import Bar, Line
from pyecharts.commons.utils import JsCode
from pyecharts.globals import ThemeType
conn = pymysql.connect(
host = '127.0.0.1',
user = 'root',
password = '123',
database = 'cloudmusic',
charset = 'utf8'
)
df = pd.read_sql("select * from playlist", con=conn)
1.2 查看数据
df.head(5)
df.shape
共有51203行数据,16个字段
1.3 数据预处理
# 根据type分组,对share_count和comment_count求平均值
type_group = df.groupby("type")['play_count', 'subscribed_count', 'share_count', 'comment_count'].mean().apply(lambda x: round(x, 2))
# 根据share_count降序排序,取前10名
sorted_type_group = type_group.sort_values(by='share_count', ascending=False)[:10]
sorted_type_group
统计总数
share_count_sum = sorted_type_group["share_count"].sum()
comment_count_sum = sorted_type_group["comment_count"].sum()
print(share_count_sum, comment_count_sum)
xdata = sorted_type_group.index.tolist()
ydata1 = [{"value": y, "percent": round((y / share_count_sum)*100, 2)} for y in sorted_type_group['share_count'].tolist()]
ydata2 = [{"value": y, "percent": round((y / comment_count_sum)*100, 2)} for y in sorted_type_group['comment_count'].tolist()]
ydata1
1.4 生成图表
bar1 = (
Bar(init_opts=opts.InitOpts(width="1000px"))
.add_xaxis(xdata)
.add_yaxis("转发量", ydata1, stack="stack1", category_gap="50%", color="#009db2")
.add_yaxis("评论数", ydata2, stack="stack1", category_gap="50%", color="#f47a75")
.set_global_opts(
# 标题配置项
title_opts = opts.TitleOpts(
title = "各类型歌单的转发量和评论数",
subtitle = "列举了前十名",
),
# X轴配置项
xaxis_opts = opts.AxisOpts(
name = "类型"
),
# Y轴配置项
yaxis_opts = opts.AxisOpts(
name = "数量",
name_location = "center",
name_gap = "40"
),
# 区域选择组件
brush_opts = opts.BrushOpts(),
# 区域缩放配置项
datazoom_opts = opts.DataZoomOpts(),
# 工具箱组件
toolbox_opts = opts.ToolboxOpts(),
)
.set_series_opts(
# 标签配置项
label_opts = opts.LabelOpts(
position = "right",
formatter = JsCode(
"function(x){return Number(x.data.percent).toFixed() + '%';}"
)
)
)
)
bar1.render_notebook()
图表特色:
- X轴缩放
- 可框选区域查看数据
- 可转为折线图和并排柱状图
2.复合柱状图和折线图
2.1 数据预处理
type_group.head(10)
type_group10 = type_group[:10]
2.2 生成图表
bar2 = (
Bar(init_opts=opts.InitOpts(width="1000px"))
.add_xaxis(xaxis_data=type_group10.index.tolist())
.add_yaxis(
series_name = "平均转发量",
yaxis_data = type_group10['share_count'].tolist(),
color="#009db2",
label_opts = opts.LabelOpts(is_show=False)
)
.add_yaxis(
series_name = "平均评论数",
yaxis_data = type_group10['comment_count'].tolist(),
color="#f47a75",
label_opts = opts.LabelOpts(is_show=False)
)
.extend_axis(
yaxis = opts.AxisOpts(
name = "收藏量",
type_ = "value",
axislabel_opts = opts.LabelOpts(
formatter = JsCode(
"function(y){return Number(y / 1000) + 'K';}"
),
),
axisline_opts = opts.AxisLineOpts(linestyle_opts=opts.LineStyleOpts(color="#e75840")),
)
)
.set_global_opts(
title_opts = opts.TitleOpts(
title = "复合柱状图与折线图"
),
tooltip_opts = opts.TooltipOpts(
is_show = True,
trigger = "axis",
axis_pointer_type = "cross"
),
xaxis_opts = opts.AxisOpts(
type_ = "category",
axislabel_opts = {'interval': '0'},
axispointer_opts = opts.AxisPointerOpts(is_show=True, type_="shadow"),
),
yaxis_opts = opts.AxisOpts(
name = "转发量与评论数",
type_ = "value",
min_ = 0,
max_ = 400,
interval = 100,
axisline_opts = opts.AxisLineOpts(linestyle_opts=opts.LineStyleOpts(color="#024b51")),
axislabel_opts = opts.AxisTickOpts(is_show=True),
splitline_opts = opts.SplitLineOpts(is_show=True)
)
)
)
line2 = (
Line()
.add_xaxis(xaxis_data=type_group10.index.tolist())
.add_yaxis(
series_name = "平均收藏量",
yaxis_index = 1,
y_axis = type_group10['subscribed_count'].tolist(),
linestyle_opts = opts.LineStyleOpts(color="#e75840", width=1),
z = 10,
label_opts = opts.LabelOpts(color="#e75840", is_show=True)
)
)
图表特色:
- 同时展示柱状图和折线图
- 增加双轴,且轴颜色不同
- 鼠标在图表区内移动时,X、Y轴上有定位,鼠标放置点有信息显示
3.竖直缩放条
3.1 数据预处理
type_group30 = type_group[:30].apply(lambda x: x.astype(int))
type_group30.head()
3.2 生成图表
bar3 = (
Bar()
.add_xaxis(xaxis_data=type_group30.index.tolist())
.add_yaxis(
series_name = "平均转发量",
yaxis_data = type_group30['share_count'].tolist(),
color="#009db2",
)
.set_global_opts(
title_opts = opts.TitleOpts(
title = "垂直数据缩放条",
),
datazoom_opts = opts.DataZoomOpts(orient="vertical"),
)
)
图表特色:
- 可拉动垂直缩放条选取数据范围
4.标记点
bar4 = (
Bar()
.add_xaxis(xaxis_data=type_group30.index.tolist())
.add_yaxis(
series_name = "平均转发量",
yaxis_data = type_group30['share_count'].tolist(),
color="#009db2",
)
.set_global_opts(
title_opts = opts.TitleOpts(
title = "最大最小平均值标记",
)
)
.set_series_opts(
label_opts = opts.LabelOpts(is_show=False),
markpoint_opts = opts.MarkPointOpts(
data = [
opts.MarkPointItem(type_ = "max", name="最大值"),
opts.MarkPointItem(type_ = "min", name="最小值"),
opts.MarkPointItem(type_ = "average", name="平均值"),
]
),
)
)
图表特色:
- 标记了最大值、最小值和平均值的位置
5.根据数据范围划分颜色
color_function = """
function (params) {
if (params.value > 0 && params.value <= 100) {
return '#71c16f';
} else if (params.value > 100 && params.value <= 200 ) {
return '#f7af59';
}
return '#f06464';
}
"""
bar5 = (
Bar()
.add_xaxis(xaxis_data=type_group30.index.tolist())
.add_yaxis(
series_name = "平均转发量",
yaxis_data = type_group30['share_count'].tolist(),
itemstyle_opts = opts.ItemStyleOpts(color=JsCode(color_function)),
)
)
图表特色:
- 根据数值的高中低显示不同的柱子颜色
6.渐变圆柱
bar6 = (
Bar()
.add_xaxis(xaxis_data=type_group10.index.tolist())
.add_yaxis("平均转发量", yaxis_data=type_group10["share_count"].tolist(), category_gap="50%")
.set_global_opts(
title_opts = opts.TitleOpts(
title = "渐变圆柱",
),
xaxis_opts = opts.AxisOpts(
type_ = "category",
axislabel_opts = {'interval': '0'},
),
)
.set_series_opts(
itemstyle_opts={
"normal": {
"color": JsCode(
"""new echarts.graphic.LinearGradient(0,0,0,1,
[{offset: 0, color: '#0780cf'},
{offset: 1, color: '#47aee3'}
],
false)"""
),
"barBorderRadius": [30, 30, 30, 30],
#"shadowColor": "#009db2",
}
}
)
)
图表特色:
- 柱子为圆角柱子
- 颜色渐变
7.内外缩放
bar7 = (
Bar(
init_opts = opts.InitOpts(
animation_opts = opts.AnimationOpts(
animation_delay=1000, animation_easing="elasticOut"
)
)
)
.add_xaxis(xaxis_data=type_group30.index.tolist())
.add_yaxis("平均转发量",yaxis_data=type_group30['share_count'].tolist(), color="#009db2")
.set_global_opts(
title_opts = opts.TitleOpts(title="内外缩放"),
datazoom_opts = [opts.DataZoomOpts(), opts.DataZoomOpts(type_="inside")],
)
)
图表特色:
- 在内部滑动鼠标滚轮可以缩放数据
- 在外部拖动滑动条也可以缩放数据
8.随时间动态变化
8.1 数据预处理
(1)替换省份字段信息
根据年份和省份对数据进行分组,并规范省份的名称
import re
def replace_str(x):
rep_list = ['省', '市', '维吾尔','自治区', '壮族', '回族', '维吾尔族', '特别行政区']
for rep in rep_list:
x = re.sub(rep, '', x)
return x
time_df = df.groupby([df['create_time'].str[:4], df['province'].apply(replace_str)]).sum()
time_df
(2)重建索引
re_time_df = time_df.reset_index()
re_time_df
(3)获取所有省份
province = re_time_df['province'].drop_duplicates().tolist()
province
对各年度的省份数据进行计数,发现2013年和2014年有缺失数据
re_time_df['create_time'].value_counts()
(4)处理缺失数据
def add_province(df_data):
# 所有年份
years = df_data['create_time'].drop_duplicates().tolist()
for year in years:
# 每年的省份
new_province = df_data.loc[df_data['create_time']==year,:]['province'].drop_duplicates().tolist()
# 缺失的省份 = 所有省份 - 每年的省份
rest_province = [x for x in province if x not in new_province]
# 对缺失的省份生成一个DataFrame,填充0值,并与原DataFrame合并
if len(rest_province):
rest_df = pd.DataFrame([[year,x,0,0,0,0] for x in rest_province], columns=df_data.columns)
df_data = pd.concat([df_data, rest_df], ignore_index=True)
return df_data
re_time_df2 = add_province(re_time_df)
re_time_df2
已填充缺失数据
我们也可以分步来做这个过程
先处理2013年
new_province2013 = re_time_df.loc[re_time_df['create_time']=='2013',:]['province'].drop_duplicates().tolist()
rest_province2013 = [x for x in province if x not in new_province2013]
rest_province2013
rest_df2013 = pd.DataFrame([['2013',x,0,0,0,0] for x in rest_province2013], columns=re_time_df.columns)
rest_df2013
re_time_df1 = pd.concat([re_time_df, rest_df2013], ignore_index=True)
再处理2014年
new_province2014 = re_time_df.loc[re_time_df1['create_time']=='2014',:]['province'].drop_duplicates().tolist()
rest_province2014 = [x for x in province if x not in new_province2014]
rest_df2014 = pd.DataFrame([['2014',x,0,0,0,0] for x in rest_province2014], columns=re_time_df.columns)
rest_df2014
re_time_df2 = pd.concat([re_time_df1, rest_df2014], ignore_index=True)
(5)重建索引,得到最后数据
final_time_df = re_time_df2.sort_values(by=['create_time', 'province']).reset_index(drop=True)
(6)提取图表所需数据
省份
final_province = final_time_df['province'].drop_duplicates().tolist()
年份
final_year = final_time_df['create_time'].drop_duplicates().tolist()
播放量
# 播放量
data_play_count = {}
for year in final_year:
data_play_count[year] = final_time_df.loc[final_time_df['create_time']==year, 'play_count'].tolist()
收藏量
# 收藏量
data_subscribed_count = {}
for year in final_year:
data_subscribed_count[year] = final_time_df.loc[final_time_df['create_time']==year, 'subscribed_count'].tolist()
转发量
# 转发量
data_share_count = {}
for year in final_year:
data_share_count[year] = final_time_df.loc[final_time_df['create_time']==year, 'share_count'].tolist()
评论数
# 评论数
data_comment_count = {}
for year in final_year:
data_comment_count[year] = final_time_df.loc[final_time_df['create_time']==year, 'comment_count'].tolist()
汇总到一个字典中
total_data = {}
def format_data(data: dict) -> dict:
for year in final_year:
max_data, sum_data = 0, 0
temp = data[year]
max_data = max(temp)
for i in range(len(temp)):
sum_data += temp[i]
data[year][i] = {"name": final_province[i], "value": temp[i]}
data[year + "max"] = int(max_data / 100) * 100
data[year + "sum"] = sum_data
return data
total_data['play_count'] = format_data(data=data_play_count)
total_data['subscribed_count'] = format_data(data=data_subscribed_count)
total_data['share_count'] = format_data(data=data_share_count)
total_data['comment_count'] = format_data(data=data_comment_count)
8.2 生成图表
from pyecharts.charts import Timeline, Pie
def get_year_overlap_chart(year: str) -> Bar:
bar = (
Bar()
.add_xaxis(xaxis_data=final_province)
.add_yaxis(
series_name = "播放量",
yaxis_data = total_data['play_count'][year],
is_selected = False,
label_opts = opts.LabelOpts(is_show=False),
)
.add_yaxis(
series_name = "收藏量",
yaxis_data = total_data['subscribed_count'][year],
is_selected = False,
label_opts = opts.LabelOpts(is_show=False),
)
.add_yaxis(
series_name = "转发量",
yaxis_data = total_data['share_count'][year],
label_opts = opts.LabelOpts(is_show=False),
)
.add_yaxis(
series_name = "评论数",
yaxis_data = total_data['comment_count'][year],
label_opts = opts.LabelOpts(is_show=False),
)
.set_global_opts(
title_opts = opts.TitleOpts(
title = "{}年网易云音乐热门歌单数据".format(year),
subtitle = "数据来源于网易云音乐"
),
tooltip_opts = opts.TooltipOpts(
is_show = True, trigger = "axis", axis_pointer_type = "shadow"
),
)
)
pie = (
Pie()
.add(
series_name = "收藏量/转发量/评论数占比",
data_pair = [
["转发量", total_data["share_count"]["{}sum".format(year)]],
["评论数", total_data["comment_count"]["{}sum".format(year)]],
],
center = ["80%", "30%"],
radius = ["14%", "28%"],
color = ["#f47a75", "#009db2"]
)
.set_series_opts(tooltip_opts=opts.TooltipOpts(is_show=True, trigger="item"))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
.set_colors(["#f47a75", "#009db2"])
)
return bar.overlap(pie)
# 生成时间轴
timeline = Timeline(init_opts=opts.InitOpts(width="1000px"))
for y in final_year:
timeline.add(get_year_overlap_chart(year=y), time_point=y)
timeline.add_schema(is_auto_play=True, play_interval=2000)
图表特色:
- 可随时间变化动态的显示图表
- 不同的年份对应不同的标题
- 可以在几种数据中选择展示
- 圆环图与柱状图联动,柱状图变化时,圆环图也相应变化