pandas+pyecharts实战1(附代码)

  • 导入模块
  • 数据准备&预处理
  • 数据可视化
  • 1、评分等级分布
  • 2、每日评论量
  • 3、每小时评论量
  • 4、一周各天评论量
  • 5、角色热度
  • 6、观众地域分布


导入模块

# 导入模块
import pandas as pd
import numpy as np
from collections import Counter
from pyecharts.charts import Geo,Bar,Line,Pie,Timeline
from pyecharts import options as opts
from pyecharts.globals import ThemeType,ChartType

数据准备&预处理

df = pd.read_excel(path)
df.info()

# 填充空白值
df['用户名'].fillna('未知',inplace = True)
df.isnull().sum()

数据集展示如下:

基于spark的电影数据分析 电影数据分析可视化_ci

数据可视化

1、评分等级分布

# 评分等级分布
df_star = df.groupby(by = '评分').count()['id']
df_star = df_star.sort_values(ascending = True)

x_data = [str(i) for i in list(df_star.index)]
y_data = df_star.values.tolist()

b1 = (
    Bar(init_opts = opts.InitOpts(theme = ThemeType.LIGHT))
    .add_xaxis(x_data)
    .add_yaxis('',y_data)
    .reversal_axis()
    .set_series_opts(label_opts = opts.LabelOpts(position = 'right'))    
    .set_global_opts(
        yaxis_opts = opts.AxisOpts(name = '评分等级'),
        xaxis_opts = opts.AxisOpts(name = '人/次'),
        title_opts = opts.TitleOpts(title = '评分等级分布',pos_left = '45%',pos_top = '5%'),
        legend_opts = opts.LegendOpts(type_ = "scroll", pos_left = "85%",pos_top = "28%",orient = "vertical")
    )
)

df_star = df.groupby('评分').count()['id']
x_data = [str(i) for i in list(df_star.index)]
y_data = df_star.values.tolist()
p1 = (
    Pie(init_opts = opts.InitOpts(width = '800px', height = '600px'))
    .add(
    '',
    [list(z) for z in zip(x_data, y_data)],
    radius = ['10%', '30%'],
    center = ['65%', '60%'],
    label_opts = opts.LabelOpts(is_show = True),
    ) 
    .set_series_opts(label_opts = opts.LabelOpts(formatter='评分{b}: {d}%'),position="outside")   
)

b1.overlap(p1)
b1.render_notebook()

基于spark的电影数据分析 电影数据分析可视化_基于spark的电影数据分析_02

2、每日评论量

# 每日评论量
df['评论日期'] = df['评论时间'].apply(lambda x:x[:10]).astype(str)
df_comment = df.groupby('评论日期').count()['id']
day_x_data = df_comment.index.tolist()
day_y_data = df_comment.values.tolist()

line1 = (
    Line()
    .add_xaxis(xaxis_data = day_x_data)
    .add_yaxis(
        series_name = "",
        y_axis = day_y_data,
        is_smooth = True,
        symbol = 'circle',
        symbol_size = 6
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="八月每日评论量",
            pos_top="5%",
            pos_left="center",
            title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=16),
        )    
    )
)
line1.render_notebook()

基于spark的电影数据分析 电影数据分析可视化_基于spark的电影数据分析_03

3、每小时评论量

# 每小时评论量
df['小时'] = df['评论时间'].apply(lambda x:x[11:13]).astype(str)

df_hour = df.groupby(by = ['评论日期','小时']).count()['id']
df_hour = pd.DataFrame(df_hour).reset_index().rename(columns={'id':'评论数'})

timeline = Timeline(init_opts=opts.InitOpts(width='1200px', height='600px')) 
timeline.add_schema(axis_type='category', orient='horizontal', symbol='circle', symbol_size=10, 
                    play_interval=3000, is_auto_play=True, is_loop_play=True, is_timeline_show=True, 
                    control_position='left', pos_left='10%', pos_bottom='0%', width='800px', height='30px') 
for i in sorted(list(set(df_hour['评论日期']))):
    line = (
        Line(init_opts=opts.InitOpts(width='1200px', height='600px'))
        .add_xaxis([str(x) for x in range(0, 24)])
        .add_yaxis('', y_axis=list(df_hour.loc[np.array(df_hour['评论日期'] == i), '评论数']))
        .set_global_opts(
            title_opts=opts.TitleOpts('每小时评论数', pos_left='50%'), 
            legend_opts=opts.LegendOpts(is_show=True, pos_top='50%', pos_right='0%', orient='vertical')) 
    )

    timeline.add(chart=line, time_point=i)
    
timeline.render_notebook()

基于spark的电影数据分析 电影数据分析可视化_ci_04

4、一周各天评论量

# 一周各天评论量
# !数据源待排序,按照周一周二的顺序来!
df['第几周'] = pd.to_datetime(df['评论日期']).dt.isocalendar().week
df['星期几'] = pd.to_datetime(df['评论日期']).dt.day_name()

df_week = df.groupby(by = ['第几周','星期几']).count()['id']
df_week = pd.DataFrame(df_week).reset_index().rename(columns={'id':'评论数'})

timeline2 = Timeline(init_opts=opts.InitOpts(width='1200px', height='600px')) 
timeline2.add_schema(axis_type='category', orient='horizontal', symbol='circle', symbol_size=10, 
                    play_interval=3000, is_auto_play=True, is_loop_play=True, is_timeline_show=True, 
                    control_position='left', pos_left='10%', pos_bottom='0%', width='800px', height='30px') 
for i in sorted(list(set(df['第几周']))):
    line = (
        Line(init_opts=opts.InitOpts(width='1200px', height='600px'))
        .add_xaxis(list(df_week.loc[np.array(df_week['第几周'] == i), '星期几']))
        .add_yaxis('', y_axis=list(df_week.loc[np.array(df_week['第几周'] == i), '评论数']))
        .set_global_opts(
            title_opts=opts.TitleOpts('一周各天评论数', pos_left='50%'), 
            legend_opts=opts.LegendOpts(is_show=True, pos_top='50%', pos_right='0%', orient='vertical')) 
    )
    timeline2.add(chart=line, time_point=i)
    
timeline2.render_notebook()

基于spark的电影数据分析 电影数据分析可视化_基于spark的电影数据分析_05

5、角色热度

# 3.6 角色热度
roles=['小白','小青','许仙','法海','司马','孙姐','牛头帮主','蒙面男子','宝青坊主','书生']
content = ''.join([str(i) for i in list(df['评论'])])
roles_num = []
for role in roles:
    role_count = content.count(role)
    roles_num.append((role,role_count))
roles_num = pd.DataFrame(roles_num,columns=['名称','出现次数'])
roles_num = roles_num.sort_values(by = '出现次数',ascending = False)

x_data = roles_num['名称']
y_data = roles_num['出现次数']

b2 = (
    Bar(init_opts = opts.InitOpts(theme = ThemeType.LIGHT))
    .add_xaxis(list(x_data))
    .add_yaxis('频次', list(y_data))
    .set_global_opts(title_opts=opts.TitleOpts(title='影评角色频次分布',pos_top='2%',pos_left = 'center'),
        legend_opts=opts.LegendOpts(is_show=False),
        yaxis_opts=opts.AxisOpts(name="频次",name_location='middle',name_gap=50,name_textstyle_opts=opts.TextStyleOpts(font_size=16))) 
    )
b2.render_notebook()

基于spark的电影数据分析 电影数据分析可视化_ci_06

6、观众地域分布

# 观众地域分布
cities = df['城市'].values.tolist()
data = Counter(cities).most_common(80)
geo = (
    Geo(init_opts=opts.InitOpts(width="1000px", height="600px", bg_color="#404a59"))
    .add_schema(maptype="china", 
                itemstyle_opts={
                  'normal': {
                      'shadowColor': 'rgba(0, 0, 0, .5)', 
                      'shadowBlur': 5, 
                      'shadowOffsetY': 0, 
                      'shadowOffsetX': 0, 
                      'borderColor': '#fff'
                  }
                }
               )
    .add("评论数量", data,type_=ChartType.HEATMAP,)
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(
       title_opts=opts.TitleOpts(title="地理位置分布",pos_top="2%", pos_left="center",
                                 title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=16)),
       legend_opts=opts.LegendOpts(is_show=False),
       visualmap_opts=opts.VisualMapOpts(
            is_show=True,
            is_piecewise=True,
            min_ = 0,
            max_ = 500,
            split_number = 5,
            series_index=0,
            pos_bottom='5%',
            pos_left='5%',
            textstyle_opts=opts.TextStyleOpts(color="#fff"),
            pieces=[
                {'max':500, 'min':401, 'label':'401-500', 'color': '#990000'},
                {'max':400, 'min':301, 'label':'301-400', 'color': '#CD5C5C'},
                {'max':300, 'min':201, 'label':'201-300', 'color': '#F08080'},
                {'max':200, 'min':101, 'label':'101-200', 'color': '#FFCC99'},
                {'max':100, 'min':0, 'label':'0-100', 'color': '#FFE4E1'},
               ],
            ),
    )
)
geo.render_notebook()