目录

1.数据组合

 1.1连接

1.1.1添加行

1.1.2添加列

 1.1.3不同索引下的连接操作

1.2合并多个数据集

 1.2.1一对一合并

 1.2.2多对一合并

 1.2.3多对多合并

2.缺失数据

2.1查找和统计缺失数据

2.2清理缺失数据

2.2.1重新编码/替换

2.2.2前值填充

 2.2.3后值填充

 2.2.4插值

2.2.5删除缺失值

2.3缺失值计算

3.整理数据

3.1包含值而非变量的列

 3.1.1固定一列

 3.1.2固定多列

3.2包含多个变量的列

3.2.1单独拆分和添加列

3.2.2在单个步骤中进行拆分和组合

3.3行与列中的变量

3.4一张表中的多个观测单元(归一化)

3.5跨多张表的观测单元

 3.5.1使用循环加载多个文件

 3.5.2使用列表推导加载多个文件


1.数据组合

 1.1连接

1.1.1添加行

import pandas as pd

df1=pd.read_csv('concat_1.csv')

df2=pd.read_csv('concat_2.csv')

df3=pd.read_csv('concat_3.csv')

print(df1)

print(df2)

print(df3)

#concat将3个dataframe连接在一起,是原始索引的简单堆叠

row_concat=pd.concat([df1,df2,df3])

print(row_concat)

#获取第四行

print(row_concat.iloc[3,])

#将series添加到dataframe,要先将series转化为dataframe

new_row_df=pd.DataFrame([['n1','n2','n3','n4']],columns=['A','B','C','D'])

print(new_row_df)

print(pd.concat([df1,new_row_df]))

#concat连接多个对象,append连接一个对象

print(df1.append(df2))

print(df1.append(new_row_df))

#使用python字典添加数据行

data_dict={'A':'n1','B':'n2','C':'n3','D':'n4'}

print(df1.append(data_dict,ignore_index=True))

#当向dataframe添加一个字典时,必须使用参数ignore_index

#简单的连接或追加数据,可以使用ignore_index参数重置连接后的行索引

row_concat_i=pd.concat([df1,df2,df3],ignore_index=True)

print(row_concat_i)


1.1.2添加列

#axis=0.按行连接数据;axis=1,按列连接数据

col_concat=pd.concat([df1,df2,df3],axis=1)

print(col_concat)

#按列明提取数据子集

print(col_concat['A'])

#直接添加一列

col_concat['new_col_list']=['n1','n2','n3','n4']

print(col_concat)

col_concat['new_col_series']=pd.Series(['n1','n2','n3','n4'])

print(col_concat)

#ignore_index重置列索引

print(pd.concat([df1,df2,df3],axis=1,ignore_index=True))

 1.1.3不同索引下的连接操作

1连接具有不同列的行

# 1连接具有不同列的行

df1.columns=['A','B','C','D']

df2.columns=['E','F','G','H']

df3.columns=['A','C','F','H']

print(df1)

print(df2)

print(df3)

row_concat=pd.concat([df1,df2,df3])

print(row_concat)

print(pd.concat([df1,df2,df3],join='inner'))

print(pd.concat([df1,df3],ignore_index=False,join='inner'))

2.连接具有不同行的列

df1.index=[0,1,2,3]

df2.index=[4,5,6,7]

df3.index=[0,2,5,7]

print(df1)

print(df2)

print(df3)

col_concat=pd.concat([df1,df2,df3],axis=1)

print(col_concat)

print(pd.concat([df1,df3],axis=1,join='inner'))

1.2合并多个数据集

person=pd.read_csv('survey_person.csv')

site=pd.read_csv('survey_site.csv')

survey=pd.read_csv('survey_survey.csv')

visited=pd.read_csv('survey_visited.csv')

print(person)

print(site)

print(survey)

print(visited)

 

 1.2.1一对一合并

 

visited_subset=visited.loc[[0,2,6]]

#merge函数的参数how默认值为inner

o2o_merge=site.merge(visited_subset,left_on='name',right_on='site')

print(o2o_merge)

1.2.2多对一合并

m2o_merge=site.merge(visited,left_on='name',right_on='site')

print(m2o_merge)

1.2.3多对多合并

ps=person.merge(survey,left_on='ident',right_on='person')

vs=visited.merge(survey,left_on='ident',right_on='taken')

print(ps)

print(vs)

ps_vs=ps.merge(vs,left_on=['ident','taken','quant','reading'],

              right_on=['person','ident','quant','reading'])

#print(ps_vs)

print(ps_vs.loc[0,])

2.缺失数据

2.1查找和统计缺失数据

 

ebola=pd.read_csv('country_timeseries.csv')

#统计非缺失值的个数

print(ebola.count())

num_rows=ebola.shape[0]

num_missing=num_rows-ebola.count()

print(num_missing)

#统计缺失值的总数或特定列中缺失值的个数,count_nonzero函数和isnull方法

import numpy as np

print(np.count_nonzero(ebola.isnull()))

print(np.count_nonzero(ebola['Cases_Guinea'].isnull()))

#value_counts方法获取缺失值个数

print(ebola.Cases_Guinea.value_counts(dropna=False).head())

 2.2清理缺失数据

2.2.1重新编码/替换

 

#把缺失值重新编码为0

print(ebola.fillna(0).iloc[0:10,0:5])

2.2.2前值填充

print(ebola.fillna(method='ffill').iloc[0:30,0:5])

 2.2.3后值填充

print(ebola.fillna(method='bfill').iloc[:,0:5].tail())

 2.2.4插值

print(ebola.interpolate().iloc[0:10,0:5])

2.2.5删除缺失值

print(ebola.shape)

ebola_dropna=ebola.dropna()

print(ebola_dropna.shape)

2.3缺失值计算

ebola['Cases_multiple']=ebola['Cases_Guinea']+ebola['Cases_Liberia']+ebola['Cases_SierraLeone']

ebola_subset=ebola.loc[:,['Cases_Guinea','Cases_Liberia','Cases_SierraLeone','Cases_multiple']]

print(ebola_subset.head(n=10))

print(ebola.Cases_Guinea.sum(skipna=True))

print(ebola.Cases_Guinea.sum(skipna=False))

3.整理数据

3.1包含值而非变量的列

 3.1.1固定一列

import pandas as pd

pew=pd.read_csv('pew.csv')

#只显示前几列

print(pew.iloc[:,0:6])

#melt函数可以把dataframe重塑成整洁的数据格式

pew_long=pd.melt(pew,id_vars='religion')

print(pew_long)

pew_long=pd.melt(pew,id_vars='religion',var_name='income',value_name='count')

print(pew_long)

 3.1.2固定多列

billboard=pd.read_csv('billboard.csv')

#查看前几行和前几列

print(billboard.iloc[0:5,0:16])

billboard_long=pd.melt(billboard,id_vars=['year','artist','track','time','date.entered'],var_name='week',value_name='rating')

print(billboard_long.head())

3.2包含多个变量的列

ebola=pd.read_csv('country_timeseries.csv')

print(ebola.columns)

#输出所选行

print(ebola.iloc[:5,[0,1,2,3,10,11]])

ebola_long=pd.melt(ebola,id_vars=['Date','Day'])

print(ebola_long)

3.2.1单独拆分和添加列

#获取variable列

#访问字符串方法

#依据分隔符拆分列

variable_split=ebola_long.variable.str.split('_')

print(variable_split[:5])

#整个容器

print(type(variable_split))

#容器中的第一个元素

print(type(variable_split[0]))

#使用get方法对各行取想要的索引

status_values=variable_split.str.get(0)

country_values=variable_split.str.get(1)

print(status_values[5:])

print(country_values[-5:])

ebola_long['status']=status_values

ebola_long['country']=country_values

print(ebola_long.head())

 

 3.2.2在单个步骤中进行拆分和组合

variable_split=ebola_long.variable.str.split('_',expand=True)

variable_split.columns=['status','country']

ebola_parsed=pd.concat([ebola_long,variable_split],axis=1)

print(ebola_parsed)

ebola_long['status'],ebola_long['country']=zip(*ebola_long.variable.str.split('_'))

print(ebola_long.head())

 3.3行与列中的变量

weather=pd.read_csv('weather.csv')

print(weather.iloc[:5,:11])

weather_melt=pd.melt(weather,id_vars=['id','year','month','element'],var_name='day',value_name='temp')

print(weather_melt)

weather_tidy=weather_melt.pivot_table(index=['id','year','month','day'],columns='element',values='temp')

weather_tidy_flat=weather_tidy.reset_index()

print(weather_tidy_flat.head())

weather_tidy=weather_melt.pivot_table(index=['id','year','month','day'],columns='element',values='temp').reset_index()

print(weather_tidy.head())

 

 3.4一张表中的多个观测单元(归一化)

 

print(billboard_long.head())

billboard_songs=billboard_long[['year','artist','track','time']]

print(billboard_songs.shape)

billboard_songs=billboard_songs.drop_duplicates()

print(billboard_songs.shape)billboard_songs['id']=range(len(billboard_songs))

print(billboard_songs.head(10))

#把歌曲dataframe合并到源数据集

billboard_ratings=billboard_long.merge(billboard_songs,on=['year','artist','track','time'])

print(billboard_ratings.shape)

print(billboard_ratings.head())

billboard_ratings=billboard_ratings[['id','date.entered','week','rating']]

print(billboard_ratings.head())

3.5跨多张表的观测单元

import os

import urllib

#只下载前2个数据集

with open('raw_data_urls.txt','r') as data_urls:

    for line,url in enumerate(data_urls):

        if line==2:

            break

        fn=url.split('/')[-1].strip()

        fp=os.path.join(fn)

        print(url)

        print(fp)

        urllib.request.urlretrieve(url,fp)


import glob

nyc_taxi_data=glob.glob('fhv_*')

print(nyc_taxi_data)

taxi1=pd.read_csv(nyc_taxi_data[0])

taxi2=pd.read_csv(nyc_taxi_data[1])

print(taxi1.head(n=2))

print(taxi2.head(n=2))

print(taxi1.shape)

print(taxi2.shape)

taxi=pd.concat([taxi1,taxi2])

print(taxi.shape)

 

 3.5.1使用循环加载多个文件

#创建一个待添加元素的空列表

list_taxi_df=[]

#循环遍历每个csv文件名

for csv_filename in nyc_taxi_data:

    #可以选择输出文件名以便调试

    print(csv_filename)

    #把csv文件加载到dataframe中

    df=pd.read_csv(csv_filename)

    #把dataframe添加到列表中

    list_taxi_df.append(df)

#输出列表中dataframe的个数

print(len(list_taxi_df))

#第一个元素的类型

print(type(list_taxi_df[0]))

#查看第一个dataframe的前5行数据

print(list_taxi_df[0].head())

taxi_loop_concat=pd.concat(list_taxi_df)

print(taxi_loop_concat.shape)

#手动加载连接和循环加载连接得到的结果一样吗

print(taxi.equals(taxi_loop_concat))

3.5.2使用列表推导加载多个文件

#不带注释的循环代码

list_taxi_df=[]

for csv_filename in nyc_taxi_data:

    df=pd.read_csv(csv_filename)

    list_taxi_df.append(df)

#使用列表推导式重写

list_taxi_df_comp=[pd.read_csv(data) for data in nyc_taxi_data]

print(type(list_taxi_df))

taxi_loop_concat_comp=pd.concat(list_taxi_df_comp)

print(taxi_loop_concat_comp.equals(taxi_loop_concat))