本节内容:

  • Pandas数据读取
  • Pandas索引与计算
  • Pandas数据预处理实例
  • Pandas常用预处理方法
  • Pandas自定义函数
  • Series结构

 

1、Pandas数据读取

import pandas
food_info = pandas.read_csv("food_info.csv")#读取一个以逗号分隔开的文件
#print(type(food_info))  #<class 'pandas.core.frame.DataFrame'>
print(food_info.dtypes)
print(help(pandas.read_csv))

first_rows = food_info.head()#不加参数默认显示前5条数据,指定参数后可根据参数进行显示
print(first_rows )
print(food_info.head(3))#显示前3条数据
print(food_info.tail(3))#显示后3条数据
print(food_info.columns)#显示列名
print(food_info.shape)#多少行多少列

 

 

2、Pandas索引与计算

索引:

#要执行下列程序必先执行上面的程序
#pandas uses zero-indexing
#Series object representing the row at index 0.
print(food_info.loc[0])#通过调用loc这个函数来进行索引,取出第一行的元素

# Series object representing the seventh row.
print(food_info.loc[6])#通过调用loc这个函数来进行索引,取出第7行的元素

# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
print(food_info.loc[8620])#当行数超出时会报错(8618, 36)
#The object dtype is equivalent to a string in Python
#object - For string values
#int - For integer values
#float - For float values
#datetime - For time values
#bool - For Boolean values
print(food_info.dtypes)#看每个元素的数据类型
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
food_info.loc[3:6]

# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
two_five_ten = [2,5,10] 
food_info.loc[two_five_ten]

# Method 2
food_info.loc[[2,5,10]]#返回一个包含了2,5,10行的数据框
# Series object representing the "NDB_No" column.
# ndb_col = food_info["NDB_No"]#根据"NDB_No"这个列名来打印此列
# print(ndb_col)
# Alternatively, you can access a column by passing in a string variable.
col_name = "NDB_No"
ndb_col = food_info[col_name]
print(ndb_col)
import pandas
food_info = pandas.read_csv("food_info.csv")
# columns = ["Zinc_(mg)", "Copper_(mg)"]#指定要拿的两个列,存在一个列表中
# zinc_copper = food_info[columns]
# print(zinc_copper)
zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
print(zinc_copper)
#print(food_info.columns)
#print(food_info.head(2))
col_names = food_info.columns.tolist()  #把所有的列名取出来并放在一个列表中
print(col_names)
gram_columns = []  #构造一个空列表

for c in col_names:  #对存放了列名的列表进行遍历
    if c.endswith("(g)"):  #对每一个col_names中的列名进行查看,如果是以"(g)"结尾的列名,则把它加到gram_columns列表里
        gram_columns.append(c)
print(gram_columns)        
gram_df = food_info[gram_columns]   #根据列名,输出对应的每一列
print(gram_df.head(3))#只输出前三行

 

加减乘除计算:

print(food_info["Iron_(mg)"])#打印列名为"Iron_(mg)"的这一列
div_1000 = food_info["Iron_(mg)"] / 1000  #将这一列的值都除以1000
print(div_1000)
# Adds 100 to each value in the column and returns a Series object.
add_100 = food_info["Iron_(mg)"] + 100
print(add_100 )
# Subtracts 100 from each value in the column and returns a Series object.
sub_100 = food_info["Iron_(mg)"] - 100
print(sub_100 )
# Multiplies each value in the column by 2 and returns a Series object.
mult_2 = food_info["Iron_(mg)"]*2
print(mult_2  )

#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]#对应位置上的两个数相乘

weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat
print(weighted_protein,weighted_fat,initial_rating)

 

# the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79
#For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result, 
#due to the scale of the values
# The largest value in the "Energ_Kcal" column.
max_calories = food_info["Energ_Kcal"].max()  #取列名为"Energ_Kcal"这一列最大的数
# Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein  #在food_info中加入新的列,列名为"Normalized_Protein"
food_info["Normalized_Fat"] = normalized_fat

 

print(food_info.shape) #打印food_info.csv的行列数,(8618, 36)
iron_grams = food_info["Iron_(mg)"] / 1000  
food_info["Iron_(g)"] = iron_grams  #在food_info中加入新的列,列名为Iron_(g)
print(food_info.shape) #(8618, 37)

 

#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.对DataFrame进行就地排序,而不是返回新的DataFrame。
print(food_info["Sodium_(mg)"])
food_info.sort_values("Sodium_(mg)", inplace=True)#默认对"Sodium_(mg)"这一列从小到大进行排序
print(food_info["Sodium_(mg)"])
#Sorts by descending order, rather than ascending.按降序排序,而不是升序排序。
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
print(food_info["Sodium_(mg)"])

 

3、Pandas数据预处理实例

泰坦尼克号:

把数据集读取进来

import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()

一些数据的处理:

#The Pandas library uses NaN(缺失值), which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]  #把"Age"这一列的值放在age这个列表中
print(age.loc[0:10])   #看一下age这一列的前11个值
age_is_null = pd.isnull(age)  #判断age中那些样本的值是空的(缺失)
print(age_is_null)  #返回一系列真值和假值
age_null_true = age[age_is_null]  #把返回ture的这个位置拿出来做索引,找出这些空缺值
print(age_null_true)
age_null_count = len(age_null_true)   #看空缺值的个数
print(age_null_count)

 

#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age)  #nan  因为空缺值的存在导致无法计算均值

把空缺值去掉:(并不需要把所有的空缺值去掉,有时候会用中位数,均值来替换空缺值以保证数据的完整)

#we have to filter out the missing values before we calculate the mean.在计算平均值之前,我们必须过滤掉遗漏的值。
good_ages = titanic_survival["Age"][age_is_null == False]  #把"Age"中不是空缺的值拿出来放在新的列表“good_ages”中
#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages) #计算新列表的均值
print(correct_mean_age)   #29.6991176471
# missing data is so common that many pandas methods automatically filter for it丢失的数据是如此常见,以至于许多熊猫方法会自动为其过滤
correct_mean_age = titanic_survival["Age"].mean()  #通过调用.mean()自动过滤空缺值,再进行求均值
print(correct_mean_age)  #29.69911764705882

 

#mean fare for each class  每个船舱等级(1,2,3)的平均票价
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:  #对船舱等级进行遍历
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]  #先取出1等级的数据,再取2等级的数据,最后取三等级的数据
    pclass_fares = pclass_rows["Fare"]  #把1等级的船费票价取出来放pclass_fares,然后再是2...,3
    fare_for_class = pclass_fares.mean()  #求1等级的船费票价的均值
    fares_by_class[this_class] = fare_for_class  #求1,2,3等级的平均票价放在fares_by_class这个字典里面。
print(fares_by_class)   #{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}

.pivot_table()

#index tells the method which column to group by               index告诉方法按哪个列分组
#values is the column that we want to apply the calculation to 值是我们要应用计算的列
#aggfunc specifies the calculation we want to perform  aggfunc指定我们要执行的计算
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean) #.pivot_table()统计一个量与其他量之间的关系
print(passenger_survival)

###
看船舱等级与获救乘客之间有什么联系,1等舱平均获救多少人,2等舱平均获救多少人,3等舱平均获救多少人

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

船舱等级越高与获救率越高
###
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")  #未指定 aggfunc,默认按照求均值的方法去计算
print(passenger_age)

###
看船舱等级与年龄之间的一个联系
Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

船舱等级越高坐着年龄越大的人
###
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
#看一个量与另外两个量之间的关系
print(port_stats)


###
看登船地点C、Q、S与票价收费总额和获救总人数的关系

                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217
###

 

.dropna()函数

#specifying axis=1 or axis='columns' will drop any columns that have null values指定axis=1或axis='columns'将删除任何具有null值的列
drop_na_columns = titanic_survival.dropna(axis=1)   #删除任何具有null值的列
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])#将"Age","Sex"这两列的中有null的行删除
print(new_titanic_survival)

 

具体定位到一个值:

row_index_83_age = titanic_survival.loc[83,"Age"]  #定位带编号为83的样本,特征量为"Age"的这一个值
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age) #28.0
print(row_index_1000_pclass) #1

 

.reset_index(drop=True)

new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)#对"Age"这一列进行降序排列
print(new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True) #对索引值进行重新排序 drop=True:原来的不要了,形成新的值
print(titanic_reindexed.iloc[0:10])

 

4、自定义函数并应用:.apply()

# This function returns the hundredth item from a series 这个函数返回系列中的第一百项
def hundredth_row(column):
    # Extract the hundredth item  提取第一百项
    hundredth_item = column.iloc[99]
    return hundredth_item

# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)
def not_null_count(column):
    column_null = pd.isnull(column)#查看列值是否空缺,将真假值返回
    null = column[column_null]#用真值做索引,取出空缺值放到null中
    return len(null)

column_null_count = titanic_survival.apply(not_null_count)#通过.apply()调用自定义得函数
print(column_null_count )
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
#通过传入axis=1参数,我们可以使用DataFrame.apply()方法遍历行而不是列。
def which_class(row):  #通过自定义的which_class函数对船舱等级进行数据类型转换
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"

classes = titanic_survival.apply(which_class, axis=1)
print(classes)
def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False

minors = titanic_survival.apply(is_minor, axis=1)
#print(minors)

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

age_labels = titanic_survival.apply(generate_age_label, axis=1)
print(age_labels)

#把连续值转换为离散值

 

titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print(age_group_survival )

###

age_labels
adult      0.381032
minor      0.539823
unknown    0.293785
Name: Survived, dtype: float64
###

 

5、Series结构

#Series (collection of values)  一行或一列
#DataFrame (collection of Series objects)
#Panel (collection of DataFrame objects)
#A Series object can hold many data types, including
#float - for representing float values
#int - for representing integer values
#bool - for representing Boolean values
#datetime64[ns] - for representing date & time, without time-zone
#datetime64[ns, tz] - for representing date & time, with time-zone
#timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.)
#category - for representing categorical values
#object - for representing String values

#FILM - film name
#RottenTomatoes - Rotten Tomatoes critics average score
#RottenTomatoes_User - Rotten Tomatoes user average score
#RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system)
#RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system)
#Metacritic - Metacritic critics average score
#Metacritic_User - Metacritic user average score

 

import pandas as pd
fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango['FILM']
print(series_film[0:5])
series_rt = fandango['RottenTomatoes']
print (series_rt[0:5])

###


0    Avengers: Age of Ultron (2015)
1                 Cinderella (2015)
2                    Ant-Man (2015)
3            Do You Believe? (2015)
4     Hot Tub Time Machine 2 (2015)
Name: FILM, dtype: object
0    74
1    85
2    80
3    18
4    14
Name: RottenTomatoes, dtype: int64
###

 

# Import the Series object from pandas
from pandas import Series

film_names = series_film.values   #通过.values 取film这一列的值
print(type(film_names))    #<class 'numpy.ndarray'>,可以证明pandas是在numpy上进行封装
# print(film_names)
print(len(film_names))  #146
rt_scores = series_rt.values
# print(rt_scores)
print(len(rt_scores))   #146
series_custom = Series(rt_scores , index=film_names)  #可用‘str’型来做索引,给出对应评分
series_custom[['Minions (2015)', 'Leviathan (2014)']]

###
Minions (2015)      54
Leviathan (2014)    99
dtype: int64
###
# int index is also aviable
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
fiveten = series_custom[5:10]
print(fiveten)
###
The Water Diviner (2015)        63
Irrational Man (2015)           42
Top Five (2014)                 86
Shaun the Sheep Movie (2015)    99
Love & Mercy (2015)             89
dtype: int64
###
original_index = series_custom.index.tolist()
print(type(original_index))  #<class 'list'>
print(original_index)
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
print(sorted_by_index)#按字母进行排序
sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
print(sc2[0:10])
print(sc3[0:10])

 

#The values in a Series object are treated as an ndarray, the core data type in NumPy
import numpy as np
# Add each value with each other
print(np.add(series_custom, series_custom) )
# Apply sine function to each value
np.sin(series_custom)
# Return the highest value (will return a single value not a Series)
np.max(series_custom)

 

#will actually return a Series object with a boolean value for each film
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50]

criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print(both_criteria)
#data alignment same index
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2#求两个媒体评分的均值

print(rt_mean)

 

 

 

import pandas as pd
#will return a new DataFrame that is indexed by the values in the specified column 
#and will drop that column from the DataFrame
#without the FILM column dropped 
fandango = pd.read_csv('fandango_score_comparison.csv')
print type(fandango)
fandango_films = fandango.set_index('FILM', drop=False)
#print(fandango_films.index)
# Slice using either bracket notation or loc[]
fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]

# Specific movie
fandango_films.loc['Kumiko, The Treasure Hunter (2015)']

# Selecting list of movies
movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)']
fandango_films.loc[movies]

#When selecting multiple rows, a DataFrame is returned, 
#but when selecting an individual row, a Series object is returned instead
#The apply() method in Pandas allows us to specify Python logic
#The apply() method requires you to pass in a vectorized operation 
#that can be applied over each Series object.
import numpy as np

# returns the data types as a Series
types = fandango_films.dtypes
#print types
# filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == 'float64'].index
# use bracket notation to filter columns to just float columns
float_df = fandango_films[float_columns]
#print float_df
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))

print(deviations)
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
rt_mt_user.apply(lambda x: np.std(x), axis=1)