python tornado 调用流式接口并流式输出 python流处理库

转载

墨舞青云 2024-06-08 22:20:27

本节内容：

Pandas数据读取
Pandas索引与计算
Pandas数据预处理实例
Pandas常用预处理方法
Pandas自定义函数
Series结构

1、Pandas数据读取

import pandas
food_info = pandas.read_csv("food_info.csv")#读取一个以逗号分隔开的文件
#print(type(food_info))  #<class 'pandas.core.frame.DataFrame'>
print(food_info.dtypes)
print(help(pandas.read_csv))

first_rows = food_info.head()#不加参数默认显示前5条数据，指定参数后可根据参数进行显示
print(first_rows )
print(food_info.head(3))#显示前3条数据
print(food_info.tail(3))#显示后3条数据
print(food_info.columns)#显示列名
print(food_info.shape)#多少行多少列

2、Pandas索引与计算

索引：

#要执行下列程序必先执行上面的程序
#pandas uses zero-indexing
#Series object representing the row at index 0.
print(food_info.loc[0])#通过调用loc这个函数来进行索引，取出第一行的元素

# Series object representing the seventh row.
print(food_info.loc[6])#通过调用loc这个函数来进行索引，取出第7行的元素

# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
print(food_info.loc[8620])#当行数超出时会报错(8618, 36)

#The object dtype is equivalent to a string in Python

#object - For string values
#int - For integer values
#float - For float values
#datetime - For time values
#bool - For Boolean values
print(food_info.dtypes)#看每个元素的数据类型

# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
food_info.loc[3:6]

# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
two_five_ten = [2,5,10] 
food_info.loc[two_five_ten]

# Method 2
food_info.loc[[2,5,10]]#返回一个包含了2，5，10行的数据框

# Series object representing the "NDB_No" column.
# ndb_col = food_info["NDB_No"]#根据"NDB_No"这个列名来打印此列
# print(ndb_col)
# Alternatively, you can access a column by passing in a string variable.
col_name = "NDB_No"
ndb_col = food_info[col_name]
print(ndb_col)

import pandas
food_info = pandas.read_csv("food_info.csv")
# columns = ["Zinc_(mg)", "Copper_(mg)"]#指定要拿的两个列，存在一个列表中
# zinc_copper = food_info[columns]
# print(zinc_copper)
zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
print(zinc_copper)

#print(food_info.columns)
#print(food_info.head(2))
col_names = food_info.columns.tolist()  #把所有的列名取出来并放在一个列表中
print(col_names)
gram_columns = []  #构造一个空列表

for c in col_names:  #对存放了列名的列表进行遍历
    if c.endswith("(g)"):  #对每一个col_names中的列名进行查看，如果是以"(g)"结尾的列名，则把它加到gram_columns列表里
        gram_columns.append(c)
print(gram_columns)        
gram_df = food_info[gram_columns]   #根据列名，输出对应的每一列
print(gram_df.head(3))#只输出前三行

加减乘除计算：

print(food_info["Iron_(mg)"])#打印列名为"Iron_(mg)"的这一列
div_1000 = food_info["Iron_(mg)"] / 1000  #将这一列的值都除以1000
print(div_1000)
# Adds 100 to each value in the column and returns a Series object.
add_100 = food_info["Iron_(mg)"] + 100
print(add_100 )
# Subtracts 100 from each value in the column and returns a Series object.
sub_100 = food_info["Iron_(mg)"] - 100
print(sub_100 )
# Multiplies each value in the column by 2 and returns a Series object.
mult_2 = food_info["Iron_(mg)"]*2
print(mult_2  )

#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]#对应位置上的两个数相乘

weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat
print(weighted_protein,weighted_fat,initial_rating)

# the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79
#For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result, 
#due to the scale of the values
# The largest value in the "Energ_Kcal" column.
max_calories = food_info["Energ_Kcal"].max()  #取列名为"Energ_Kcal"这一列最大的数
# Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein  #在food_info中加入新的列，列名为"Normalized_Protein"
food_info["Normalized_Fat"] = normalized_fat

print(food_info.shape) #打印food_info.csv的行列数，(8618, 36)
iron_grams = food_info["Iron_(mg)"] / 1000  
food_info["Iron_(g)"] = iron_grams  #在food_info中加入新的列，列名为Iron_(g)
print(food_info.shape) #(8618, 37)

#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.对DataFrame进行就地排序，而不是返回新的DataFrame。
print(food_info["Sodium_(mg)"])
food_info.sort_values("Sodium_(mg)", inplace=True)#默认对"Sodium_(mg)"这一列从小到大进行排序
print(food_info["Sodium_(mg)"])
#Sorts by descending order, rather than ascending.按降序排序，而不是升序排序。
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
print(food_info["Sodium_(mg)"])

3、Pandas数据预处理实例

泰坦尼克号：

把数据集读取进来

import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()

一些数据的处理：

#The Pandas library uses NaN(缺失值), which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]  #把"Age"这一列的值放在age这个列表中
print(age.loc[0:10])   #看一下age这一列的前11个值
age_is_null = pd.isnull(age)  #判断age中那些样本的值是空的（缺失）
print(age_is_null)  #返回一系列真值和假值
age_null_true = age[age_is_null]  #把返回ture的这个位置拿出来做索引，找出这些空缺值
print(age_null_true)
age_null_count = len(age_null_true)   #看空缺值的个数
print(age_null_count)

#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print(mean_age)  #nan  因为空缺值的存在导致无法计算均值

把空缺值去掉：（并不需要把所有的空缺值去掉，有时候会用中位数，均值来替换空缺值以保证数据的完整）

#we have to filter out the missing values before we calculate the mean.在计算平均值之前，我们必须过滤掉遗漏的值。
good_ages = titanic_survival["Age"][age_is_null == False]  #把"Age"中不是空缺的值拿出来放在新的列表“good_ages”中
#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages) #计算新列表的均值
print(correct_mean_age)   #29.6991176471

# missing data is so common that many pandas methods automatically filter for it丢失的数据是如此常见，以至于许多熊猫方法会自动为其过滤
correct_mean_age = titanic_survival["Age"].mean()  #通过调用.mean()自动过滤空缺值，再进行求均值
print(correct_mean_age)  #29.69911764705882

#mean fare for each class  每个船舱等级（1,2,3）的平均票价
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:  #对船舱等级进行遍历
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]  #先取出1等级的数据，再取2等级的数据，最后取三等级的数据
    pclass_fares = pclass_rows["Fare"]  #把1等级的船费票价取出来放pclass_fares，然后再是2...,3
    fare_for_class = pclass_fares.mean()  #求1等级的船费票价的均值
    fares_by_class[this_class] = fare_for_class  #求1，2，3等级的平均票价放在fares_by_class这个字典里面。
print(fares_by_class)   #{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}

.pivot_table()

#index tells the method which column to group by               index告诉方法按哪个列分组
#values is the column that we want to apply the calculation to 值是我们要应用计算的列
#aggfunc specifies the calculation we want to perform  aggfunc指定我们要执行的计算
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean) #.pivot_table()统计一个量与其他量之间的关系
print(passenger_survival)

###
看船舱等级与获救乘客之间有什么联系，1等舱平均获救多少人，2等舱平均获救多少人，3等舱平均获救多少人

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

船舱等级越高与获救率越高
###

passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")  #未指定 aggfunc，默认按照求均值的方法去计算
print(passenger_age)

###
看船舱等级与年龄之间的一个联系
Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

船舱等级越高坐着年龄越大的人
###

port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
#看一个量与另外两个量之间的关系
print(port_stats)


###
看登船地点C、Q、S与票价收费总额和获救总人数的关系

                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217
###

.dropna（）函数

#specifying axis=1 or axis='columns' will drop any columns that have null values指定axis=1或axis='columns'将删除任何具有null值的列
drop_na_columns = titanic_survival.dropna(axis=1)   #删除任何具有null值的列
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])#将"Age"，"Sex"这两列的中有null的行删除
print(new_titanic_survival)

具体定位到一个值：

row_index_83_age = titanic_survival.loc[83,"Age"]  #定位带编号为83的样本，特征量为"Age"的这一个值
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print(row_index_83_age) #28.0
print(row_index_1000_pclass) #1

.reset_index(drop=True)

new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)#对"Age"这一列进行降序排列
print(new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True) #对索引值进行重新排序 drop=True：原来的不要了，形成新的值
print(titanic_reindexed.iloc[0:10])

4、自定义函数并应用：.apply()

# This function returns the hundredth item from a series 这个函数返回系列中的第一百项
def hundredth_row(column):
    # Extract the hundredth item  提取第一百项
    hundredth_item = column.iloc[99]
    return hundredth_item

# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)

def not_null_count(column):
    column_null = pd.isnull(column)#查看列值是否空缺，将真假值返回
    null = column[column_null]#用真值做索引，取出空缺值放到null中
    return len(null)

column_null_count = titanic_survival.apply(not_null_count)#通过.apply()调用自定义得函数
print(column_null_count )

#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
#通过传入axis=1参数，我们可以使用DataFrame.apply()方法遍历行而不是列。
def which_class(row):  #通过自定义的which_class函数对船舱等级进行数据类型转换
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"

classes = titanic_survival.apply(which_class, axis=1)
print(classes)

def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False

minors = titanic_survival.apply(is_minor, axis=1)
#print(minors)

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

age_labels = titanic_survival.apply(generate_age_label, axis=1)
print(age_labels)

#把连续值转换为离散值

titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print(age_group_survival )

###

age_labels
adult      0.381032
minor      0.539823
unknown    0.293785
Name: Survived, dtype: float64
###

5、Series结构

#Series (collection of values)  一行或一列
#DataFrame (collection of Series objects)
#Panel (collection of DataFrame objects)

#A Series object can hold many data types, including
#float - for representing float values
#int - for representing integer values
#bool - for representing Boolean values
#datetime64[ns] - for representing date & time, without time-zone
#datetime64[ns, tz] - for representing date & time, with time-zone
#timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.)
#category - for representing categorical values
#object - for representing String values

#FILM - film name
#RottenTomatoes - Rotten Tomatoes critics average score
#RottenTomatoes_User - Rotten Tomatoes user average score
#RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system)
#RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system)
#Metacritic - Metacritic critics average score
#Metacritic_User - Metacritic user average score

import pandas as pd
fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango['FILM']
print(series_film[0:5])
series_rt = fandango['RottenTomatoes']
print (series_rt[0:5])

###


0    Avengers: Age of Ultron (2015)
1                 Cinderella (2015)
2                    Ant-Man (2015)
3            Do You Believe? (2015)
4     Hot Tub Time Machine 2 (2015)
Name: FILM, dtype: object
0    74
1    85
2    80
3    18
4    14
Name: RottenTomatoes, dtype: int64
###

# Import the Series object from pandas
from pandas import Series

film_names = series_film.values   #通过.values 取film这一列的值
print(type(film_names))    #<class 'numpy.ndarray'>，可以证明pandas是在numpy上进行封装
# print(film_names)
print(len(film_names))  #146
rt_scores = series_rt.values
# print(rt_scores)
print(len(rt_scores))   #146
series_custom = Series(rt_scores , index=film_names)  #可用‘str’型来做索引，给出对应评分
series_custom[['Minions (2015)', 'Leviathan (2014)']]

###
Minions (2015)      54
Leviathan (2014)    99
dtype: int64
###

# int index is also aviable
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
fiveten = series_custom[5:10]
print(fiveten)
###
The Water Diviner (2015)        63
Irrational Man (2015)           42
Top Five (2014)                 86
Shaun the Sheep Movie (2015)    99
Love & Mercy (2015)             89
dtype: int64
###

original_index = series_custom.index.tolist()
print(type(original_index))  #<class 'list'>
print(original_index)
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
print(sorted_by_index)#按字母进行排序

sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
print(sc2[0:10])
print(sc3[0:10])

#The values in a Series object are treated as an ndarray, the core data type in NumPy
import numpy as np
# Add each value with each other
print(np.add(series_custom, series_custom) )
# Apply sine function to each value
np.sin(series_custom)
# Return the highest value (will return a single value not a Series)
np.max(series_custom)

#will actually return a Series object with a boolean value for each film
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50]

criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print(both_criteria)

#data alignment same index
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2#求两个媒体评分的均值

print(rt_mean)

import pandas as pd
#will return a new DataFrame that is indexed by the values in the specified column 
#and will drop that column from the DataFrame
#without the FILM column dropped 
fandango = pd.read_csv('fandango_score_comparison.csv')
print type(fandango)
fandango_films = fandango.set_index('FILM', drop=False)
#print(fandango_films.index)

# Slice using either bracket notation or loc[]
fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]

# Specific movie
fandango_films.loc['Kumiko, The Treasure Hunter (2015)']

# Selecting list of movies
movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)']
fandango_films.loc[movies]

#When selecting multiple rows, a DataFrame is returned, 
#but when selecting an individual row, a Series object is returned instead

#The apply() method in Pandas allows us to specify Python logic
#The apply() method requires you to pass in a vectorized operation 
#that can be applied over each Series object.
import numpy as np

# returns the data types as a Series
types = fandango_films.dtypes
#print types
# filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == 'float64'].index
# use bracket notation to filter columns to just float columns
float_df = fandango_films[float_columns]
#print float_df
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))

print(deviations)

rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
rt_mt_user.apply(lambda x: np.std(x), axis=1)

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。

上一篇：linux驱动如何获取所有android应用程序的uid 获取linux文件

下一篇：cups docker安装 docker安装教程

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯

python tornado 调用流式接口 并流式输出 python流处理库

python tornado 调用流式接口 并流式输出 python流处理库

51CTO博客

python tornado 调用流式接口并流式输出 python流处理库

python tornado 调用流式接口并流式输出 python流处理库