十分钟入门 Pandas

原创

村雨遥 2021-09-01 15:03:05 ©著作权

文章标签 Pandas Python 数据字符串时间戳 文章分类 代码人生

©著作权归作者所有：来自51CTO博客作者村雨遥的原创作品，请联系作者获取转载授权，否则将追究法律责任

定义

Pandas是基于Numpy的一种工具，目的是解决数据分析任务。通过纳入大量库和一些标准数据模型，提供了高效操作大型数据集所需工具；

安装

pip install pandas

数据类型

Series

定义
一维的数组类型，其中每个元素有各自标签；可当作一个由带标签元素组成的numpy数组，标签可以是数字或字符；
关键点
- 均匀数据；
- 尺寸大小不变；
- 数据的值可变；

Dataframe

定义
二维、表格型的数组结构，可存储许多不同类型的数据，且每个轴都有标签，可当作一个series的字典；
关键点
- 异构数据；
- 大小可变；
- 数据可变；
功能特点
- 潜在的类是不同类型；
- 大小可变；
- 标记轴（行和列）；
- 可对行和列执行算术运算；

Panel

定义
三维，大小可变的数组；
关键点
- 异构数据；
- 大小可变；
- 数据可变；

三者区别与共性

可变性：三者的值都是值可变的，除了series都是大小可变的；
较高维数据结构是较低维数据结构的容器，Panel 是 DataFrame 的容器，DataFrame是 Series 的容器；

如何使用Pandas

#!/usr/bin/python3
# -*- coding:utf-8 -*-
# @Time    : 2018-12-15 14:29
# @Author  : Cunyu
# @Site    : 
# @File    : panda.py
# @Software: PyCharm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 对象创建
seri = pd.Series([1,3,5,np.nan,9,10])
print(seri)

dates = pd.date_range('20181215', periods=10)
print(dates)

df = pd.DataFrame(np.random.randn(10,4), index=dates, columns=list('ABCD'))
print(df)

# 查看数据
print('All:\n', df.head())
print('前三行：\n', df.head(3))
print('后三行\n', df.tail(3))
print('index:\n',df.index)
print('col:\n', df.columns)
print('values:\n', df.values)
print('desc:\n', df.describe())
print('转置：\n', df.T)
print('sort by an axis:\n', df.sort_index(axis=1, ascending=False))
print('sort by values:\n', df.sort_values(by='B'))

# 获取，常用数据访问方法：.at, .iat, .loc, .iloc, .ix
print("df['A']:\n", df['A']) # 选择一列产生一个系列
print('df[0:3]:\n', df[0:3])
# 按标签选择
print(df.loc[dates[0]])
print(df.loc[:,['A','B']])
print('获取某一个特定值：\n', df.at[dates[0], 'A'])
# 通过位置选择
print('获取每个特定位置的值：\n', df.iloc[3])
print('切片操作：\n', df.iloc[3:5, 0:2])
print(df.iat[1,1])
# 布尔索引
print(df[df.A>0])
print('filter:\n', df.copy())
print('demo:\n', df[df['D'].isin(['two', 'four'])])

"""
Series
"""
# pandas.Series(data, index, dtype, copy)，构造函数创建
# 创建一个空系列
print('Null Series:\n', pd.Series())
# 从ndarray创建一个系列
data = np.array(['a', 'b', 'c', 'd'])
print('ndarray Series:\n', pd.Series(data))
# 从字典创建一个系列
data = {'a':0, 'b':3, 'c':4}
print('dict Series:\n', pd.Series(dict))
# 从标量创建一个系列
print('scalar Series:\n', pd.Series(5, index=[1, 2, 4, 8, 0]))

# 从具有位置系列中访问数据
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
print('Search :\n', s[:3])
# 使用标签检索数据
s = pd.Series([1,2,3,4,5],index = ['a','b','c','d','e'])
print('s["d"]: ', s['d'])


# 属性或方法
# 1、axes，返回行轴标签列表
seri = pd.Series(np.random.randn(5))
print('axes:\n', seri.axes)
# 2、dtype，返回对象数据类型
print('dtype:\n', seri.dtype)
# 3、empty，若系列为空，返回True
print('empty:\n', seri.empty)
# 4、ndim，返回底层数据的维数
print('ndim:\n', seri.ndim)
# 5、size，返回基础数据中的元素数
print('size:\n', seri.size)
# 6、values，将系列作为ndarray返回
print('values:\n', seri.values)
# 7、head(n),返回前n行
print('head:\n', seri.head(3))
# 8、tail(n),返回后n行
print('tail:\n', seri.tail(3))

"""
DataFrame
"""
# pandas.DataFrame(data,index,columns,dtype,copy)

# 创建空DataFrame
print(pd.DataFrame())
# 从列表创建DataFrame
print('List DataFrame:\n', pd.DataFrame([1,3,5,7,9]))
# 从字典创建DataFrame
dict = {'name':['Manu', 'Tim', 'Paker'], 'age':[41, 42, 36]}
print('Dict DataFrame:\n', pd.DataFrame(dict))
# 从系列的字典创建DataFrame
dict_series = {'First' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'Second' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
print('Series DataFrame:\n', pd.DataFrame(dict_series))
# 列选择，列添加，列删除
df = pd.DataFrame(dict_series)
dict_series = {'First' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'Second' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
print('列选择：\n', pd.DataFrame(dict_series)['First'])
df['Third'] = pd.Series([11, 22, 44], index=['a', 'b', 'c'])
print('列添加：\n', df)
del df['First']
df.pop('Third')
print('列删除：\n', df)


# 属性或方法
dict = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Minsu','Jack']),
   'Age':pd.Series([25,26,25,23,30,29,23]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8])}
dataFrame = pd.DataFrame(dict)
# 1、axes，返回行轴标签列表
dataFrame = pd.DataFrame(np.random.randn(5))
print('axes:\n', dataFrame.axes)
# 2、dtypes，返回对象数据类型
print('dtype:\n', dataFrame.dtypes)
# 3、empty，若系列为空，返回True
print('empty:\n', dataFrame.empty)
# 4、ndim，返回底层数据的维数
print('ndim:\n', dataFrame.ndim)
# 5、size，返回基础数据中的元素数
print('size:\n', dataFrame.size)
# 6、values，将系列作为ndarray返回
print('values:\n', dataFrame.values)
# 7、head(n),返回前n行
print('head:\n', dataFrame.head(3))
# 8、tail(n),返回后n行
print('tail:\n', dataFrame.tail(3))
# 9、T，转置
print('T:\n', dataFrame.T)
# 10、shape，返回表示DataFrame的维度的元祖
print('shape:\n', dataFrame.shape)


"""
Panel
"""
# pandas.Panel(data,items,major_axis,minor_axis,dtype,copy)

# 创建面板

# 创建一个空面板
print('Null Panel:\n', pd.Panel())
# 从3D ndarray创建
data = np.random.rand(3, 4, 5)
print('3D narray:\n', pd.Panel(data))
# 从DataFrame对象的dict创建面板
data = {'Item1' : pd.DataFrame(np.random.randn(4, 4)),
        'Item2' : pd.DataFrame(np.random.randn(4, 5))}
print('DataFrame的Dict:\n', pd.Panel(data))

"""
描述性统计
"""
dict = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Minsu','Jack',
   'Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}

dataFrame = pd.DataFrame(dict)
# 求和
print('sum:\n', dataFrame.sum())
# 均值
print('avg:\n', dataFrame.mean())
# 标准差
print('std:\n', dataFrame.std())
# 非空观测数量
print('count:\n', dataFrame.count())
# 中位数
print('median:\n', dataFrame.median())
# 模值
print('mode:\n', dataFrame.mode())
# 累计总和
print('cumsum:\n', dataFrame.cumsum())
# 累计乘积
print('cumprod:\n', dataFrame['Age'].cumprod())
# 统计信息摘要
print('describe:\n', dataFrame.describe(include='all'))

"""
函数应用
"""
# 表格函数应用：pipe()
def adder(elem1, elem2):
    return elem1+elem2
dataFrame = pd.DataFrame(np.random.randn(10,3),columns=['col1','col2','col3'])
dataFrame.pipe(adder,10)
print('pipe:\n', dataFrame.pipe(adder,10))
# 行或列函数应用：apply()
print('apply:\n', dataFrame.apply(np.median))
# 元素函数应用：applymap()
print('map:\n', dataFrame['col1'].map(lambda x:x*100))
print('applymap:\n', dataFrame.apply(lambda x:x*100))

"""
重建索引
"""
df1 = pd.DataFrame(np.random.randn(10,3),columns=['col1','col2','col3'])
df2 = pd.DataFrame(np.random.randn(7,3),columns=['col1','col2','col3'])
# 重建索引与其他对象对齐，reindx_like,填充方法：pad/ffill-前向填充、bfill/backfill-向后填充值、nearest-从最近索引值填充
df1 = df1.reindex_like(df2)
print('reindex_like:\n', df1)
print('ffill:\n', df2.reindex_like(df1, method='ffill'))
# 重建索引时的填充限制
print('limit:\n', df2.reindex_like(df1, method='nearest', limit=2))
# 重命名
print('rename:\n', df1.rename(columns={'col1':'c1', 'col2':'c2'}))

"""
迭代
"""
SIZE=20

dataFrame = pd.DataFrame({
    'A': pd.date_range(start='2016-01-01',periods=SIZE,freq='D'),
    'x': np.linspace(0,stop=SIZE-1,num=SIZE),
    'y': np.random.rand(SIZE),
    'C': np.random.choice(['Low','Medium','High'],SIZE).tolist(),
    'D': np.random.normal(100, 10, size=(SIZE)).tolist()
    })

# iteritems()，每个列作为键，将值与值作为键和列值迭代为Series对象
print('iteritems:')
for key, value in dataFrame.iteritems():
    print(key,value)

# iterrow()，返回迭代器，产生每个索引值及每行数据的序列
print('iterrow:')
dataFramea = pd.DataFrame(np.random.randn(5,3),columns = ['col1','col2','col3'])
for row_index, row in dataFrame.iterrows():
    print(row_index, row)

# intertuples(),为DataFrame中的每一行返回一个产生一个命名元祖的迭代器，元祖的第一个元素将是行的相应索引值，剩余的值是行值
print('itertuples:')
for row in dataFrame.itertuples():
    print(row)

"""
排序
"""
unsorted_df=pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],
                         columns=['col2','col1'])
print(unsorted_df)
# sort_index(),按标签排序
print('sort_index:\n', unsorted_df.sort_index(ascending=False))
# sort_values,按值排序
print('sort_values:\n', unsorted_df.sort_values(by='col2'))
# 排序算法:mergesort(唯一稳定)、heapsort、quicksort
print('sort algorithm:\n', unsorted_df.sort_index(kind='heapsort'))

"""
字符串和文本数据
"""
# 1、lower()	将Series/Index中的字符串转换为小写。
# 2、upper()	将Series/Index中的字符串转换为大写。
# 3、len()	计算字符串长度。
# 4、strip()	帮助从两侧的系列/索引中的每个字符串中删除空格(包括换行符)。
# 5、split(' ')	用给定的模式拆分每个字符串。
# 6、cat(sep=' ')	使用给定的分隔符连接系列/索引元素。
# 7、get_dummies()	返回具有单热编码值的数据帧(DataFrame)。
strings = pd.Series(['Tim ', ' Rick', 'Joson', 'Albert'])
print ('get_dummies:\n', strings.str.get_dummies())
# 8、contains(pattern)	如果元素中包含子字符串，则返回每个元素的布尔值True，否则为False。
# 9、replace(a,b)	将值a替换为值b。
# 10、repeat(value)	重复每个元素指定的次数。
# 11、count(pattern)	返回模式中每个元素的出现总数。
# 12、startswith(pattern)	如果系列/索引中的元素以模式开始，则返回true。
# 13、endswith(pattern)	如果系列/索引中的元素以模式结束，则返回true。
# 14、find(pattern)	返回模式第一次出现的位置。
# 15、findall(pattern)	返回模式的所有出现的列表。
print('findall:\n', strings.str.findall('e'))
# 16、swapcase	变换字母大小写。
# 17、islower()	检查系列/索引中每个字符串中的所有字符是否小写，返回布尔值
# 18、isupper()	检查系列/索引中每个字符串中的所有字符是否大写，返回布尔值
# 19、isnumeric() 检查系列/索引中每个字符串中的所有字符是否为数字，返回布尔值。
print('isnumeric:\n', strings.str.isnumeric())

"""
选项和自定义
"""
# get_option(param)，一个参数，获取属性值
print('display max rows:', pd.get_option('display.max_rows'))
print('display max columns:', pd.get_option('display.max_columns'))

# set_option(param, value)，两个参数，将该值设置为指定的参数值
pd.set_option('display.max_rows', 90)
print('display max rows: ', pd.get_option('display.max_rows'))

# reset_option(param)，接受一个参数，并将该值设置为默认值
pd.reset_option('display.max_rows')
print('display max rows: ', pd.get_option('display.max_rows'))

# describe_option(param)，打印参数的描述
print('description:')
pd.describe_option('display.max_rows')

# option_context()，上下文管理器用于临时设置语句中的选项
print('option_context:')
with pd.option_context('display.max_rows', 10):
    print(pd.get_option('display.max_rows')) # 最大行数
    print(pd.get_option('display.max_columns')) # 最大列数
    print(pd.get_option('display.max_colwidth')) # 最大列宽
    print(pd.get_option('display.precision')) # 十进制的精度
    print(pd.get_option('display.expand_frame_repr')) # 数据帧以拉伸页面

"""
索引与数据选择
"""
# 1、.loc(),基于标签
# 2、.iloc(),基于整数
# 3、.ix(),基于标签和数据
dataFrame = pd.DataFrame(np.random.randn(10, 4), columns = ['A', 'B', 'C', 'D'])
print(dataFrame.ix[:5])

"""
统计函数
"""
# pct_change(),将每个元素与前一个元素进行比较，并计算变化百分比
df = pd.DataFrame(np.random.randn(5, 2))
print ('pct_change:\n', df.pct_change())

# 协方差
seri1 = pd.Series(np.random.randn(10))
seri2 = pd.Series(np.random.randn(30))
print('cov:\n', seri1.cov(seri2))

# 相关性
frame = pd.DataFrame(np.random.randn(20, 5), columns=['a', 'b', 'c', 'd', 'e'])
print('相关性：\n', frame['a'].corr(frame['c']))

# 数据排名
s = pd.Series(np.random.np.random.randn(5), index=list('abcde'))
s['d'] = s['e']
print ('rank:\n', s.rank())


"""
合并/连接
"""
left = pd.DataFrame({
         'id':[1,2,3,4,5],
         'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
         'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame(
         {'id':[1,2,3,4,5],
         'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
         'subject_id':['sub2','sub4','sub3','sub6','sub5']})
left_merge = pd.merge(left, right, on='subject_id', how='left')
right_merge = pd.merge(left, right, on='subject_id', how='right')
outer_merge = pd.merge(left, right, on='subject_id', how='outer')
inner_merge = pd.merge(left, right, on='subject_id', how='inner')
print('left:\t', left_merge)
print('right:\t', right_merge)
print('outer:\t', outer_merge)
print('inner:\t', inner_merge)

"""
级联
"""
# concat(objs, axis, join, join_axes, ignore_index)
one = pd.DataFrame({
         'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
         'subject_id':['sub1','sub2','sub4','sub6','sub5'],
         'Marks_scored':[98,90,87,69,78]},
         index=[1,2,3,4,5])
two = pd.DataFrame({
         'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
         'subject_id':['sub2','sub4','sub3','sub6','sub5'],
         'Marks_scored':[89,80,79,97,88]},
         index=[1,2,3,4,5])
str_concat = pd.concat([one,two], ignore_index=True)
print('级联:\n', str_concat)

""""
时间序列
"""
# 获取当前时间
print('time now:\n', pd.datetime.now())
# 创建时间戳
print('创建时间戳：\n', pd.Timestamp('2018-11-11'))
# 转换为时间戳
print('转换时间戳：\n', pd.to_datetime(['2018/11/23', '2010.12.31', None]))
# 改变时间频率
print('改变频率：\n', pd.date_range("12:00", "19:59", freq="H").time)
# 时间差
print('时间差:\n', pd.Timedelta('60 days 11 hours 33 minutes 30 seconds'))

"""
绘图
"""
df = pd.DataFrame(np.random.randn(10,5),index=pd.date_range('2018/12/16',
   periods=10), columns=list('ABCDE'))
df.plot()
# 条形图
df.plot.bar()
# 直方图
df.plot.hist()
# 盒型图
df.boxplot()
plt.show()


""""
IO工具
"""
# pandas.read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer',names=None, index_col=None, usecols=None)
# 读取文件
file = pd.read_csv('read.csv')
print(file)

how的参数

left
使用左侧对象的键；
right
使用右侧对象的键；
outer
使用键的联合；
inner
使用键的交集

# --*--coding:utf-8--*--
import pandas as pd
left = pd.DataFrame({
         'id':[1,2,3,4,5],
         'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
         'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame(
         {'id':[1,2,3,4,5],
         'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
         'subject_id':['sub2','sub4','sub3','sub6','sub5']})
left_merge = pd.merge(left, right, on='subject_id', how='left')
right_merge = pd.merge(left, right, on='subject_id', how='right')
outer_merge = pd.merge(left, right, on='subject_id', how='outer')
inner_merge = pd.merge(left, right, on='subject_id', how='inner')
print('left:\t', left_merge)
print('right:\t', right_merge)
print('outer:\t', outer_merge)
print('inner:\t', inner_merge)