数据合并merge

#参数解释
merge(
        self,
        right: DataFrame | Series,
        how: str = "inner",#连接方式:‘inner’(默认);还有,‘outer’、‘left’、‘right’ 
        on: IndexLabel | None = None,#用于连接的列名,必须同时存在于左右两个DataFrame对象中,如果位指定,则以left和right列名的交集作为连接键 
        left_on: IndexLabel | None = None,#左侧DataFarme中用作连接键的列 
        right_on: IndexLabel | None = None,#右侧DataFarme中用作连接键的列
        left_index: bool = False,#将左侧的行索引用作其连接键
        right_index: bool = False,#将右侧的行索引用作其连接键
        sort: bool = False,
        suffixes: Suffixes = ("_x", "_y"),#字符串值元组,用于追加到重叠列名的末尾,默认为(‘_x’,‘_y’).例如,左右两个DataFrame对象都有‘data’,则结果中就会出现‘data_x’,‘data_y’ 
        copy: bool = True,
        indicator: bool = False,#会在结果中新增加一列来标明合并的结果是左右两边,还是仅在左边或仅在右边。参数为True时,新增列的标签为“_merge”当indicator=字符串时,返回的结果为字符串
        validate: str | None = None,
    )
  • merge的5种拼接方式inner/left/outer/right/cross

python的dataframe需要进行merge操作很慢_数据

a = {"name":["lemon","jack","peter","Emma","james"],
     "city":["长沙","上海","深圳","北京","北京"],
     "a":[80,90,60,73,89],
     "b":[80,75,80,85,83],
     "c":[70,75,80,73,62]}
df1 = pd.DataFrame(data=a,index=["d","e","f","g","h"])
a2 = {"name":["lemon","jack","peter","jacob",],
      "city":["长沙","上海","深圳","北京"],
      "e":[85,92,70,83],
      "f":[83,75,82,87]}
df2 = pd.DataFrame(data=a2,index=["e","f","g","h"])

df1.merge(df2,left_on="name",right_on="name")#2个数据框中的“name”列中有相同元素值得数据行参与拼接,其他数据行不
# 参与拼接,同时在结果中其他列如有相同的列名称,则会以添加后缀的方式进行重命名,同时新的数据框的行索引标签会根据
# 行的数量进行重置

"""
#输出结果:
	name	city_x	a	b	c	city_y	e	f
0	lemon	长沙	80	80	70	长沙	85	83
1	jack	上海	90	75	75	上海	92	75
2	peter	深圳	60	80	80	深圳	70	82
"""

df1.merge(df2,left_on="name",right_on="name",how="outer")#并集方式呈现
"""
#输出结果
	name	city_x	a	b	c	city_y	e	f
0	lemon	长沙	80.0	80.0	70.0	长沙	85.0	83.0
1	jack	上海	90.0	75.0	75.0	上海	92.0	75.0
2	peter	深圳	60.0	80.0	80.0	深圳	70.0	82.0
3	Emma	北京	73.0	85.0	73.0	NaN	NaN	NaN
4	james	北京	89.0	83.0	62.0	NaN	NaN	NaN
5	jacob	NaN	NaN	NaN	NaN	北京	83.0	87.0
"""

df1.merge(df2,left_on="name",right_on="name",how="left")#只保留第一个的
"""
#输出结果
	name	city_x	a	b	c	city_y	e	f
0	lemon	长沙	80	80	70	长沙	85.0	83.0
1	jack	上海	90	75	75	上海	92.0	75.0
2	peter	深圳	60	80	80	深圳	70.0	82.0
3	Emma	北京	73	85	73	NaN	NaN	NaN
4	james	北京	89	83	62	NaN	NaN	NaN
"""

df1.merge(df2,how="cross")#此时不需要设置left_on与right_on,是将左右2个数据框的数据行以乘积的方式分别进行匹配
"""
name_x	city_x	a	b	c	name_y	city_y	e	f
0	lemon	长沙	80	80	70	lemon	长沙	85	83
1	lemon	长沙	80	80	70	jack	上海	92	75
2	lemon	长沙	80	80	70	peter	深圳	70	82
3	lemon	长沙	80	80	70	jacob	北京	83	87
4	jack	上海	90	75	75	lemon	长沙	85	83
5	jack	上海	90	75	75	jack	上海	92	75
6	jack	上海	90	75	75	peter	深圳	70	82
7	jack	上海	90	75	75	jacob	北京	83	87
8	peter	深圳	60	80	80	lemon	长沙	85	83
9	peter	深圳	60	80	80	jack	上海	92	75
10	peter	深圳	60	80	80	peter	深圳	70	82
11	peter	深圳	60	80	80	jacob	北京	83	87
12	Emma	北京	73	85	73	lemon	长沙	85	83
13	Emma	北京	73	85	73	jack	上海	92	75
14	Emma	北京	73	85	73	peter	深圳	70	82
15	Emma	北京	73	85	73	jacob	北京	83	87
16	james	北京	89	83	62	lemon	长沙	85	83
17	james	北京	89	83	62	jack	上海	92	75
18	james	北京	89	83	62	peter	深圳	70	82
19	james	北京	89	83	62	jacob	北京	83	87
"""

#也可以设置“on”参数
df1.merge(df2,on="name")
"""
	name	city_x	a	b	c	city_y	e	f
0	lemon	长沙	80	80	70	长沙	85	83
1	jack	上海	90	75	75	上海	92	75
2	peter	深圳	60	80	80	深圳	70	82
"""
#“on”参数也可以是列表
df1.merge(df2,on=["name","city"])#有图可知name与city中的重复值已经完全合并

"""
	name	city	a	b	c	e	f
0	lemon	长沙	80	80	70	85	83
1	jack	上海	90	75	75	92	75
2	peter	深圳	60	80	80	70	82
"""

#当结果中有列索引标签重复的数据列时,可以设置参数suffixes来对列索引标签添加后缀
df1.merge(df2,on = "name",suffixes=["_left","_right"])
"""
	name	city_left	a	b	c	city_right	e	f
0	lemon	长沙	80	80	70	长沙	85	83
1	jack	上海	90	75	75	上海	92	75
2	peter	深圳	60	80	80	深圳	70	82
"""

#如果有列索引标签重复时,可以设置参数suffixes为False,则会报错
df1.merge(df2,on = "name",suffixes=[False,False])
"""
df1.merge(df2,on = "name",suffixes=[False,False])
"""

#indicator参数,会在结果中新增加一列来标明合并的结果是左右两边,还是仅在左边或仅在右边。参数为True时,新增列的标签为“_merge”
df1.merge(df2,on = "name",how="outer",indicator=True)
"""

name	city_x	a	b	c	city_y	e	f	_merge
0	lemon	长沙	80.0	80.0	70.0	长沙	85.0	83.0	both
1	jack	上海	90.0	75.0	75.0	上海	92.0	75.0	both
2	peter	深圳	60.0	80.0	80.0	深圳	70.0	82.0	both
3	Emma	北京	73.0	85.0	73.0	NaN	NaN	NaN	left_only
4	james	北京	89.0	83.0	62.0	NaN	NaN	NaN	left_only
5	jacob	NaN	NaN	NaN	NaN	北京	83.0	87.0	right_only
"""

#当indicator=字符串时,返回的结果为字符串
df1.merge(df2,on = "name",how="outer",indicator="重复的列")
"""
name	city_x	a	b	c	city_y	e	f	重复的列
0	lemon	长沙	80.0	80.0	70.0	长沙	85.0	83.0	both
1	jack	上海	90.0	75.0	75.0	上海	92.0	75.0	both
2	peter	深圳	60.0	80.0	80.0	深圳	70.0	82.0	both
3	Emma	北京	73.0	85.0	73.0	NaN	NaN	NaN	left_only
4	james	北京	89.0	83.0	62.0	NaN	NaN	NaN	left_only
5	jacob	NaN	NaN	NaN	NaN	北京	83.0	87.0	right_only
"""

#merge还有另外一种形式为pandas.merge,下案例使用pd.merge,将左右两个数据框作为参数在函数中设置得到的结果与dataframe.merge是一样的
pd.merge(df1,df2,on=["name","city"],how="outer")
"""
	name	city	a	b	c	e	f
0	lemon	长沙	80.0	80.0	70.0	85.0	83.0
1	jack	上海	90.0	75.0	75.0	92.0	75.0
2	peter	深圳	60.0	80.0	80.0	70.0	82.0
3	Emma	北京	73.0	85.0	73.0	NaN	NaN
4	james	北京	89.0	83.0	62.0	NaN	NaN
5	jacob	北京	NaN	NaN	NaN	83.0	87.0
"""

#参数validate可以设置对齐方式,其值可以是“one_to_one”、“one_to_many”、“many_to_one”、“many_to_many”

pd.merge(df1,df2,on="city",how="outer",validate="many_to_one")

"""
name_x	city	a	b	c	name_y	e	f
0	lemon	长沙	80	80	70	lemon	85	83
1	jack	上海	90	75	75	jack	92	75
2	peter	深圳	60	80	80	peter	70	82
3	Emma	北京	73	85	73	jacob	83	87
4	james	北京	89	83	62	jacob	83	87
"""

pd.merge(df1,df2,on="city",how="outer")
"""
	name_x	city	a	b	c	name_y	e	f
0	lemon	长沙	80	80	70	lemon	85	83
1	jack	上海	90	75	75	jack	92	75
2	peter	深圳	60	80	80	peter	70	82
3	Emma	北京	73	85	73	jacob	83	87
4	james	北京	89	83	62	jacob	83	87
"""

#例子data3中"bought_by":[1,1,3]},[1,1,3]为data4中的行索引,将2个数据框连接起来
data3 = {"product":["computer","phone","TV"],"bought_by":[1,1,3]}
df3 = pd.DataFrame(data3,index=["A","B","C"])
data4 = {"name":["lemmon","jack","emma"],"age":[23,20,32]}
df4 = pd.DataFrame(data4,index=[1,2,3])


df3.merge(df4,how="inner",right_index=True,left_on="bought_by")#left_on 右侧DataFarme中用作连接键的列,right_index=True将右侧的行索引用作其连接键 

"""
	product	bought_by	name	age
A	computer	1	lemmon	23
B	phone	1	lemmon	23
C	TV	3	emma	32
"""

记录学习的点点滴滴