代码
list_ZFI077 = df_1.columns.tolist()
df_ZFI077 = df_1.columns.to_frame(name= "列名") # 先不重置索引 drop依据索引
df_ZFI077_1 = df_ZFI077.copy().reset_index(drop= True) # index.to_frame()后需重置索引 方便赋值
for i,j in enumerate(list_ZFI077):
if df_ZFI077.列名.drop(j).str.contains(j).any():
df_ZFI077_1.loc[i,"列名"] = j + "th"
df_ZFI077_1.列名.str.contains("/").to_frame("判断").join(df_ZFI077).query("判断")
seri_ZFI077 = df_ZFI077_1.列名.str.replace("/", "")
'''检查'''
if seri_ZFI077.str.contains("/").any() == False: # 不能 is False
print("无'/'")
dict_ZFI077 = {}
for j in seri_ZFI077: # 若使用list_ZFI077需替换"/" 即 [ i.replace("/", "") for i in list_ZFI077 ] 或参考 https://blog.51cto.com/u_14246112/3140471
if seri_ZFI077.drop(
index= seri_ZFI077.to_frame("列名").query("列名 == @j").index
).str.contains(j).any():
print(j)
else:
dict_ZFI077[j] = "无重复"
if set(dict_ZFI077.values()) == set(["无重复"]): # 不能写 set("无重复") 会将字符串拆成单个字符
print("无重复")
背景
# 问题4 关键词有重复部分 例如 '采购入库数量' '无采购入库数量'
# 解决4.1 函数改进
# 解决4.2 源数据调整
# 思路4.2.1 将重复的识别出来
# 思路4.2.2 将重复的修改命名
# 4.2.2.1 列标题替换 Series.replace
# 4.2.2.2 长字符串内替换 Series.str.replace
# 4.2.2.2.1 单个字符串
# 4.2.2.2.2 +组合字符串
# 4.2.2.2.2.1 +可能在前可能在后 避免重复替换 新方法
# 4.3 特殊性大于通用性时 特殊做法
# 问题5 '/'替换为''
相关笔记 重要性从大到小顺序
https://blog.51cto.com/u_16055028/6316988
https://blog.51cto.com/u_16055028/6317062
https://blog.51cto.com/u_16055028/6309992
https://blog.51cto.com/u_16055028/6317193
应用
df_1.columns, df_2.columns = seri_ZFI077, seri_ZFI077
list_th = df_ZFI077_1.query("列名.str.contains('th')").列名.to_list()
dict_th = {}
for i in list_th:
dict_th[i.replace("th","")] = i
dict_th
seri = df_0.iloc[2] # 公式模板
list_seri = seri.tolist()
list_spli = [ i.split('+') for i in list_seri] # 根据"+"拆分
df_spli = pd.DataFrame(list_spli) # 将二维列表转化为表格
df_spli.index = seri.index
df_spli.columns = [ "列名"+str(i) for i in range(df_spli.shape[1]) ]
df_sprp = df_spli.copy()
for i in range(df_spli.shape[1]):
seri_colu = df_spli.iloc[:,i].str.replace("/","")
df_sprp.iloc[:,i] = seri_colu
df_rprp = df_sprp.copy()
for i in dict_th:
df_rprp = df_rprp.replace(i, dict_th[i])
df_rprp = df_rprp.replace("-"+i, "-"+dict_th[i])
# '''检查''' # 注意replace是完整替换 str.replace是字符替换
# for i in range(df_rprp.shape[1]):
# print(df_rprp.iloc[:,i].str.contains("th").any())
list_rprp = []
for i in df_rprp.index:
df_row = df_rprp.loc[i].astype(str).to_frame(name= "列名").query("列名!= 'None'")
list_row = df_row.列名.to_list()
list_rprp.append(list_row)
seri_form = seri.copy()
for i,j in enumerate(list_rprp):
str_form = "df_1." + "+df_1.".join(j)
seri_form.iloc[i] = str_form
seri_rp = seri_form.str.replace("df_1.-", "-df_1.", regex= False)
df_nebu_2030 = pd.DataFrame(index= range(df_1.shape[0]), columns= seri.index)
seri_rp.iloc[-2:] = seri_rp.iloc[-2:].str.replace("df_1","df_nebu_2030")
for i in seri_rp.index:
val_ = eval(seri_rp.loc[i])
df_nebu_2030[i] = val_
历史记录(错误思路)
seri = df_0.iloc[2]
for i in list_chfu:
for j in seri:
if i in j:
# 出现几次
# 每次在第几个
list_seri = seri.tolist() # 公式
list_spli = [ i.split('+') for i in list_seri] # +拆分
df_nebu_2030 = pd.DataFrame() # 新表格
for i in range(df.shape[1]):
colu_ = seri.index[i] # 新列名
str_ = seri.iloc[i] # 公式
coun_ = str_.count('+') + 1 # 计数
form_ = ''.join(["+df_1{}" ] * coun_) # 格式
list_j = [] # 定位
for j,k in enumerate(list_spli[i]):
if '-' in k:
list_j.append(j)
form_ = sub_(string= form_, position= list_j) # 替换
list_str = str_.replace('-','').split('+') # 替换
form_ = sub_mul_loo(string= form_, pattern= '{}', list_str= list_str, str_add= '.') # 替换
exec("df_nebu_2030['{}'] = {}".format(colu_, form_)) # 生成
def qixi_():
'''提取组合字符串'''
seri = df_0.iloc[2]
'''提取列名'''
colu_ZFI077 = df_1.columns
seri_ZFI077 = pd.Series(colu_ZFI077)
'''查找重复元素'''
list_chfu = []
for i in colu_ZFI077:
for j in colu_ZFI077:
if (i in j) and (i != j):
list_chfu.append([i,j])
'''替换重复元素'''
df_chfu = pd.DataFrame(list_chfu)
list_chfu = pd.DataFrame(list_chfu).iloc[:,0].tolist()
list_tihu = [k+'th' for k in list_chfu]
dict_tihu = dict(zip(list_chfu, list_tihu))
seri_th = seri_ZFI077.replace(dict_tihu)
'''用'th'来修改列名 应该不会有新的重复了'''
df_1.columns = seri_th
df_2.columns = seri_th
'''组合字符串内部的替换'''
'''非组合字符串替换 replace'''
seri = seri.replace(dict_tihu)
'''组合字符串替换 str.replace'''
for i in list_chfu:
for j in seri:
qixi_()