数据挖掘实践（22）：实战--用户流失预警系统

转载

mob604757006a49 2021-02-11 12:07:00

文章标签 数据数组初始化图例数据集 文章分类 数据挖掘人工智能

0 分析与准备数据--开会与讨论

数据挖掘实践（22）：实战--用户流失预警系统_数组

数据挖掘实践（22）：实战--用户流失预警系统_数据_02

数据挖掘实践（22）：实战--用户流失预警系统_初始化_03

数据挖掘实践（22）：实战--用户流失预警系统_数组_04

1 数据清洗与格式转换

import warnings
warnings.filterwarnings('ignore') #忽视

Step.1 通过pandas来导入csv：查看一下数据的基本情况，可以看到，整个数据集有3333条数据， 21个维度，最后一列是分类

import pandas as pd
import numpy as np
# 加载数据集
churn_df = pd.read_csv('churn.csv')
col_names = churn_df.columns.tolist() #所有的列展示出来

print("Column names:")
print(col_names)

Column names:
['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins',
 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?']

Step.2 基本信息以及类型

to_show = col_names[:6] + col_names[-6:] #前6列特征和后6列特征

print("\nSample data:")
churn_df[to_show].head(6)

Sample data:

数据挖掘实践（22）：实战--用户流失预警系统_数组_05

churn_df.info()

数据挖掘实践（22）：实战--用户流失预警系统_数组_06

churn_df.describe() 
#describe() 可以返回具体的结果， 对于每一列。

#数量 平均值 标准差 25% 分位 50% 分位数 75% 分位数 最大值 很多时候你可以得到NA的数量和比例。

2 探索性数据分析

#我们先来看一下流失比例， 以及关于打客户电话的个数分布
import matplotlib.pyplot as plt # 仿真
%matplotlib inline

fig = plt.figure()
fig.set(alpha=0.3)  # 设定图表颜色alpha参数
#subplot2grid(shape , loc )
plt.subplot2grid((1,2),(0,0))# 图像几行几列，从第0行第0列，

# bar:条形直方图
churn_df['Churn?'].value_counts().plot(kind='bar') #把用户是否流失分组起来，流失的有多少人，没有流失的有多少人
plt.title("stat for churn") # 设置标签/label
plt.ylabel("number")  #流失与否的数量，一共3333行，没有流失的约占2700 ，流失的占500左右


plt.subplot2grid((1,2),(0,1))            
churn_df[u'CustServ Calls'].value_counts().plot(kind='bar') # 客服电话， 客户打电话投诉多那流失率可能会大
plt.title("stat for cusServCalls") # 标题
plt.ylabel(u"number") #客户打1个客服电话的有1400个左右，客户.....总计加起来有3333个 

plt.show()

数据挖掘实践（22）：实战--用户流失预警系统_数组_07

import matplotlib.pyplot as plt

%matplotlib inline
fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

plt.subplot2grid((1,3),(0,0)) # 在一张大图里分列几个小图
churn_df['Day Mins'].plot(kind='kde') # 白天通话分钟数，图用的kde的图例
plt.xlabel(u"Mins")# 横轴是分钟数
plt.ylabel(u"density")  # density：密度
plt.title(u"dis for day mins") #标题



plt.subplot2grid((1,3),(0,1))            
churn_df['Day Calls'].plot(kind='kde')# 白天打电话个数
plt.xlabel(u"call")# 客户打电话个数
plt.ylabel(u"density") #密度
plt.title(u"dis for day calls") #标题

plt.subplot2grid((1,3),(0,2))           
churn_df['Day Charge'].plot(kind='kde') # 白天收费情况
plt.xlabel(u"Charge")# 横轴是白天收费情况
plt.ylabel(u"density") #密度
plt.title(u"dis for day charge")

plt.show()

数据挖掘实践（22）：实战--用户流失预警系统_数据_08

#import matplotlib.pyplot as plt

fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

#查看流失与国际漫游之间的关系
int_yes = churn_df['Churn?'][churn_df['Int\'l Plan'] == 'yes'].value_counts() # 分组，yes:参与了有国际漫游需求的统计出来
int_no = churn_df['Churn?'][churn_df['Int\'l Plan'] == 'no'].value_counts() #分组：no:没有参与国际漫游的统计出来

#用DataFrame做图例上的标签 ，在右上角
df_int=pd.DataFrame({u'int plan':int_yes, u'no int plan':int_no})

df_int.plot(kind='bar', stacked=True)
plt.title(u"statistic between int plan and churn")
plt.xlabel(u"int or not") 
plt.ylabel(u"number")

plt.show()

# 下图中，原始数据有3333行，False：对于没有流失的用户中，有2700个，其中参与了有100个左右，没有参与的2600多个
#True：对于真正流失的用户有400个左右，参与了国际漫游的有100个，没有参与漫游的用300个
#结论：有国际电话的流失比例较高

数据挖掘实践（22）：实战--用户流失预警系统_数据_09

#查看客户服务电话和结果的关联
fig = plt.figure()
fig.set(alpha=0.2)  # 设定图表颜色alpha参数

cus_0 = churn_df['CustServ Calls'][churn_df['Churn?'] == 'False.'].value_counts()#没有用户流失的电话数
cus_1 = churn_df['CustServ Calls'][churn_df['Churn?'] == 'True.'].value_counts()#有用户流失的电话数

df=pd.DataFrame({u'churn':cus_1, u'retain':cus_0})
df.plot(kind='bar', stacked=True)
plt.title(u"Static between customer service call and churn")
plt.xlabel(u"Call service") # 打电话的数量
plt.ylabel(u"Num")  #流失与不流失的总数

plt.show()

# 在打3次客服电话的时候有400多人，其中没有流失的有300，流失了有100人
# 在打4次客服电话的时候有180人，其中没有流失的有80人，流失的有100人

数据挖掘实践（22）：实战--用户流失预警系统_数组_10

3 特征筛选

# 对于标签数据需要整合
ds_result = churn_df['Churn?']

#shift+tab:condition是布尔类型的数组，每个条件都和x ,y 对应

#相当于流失了/True为1 ，没有流失/False为0
Y = np.where(ds_result == 'True.',1,0) 
# 哑变量 == 读热编码
dummies_int = pd.get_dummies(churn_df['Int\'l Plan'], prefix='_int\'l Plan') #prefix：前缀
# VMail Plan：某个策划活动  prefix：前缀
dummies_voice = pd.get_dummies(churn_df['VMail Plan'], prefix='VMail')

#concat：用来合并2个或者2个以上的数组
ds_tmp=pd.concat([churn_df, dummies_int, dummies_voice], axis=1)

# 删除州名、地区编号、手机号、用户是否流失、各种策略活动
to_drop = ['State','Area Code','Phone','Churn?', 'Int\'l Plan', 'VMail Plan']
df = ds_tmp.drop(to_drop,axis=1)

print("after convert ")
df.head(5)

4 特征工程

#数量级不一样，，通过Scaler实现去量纲的影响
#在训练模型时之前经常要对数据进行数组转化，as_matrix()：把所有的特征都转化为np.float
X = df.as_matrix().astype(np.float)


from sklearn.preprocessing import StandardScaler # 标准化

scaler = StandardScaler()# 标准化

X = scaler.fit_transform(X)

print("Feature space holds %d observations and %d features" % X.shape) #  3333行 * 19列
print("---------------------------------")
print("Unique target labels:", np.unique(Y)) # 标签的唯一值
print("---------------------------------")
print(len(Y[Y==0])) # 没丢失的有2850
print("---------------------------------")
print(len(Y[Y==1])) # 丢失的有483

Feature space holds 3333 observations and 19 features
---------------------------------
Unique target labels: [0 1]
---------------------------------
2850
---------------------------------
483

# 整理好的数据拿过来
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
features = churn_feat_space.columns
X = churn_feat_space.as_matrix().astype(np.float)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

print("Feature space holds %d observations and %d features" % X.shape)
print("---------------------------------")
print("Unique target labels:", np.unique(y))
print("---------------------------------")
print(X[0])#第1行
print("---------------------------------")
print(len(y[y == 0]))

Feature space holds 3333 observations and 17 features
---------------------------------
Unique target labels: [0 1]
---------------------------------
[ 0.67648946 -0.32758048  1.6170861   1.23488274  1.56676695  0.47664315
  1.56703625 -0.07060962 -0.05594035 -0.07042665  0.86674322 -0.46549436
  0.86602851 -0.08500823 -0.60119509 -0.0856905  -0.42793202]
---------------------------------
2850

5 建立多种基础模型，尝试多种算法

# 手写一个交叉验证：调参
from sklearn.model_selection import KFold

def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(5,shuffle=True) # 5折
    y_pred = y.copy() #把所有的标签y拿出来备份一下copy()

    # 一共是5份，每四份儿当做训练集 ，剩下的一份验证集
    for train_index, test_index in kf.split(X):
    
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

#手写的测试
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN

def accuracy(y_true,y_pred):
    
    return np.mean(y_true == y_pred) # 相等为True ，不等为False ，  1+0+1+0.../3333

print("Support vector machines:")
print("%.3f" % accuracy(y, run_cv(X,y,SVC)))
print("----------------------------")
print("LogisticRegression :")
print("%.3f" % accuracy(y, run_cv(X,y,LR)))
print("----------------------------")
print("K-nearest-neighbors:")
print("%.3f" % accuracy(y, run_cv(X,y,KNN)))

Support vector machines:
0.918
----------------------------
LogisticRegression :
0.861
----------------------------
K-nearest-neighbors:
0.897

# 调入工具包
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score,KFold
from sklearn.neighbors import KNeighborsClassifier 
import matplotlib.pyplot as plt


# 初始化模型
models = []
models.append(('KNN', KNeighborsClassifier()))


models.append(('LR', LogisticRegression()))

models.append(('SVM', SVC()))

# 初始化
results = []
names = []
scoring = 'accuracy' # 准确率
for name, model in models:
    
    #random_state = 0 
    kfold = KFold(5,shuffle=True,random_state = 0) # 5折
    cv_results = cross_val_score(model, X, Y, cv=kfold)#scoring=scoring 默认为None
    results.append(cv_results)#交叉验证给的结果分
    names.append(name)
    #模型的标准差，体现模型的分值的波动，std越小越稳定
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    print("------------------------------")
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)

plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# 总结：SVM的效果比较好

KNN: 0.894088 (0.009717)
------------------------------
LR: 0.861384 (0.015574)
------------------------------
SVM: 0.919288 (0.011112)
------------------------------

数据挖掘实践（22）：实战--用户流失预警系统_数组_11

6 模型调参/提升模型

from sklearn.ensemble import RandomForestClassifier as RF
num_trees = 100
max_features = 3
kfold = KFold(n_splits=10, random_state=7)
model = RF(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.9537942133750515

from sklearn.ensemble import GradientBoostingClassifier
seed = 7
num_trees = 100
kfold = KFold(n_splits=10, random_state=seed)

model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)

print(results.mean())

0.9525966085846325

7 评估测试/结论汇报

def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(5,True)
    y_prob = np.zeros((len(y),2))
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        
        y_prob[test_index] = clf.predict_proba(X_test) #返回的是概率值 ，属于0的概率多少，属于1的概率是多少
    return y_prob

import warnings
warnings.filterwarnings('ignore')


pred_prob = run_prob_cv(X, y, RF, n_estimators=10)

pred_churn = pred_prob[:,1]#只要属于1的概率是多少 ，因为咱们关注的是流失的
is_churn = y == 1


counts = pd.value_counts(pred_churn) # 属于1的概率多少进行分组统计 ， 即：pred_prob    count



true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob]) 
    true_prob = pd.Series(true_prob)


counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts