基于生存分析模型的用户流失预测
小O:有没有什么很好的办法在预测用户流失的同时,提供一些建议帮助我们运营呢?
小H:这简单,如果我可以告诉你什么样的人群容易流失、什么时间点容易流失、用户的可能存活多节可以吗?
小O:这太可以了~
生存模型就能很好的地解决上面的问题,生存分析(Survival analysis)是指根据历史数据对人的生存时间进行分析和推断,研究生存情况与众多影响因素间的关系。本文参考自python数据分析案例-利用生存分析Kaplan-Meier法与COX比例风险回归模型进行客户流失分析与剩余价值预测。
数据探索
导入相关库
# pip install -i https://pypi.doubanio.com/simple/ lifelines # 采用镜像安装
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import NelsonAalenFitter, CoxPHFitter, KaplanMeierFitter
from lifelines.statistics import logrank_test, multivariate_logrank_test
from sklearn.model_selection import train_test_split, GridSearchCV
from lifelines.utils.sklearn_adapter import sklearn_adapter
from sklearn import metrics
import time
import math
import toad
import itertools
from lifelines.plotting import add_at_risk_counts
from lifelines.statistics import pairwise_logrank_test
from sklearn import preprocessing
from lifelines.calibration import survival_probability_calibration
from sklearn.metrics import brier_score_loss
from lifelines.utils import median_survival_times, qth_survival_times
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import warnings
warnings.filterwarnings("ignore")
数据准备
# 读取数据
raw_data = pd.read_csv('Customer-Churn.csv')
raw_data.head()
数据预处理
df = raw_data.copy()
# 删除tenure=0(This model does not allow for non-positive durations)
df = df[df['tenure']>0]
# 转换成连续型变量
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
# 填充缺失值
df['TotalCharges'].fillna(0, inplace=True)
# 转换成类别变量
df['SeniorCitizen'] = df['SeniorCitizen'].astype("str")
# 将是否流失转换为01分布
df['Churn'] = df['Churn'].map({'No':0,'Yes':1})
生存分析探索
- 查看整体生存曲线
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(12,10))
# 整体生存曲线
kmf = KaplanMeierFitter()
kmf.fit(df.tenure, # 代表生存时长
event_observed=df.Churn # 代表事件的终点
)
kmf.plot_survival_function(ax=ax[0], label='all')
ax[0].grid()
# 整体生存变化率
naf = NelsonAalenFitter()
naf.fit(df.tenure, event_observed=df.Churn)
naf.plot_hazard(ax=ax[1], bandwidth=20, label='all')
ax[1].grid()
plt.show()
- 截至观察期(最大70个月),整体还有60%的用户未流失。不存在半衰期,即当用户流失达到50%的时间节点
- 0-10个月用户流失加快,50-60个月的用户流失速率也有所提升
# 缩短时间查看前20个月
t = np.linspace(0, 20, 21)
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(12,10))
# 整体生存曲线
kmf = KaplanMeierFitter()
kmf.fit(df.tenure, # 代表生存时长
event_observed=df.Churn, # 代表事件的终点
timeline=t
)
kmf.plot_survival_function(ax=ax[0], label='all')
# 整体生存变化率
naf = NelsonAalenFitter()
naf.fit(df.tenure, event_observed=df.Churn, timeline=t)
naf.plot_hazard(ax=ax[1], bandwidth=20, label='all')
plt.show()
- 第一个月是用户流失重灾时间
- 前10个月中,前5个月流失速率加快
- 查看单变量生存曲线差异
# 连续变量分箱
combiner = toad.transform.Combiner()
combiner.fit(df[['MonthlyCharges','TotalCharges']], df['Churn'], method='chi', # 卡方分箱
min_samples=0.05)
df_t = combiner.transform(df)
# 筛选变量
obj_list = df_t.select_dtypes(include="object").columns[1:].to_list()
obj_list += ['MonthlyCharges','TotalCharges']
# 计算行列数
num_plots = len(obj_list)
num_cols = math.ceil(np.sqrt(num_plots))
num_rows = math.ceil(num_plots/num_cols)
cr = [d for d in itertools.product(range(num_rows),range(num_cols))]
# 绘制生存曲线
fig, ax = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(48,36))
for ind,feature in enumerate(obj_list):
for i in df_t[feature].unique():
# KaplanMeier检验
kmf=KaplanMeierFitter()
df_tmp = df_t.loc[df_t[feature] == i]
kmf.fit(df_tmp.tenure, # 代表生存时长
event_observed=df_tmp.Churn, # 代表事件的终点
label=i)
# 绘制生存曲线
kmf.plot_survival_function(ci_show=False,
ax = ax[cr[ind][0]][cr[ind][1]])
# 对数秩检验,得到p值
p_value = multivariate_logrank_test(event_durations = df_t.tenure, # 代表生存时长
groups=df_t[feature], # 代表检验的组别
event_observed=df_t.Churn # 代表事件的终点
).p_value
p_value_text = ['p-value < 0.001' if p_value < 0.001 else 'p-value = %.4F'%p_value][0]
ax[cr[ind][0]][cr[ind][1]].set_title("{}\n{}".format(feature, p_value_text))
- 验证多分类单变量生存曲线差异
# 查看多分类变量的组间差异:以PaymentMethod为例
fig, ax = plt.subplots(figsize=(10,6))
feature = 'PaymentMethod'
for i in df_t[feature].unique():
# KaplanMeier检验
kmf=KaplanMeierFitter()
df_tmp = df_t.loc[df_t[feature] == i]
kmf.fit(df_tmp.tenure, # 代表生存时长
event_observed=df_tmp.Churn, # 代表事件的终点
label=i)
# 绘制生存曲线
kmf.plot_survival_function(ci_show=False,
ax = ax)
# 对数秩检验,得到p值
p_value = multivariate_logrank_test(event_durations = df_t.tenure, # 代表生存时长
groups=df_t[feature], # 代表检验的组别
event_observed=df_t.Churn # 代表事件的终点
).p_value
p_value_text = ['p-value < 0.001' if p_value < 0.001 else 'p-value = %.4F'%p_value][0]
ax.set_title("{}\n{}".format(feature, p_value_text))
plt.show()
pairwise_logrank_test(
event_durations = df_t.tenure,
groups=df_t.PaymentMethod,
event_observed=df_t.Churn
)
- 生存曲线表明PaymentMethod存在组间显著差异
- 检验表明除了Bank transfer (automatic)与Credit card (automatic)不存在显著差异,其余组间均显著差异
- 交互变量的生存曲线
# 以性别gender为例,查看与其他变量交互后,gender的流失情况是否存在显著差异
for feature in obj_list:
for i in df_t[feature].unique():
df_tmp = df_t.loc[df_t[feature] == i]
p_value = multivariate_logrank_test(event_durations = df_tmp.tenure,
groups=df_tmp.gender,
event_observed=df_tmp.Churn
).p_value
if p_value <= 0.05:
p_value_text = ['p-value < 0.001' if p_value < 0.001 else 'p-value = %.4F'%p_value][0]
print('gender : {}={}, logrank test {}'.format(feature, i, p_value_text))
gender : PaymentMethod=Credit card (automatic), logrank test p-value = 0.0288
gender : MonthlyCharges=3, logrank test p-value = 0.0360
gender : MonthlyCharges=4, logrank test p-value = 0.0181
当 PaymentMethod=Credit card (automatic)时,性别的流失情况有显著差异。同样的,当MonthlyCharges=3或者=4时,性别的流失情况有显著差异
# 查看生存曲线差异:不同性别在PaymentMethod=Credit card (automatic)时的流失差异
fig,ax = plt.subplots(figsize=(12,8))
# 绘制Male:PaymentMethod=Credit card (automatic)生存曲线
data_male = df_t.loc[(df_t.gender=='Male') & (df_t.PaymentMethod == 'Credit card (automatic)')]
kmf_male = KaplanMeierFitter().fit(data_male.tenure,
data_male.Churn,
label='Male : PaymentMethod=Credit card (automatic)')
ax = kmf_male.plot_survival_function(ax=ax)
# 绘制Female:PaymentMethod=Credit card (automatic)生存曲线
data_female = df_t.loc[(df_t.gender=='Female') & (df_t.PaymentMethod == 'Credit card (automatic)')]
kmf_female = KaplanMeierFitter().fit(data_female.tenure,
data_female.Churn,
label='Female : PaymentMethod=Credit card (automatic)')
ax = kmf_female.plot_survival_function(ax=ax)
# 调整位置
plt.grid()
plt.tight_layout()
在Credit card (automatic)客户中,Female更容易流失
特征工程
# 特征工程
df_model = pd.get_dummies(df.iloc[:,1:], drop_first=True) # 剔除客户ID,并对类别变量进行哑变量处理(剔除首列防止虚拟变量陷阱)
# 拆分训练集
train, test = train_test_split(df_model, test_size=0.2)
数据建模
模型拟合
# 模型拟合
cph = CoxPHFitter(penalizer = 0.01)
cph.fit(train, duration_col='tenure', event_col='Churn')
cph.print_summary(decimals=1)
model | lifelines.CoxPHFitter |
duration col | 'tenure' |
event col | 'Churn' |
penalizer | 0.01 |
l1 ratio | 0 |
baseline estimation | breslow |
number of observations | 5625 |
number of events observed | 1500 |
partial log-likelihood | -10138.3 |
time fit was run | 2022-09-16 15:52:24 UTC |
- Concordance=0.9,模型拟合不错。Concordance Index是AUC的推广,值越接近1效果越好
- 系数为正则表示该变量促进流失,系数为负表示该变量防止流失
- 显著影响的为p<0.05的变量。例如TotalCharges、Partner_Yes等
- exp(coef)即为相对危险度。对于连续变量TotalCharges,总费用增加一单位,流失概率是原来的1.00倍(基本没变。);对于分类变量Contract。表明One year和Two year是对照组(Month-to-month)的0.30和0.09倍
模型优化
# 剔除不显著变量
drop_col = list(cph.summary['p'][cph.summary['p'].values>0.05].index)
train_s = train.drop(drop_col, axis=1)
test_s = test.drop(drop_col, axis=1)
# 重新拟合
cph = CoxPHFitter(penalizer = 0.01)
cph.fit(train_s, duration_col='tenure', event_col='Churn')
cph.print_summary(decimals=1)
特征重要性
# 特征重要性
fig,ax = plt.subplots(figsize=(12,9))
cph.plot(ax=ax)
plt.show()
- 加速流失的变量有:InternetService_Fiber optic、PaymentMethod_Mailed check等
- 防止流失的变量有:Contract_Two year、Contract_One year等
风险观察
# 累积风险曲线
fig,ax = plt.subplots(nrows=2, ncols=1, figsize=(10,12))
cph.baseline_cumulative_hazard_.plot(ax=ax[0])
# 各时间节点的风险曲线
cph.baseline_hazard_.plot(ax=ax[1])
plt.show()
在60个月以后流失风险明显提高
模型评估
# 模型校准性
fig,ax = plt.subplots(figsize=(12,9))
survival_probability_calibration(cph, test_s, t0=50, ax=ax)
ax.grid()
ICI = 0.05854302858419381
E50 = 0.021989969295193035
- x轴为预测的流失概率,y轴为观测的流失概率
- 以50个月为例,模型与基准值(对角线)偏离较大,且一直高估了用户的流失情况
- 建议样本均衡处理,剔除具有相关性的特征等
# 使用brier score观测校准距离:Brier分数对于一组预测值越低,预测校准越好
loss_dict = {}
for i in range(1,73):
score = brier_score_loss(
test_s.Churn,
1 - np.array(cph.predict_survival_function(test_s).loc[i]),
pos_label=1 )
loss_dict[i] = [score]
loss_df = pd.DataFrame(loss_dict).T
fig, ax = plt.subplots(figsize=(12,9))
ax.plot(loss_df.index, loss_df)
ax.set(xlabel='Prediction Time', ylabel='Calibration Loss', title='Cox PH Model Calibration Loss / Time')
ax.grid()
模型在10月-20月的预测效果较好
模型应用
预测剩余价值
# 筛选未流失用户
churn0 = df_model.query("Churn == 0")
# 预测中位数生存时间
churn0_median_survive = cph.predict_median(churn0).rename('median_survive')
# 计算剩余价值=月消费*(预测中位数生存时间)-已存续时间
values = pd.concat([churn0_median_survive, churn0[['MonthlyCharges','tenure']]], axis=1)
values['RemainTenure'] = values['median_survive'] - values['tenure']
values['RemainingValue'] = values['MonthlyCharges'] * values['RemainTenure']
# 计算RemainingValue负值与inf的占比
inf_rate = values[values['RemainingValue']==float('inf')].shape[0]/values.shape[0]
neg_rate = values[values['RemainingValue']<0].shape[0]/values.shape[0]
print('{:*^60}'.format('inf_rate&neg_rate:'), '\n', 'inf_rate=%.2f' % inf_rate, 'neg_rate=%.2f' % neg_rate)
values.tail()
*********************inf_rate&neg_rate:*********************
inf_rate=0.32 neg_rate=0.04
- 如果用户生存曲线不超过0.5,预测的中位生存时间是inf,可以采用cph.predict_percentile(churn0,p=0.6)计算分为数存活时间
- 预测的最大存活时间为tenure的最大值,即无法预测到观测截面时间后的生存情况。因此也可以将inf定义为最大值
- 一些用户会在流失前被预测为流失,因此存在剩余生存时间为负。可以通过校准纠偏
# 预测未流失客户的生存曲线
unconditioned_sf = cph.predict_survival_function(churn0)
# 校准
conditioned_sf = unconditioned_sf.apply(lambda c: (c / c.loc[df.loc[c.name, 'tenure']]).clip(upper=1))
# 绘制校准曲线
fig, ax = plt.subplots(figsize=(12,9))
subject = 7038
tenures = df_model['tenure'].loc[subject,] # 查询index的位置,非自然位置
unconditioned_sf[subject].plot(ls="--", color="#A60628", label="unconditioned")
conditioned_sf[subject].plot(color="#A60628", label="conditioned on $T>{:.0f}$".format(tenures))
ax.legend()
ax.grid()
# 根据校准曲线纠偏
churn0_median_survive_new = median_survival_times(conditioned_sf)
values_new = pd.concat([churn0[['tenure','MonthlyCharges']], churn0_median_survive_new.T], axis=1)
values_new.rename(columns={0.5:'median_survive'}, inplace=True)
values_new['RemainTenure'] = values_new['median_survive'] - values_new['tenure']
values_new['RemainingValue'] = values_new['MonthlyCharges'] * values_new['RemainTenure']
# 计算RemainingValue负值与inf的占比
inf_rate = values_new[values_new['RemainingValue']==float('inf')].shape[0]/values_new.shape[0]
neg_rate = values_new[values_new['RemainingValue']<0].shape[0]/values_new.shape[0]
print('{:*^60}'.format('inf_rate&neg_rate:'), '\n', 'inf_rate=%.2f' % inf_rate, 'neg_rate=%.2f' % neg_rate)
values_new.tail()
*********************inf_rate&neg_rate:*********************
inf_rate=0.33 neg_rate=0.00
剩余价值提升
# 将之前的校准写成函数
def predict_median_func(churn0):
unconditioned_sf = cph.predict_survival_function(churn0)
conditioned_sf = unconditioned_sf.apply(lambda c: (c / c.loc[df.loc[c.name, 'tenure']]).clip(upper=1))
churn0_median_survive_new = median_survival_times(conditioned_sf).T
return churn0_median_survive_new
# 将月签合同提升至一年合同,若原先是两年合同则维持不变
tmp =df_model.loc[values_new.index,].copy()
tmp['Contract_One year'] = tmp.apply(lambda x: 1 if x['Contract_Two year'] == 0 else 0, axis=1)
values_new['contract_1y_tenure_pred'] = predict_median_func(tmp)
# 将月签合同或者一年合同提升至两年合同
tmp =df_model.loc[values_new.index,].copy()
tmp['Contract_Two year'] = 1
tmp['Contract_One year'] = 0
values_new['contract_2y_tenure_pred'] = predict_median_func(tmp)
# 提升在线安全服务
tmp =df_model.loc[values_new.index,].copy()
tmp['OnlineSecurity_Yes'] = 1
values_new['OnlineSecurity_tenure_pred'] = predict_median_func(tmp)
# 计算剩余价值
values_new['Contract_1y Diff'] = (values_new['contract_1y_tenure_pred'] - values_new['median_survive']) * values_new['MonthlyCharges']
values_new['Contract_2y Diff'] = (values_new['contract_2y_tenure_pred'] - values_new['median_survive']) * values_new['MonthlyCharges']
values_new['OnlineSecurity Diff'] = (values_new['OnlineSecurity_tenure_pred'] - values_new['median_survive']) * values_new['MonthlyCharges']
# 预览结果
values_new.head().T
# 客户基本信息
df.loc[0,]
customerID 7590-VHVEG
gender Female
SeniorCitizen 0
Partner Yes
Dependents No
tenure 1
PhoneService No
MultipleLines No phone service
InternetService DSL
OnlineSecurity No
OnlineBackup Yes
DeviceProtection No
TechSupport No
StreamingTV No
StreamingMovies No
Contract Month-to-month
PaperlessBilling Yes
PaymentMethod Electronic check
MonthlyCharges 29.85
TotalCharges 29.85
Churn 0
Name: 0, dtype: object
- 0号客户基本信息:使用时长(tenure)1个月,没有OnlineSecurity,Contract为月签。月费为29.85
- 0号客户基本预测信息:预测27个月,剩余价值805.95
- 0号客户价值提升信息:
- 更换1年合同后,预测43个月,剩余价值较月签合同提升了447.75
- 更换2年合同后,预测65个月,剩余价值较月签合同提升了1104.45
- 添加OnlineSecurity后,预测31个月,剩余价值较月签合同提升了89.55