导入数据分析相关库
# 导入标准库
import io, os, sys, types, time, datetime, math, random, requests, subprocess,io, tempfile, math
# 导入第三方库
# 数据处理
import numpy as np
import pandas as pd
# 数据可视化
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
# from pandas.tools.plotting import scatter_matrix # No module named 'pandas.tools'
from mpl_toolkits.mplot3d import Axes3D
# plt.style.use('seaborn') # 改变图像风格
plt.rcParams['font.family'] = ['Arial Unicode MS', 'Microsoft Yahei', 'SimHei', 'sans-serif'] # 解决中文乱码
plt.rcParams['axes.unicode_minus'] = False # simhei黑体字 负号乱码 解决
# 特征选择和编码
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize # Imputer
# from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute
# 机器学习
import sklearn.ensemble as ske
from sklearn import datasets, model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
# 网格搜索、随机搜索
import scipy.stats as st
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
# 模型度量(分类)
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc
# 警告处理
import warnings
warnings.filterwarnings('ignore')
# 在Jupyter上画图
%matplotlib inline
# 数据预处理
import numpy as np
import scipy as sc
import sklearn as sk
import matplotlib.pyplot as plt
# 绘图工具包
import seaborn as sns
import pyecharts.options as opts
from pyecharts.charts import Line, Grid
一、赛题数据
赛题以预测用户贷款是否违约为任务,数据集报名后可见并可下载,该数据来自某信贷平台的贷款记录,总数据量超过120w,包含47列变量信息,其中15列为匿名变量。为了保证比赛的公平性,将会从中抽取80万条作为训练集,20万条作为测试集A,20万条作为测试集B,同时会对employmentTitle、purpose、postCode和title等信息进行脱敏。
数据集导入
- train
- test
# 数据集路径
train_path = 'train.csv'
test_path = 'testA.csv'
dataset_path = './'
data_train_path = dataset_path + train_path
data_test_path = dataset_path + test_path
# 2.数据集csv读入
train = pd.read_csv(data_train_path)
test = pd.read_csv(data_test_path)
train.info()
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
id 800000 non-null int64
loanAmnt 800000 non-null float64
term 800000 non-null int64
interestRate 800000 non-null float64
installment 800000 non-null float64
grade 800000 non-null object
subGrade 800000 non-null object
employmentTitle 799999 non-null float64
employmentLength 753201 non-null object
homeOwnership 800000 non-null int64
annualIncome 800000 non-null float64
verificationStatus 800000 non-null int64
issueDate 800000 non-null object
isDefault 800000 non-null int64
purpose 800000 non-null int64
postCode 799999 non-null float64
regionCode 800000 non-null int64
dti 799761 non-null float64
delinquency_2years 800000 non-null float64
ficoRangeLow 800000 non-null float64
ficoRangeHigh 800000 non-null float64
openAcc 800000 non-null float64
pubRec 800000 non-null float64
pubRecBankruptcies 799595 non-null float64
revolBal 800000 non-null float64
revolUtil 799469 non-null float64
totalAcc 800000 non-null float64
initialListStatus 800000 non-null int64
applicationType 800000 non-null int64
earliesCreditLine 800000 non-null object
title 799999 non-null float64
policyCode 800000 non-null float64
n0 759730 non-null float64
n1 759730 non-null float64
n2 759730 non-null float64
n3 759730 non-null float64
n4 766761 non-null float64
n5 759730 non-null float64
n6 759730 non-null float64
n7 759730 non-null float64
n8 759729 non-null float64
n9 759730 non-null float64
n10 766761 non-null float64
n11 730248 non-null float64
n12 759730 non-null float64
n13 759730 non-null float64
n14 759730 non-null float64
dtypes: float64(33), int64(9), object(5)
memory usage: 286.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 46 columns):
id 200000 non-null int64
loanAmnt 200000 non-null float64
term 200000 non-null int64
interestRate 200000 non-null float64
installment 200000 non-null float64
grade 200000 non-null object
subGrade 200000 non-null object
employmentTitle 200000 non-null float64
employmentLength 188258 non-null object
homeOwnership 200000 non-null int64
annualIncome 200000 non-null float64
verificationStatus 200000 non-null int64
issueDate 200000 non-null object
purpose 200000 non-null int64
postCode 200000 non-null float64
regionCode 200000 non-null int64
dti 199939 non-null float64
delinquency_2years 200000 non-null float64
ficoRangeLow 200000 non-null float64
ficoRangeHigh 200000 non-null float64
openAcc 200000 non-null float64
pubRec 200000 non-null float64
pubRecBankruptcies 199884 non-null float64
revolBal 200000 non-null float64
revolUtil 199873 non-null float64
totalAcc 200000 non-null float64
initialListStatus 200000 non-null int64
applicationType 200000 non-null int64
earliesCreditLine 200000 non-null object
title 200000 non-null float64
policyCode 200000 non-null float64
n0 189889 non-null float64
n1 189889 non-null float64
n2 189889 non-null float64
n3 189889 non-null float64
n4 191606 non-null float64
n5 189889 non-null float64
n6 189889 non-null float64
n7 189889 non-null float64
n8 189889 non-null float64
n9 189889 non-null float64
n10 191606 non-null float64
n11 182425 non-null float64
n12 189889 non-null float64
n13 189889 non-null float64
n14 189889 non-null float64
dtypes: float64(33), int64(8), object(5)
memory usage: 70.2+ MB
字段表
- id 为贷款清单分配的唯一信用证标识
- loanAmnt 贷款金额
- term 贷款期限(year)
- interestRate 贷款利率
- installment 分期付款金额
- grade 贷款等级
- subGrade 贷款等级之子级
- employmentTitle 就业职称
- employmentLength 就业年限(年)
- homeOwnership 借款人在登记时提供的房屋所有权状况
- annualIncome 年收入
- verificationStatus 验证状态
- issueDate 贷款发放的月份
- purpose 借款人在贷款申请时的贷款用途类别
- postCode 借款人在贷款申请中提供的邮政编码的前3位数字
- regionCode 地区编码
- dti 债务收入比
- delinquency_2years 借款人过去2年信用档案中逾期30天以上的违约事件数
- ficoRangeLow 借款人在贷款发放时的fico所属的下限范围
- ficoRangeHigh 借款人在贷款发放时的fico所属的上限范围
- openAcc 借款人信用档案中未结信用额度的数量
- pubRec 贬损公共记录的数量
- pubRecBankruptcies 公开记录清除的数量
- revolBal 信贷周转余额合计
- revolUtil 循环额度利用率,或借款人使用的相对于所有可用循环信贷的信贷金额
- totalAcc 借款人信用档案中当前的信用额度总数
- initialListStatus 贷款的初始列表状态
- applicationType 表明贷款是个人申请还是与两个共同借款人的联合申请
- earliesCreditLine 借款人最早报告的信用额度开立的月份
- title 借款人提供的贷款名称
- policyCode 公开可用的策略代码=1新产品不公开可用的策略代码=2
- n系列匿名特征 匿名特征n0-n14,为一些贷款人行为计数特征的处理
观察各个字段含义和实际数值
# 数据维度
train.shape, test.shape
((800000, 47), (200000, 46))
# train.head(20).iloc[:,:13]
train.head(20).iloc[:,13:26]
isDefault | purpose | postCode | regionCode | dti | delinquency_2years | ficoRangeLow | ficoRangeHigh | openAcc | pubRec | pubRecBankruptcies | revolBal | revolUtil | |
0 | 1 | 1 | 137.0 | 32 | 17.05 | 0.0 | 730.0 | 734.0 | 7.0 | 0.0 | 0.0 | 24178.0 | 48.9 |
1 | 0 | 0 | 156.0 | 18 | 27.83 | 0.0 | 700.0 | 704.0 | 13.0 | 0.0 | 0.0 | 15096.0 | 38.9 |
2 | 0 | 0 | 337.0 | 14 | 22.77 | 0.0 | 675.0 | 679.0 | 11.0 | 0.0 | 0.0 | 4606.0 | 51.8 |
3 | 0 | 4 | 148.0 | 11 | 17.21 | 0.0 | 685.0 | 689.0 | 9.0 | 0.0 | 0.0 | 9948.0 | 52.6 |
4 | 0 | 10 | 301.0 | 21 | 32.16 | 0.0 | 690.0 | 694.0 | 12.0 | 0.0 | 0.0 | 2942.0 | 32.0 |
5 | 0 | 9 | 512.0 | 21 | 17.14 | 0.0 | 730.0 | 734.0 | 19.0 | 0.0 | 0.0 | 4047.0 | 31.1 |
6 | 0 | 0 | 517.0 | 14 | 17.49 | 0.0 | 755.0 | 759.0 | 12.0 | 0.0 | 0.0 | 3111.0 | 8.5 |
7 | 0 | 0 | 100.0 | 4 | 32.60 | 0.0 | 665.0 | 669.0 | 8.0 | 1.0 | 1.0 | 14021.0 | 59.7 |
8 | 1 | 0 | 792.0 | 13 | 19.22 | 0.0 | 690.0 | 694.0 | 15.0 | 0.0 | 0.0 | 27176.0 | 46.0 |
9 | 0 | 0 | 59.0 | 11 | 24.39 | 0.0 | 725.0 | 729.0 | 7.0 | 0.0 | 0.0 | 2936.0 | 30.6 |
10 | 0 | 4 | 134.0 | 8 | 14.21 | 0.0 | 665.0 | 669.0 | 13.0 | 0.0 | 0.0 | 8653.0 | 47.5 |
11 | 0 | 0 | 893.0 | 49 | 34.63 | 0.0 | 710.0 | 714.0 | 10.0 | 0.0 | 0.0 | 16343.0 | 80.9 |
12 | 0 | 0 | 195.0 | 38 | 7.58 | 0.0 | 680.0 | 684.0 | 12.0 | 0.0 | 0.0 | 18866.0 | 35.7 |
13 | 0 | 2 | 134.0 | 8 | 5.68 | 0.0 | 690.0 | 694.0 | 7.0 | 0.0 | 0.0 | 4334.0 | 68.8 |
14 | 0 | 4 | 167.0 | 8 | 38.95 | 0.0 | 710.0 | 714.0 | 9.0 | 0.0 | 0.0 | 19023.0 | 60.8 |
15 | 0 | 2 | 194.0 | 38 | 17.27 | 0.0 | 660.0 | 664.0 | 16.0 | 1.0 | 1.0 | 220.0 | 3.6 |
16 | 0 | 2 | 492.0 | 36 | 21.02 | 0.0 | 705.0 | 709.0 | 16.0 | 0.0 | 0.0 | 36609.0 | 61.1 |
17 | 1 | 4 | 56.0 | 8 | 17.14 | 0.0 | 695.0 | 699.0 | 5.0 | 0.0 | 0.0 | 5463.0 | 76.9 |
18 | 1 | 3 | 140.0 | 8 | 28.95 | 3.0 | 660.0 | 664.0 | 6.0 | 0.0 | 0.0 | 6804.0 | 84.0 |
19 | 0 | 0 | 305.0 | 15 | 15.55 | 0.0 | 700.0 | 704.0 | 10.0 | 0.0 | 0.0 | 22859.0 | 57.0 |
数据处理
查看缺失值
- 可以看到employmentLength这一字段缺失情况严重
missingno.bar(train)
<matplotlib.axes._subplots.AxesSubplot at 0x20f802ad588>
# employmentLength字段值的分布
train['employmentLength'].value_counts()
10+ years 262753
2 years 72358
< 1 year 64237
3 years 64152
1 year 52489
5 years 50102
4 years 47985
6 years 37254
8 years 36192
7 years 35407
9 years 30272
Name: employmentLength, dtype: int64
二、评测标准
提交结果为每个测试样本是1的概率,也就是y为1的概率。评价方法为AUC评估模型效果(越大越好)。
分类常用使用的评估指标是:
- Accuracy(精确度),AUC,Recall(召回率),Precision(准确度),F1,Kappa
本次是学习赛使用的评估指标是AUC
- AUC也就是ROC曲线下与坐标轴围成的面积
- ROC空间将假正例率(FPR)定义为 X 轴,真正例率(TPR)定义为 Y 轴。
- TPR:在所有实际为正例的样本中,被正确地判断为正例之比率。
- FPR:在所有实际为负例的样本中,被错误地判断为正例之比率。
- AUC的取值范围子是0.5和1之间,面积越大,精准度越高,因此AUC越接近1.0,模型精准率预告,AUC为1时精准率为100%,
三、结果提交
提交前请确保预测结果的格式与sample_submit.csv中的格式一致,以及提交文件后缀名为csv。