python分层随机采样 python分层抽样代码

转载

数据探索者 2024-02-19 10:25:01

文章标签 python分层随机采样 python 机器学习迭代器数据 文章分类 Python 后端开发

分层抽样划分划分训练集&测试集

StratifiedKFold函数
StratifiedKFold是指分层采样，确保训练集，验证集中各类别样本的比例与原始数据集中相同。因此一般使用StratifiedKFold。

from sklearn.model_selection import StratifiedKFold

#构建一个抽样器
sfolder = StratifiedKFold(n_splits=3, shuffle=True, random_state=24) #random_state值是随意给的

sfolder 常用参数：

n_splits : 折叠次数，默认为3，至少为2。
shuffle:是否在每次分割之前打乱顺序。
random_state:随机种子，在shuffle==True时使用，默认使用np.random。

split方法对数据进行分割

#传入要分割的数据——特征和标签
import numpy as np
feature = np.random.rand(8,3)
labels = np.random.randint(0,2,8) #左闭右开

#In: faeture
#Out: array([[0.85266761, 0.15080449, 0.85373155],
#       [0.76360769, 0.62172447, 0.79384236],
#       [0.63896475, 0.69680718, 0.60522843],
#       [0.06980377, 0.94279252, 0.33515437],
#       [0.50063964, 0.21751566, 0.99743367],
#       [0.00954613, 0.33695297, 0.82242697],
#       [0.7629595 , 0.31011859, 0.18316649],
#       [0.03301575, 0.36399613, 0.59616915]])
#In: labels
#Out: array([0, 0, 1, 1, 1, 0, 0, 0])

#生成feature, labels的索引，其中 generator_data 是一个迭代器
generator_data = sfolder.split(feature,labels)

#可以用循环的方法导出数据
for (trn_idx, val_idx) in generator_data :
    print((trn_idx, val_idx))

#Out: (array([0, 3, 4, 6, 7]), array([1, 2, 5]))
#	(array([0, 1, 2, 3, 5]), array([4, 6, 7]))
#	(array([1, 2, 4, 5, 6, 7]), array([0, 3]))

#也可以用迭代器的其他方法，如next
generator_data = sfolder.split(feature,labels)
trn_idx, val_idx = next(generator_data)
#Out: (array([0, 3, 4, 6, 7]), array([1, 2, 5]))

next(generator_data),next(generator_data)
#Out: ((array([0, 1, 2, 3, 5]), array([4, 6, 7])),
#	 (array([1, 2, 4, 5, 6, 7]), array([0, 3])))

#分割数据
trn_feature, val_feature = feature[trn_idx], feature[val_idx] 
trn_labels, val_labels = labels[trn_idx], labels[val_idx]
print(val_feature,'\n',val_labels)
#Out:[[0.76360769 0.62172447 0.79384236]
#	 [0.63896475 0.69680718 0.60522843]
#	 [0.00954613 0.33695297 0.82242697]] 
#	[0 1 0]

定义一个函数划分训练集，测试集

import random
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

def train_test_sfloder(feature,labels):
    sfolder = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)

    generator_data = sfolder.split(feature,labels)
    train_index, test_index = next(generator_data)
    random.shuffle(train_index),random.shuffle(test_index)  #这里打乱排列的顺序
    
    #此处是pandas数据类型，用.iloc进行索引
    train_feature, test_feature = feature.iloc[train_index], feature.iloc[test_index]
    train_labels, test_labels = labels.loc[train_index], labels.loc[test_index]
    
    return train_feature, test_feature, train_labels, test_labels

XGBoost 模型调参

刚开始调参的时候，一般要先初始化一些值（初始值可以随机设置，也可以根据实际情况来设置）

import xgboost as xgb

#xgboost常用参数
params = {'learning_rate' :0.05, 
	      'n_estimators':140, 
	      'max_depth':5, 'min_child_weight':1, 
	      'gamma':0, 'subsample':0.8, 
	      'colsample_bytree':0.8,
	      'nthread':4, 'scale_pos_weight':1, 'seed':27, 
	      'objective': 'binary:logistic'}

详细的XGBoost参数：XGBoost Parameters

先将初始化的值传入模型，此时注意xgb.XGBClassifier参数中params前面的两个星号，不然接下来使用网格搜索时会报错。原因：XGBoost issue

model = xgb.XGBClassifier(**params)

用网格搜索（GridSearchCV）进行调参

详细的GridSearchCV参数：GridSearchCV Parameters

from sklearn.model_selection import GridSearchCV

#生成一个字典。字典的键是要搜索的参数名，字典的值是该参数的搜索范围
param_search = {'learning_rate':np.arange(0.01, 0.1, 0.01)} 
'''注： range生成的迭代器只能生成整数，所以用np.arange生成小数的迭代器'''


#使用GridSearchCV进行调参
gsearch = GridSearchCV(estimator = model, param_grid = param_search,scoring='roc_auc',n_jobs=4, cv=10)
gsearch.fit(feature,labels)
#In: gsearch
#Out:GridSearchCV(cv=10, error_score=nan,
#             estimator=XGBClassifier(base_score=None, booster=None,
#                                      colsample_bylevel=None,
#                                      colsample_bynode=None,
#                                      colsample_bytree=None, gamma=None,
#                                      gpu_id=None, importance_type='gain',
#                                      interaction_constraints=None,
#                                      learning_rate=None, max_delta_step=None,
#                                      max_depth=None, min_child_weight=None,
#                                      missing=nan, monotone_constraints=None,
#                                      n_esti...
#                                                 'seed': 27, 'subsample': 0.8},
#                                      random_state=None, reg_alpha=None,
#                                      reg_lambda=None, scale_pos_weight=None,
#                                      subsample=None, tree_method=None,
#                                      validate_parameters=None, verbosity=None),
#              iid='deprecated', n_jobs=4,
#              param_grid={'learning_rate': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09])},
#              pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
#              scoring='roc_auc', verbose=0)

#查看调参结果以及准确率
print(gsearch.best_params_, '\n',
      gsearch.best_score_, '\n')
#Out: {'learning_rate': 0.04} 
#	 0.6669905462184874 

#更新params的值
params_updated = dict(params, **gsearch.best_params_)
param_search = params_updated

用同样的方法，依次对n_estimators，max_depth & min_child_weight，gamma， subsample 进行网格搜索。

定义一个函数对XGBoost参数进行网格搜索

def search_XGB_params(feature,labels,isTuning=True):

    params = {  'learning_rate' :0.05, 
                'n_estimators':140, 
                'max_depth':5, 'min_child_weight':1, 
                'gamma':0, 'subsample':0.8, 
                'colsample_bytree':0.8,
                'nthread':4, 'scale_pos_weight':1, 'seed':27, 
                'objective': 'binary:logistic'}
    model = xgb.XGBClassifier(**params)
    if isTuning == False:
        return model
    
    def update_params(model, params, param_search):
        print('searching %s'%list(param_search.keys()))
        gsearch = GridSearchCV(estimator = model, param_grid = param_search, 
                            scoring='roc_auc',n_jobs=4, cv=10)
        gsearch.fit(feature,labels)
        print(gsearch.best_params_, '\n',
              gsearch.best_score_, '\n')
        params_searched = dict(params, **gsearch.best_params_)
        params_updated = params_searched
        return params_updated
    
    param_search = {'learning_rate':np.arange(0.01, 0.1, 0.01)}
    params = update_params(model, params, param_search)
    
    param_search = {'n_estimators':range(20, 150, 10)}
    params = update_params(model, params, param_search)
        
    param_search = {'max_depth':range(3, 10, 2),'min_child_weight':range(1,10,2)}
    params = update_params(model, params, param_search)
    
    param_search = {'gamma':np.arange(0, 1, 0.2)}
    params = update_params(model, params, param_search)
    
    param_search = {'subsample':np.arange(0, 1, 0.1)}
    params = update_params(model, params, param_search)
    
    model = xgb.XGBClassifier(**params)
    return model

补充

StratifiedKFold()与KFold()

KFold： KFold 将所有的样例划分为 k 个组，每组数据都具有相同的大小。每一次分割会将其中的 K-1 组作为训练数据，剩下的一组用作测试数据，一共会分割K次。可以通过下图直观的体现出来。（以四折交叉验证为例，即K取4）

python分层随机采样 python分层抽样代码_python

代码示例

import numpy as np
from sklearn.model_selection import KFold
X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
y=np.array([1,2,3,4,5,6])

kf=KFold(n_splits=3)    # 定义分成几个组

#for循环中的train_index与test_index是索引而非训练数据
for train_index,test_index in kf.split(X):
    print("Train Index:",train_index,",Test Index:",test_index)
    X_train,X_test=X[train_index],X[test_index]
    y_train,y_test=y[train_index],y[test_index]

StratifiedKFold：是KFold()的变种，采用分层分组的形式（有点类似分层抽样），使每个分组中各类别的比例同整体数据中各类别的比例尽可能的相同。（它相对于KFold()方法更完善）

代码示例

import numpy as np
from sklearn.model_selection import StratifiedKFold
X=np.array([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]])
y=np.array([1,1,1,2,2,2])
skf=StratifiedKFold(n_splits=3)

#for循环中的train_index与test_index是索引而非训练数据
for train_index,test_index in skf.split(X,y):
    print("Train Index:",train_index,",Test Index:",test_index)
    X_train,X_test=X[train_index],X[test_index]
    y_train,y_test=y[train_index],y[test_index]

Python迭代器

迭代器
迭代器是指用iter(obj)函数返回的对象(实例)
迭代器是指用next(it)函数获取可迭代对象的数据

迭代器函数
iter(iterable)从可迭代对象中返回一个迭代器,iterable必须是能提供一个迭代器的对象
next(iterator) 从迭代器iterator中获取下一了记录,如果无法获取下一条记录,则触发stoptrerator异常
说明:
1.迭代器只能往前取值,不会后退
2.用iter函数可以返回一个可迭代对象的迭代器

迭代器与可迭代对象的关系

python分层随机采样 python分层抽样代码_机器学习_02

示例

l = [1,3,5,7]
it = iter(l) #让l提供一个能访问自己的迭代器

In[1]: it
Out[1]: <list_iterator at 0x1d003707f08>

In[2]: next(it)
Out[2]: 1

In[3]: next(it)
Out[3]: 3

In[4]: next(it)
Out[4]: 5

In[5]: next(it)
Out[5]: 7

In[6]: next(it)
Out[6]: StopIterable

练习:
有一个集合:
s = {‘唐僧’,‘悟空’,‘悟能’,‘悟净’}
用for语句来遍历所有的元素如下;
for x in s:
print(x)
else:
print(‘遍历结束’)
将上面的for语句改写为while语句和迭代器实现

s = {'唐僧','悟空','悟能','悟净'}
it = iter(s)
try:
    while True:
        x = next(it)
        print(x)
except StopIteration:
    print('遍历结束')

Python字典的合并

字典合并的四种方式

for 循环的方式

即向一个字典中添加另一个字典的键值对

>>> a = {1: 'a', 2: 'aa'}
>>> b = {3: 'aaa', 4: 'aaaa'}
>>> for k, v in b.items():
...     a[k] = v
...
>>> a
{1: 'a', 2: 'aa', 3: 'aaa', 4: 'aaaa'}

dict.update(other_dict) 的方式

调用字典的 update() 方法

>>> a = {1: 'a', 2: 'aa'}
>>> b = {3: 'aaa', 4: 'aaaa'}
>>> a.update(b)
>>> a
{1: 'a', 2: 'aa', 3: 'aaa', 4: 'aaaa'}

dict(a.items() + b.items()) 的方式

同样返回一个新的字典

>>> a = {1: 'a', 2: 'aa'}
>>> b = {3: 'aaa', 4: 'aaaa'}
>>> dict(a.items() + b.items())
{1: 'a', 2: 'aa', 3: 'aaa', 4: 'aaaa'}

dict(a, **b) 的方式

该方式返回一个新的字典

>>> a = {1: 'a', 2: 'aa'}
>>> b = {3: 'aaa', 4: 'aaaa'}
>>> dict(a, **b)
{1: 'a', 2: 'aa', 3: 'aaa', 4: 'aaaa'}

注：如果添加字典已有的键，会覆盖（更新）对应的值。

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。

上一篇：logistic sigmoid 函数 logistic函数推导

下一篇：mysql 添加 inndb 引擎 mysql配置innodb

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯