文章目录

  • pyspark实现(全网唯一)
  • sklearn实现1:划分训练集、测试集
  • sklearn实现2:train_test_split 划分数据集,并保证正负样本比一致


忍不住啰嗦一句,spark做数据处理,是真的麻烦,很麻烦!!!

直接上代码,希望能帮到你

pyspark实现(全网唯一)

pyspark 处理数据 pyspark sample_数据集

代码可以根据需求自己修改,测试没问题了把count去掉跑,能快半个多小时

'''  spark df 采样、分割train、val、test   '''
def data_sample_split_distribution(df, ss, n=1, rate_val=0.1, rate_test=0.1, rate_test_with=30, ifflag=True):
    '''
     首先 1:n 采样,再划分train、val、test
    :param df: dataframe
    :param ss: sparksession 用于添加自增id
    :param n: 1:n采样
    :param rate_val: 验证集划分比例
    :param rate_test: 测试集划分比例
    :param rate_test_with: 测试集分布和实际线上持平,例如这里是1:30
    :return: df_train、df_val、df_test
    '''
    df_c = df.count()
    print(' 样本总量:', df_c)
    df_pos = df.filter(df.y == 1)
    df_neg = df.filter(df.y == 0)
    df_pos_c = df_pos.count()
    df_neg_c = df_neg.count()
    print(' 正样本数:{},负样本数:{},正负样本比为:{} : {}'.format(df_pos_c, df_neg_c, 1, df_neg_c / df_pos_c))

    if ifflag:
        # 只对负样本添加自增id  tmpid
        df_neg = df_neg.withColumn("tmpid", fn.monotonically_increasing_id())

    # 1:n 采样
    rate = np.around((df_pos_c * n / df_neg_c), 4) + 0.0002
    print(' 负样本欠采样概率:', rate)
    df_neg_sample = df_neg.sample(fraction=rate)
    df_neg_sample_c = df_neg_sample.count()
    print(' 1:{} 采样后,样本量:{},正负样本比为:{} : {}'.format(n, df_neg_sample_c + df_pos_c, 1, np.around(df_neg_sample_c / df_pos_c, decimals=4)))

    if ifflag:
        # 将采样后的负样本tmpid取出,保证测试集负样本和训练、验证独立
        rmlst = [c[0] for c in df_neg_sample.select('tmpid').collect()]
        df_neg_sample = df_neg_sample.drop('tmpid')

        # 划分train、test
        df_pos_train, df_pos_test = df_pos.randomSplit(weights=[1 - rate_test, rate_test], seed=20)
        df_neg_train, _ = df_neg_sample.randomSplit(weights=[1 - rate_test, rate_test], seed=20)

        # 划分test负样本
        df_pos_test_c = df_pos_test.count()
        df_neg_test_c = df_pos_test_c * rate_test_with  # 测试集和实际线上分布持平
        df_neg_test = df_neg.filter(~df_neg['tmpid'].isin(rmlst))
        # 将自增id删除后再添加,目的是取df的 df_neg_test_c 行
        df_neg_test = df_neg_test.drop('tmpid')
        df_neg_test = df_neg_test.withColumn("tmpid", fn.monotonically_increasing_id())
        # df_neg_test = df_neg_test.filter(df_neg_test['tmpid'].between(0, df_neg_test_c + 1000))
        df_neg_test = ss.createDataFrame(df_neg_test.take(df_neg_test_c + 1000))
        df_neg_test = df_neg_test.drop('tmpid')
        dfTest = df_pos_test.unionAll(df_neg_test)
        dfTest.cache()

        # 划分train、val
        df_pos_train, df_pos_val = df_pos_train.randomSplit(weights=[1 - rate_val, rate_val], seed=20)
        df_neg_train, df_neg_val = df_neg_train.randomSplit(weights=[1 - rate_val, rate_val], seed=20)

        dfTrain = df_pos_train.unionAll(df_neg_train)
        dfVal = df_pos_val.unionAll(df_neg_val)
        dfTrain.cache()
        dfVal.cache()

        # 将采样后的样本洗牌
        dfTrain = df_shuffle(dfTrain)
        dfVal = df_shuffle(dfVal)
        dfTest = df_shuffle(dfTest)

        sample_cTrain, sample_pos_cTrain, sample_neg_cTrain = pos_neg_rate2(dfTrain, ifspark=True)
        sample_cVal, sample_pos_cValn, sample_neg_cVal = pos_neg_rate2(dfVal, ifspark=True)
        sample_cTest, sample_pos_cTest, sample_neg_cTest = pos_neg_rate2(dfTest, ifspark=True)

        print('  划分后,训练集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTrain, sample_pos_cTrain, sample_neg_cTrain, 1, np.around(sample_neg_cTrain / sample_pos_cTrain, decimals=4)))
        print('  划分后,验证集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cVal, sample_pos_cValn, sample_neg_cVal, 1, np.around(sample_neg_cVal / sample_pos_cValn, decimals=4)))
        print('  划分后,测试集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTest, sample_pos_cTest, sample_neg_cTest, 1, np.around(sample_neg_cTest / sample_pos_cTest, decimals=4)))
        return dfTrain, dfVal, dfTest

    else:
        df_new = df_neg_sample.unionAll(df_pos)
        df_new.cache()

        return df_new, None, None
def pos_neg_rate2(df, ifspark=True):
    if ifspark:
        sample_c = df.count()
        sample_pos_c = df.filter(df.y == 1).count()
        sample_neg_c = df.filter(df.y == 0).count()
    else:
        sample_c = len(df)
        sample_pos_c = len(df[df['y'] == 1])
        sample_neg_c = len(df[df['y'] == 0])
    return sample_c, sample_pos_c, sample_neg_c

def flat(l):
    for k in l:
        if not isinstance(k, (list, tuple)):
            yield k
        else:
            yield from flat(k)

# 给df增加一列连续自增id,用于拼接
def mkdf_tojoin(df, ss):
    schema = df.schema.add(StructField("tmpid", LongType()))
    rdd = df.rdd.zipWithIndex()
    rdd = rdd.map(lambda x: list(flat(x)))
    df = ss.createDataFrame(rdd, schema)
    return df

def df_shuffle(df):
    df = df.withColumn('rand', fn.rand() * 10000)
    df = df.orderBy(df['rand'])
    df = df.drop('rand')
    return df

example:

样本总量: 10577260
 正样本数:300081,负样本数:10277179,正负样本比为:1 : 34.248016368913724
 负样本欠采样概率: 0.0294
 1:1 采样后,样本量:600662,正负样本比为:1 : 1.0017
  划分后,训练集样本量:485414,正样本数:242513,负样本数:242901,正负样本比:1 : 1.0016
  划分后,验证集样本量:54584,正样本数:27272,负样本数:27312,正负样本比:1 : 1.0015
  划分后,测试集样本量:940176,正样本数:30296,负样本数:909880,正负样本比:1 : 30.033

sklearn实现1:划分训练集、测试集

'''   pandas df 采样,针对直客数据,分割train、test   '''
def df_sample_split(df, n=1, rate_test_with=15):
    print('样本量:', len(df))
    df_pos = skl_shuffle(df[df['y'] == 1])
    df_neg = skl_shuffle(df[df['y'] == 0])
    df_pos_c = len(df_pos)
    df_neg_c = len(df_neg)
    print(' 正样本数:{},负样本数:{},正负样本比为:{} : {}'.format(df_pos_c, df_neg_c, 1, df_neg_c / df_pos_c))

    df_neg_train = df_neg.iloc[:df_pos_c * n + 1, :]  # epsilon

    # 测试集和实际线上分布持平
    df_neg_test = df_neg.iloc[df_pos_c * n + 2: df_pos_c * rate_test_with + 1, :]  # epsilon

    df_train = skl_shuffle(pd.concat([df_pos, df_neg_train], axis=0, ignore_index=True))
    df_test = skl_shuffle(pd.concat([df_pos, df_neg_test], axis=0, ignore_index=True))

    sample_cTrain, sample_pos_cTrain, sample_neg_cTrain = pos_neg_rate2(df_train, ifspark=False)
    sample_cTest, sample_pos_cTest, sample_neg_cTest = pos_neg_rate2(df_test, ifspark=False)

    print('  划分后,训练集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTrain, sample_pos_cTrain, sample_neg_cTrain, 1, np.around(sample_neg_cTrain / sample_pos_cTrain, decimals=4)))
    print('  划分后,测试集样本量:{},正样本数:{},负样本数:{},正负样本比:{} : {}'.format(sample_cTest, sample_pos_cTest, sample_neg_cTest, 1, np.around(sample_neg_cTest / sample_pos_cTest, decimals=4)))

    return df_train, df_test

sklearn实现2:train_test_split 划分数据集,并保证正负样本比一致