1、将一个字符或数字列转换为vector/array
from pyspark.sql.functions import col,udf
from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT, DenseVector
# 数字的可转为vector,但字符串转为vector会报错
to_vec = udf(lambda x: DenseVector([x]), VectorUDT())
# 字符串转为array
to_array = udf(lambda x: [x], ArrayType(StringType()))
2、从一个向量或数组列中获取某个位置处的值
df = spark.createDataFrame([(1, [1,2,3]), (2, [4,5,6])], ['label', 'data'])
df.show()
df.printSchema()
+-----+---------+
|label| data|
+-----+---------+
| 1|[1, 2, 3]|
| 2|[4, 5, 6]|
+-----+---------+
root
|-- label: long (nullable = true)
|-- data: array (nullable = true)
| |-- element: long (containsNull = true)
# 可以根据某一列的值作为索引来选择特定位置的值
from pyspark.sql.functions import udf,col
from pyspark.sql.types import FloatType
firstelement=udf(lambda k,v:float(v[int(k)]),FloatType())
df.withColumn('value', firstelement(col('label'), col('data'))).show(4, truncate=False)
+-----+---------+-----+
|label|data |value|
+-----+---------+-----+
|1 |[1, 2, 3]|2.0 |
|2 |[4, 5, 6]|6.0 |
+-----+---------+-----+
3、单个list列变多列
df = hc.createDataFrame(sc.parallelize([['a', [1,2,3]], ['b', [2,3,4]]]), ["key", "value"])
df.printSchema()
df.show()
root
|-- key: string (nullable = true)
|-- value: array (nullable = true)
| |-- element: long (containsNull = true)
you can access the values like you would with python using []
:
df.select("key", df.value[0], df.value[1], df.value[2]).show()
+---+--------+--------+--------+
|key|value[0]|value[1]|value[2]|
+---+--------+--------+--------+
| a| 1| 2| 3|
| b| 2| 3| 4|
+---+--------+--------+--------+
+---+-------+
|key| value|
+---+-------+
| a|[1,2,3]|
| b|[2,3,4]|
+---+-------+
- If it is of type
StructType()
: (maybe you built your dataframe by reading a JSON)
df2 = df.select("key", psf.struct(
df.value[0].alias("value1"),
df.value[1].alias("value2"),
df.value[2].alias("value3")
).alias("value"))
df2.printSchema()
df2.show()
root
|-- key: string (nullable = true)
|-- value: struct (nullable = false)
| |-- value1: long (nullable = true)
| |-- value2: long (nullable = true)
| |-- value3: long (nullable = true)
+---+-------+
|key| value|
+---+-------+
| a|[1,2,3]|
| b|[2,3,4]|
+---+-------+
you can directly 'split' the column using *
:
df2.select('key', 'value.*').show()
+---+------+------+------+
|key|value1|value2|value3|
+---+------+------+------+
| a| 1| 2| 3|
| b| 2| 3| 4|
+---+------+------+------+
from pyspark.sql.types import * # Needed to define DataFrame Schema.
from pyspark.sql.functions import expr
# Define schema to create DataFrame with an array typed column.
mySchema = StructType([StructField("V1", StringType(), True),
StructField("V2", ArrayType(IntegerType(),True))])
df = spark.createDataFrame([['A', [1, 2, 3, 4, 5, 6, 7]],
['B', [8, 7, 6, 5, 4, 3, 2]]], schema= mySchema)
# Split list into columns using 'expr()' in a comprehension list.
arr_size = 7
df = df.select(['V1', 'V2']+[expr('V2[' + str(x) + ']') for x in range(0, arr_size)])
# It is posible to define new column names.
new_colnames = ['V1', 'V2'] + ['val_' + str(i) for i in range(0, arr_size)]
df = df.toDF(*new_colnames)
4、获取每个类别的前n个数据
rdd = sc.parallelize([("user_1", "object_1", 3),
("user_1", "object_2", 2),
("user_2", "object_1", 5),
("user_2", "object_2", 2),
("user_2", "object_2", 6)])
df = sqlContext.createDataFrame(rdd, ["user_id", "object_id", "score"])
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col
window = Window.partitionBy(df['user_id']).orderBy(df['score'].desc())
df.select('*', rank().over(window).alias('rank'))
.filter(col('rank') <= 2)
.show()
#+-------+---------+-----+----+
#|user_id|object_id|score|rank|
#+-------+---------+-----+----+
#| user_1| object_1| 3| 1|
#| user_1| object_2| 2| 2|
#| user_2| object_2| 6| 1|
#| user_2| object_1| 5| 2|
#+-------+---------+-----+----+
5、将字符串转变为日期
from pyspark.sql.functions import min, max
df = spark.createDataFrame([
"2017-01-01", "2018-02-08", "2019-01-03", "2019-01-01"], "string"
).selectExpr("CAST(value AS date) AS date")
# 或者
df = spark.createDataFrame([
"2017-01-01", "2018-02-08", "2019-01-03", "2019-01-01"], "string"
).withColumn('date', F.col('value').cast(DateType()))
min_date, max_date = df.select(min("date"), max("date")).first()
min_date, max_date
# (datetime.date(2017, 1, 1), datetime.date(2019, 1, 3))
6、检测缺失值
d1 = spark.createDataFrame([(10,'a',None), (20, 'b',3), (30, 'c',4),
(20, 'b',5), (30, 'd',6), (40, None,7),
(None, 'e',8)], ['value', 'key','v2'])
d1 = d1.select('key', 'value', 'v2')
d1.where(reduce(lambda x, y: x | y, (F.col(x).isNull() for x in d1.columns))).show()
# 或者
d1.where(F.col('key').isNull() | F.col('value').isNull() | F.col('v2').isNull()).show()
# +----+-----+----+
# | key|value| v2|
# +----+-----+----+
# | a| 10|null|
# |null| 40| 7|
# | e| null| 8|
# +----+-----+----+
更进一步地,同时还检测NaN
和空字符串:
d1 = spark.createDataFrame([(10,'a', None), (20, 'b', 3.), (30, 'c',4.),
(20, 'b',5.), (30, 'd', np.nan), (40, None,7.),
(None, 'e',8.), (50, '', 8.)], ['value', 'key','v2'])
d1 = d1.select('key', 'value', 'v2')
d1.show()
# +----+-----+----+
# | key|value| v2|
# +----+-----+----+
# | a| 10|null|
# | b| 20| 3.0|
# | c| 30| 4.0|
# | b| 20| 5.0|
# | d| 30| NaN|
# |null| 40| 7.0|
# | e| null| 8.0|
# | | 50| 8.0|
# +----+-----+----+
d1.where((F.col('key').isNotNull()) & (F.col('key')!='') & (~F.isnan(F.col('v2')))).show()
# +---+-----+----+
# |key|value| v2|
# +---+-----+----+
# | a| 10|null|
# | b| 20| 3.0|
# | c| 30| 4.0|
# | b| 20| 5.0|
# | e| null| 8.0|
# +---+-----+----+
7、填充非连续时间序列
import random
data = {'readtime' : pd.date_range(start='1/15/2018', end='02/14/2018', freq='D')\
.append(pd.date_range(start='1/15/2018', end='02/14/2018', freq='D')),
'house' : ['house1' for i in range(31)] + ['house2' for i in range(31)],
'readvalue': [0.5+0.5*np.sin(2*np.pi/30*i) for i in range(31)]\
+ [0.5+0.5*np.cos(2*np.pi/30*i) for i in range(31)]}
df0 = pd.DataFrame(data, columns = ['readtime', 'house', 'readvalue'])
random.seed(42)
df0 = df0.drop(random.sample(range(df0.shape[0]), k=int(df0.shape[0]/2)))
df0.head()
readtime house readvalue
2 2018-01-17 house1 0.703368
5 2018-01-20 house1 0.933013
7 2018-01-22 house1 0.997261
8 2018-01-23 house1 0.997261
14 2018-01-29 house1 0.603956
import pyspark.sql.functions as func
from pyspark.sql.functions import col
df = spark.createDataFrame(df0)
df = df.withColumn("readtime", col('readtime')/1e9)\
.withColumn("readtime_existent", col("readtime"))
df.show(10, False)
+-----------+------+--------------------+-----------------+
|readtime |house |readvalue |readtime_existent|
+-----------+------+--------------------+-----------------+
|1.5161472E9|house1|0.7033683215379001 |1.5161472E9 |
|1.5164064E9|house1|0.9330127018922193 |1.5164064E9 |
|1.5165792E9|house1|0.9972609476841366 |1.5165792E9 |
|1.5166656E9|house1|0.9972609476841368 |1.5166656E9 |
|1.517184E9 |house1|0.6039558454088798 |1.517184E9 |
|1.5172704E9|house1|0.5000000000000001 |1.5172704E9 |
|1.5174432E9|house1|0.2966316784621001 |1.5174432E9 |
|1.5175296E9|house1|0.2061073738537635 |1.5175296E9 |
|1.5177024E9|house1|0.06698729810778081 |1.5177024E9 |
|1.5177888E9|house1|0.024471741852423234|1.5177888E9 |
+-----------+------+--------------------+-----------------+
only showing top 10 rows
三步实现填充时间gap:
- In the first step, we group the data by ‘house’ and generate an array containing an equally spaced time grid for each house.
- In the second step, we create one row for each element of the arrays by using the spark SQL function explode().
- In the third step, the resulting structure is used as a basis to which the existing read value information is joined using an outer left join.
from pyspark.sql.types import *
# define function to create date range
def date_range(t1, t2, step=60*60*24):
"""Return a list of equally spaced points between t1 and t2 with stepsize step."""
return [t1 + step*x for x in range(int((t2-t1)/step)+1)]
# define udf
date_range_udf = func.udf(date_range, ArrayType(LongType()))
# obtain min and max of time period for each house
df_base = df.groupBy('house')\
.agg(func.min('readtime').cast('integer').alias('readtime_min'),
func.max('readtime').cast('integer').alias('readtime_max'))
# generate timegrid and explode
df_base = df_base.withColumn("readtime", func.explode(date_range_udf("readtime_min", "readtime_max")))\
.drop('readtime_min', 'readtime_max')
# left outer join existing read values
df_all_dates = df_base.join(df, ["house", "readtime"], "leftouter")
tmp = df_all_dates.withColumn('readtime', func.from_unixtime(col('readtime')))
tmp.orderBy('house','readtime').show(20, False)
+------+-------------------+--------------------+-----------------+
|house |readtime |readvalue |readtime_existent|
+------+-------------------+--------------------+-----------------+
|house1|2018-01-17 08:00:00|0.7033683215379001 |1.5161472E9 |
|house1|2018-01-18 08:00:00|null |null |
|house1|2018-01-19 08:00:00|null |null |
|house1|2018-01-20 08:00:00|0.9330127018922193 |1.5164064E9 |
|house1|2018-01-21 08:00:00|null |null |
|house1|2018-01-22 08:00:00|0.9972609476841366 |1.5165792E9 |
|house1|2018-01-23 08:00:00|0.9972609476841368 |1.5166656E9 |
|house1|2018-01-24 08:00:00|null |null |
|house1|2018-01-25 08:00:00|null |null |
|house1|2018-01-26 08:00:00|null |null |
|house1|2018-01-27 08:00:00|null |null |
|house1|2018-01-28 08:00:00|null |null |
|house1|2018-01-29 08:00:00|0.6039558454088798 |1.517184E9 |
|house1|2018-01-30 08:00:00|0.5000000000000001 |1.5172704E9 |
|house1|2018-01-31 08:00:00|null |null |
|house1|2018-02-01 08:00:00|0.2966316784621001 |1.5174432E9 |
|house1|2018-02-02 08:00:00|0.2061073738537635 |1.5175296E9 |
|house1|2018-02-03 08:00:00|null |null |
|house1|2018-02-04 08:00:00|0.06698729810778081 |1.5177024E9 |
|house1|2018-02-05 08:00:00|0.024471741852423234|1.5177888E9 |
+------+-------------------+--------------------+-----------------+
only showing top 20 rows
8、当前行与上一行值得差
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
sc = SparkContext(appName="PrevRowDiffApp")
sqlc = SQLContext(sc)
rdd = sc.parallelize([(1, 65), (2, 66), (3, 65), (4, 68), (5, 71)])
df = sqlc.createDataFrame(rdd, ["id", "value"])
my_window = Window.partitionBy().orderBy("id")
df = df.withColumn("prev_value", F.lag(df.value).over(my_window))
df = df.withColumn("diff", F.when(F.isnull(df.value - df.prev_value), 0)
.otherwise(df.value - df.prev_value))
df.show()
+---+-----+----------+----+
| id|value|prev_value|diff|
+---+-----+----------+----+
| 1| 65| null| 0|
| 2| 66| 65| 1|
| 3| 65| 66| -1|
| 4| 68| 65| 3|
| 5| 71| 68| 3|
+---+-----+----------+----+
9、udf如何传入多个参数来计算list column不同的quantile?
def quantile_udf(q=0.5):
"""计算quantile
"""
def quantile_(arr):
return float(np.quantile(arr, q))
return F.udf(quantile_, DoubleType())
history_stat = history_stat.withColumn('quantile_025', quantile_udf(0.25)(F.col('date_diff_list')))\
.withColumn('quantile_05', quantile_udf(0.5)(F.col('date_diff_list')))\
.withColumn('quantile_075', quantile_udf(0.75)(F.col('date_diff_list')))
history_stat.show(2)
+---------------+-----------------+----------+----------+----------+------------------+-------------------+--------------------+------------+-----------+------------+
|cust_store_dkey|cust_product_dkey| first_day| last_day|sale_times|max_sales_interval|mean_sales_interval| date_diff_list|quantile_025|quantile_05|quantile_075|
+---------------+-----------------+----------+----------+----------+------------------+-------------------+--------------------+------------+-----------+------------+
| 560| 211|2017-12-10|2019-03-08| 180| 16| 2.5166666666666666|[0, 11, 4, 11, 1,...| 1.0| 1.0| 3.0|
| 560| 990|2016-12-30|2017-03-17| 20| 26| 3.85|[0, 1, 1, 1, 2, 3...| 1.0| 2.0| 4.0|
+---------------+-----------------+----------+----------+----------+------------------+-------------------+--------------------+------------+-----------+------------+
10、对特定条件的值进行替换
import numpy as np
df = spark.createDataFrame(
[(1, 1, None),
(1, 2, float(5)),
(1, 3, np.nan),
(1, 4, None),
(0, 5, float(10)),
(1, 6, float('nan')),
(0, 6, float('nan'))],
('session', "timestamp1", "id2"))
+-------+----------+----+
|session|timestamp1| id2|
+-------+----------+----+
| 1| 1|null|
| 1| 2| 5.0|
| 1| 3| NaN|
| 1| 4|null|
| 0| 5|10.0|
| 1| 6| NaN|
| 0| 6| NaN|
+-------+----------+----+
from pyspark.sql.functions import when
targetDf = df.withColumn("timestamp1", \
when(df["session"] == 0, 999).otherwise(df["timestamp1"]))
11、如何将sparse vector转化为array?
vector_udf = udf(lambda vector: vector.toArray().tolist(), ArrayType(FloatType()))
df = df.withColumn('col1', vector_udf('col2'))
需要注意的是,udf中的tolist()
是必须的, 因为spark中没有np.array
类型。类似的,当我们返回一个np.dtype
类型数据的时候,也需要使用float
或int
对其进行转换。
12、如何将pyspark sparse vector转化为 scipy sparse matrix以及pytorch sparse tensor?
将spark sparse vector转换为scipy csr matrix如下:
import numpy as np
from scipy.sparse import vstack
import numpy as np
from scipy.sparse import csr_matrix
import torch
def as_matrix(vec):
data, indices = vec.values, vec.indices
shape = 1, vec.size
return csr_matrix((data, indices, np.array([0, vec.values.size])), shape)
# cv_cols表示spark中countvectorizer得到的稀疏矩阵
train_pd[csr_cols] = train_pd[cv_cols].applymap(lambda x: as_matrix(x))
# 上面的代码将每一行的spark sparse vector转换为了scipy csr matrix,
# 通过下面的代码可以将每一列的所有行csr matrix进行合并,得到一个大的csr matrix
csr_col1 = vstack(train_pd['csr_col1'])
通过上面的代码可以将sparse vector转换为scipy sparse matrix,具体地——scipy csr matrix。
下面我们再将scipy csr matrix转换为pytorch sparse tensor。
def sparse2tensor(tmpdf):
"""
tmpdf 表示一个scipy csr matrix,如上面得到的csr_col1。
"""
tmpdf_coo = vstack(tmpdf).tocoo()
# 下面代码中的torch.Size能保证转换为sparse tensor后维度一致
sptensor = torch.sparse.FloatTensor(torch.LongTensor([tmpdf_coo.row.tolist(), tmpdf_coo.col.tolist()]),
torch.FloatTensor(tmpdf_coo.data), torch.Size(tmpdf_coo.shape))
return sptensor
spt = sparse2tensor(csr_cols1)
当数据维度非常大且稀疏的时候,使用sparse matrix/tensor能极大的减少内存占用,是一个非常实用的方法。
12、稀疏向量求和
def sum_vector(vector):
return float(vector.values.sum())
13、bigint 转timestamp
df = df.withColumn('ttl2',
Func.from_unixtime(col("ttl").cast('float') / 1000.0 , 'yyyyMMdd'))
13、大量category的string2index实现
在进行label encoder的时候,一般使用stringIndex函数。但是有时候由于类别数太多,达到几千万上亿,如对推荐的MF中ID的编码,此时仍然使用stringIndex容易内存溢出,因为这几千万个字符要进行序列化并传播需要较大的机器内存。这时候我们就需要使用其他的方法来进行编码了。
一种方法是借鉴window函数,以id作为key进行分window并对每个window赋一个值:
# 原数据如下:
id | col |
1 | a |
2 | a |
3 | b |
4 | c |
5 | c |
from pyspark.sql import window as W
import pyspark.sql.functions as f
df.select('id', f.dense_rank().over(W.Window.orderBy('col')).alias('group')).show(truncate=False)
# 结果如下:
+---+-----+
|id |group|
+---+-----+
|1 |1 |
|2 |1 |
|3 |2 |
|4 |3 |
|5 |3 |
+---+-----+
此外,还有一种方法是借用window的row_number函数,先给每一个唯一id编码,然后进行join:
df2 = data.select("col").distinct()
.withColumn("group", row_number().over(Window.orderBy("col")))
result = data.join(df2, on="col", how="inner")