1. read files
# define schema
from pyspark.sql.types import StructType,StructField
from pyspark.sql.types import DoubleType,StringType,IntegerType
schema = StructType([
StructField('x1' = StringType()),
StructField('x2' = DoubleType())
])
# read csv
sel_col = ['x1']
xs = spark.read.schema(schema)\
.option('header','false')\
.csv(path.format(s3_buckect),sep = '\\t')\
.select(*sel_col)
2. add columns
from pyspark.sql.window import window as W
from pyspark.sql import functions as F
# add columns
df = df.withColumn('new_col',F.monotonically_increasing_id())\
.withColumn('row_number',F.row_number().over(windowSpec))
# rename columns
df = df.withColumnRenamed('X1','newname')
# sort/drop
df.sort('x1').drop('x1')
3. join tables
dfx = df1.join(df2, df1.col1 == df2.col2, how ='inner')