sparking water_spark

2 It provides a way to initialize H2O services on each node in the Spark cluster and to access data stored in data structures of Spark and H2O.

3 Internal Backend  is easiest to deploy; however when Spark or YARN kills the executor - which is not an unusual case - the entire H2O cluster goes down because H2O does not support high availability.

4 The internal backend is the default for behavior for Sparkling Water.  Another way to change type of backend is by calling the ​​setExternalClusterMode()​​​ or ​​setInternalClusterMode()​​​ method on the ​​H2OConf​​​ class. ​​H2OConf​​​ is simple wrapper around ​​SparkConf​​ and inherits all properties in the Spark configuration.

5 好像在安装sparkingwater时,就会把pyspark和H2O装好: pip install h2o_pysparkling_2.3

=======================

1 启动spark :  ./sbin/start-master.sh      ./sbin/start-slave.sh spark://zcy-VirtualBox:7077

2 可以先运行一个很简单的脚本,看环境是否ready ,为了运行成功,需要把虚拟机内存调大(我改成了2g)

 sparking water_sql_02

 

from pysparkling import *
from pyspark.sql import SparkSession
import h2o

# Initiate SparkSession
spark = SparkSession.builder.appName("App name").getOrCreate()

# Initiate H2OContext
hc = H2OContext.getOrCreate(spark)

# Stop H2O and Spark services
h2o.cluster().shutdown()
spark.stop()
print "111111111111"

 ./bin/spark-submit --master spark://zcy-VirtualBox:7077  --conf "spark.executor.memory=1g" /home/zcy/working/tst.py

结果如下

sparking water_spring_03

 

 

3 运行一个稍微复杂的脚本:

 

import h2o
from datetime import datetime

from pyspark import SparkConf, SparkFiles
from pyspark.sql import Row, SparkSession
import os
from pysparkling import *

# Refine date column
def refine_date_col(data, col):
    data["Day"] = data[col].day()
    data["Month"] = data[col].month()
    data["Year"] = data[col].year()
    data["WeekNum"] = data[col].week()
    data["WeekDay"] = data[col].dayOfWeek()
    data["HourOfDay"] = data[col].hour()
    
    # Create weekend and season cols
    # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
    # data["Weekend"]   = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
    data["Weekend"] = ((data["WeekDay"] == "Sun") | (data["WeekDay"] == "Sat"))
    data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])


# This is just helper function returning path to data-files
def _locate(file_name):
    if os.path.isfile("/home/zcy/working/data_tst/" + file_name):
        return "/home/zcy/working/data_tst/" + file_name
    else:
        print "eeeeeeeeeeee"


spark = SparkSession.builder.appName("ChicagoCrimeTest").getOrCreate()
# Start H2O services
h2oContext = H2OContext.getOrCreate(spark)
# Define file names
chicagoAllWeather = "chicagoAllWeather.csv"
chicagoCensus = "chicagoCensus.csv"
chicagoCrimes10k = "chicagoCrimes10k.csv.zip"


# h2o.import_file expects cluster-relative path
f_weather = h2o.upload_file(_locate(chicagoAllWeather))
f_census = h2o.upload_file(_locate(chicagoCensus))
f_crimes = h2o.upload_file(_locate(chicagoCrimes10k))
print "111111111111"

# Transform weather table
# Remove 1st column (date)
f_weather = f_weather[1:]

# Transform census table
# Remove all spaces from column names (causing problems in Spark SQL)
col_names = list(map(lambda s: s.strip().replace(' ', '_').replace('+', '_'), f_census.col_names))

# Update column names in the table
# f_weather.names = col_names
f_census.names = col_names


# Transform crimes table
# Drop useless columns
f_crimes = f_crimes[2:]

# Set time zone to UTC for date manipulation
h2o.cluster().timezone = "Etc/UTC"

# Replace ' ' by '_' in column names
col_names = list(map(lambda s: s.replace(' ', '_'), f_crimes.col_names))
f_crimes.names = col_names
refine_date_col(f_crimes, "Date")
f_crimes = f_crimes.drop("Date")

# Expose H2O frames as Spark DataFrame
print "22222222222222"
df_weather = h2oContext.as_spark_frame(f_weather)
df_census = h2oContext.as_spark_frame(f_census)
df_crimes = h2oContext.as_spark_frame(f_crimes)

# Register DataFrames as tables
df_weather.createOrReplaceTempView("chicagoWeather")
df_census.createOrReplaceTempView("chicagoCensus")
df_crimes.createOrReplaceTempView("chicagoCrime")

crimeWithWeather = spark.sql("""SELECT
a.Year, a.Month, a.Day, a.WeekNum, a.HourOfDay, a.Weekend, a.Season, a.WeekDay,
a.IUCR, a.Primary_Type, a.Location_Description, a.Community_Area, a.District,
a.Arrest, a.Domestic, a.Beat, a.Ward, a.FBI_Code,
b.minTemp, b.maxTemp, b.meanTemp,
c.PERCENT_AGED_UNDER_18_OR_OVER_64, c.PER_CAPITA_INCOME, c.HARDSHIP_INDEX,
c.PERCENT_OF_HOUSING_CROWDED, c.PERCENT_HOUSEHOLDS_BELOW_POVERTY,
c.PERCENT_AGED_16__UNEMPLOYED, c.PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA
FROM chicagoCrime a
JOIN chicagoWeather b
ON a.Year = b.year AND a.Month = b.month AND a.Day = b.day
JOIN chicagoCensus c
ON a.Community_Area = c.Community_Area_Number""")

# Publish Spark DataFrame as H2OFrame with given name
crimeWithWeatherHF = h2oContext.as_h2o_frame(crimeWithWeather, "crimeWithWeatherTable")
print "3333333333333333333"
# Transform selected String columns to categoricals
cat_cols = ["Arrest", "Season", "WeekDay", "Primary_Type", "Location_Description", "Domestic"]
for col in cat_cols :
    crimeWithWeatherHF[col] = crimeWithWeatherHF[col].asfactor()
    
# Split frame into two - we use one as the training frame and the second one as the validation frame
splits = crimeWithWeatherHF.split_frame(ratios=[0.8])
train = splits[0]
test = splits[1]
print "4444444444444444"
h2o.download_csv(train,'/home/zcy/working/data_tst/ret/train.csv')
h2o.download_csv(test,'/home/zcy/working/data_tst/ret/test.csv')

# stop H2O and Spark services
h2o.cluster().shutdown()
spark.stop()

 

 

 

3 运行脚本,

./bin/spark-submit --master spark://zcy-VirtualBox:7077  --conf "spark.executor.memory=1g" /home/zcy/working/sparkH2O.py

sparking water_spark_04

 

 sparking water_spark_05

 sparking water_sql_06