Spark SQL:内置函数以及每日uv
java版本:
package cn.spark.study.sql;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
/**
* Spark SQL:内置函数以及每日uv
*/
import scala.Tuple2;
public class DailyUV {
public static void main(String[] args) {
// 创建SparkConf,本地运行
SparkConf conf = new SparkConf()
.setMaster("local")
.setAppName("DailyUV");
// 创建JavaSparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
// 构造用户访问日志数据,并创建DataFrame
// 模拟用户访问日志,日志用逗号隔开,第一列是日期,第二列是用户id
List<String> userAccessLog = Arrays.asList(
"2015-10-01,1122",
"2015-10-01,1122",
"2015-10-01,1123",
"2015-10-01,1124",
"2015-10-01,1124",
"2015-10-01,1128",
"2015-10-01,1129",
"2015-10-02,1122",
"2015-10-02,1121",
"2015-10-02,1123",
"2015-10-02,1125",
"2015-10-02,1123");
// 并行化集合创建RDD,要通过并行化集合的方式创建RDD,那么就调用SparkContext以及其子类,的parallelize()方法
JavaRDD<String> userAccessLogRDD = sc.parallelize(userAccessLog);
// 将RDD数据转换为JavaRDD<Row>形式,为后续转换为DataFrame作准备
JavaRDD<Tuple2<String, Long>> userAssessLogMapRDD = userAccessLogRDD.map(new Function<String, Tuple2<String, Long>>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Long> call(String line) throws Exception {
return new Tuple2<String, Long>(line.split(",")[0], Long.valueOf(line.split(",")[1]));
}
});
// 将RDD数据转换为JavaRDD<Row>形式,为后续转换为DataFrame作准备
JavaRDD<Row> userAccessLogRowRDD = userAssessLogMapRDD.map(new Function<Tuple2<String,Long>, Row>() {
private static final long serialVersionUID = 1L;
@Override
public Row call(Tuple2<String, Long> tuple2) throws Exception {
return RowFactory.create(tuple2._1, tuple2._2);
}
});
// 创建元数据类型,为后续转换为DataFrame作准备
List<StructField> structFields = Arrays.asList(
DataTypes.createStructField("date", DataTypes.StringType, true),
DataTypes.createStructField("userID", DataTypes.LongType, true));
StructType structType = DataTypes.createStructType(structFields);
// 转换为DataFrame
DataFrame userAccessLogDF = sqlContext.createDataFrame(userAccessLogRowRDD, structType);
// 将DataFrame注册为临时表,便于后面查询数据
userAccessLogDF.registerTempTable("userAccessLogDF");
// 查询每一天,不同用户登录次数
DataFrame everyDayUVDF = sqlContext.sql(
"SELECT date, COUNT(DISTINCT(userID)) AS count FROM userAccessLogDF GROUP BY date");
// 将DataFrame转换为JavaRDD,为后续遍历数据做准备
JavaRDD<Tuple2<String, Long>> userAccessLogSumRDD =
everyDayUVDF.javaRDD().map(new Function<Row, Tuple2<String, Long>>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Long> call(Row row) throws Exception {
return new Tuple2<String, Long>(row.getString(0), row.getLong(1));
}
});
// 遍历计算后的结果数据
userAccessLogSumRDD.foreach(new VoidFunction<Tuple2<String,Long>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Tuple2<String, Long> tt) throws Exception {
System.out.println(tt._1.toString() + " login: " + tt._2.toString() + " times");
}
});
// 关闭SparkContext
sc.close();
}
}
scala版本:
package cn.spark.study.sql
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StringType;
import org.apache.spark.sql.types.IntegerType;
// 手动导入一个函数
import org.apache.spark.sql.functions._
/**
* 83讲,SparkSQL内置函数以及每日UV案例实战
*/
object DailyUV {
def main(args:Array[String])
{
val conf = new SparkConf()
.setMaster("local")
.setAppName("DailyUV")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc);
// 这里着重说明一下!!!
// 要使用SparkSQL的内置函数,就必须在这里导入SQLContext下的隐式转换
import sqlContext.implicits._
// 构造用户访问日志数据,并创建DataFrame
// 模拟用户访问日志,日志用逗号隔开,第一列是日期,第二列是用户id
val userAccessLog = Array(
"2015-10-01,1122",
"2015-10-01,1122",
"2015-10-01,1123",
"2015-10-01,1124",
"2015-10-01,1124",
"2015-10-02,1122",
"2015-10-02,1121",
"2015-10-02,1123",
"2015-10-02,1125",
"2015-10-02,1123");
val userAccessLogRDD = sc.parallelize(userAccessLog, 5);
// 将模拟出来的用户访问日志RDD,转换为DataFrame
// 首先,将普通的RDD,转换为元素为Row的RDD
val userAccessLogRowRDD = userAccessLogRDD
.map{log => Row(log.split(",")(0), log.split(",")(1).toInt)}
// 构造DataFrame的元数据
val structType = StructType(Array(
StructField("date", StringType, true),
StructField("userid", IntegerType, true)))
// 使用SQLContext创建DataFrame
val userAccessLogRowDF = sqlContext.createDataFrame(userAccessLogRowRDD, structType)
// 这里讲解一下uv的基本含义和业务
// 每天都有很多用户来访问,但是每个用户可能每天都会该问很多次
// 所以,uv, 指的是,对用户进行去重以后的访问总数
// 这里,正式开始使用Spark 1.5.x版本提供的最新特性,内置函数,countDistinct
// 讲解一下聚合函数的用法
// 首先,对DataFrame调用groupBy()方法,对某一列进行分组
// 然后,调用agg()方法,第一个参数,必须,必须,传入之前在groupBy()方法中出现的字段
// 第二个参数,传入countDistinct、sum、first天天副,Spark提供的内置函数
// 内置函数中,传入的参数,也是用单引号作为前缀的,其他的字段
userAccessLogRowDF.groupBy("date")
.agg('date, countDistinct('userid))
.map{row => Row(row(1), row(2))}
.collect()
.foreach(println)
}
}