Spark SQL:内置函数以及每日uv

java版本:

package cn.spark.study.sql;


import java.util.Arrays;
import java.util.List;


import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;


/**
 * Spark SQL:内置函数以及每日uv
 */


import scala.Tuple2;


public class DailyUV {


	public static void main(String[] args) {
		// 创建SparkConf,本地运行
		SparkConf conf = new SparkConf()
				.setMaster("local")
				.setAppName("DailyUV");
		
		// 创建JavaSparkContext
		JavaSparkContext sc = new JavaSparkContext(conf);
		SQLContext sqlContext = new SQLContext(sc);
		
	    // 构造用户访问日志数据,并创建DataFrame
	    // 模拟用户访问日志,日志用逗号隔开,第一列是日期,第二列是用户id
		List<String> userAccessLog = Arrays.asList(
				"2015-10-01,1122",
		        "2015-10-01,1122",
		        "2015-10-01,1123",
		        "2015-10-01,1124",
		        "2015-10-01,1124",
		        "2015-10-01,1128",
		        "2015-10-01,1129",
		        "2015-10-02,1122",
		        "2015-10-02,1121",
		        "2015-10-02,1123",
		        "2015-10-02,1125",
		        "2015-10-02,1123");
		
		// 并行化集合创建RDD,要通过并行化集合的方式创建RDD,那么就调用SparkContext以及其子类,的parallelize()方法
		JavaRDD<String> userAccessLogRDD = sc.parallelize(userAccessLog);
		
		// 将RDD数据转换为JavaRDD<Row>形式,为后续转换为DataFrame作准备
		JavaRDD<Tuple2<String, Long>> userAssessLogMapRDD = userAccessLogRDD.map(new Function<String, Tuple2<String, Long>>() {
			private static final long serialVersionUID = 1L;
			@Override
			public Tuple2<String, Long> call(String line) throws Exception {
				return new Tuple2<String, Long>(line.split(",")[0], Long.valueOf(line.split(",")[1]));
			}
		});
		
		// 将RDD数据转换为JavaRDD<Row>形式,为后续转换为DataFrame作准备
		JavaRDD<Row> userAccessLogRowRDD = userAssessLogMapRDD.map(new Function<Tuple2<String,Long>, Row>() {
			private static final long serialVersionUID = 1L;
			@Override
			public Row call(Tuple2<String, Long> tuple2) throws Exception {
				return RowFactory.create(tuple2._1, tuple2._2);
			}
		});
		
		// 创建元数据类型,为后续转换为DataFrame作准备
		List<StructField> structFields = Arrays.asList(
				DataTypes.createStructField("date", DataTypes.StringType, true),
				DataTypes.createStructField("userID", DataTypes.LongType, true));
		StructType structType = DataTypes.createStructType(structFields);
		
		// 转换为DataFrame
		DataFrame userAccessLogDF = sqlContext.createDataFrame(userAccessLogRowRDD, structType);
		
		// 将DataFrame注册为临时表,便于后面查询数据
		userAccessLogDF.registerTempTable("userAccessLogDF");
		
		// 查询每一天,不同用户登录次数
		DataFrame everyDayUVDF = sqlContext.sql(
				"SELECT date, COUNT(DISTINCT(userID)) AS count FROM userAccessLogDF GROUP BY date");
		
		// 将DataFrame转换为JavaRDD,为后续遍历数据做准备
		JavaRDD<Tuple2<String, Long>> userAccessLogSumRDD = 
			everyDayUVDF.javaRDD().map(new Function<Row, Tuple2<String, Long>>() {
				private static final long serialVersionUID = 1L;
				@Override
				public Tuple2<String, Long> call(Row row) throws Exception {
					return new Tuple2<String, Long>(row.getString(0), row.getLong(1));
			}
		});
		
		// 遍历计算后的结果数据
		userAccessLogSumRDD.foreach(new VoidFunction<Tuple2<String,Long>>() {
			private static final long serialVersionUID = 1L;
			@Override
			public void call(Tuple2<String, Long> tt) throws Exception {
				System.out.println(tt._1.toString() + " login: " + tt._2.toString() + " times");
			}
		});
		
		// 关闭SparkContext
		sc.close();
	}
}

scala版本:

package cn.spark.study.sql


import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StringType;
import org.apache.spark.sql.types.IntegerType;
// 手动导入一个函数
import org.apache.spark.sql.functions._


/**
 * 83讲,SparkSQL内置函数以及每日UV案例实战
 */
object DailyUV {
  def main(args:Array[String])
  {
    val conf = new SparkConf()
      .setMaster("local")
      .setAppName("DailyUV")
      
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc);
    
    // 这里着重说明一下!!!
    // 要使用SparkSQL的内置函数,就必须在这里导入SQLContext下的隐式转换
    import sqlContext.implicits._
    
    // 构造用户访问日志数据,并创建DataFrame
    // 模拟用户访问日志,日志用逗号隔开,第一列是日期,第二列是用户id
    val userAccessLog = Array(
        "2015-10-01,1122",
        "2015-10-01,1122",
        "2015-10-01,1123",
        "2015-10-01,1124",
        "2015-10-01,1124",
        "2015-10-02,1122",
        "2015-10-02,1121",
        "2015-10-02,1123",
        "2015-10-02,1125",
        "2015-10-02,1123");
    val userAccessLogRDD = sc.parallelize(userAccessLog, 5);
    
    // 将模拟出来的用户访问日志RDD,转换为DataFrame
    // 首先,将普通的RDD,转换为元素为Row的RDD
    val userAccessLogRowRDD = userAccessLogRDD
      .map{log => Row(log.split(",")(0), log.split(",")(1).toInt)}
    
    // 构造DataFrame的元数据
    val structType = StructType(Array(
        StructField("date", StringType, true),
        StructField("userid", IntegerType, true)))
        
    // 使用SQLContext创建DataFrame
    val userAccessLogRowDF = sqlContext.createDataFrame(userAccessLogRowRDD, structType)
    
    // 这里讲解一下uv的基本含义和业务
    // 每天都有很多用户来访问,但是每个用户可能每天都会该问很多次
    // 所以,uv, 指的是,对用户进行去重以后的访问总数
    
    // 这里,正式开始使用Spark 1.5.x版本提供的最新特性,内置函数,countDistinct
    // 讲解一下聚合函数的用法
    // 首先,对DataFrame调用groupBy()方法,对某一列进行分组
    // 然后,调用agg()方法,第一个参数,必须,必须,传入之前在groupBy()方法中出现的字段
    // 第二个参数,传入countDistinct、sum、first天天副,Spark提供的内置函数
    // 内置函数中,传入的参数,也是用单引号作为前缀的,其他的字段
    userAccessLogRowDF.groupBy("date")
      .agg('date, countDistinct('userid))
      .map{row => Row(row(1), row(2))}
      .collect()
      .foreach(println)
  }
}