Spark RDD Action算子的基本使用(Java)

最近在总结Spark RDD相关算子的使用,列出了一些基本使用方法,可供大家参考,快速上手。

package com.edward.spark.core;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.*;
import java.util.*;

import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;

import java.io.Serializable;

public class ApiTestAction {
    private static SparkConf conf = new SparkConf()
            .setMaster("local[1]")
            .setAppName("ApiTest");
    private static JavaSparkContext jsc = new JavaSparkContext(conf);

    /**
     * count(),返回元素的个数,返回元素类型是long
     */
    private static void api_count()
    {
        List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
        List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
        JavaRDD<Integer> rdd =jsc.parallelize(data,3);
        JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
        long value =rdd2.count();
        System.out.println(value);
    }


    /**
     * 返回每个value的个数,返回类型是map<value,count>
     */
    private static void api_countByValue()
    {
        Map<Integer,Long> map =new HashMap<>();
        List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
        List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
        JavaRDD<Integer> rdd =jsc.parallelize(data,1);
        JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
        map=rdd.countByValue();
        map.forEach((k,v)->{
            System.out.println("key:" + k + " value:" + v);
        });

    }


    /**
     * 返回key的个数,返回类型是map<key,count> ;只有JavaPairRDD才有
     */
    private static void api_countByKey()
    {
        Map<Integer,Long> map =new HashMap<>();
        Map<Tuple2<Integer,String>,Long> map2 =new HashMap<>();
        List<Tuple2<Integer,String>> data3 =Arrays.asList(new Tuple2<>(1,"Edward"), new Tuple2<>(1,"CiCi"),new Tuple2<>(3,"Della"));
        JavaPairRDD<Integer,String> rdd3=jsc.parallelizePairs(data3,3);
        map=rdd3.countByKey();
        map2=rdd3.countByValue();
        map.forEach((k,v)->{
            System.out.println("key:" + k + " count:" + v);
        });
        map2.forEach((k,v)->{
            System.out.println("value:" + k + " count:" + v);
        });
    }

    /**
     * 返回元素的最大值,按传入的comparator排序
     */
    private static void api_max()
    {
        List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
        List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
        JavaRDD<Integer> rdd =jsc.parallelize(data,3);
        JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
        String value2 =rdd2.max(Comparator.naturalOrder());
        int value =rdd.max(Comparator.naturalOrder());
        System.out.println("value=" + value);
        System.out.println("value2=" + value2);
    }


    /**
     * 返回元素的最小值,按传入的comparator排序
     */
    private static void api_min()
    {
        List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
        List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
        JavaRDD<Integer> rdd =jsc.parallelize(data,3);
        JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
        String value2 =rdd2.max(Comparator.naturalOrder());
        int value =rdd.min(Comparator.naturalOrder());
        System.out.println("value=" + value);
        System.out.println("value2=" + value2);
    }


    /**
     * 返回元素的第一个
     */
    private static void api_first()
    {
        List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
        List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
        JavaRDD<Integer> rdd =jsc.parallelize(data,3);
        JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
        String firstValue =rdd2.first();
        System.out.println(firstValue);

    }

    /**
     * 返回所有数据集的所有元素,返回类型是List<T> ;
     * 仅做测试或者是返回较少的数据集,因为是存到内存里面的
     */
    private static void api_collect()
    {
        List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
        List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
        JavaRDD<Integer> rdd =jsc.parallelize(data,3);
        JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
        List<Integer> output =rdd.collect();
        System.out.println(output);
    }

    /**
     * 返回对应key的value值,返回类型是List<T>
     */
    private static void api_lookup()
    {
        Map<Integer,Long> map =new HashMap<>();
        Map<Tuple2<Integer,String>,Long> map2 =new HashMap<>();
        List<Tuple2<Integer,String>> data3 =Arrays.asList(new Tuple2<>(1,"Edward"), new Tuple2<>(1,"CiCi"),new Tuple2<>(3,"Della"));
        JavaPairRDD<Integer,String> rdd3=jsc.parallelizePairs(data3,3);
        List<String> out =rdd3.lookup(1);
        System.out.println(rdd3.lookup(1));
    }

    /**
     * 返回对应num数量的元素,返回类型是List<T>
     * 这是一个分区一个分区扫描的,如果涉及多个分区则会比较慢
     * 适合数据量不大的情况,因为数据存在内存中
     */
    private static void api_take()
    {
        List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
        List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
        JavaRDD<Integer> rdd =jsc.parallelize(data,3);
        JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
        List<String> list =rdd2.take(3);
        System.out.println(list);

    }

    private static class TestComparator implements Serializable,Comparator<Tuple2<String, Integer>>{
        @Override
        public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
            return o1._2.compareTo(o2._2);
        }
    }

    private static class TestComparator2 implements Serializable,Comparator<Integer>{

        @Override
        public int compare(Integer o1, Integer o2) {
            return o1.compareTo(o2);
        }
    }

    private static class TestCompartor3 implements Serializable,Comparator<String>{

        @Override
        public int compare(String o1, String o2) {
            return o1.compareTo(o2);
        }
    }

    /**
     * 返回按传入的比较器的最小的N个元素,不传就是默认的自然排序
     * 适合数据量不大的情况,因为数据是存在内存中
     */
    private static void api_takeOrdered()
    {
        List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
        List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
        JavaRDD<Integer> rdd =jsc.parallelize(data,3);
        JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
        List<Integer> output =rdd.takeOrdered(2);
        System.out.println(output);
        System.out.println(rdd2.takeOrdered(2, new TestCompartor3()));
        System.out.println(rdd.takeOrdered(3,new TestComparator2()));
    }


    /**
     * 返回按传入的比较器排序的最大的N个元素,不传就是按自然排序排序
     * 适合数据量不大的情况,因为数据是存在内存中
     */
    private static void api_top()
    {
        List<Tuple2<String, Integer>> pairs = Arrays.asList(new Tuple2<>("A", 1),
                new Tuple2<>("A", 2), new Tuple2<>("B", 1));
        JavaPairRDD< String, Integer> rdd = jsc.parallelizePairs(pairs, 3);
        List<Tuple2<String, Integer>> result = rdd.top(2, new TestComparator());
        System.out.println(result);
    }

    /**
     * 返回固定长度的随机抽样的结果,takeSample有3个参数,
     * withReplacement:是否参与重复抽样(抽取是否放回),true:抽取后放回(可能会出现重复) false:抽取后不放回(不会出现重复)
     * num:返回结果长度
     * seed:随机数生成器的种子(算法生成的随机数都是伪随机,因为是通过一个可确定的函数,加上一个种子(常用时钟),来生成的;种子又是根据一个初始种子(系统默认的是8682522807148012L)和一个变态的数进行运算而得;这个参数一般不传)
     */
    private static void api_takeSample()
    {
        List<Integer> data =Arrays.asList(1,2,3,4,5,11,23);
        List<String> data2 = Arrays.asList("Edward","CiCi","Della","Mystique");
        JavaRDD<Integer> rdd =jsc.parallelize(data,3);
        JavaRDD<String> rdd2 =jsc.parallelize(data2,2);
        List<Integer> out=rdd.takeSample(false,3);
        System.out.println(out);
    }

    /**
     * 返回key-value的Map
     */
    private static void api_collectAsMap()
    {
        List<Tuple2<Integer,String>> data =Arrays.asList(new Tuple2<>(1,"Edward"), new Tuple2<>(1,"CiCi"),new Tuple2<>(3,"Della"));
        JavaPairRDD<Integer,String> rdd =jsc.parallelizePairs(data,2);
        Map<Integer,String> map=rdd.collectAsMap();
        System.out.println(map);
    }

    /**
     * 三个参数,zeroValue:初始值 ;seqOp:把每个分区上的元素聚合一次;comOp:把每个分区的聚合结果在聚合一次
     * 下面的聚合步骤为:(0,0) ->(0+1,0+1)=(1,1) ->(1+1,1+1) =(2,2) ->(2+2,2+1)=(4,3) ->(4+3,3+1)=(7,4)
     * 返回类型跟输入类型可以不一致
     */
    private static void api_aggregate()
    {
        List<Integer> data = Arrays.asList(1, 1, 2, 3);
        JavaRDD<Integer> rdd = jsc.parallelize(data, 2);
        Tuple2<Integer, Integer> result = rdd.aggregate(new Tuple2<>(0,0),
                (x,y)->new Tuple2<>(x._1+y, x._2+1),
                (x,y)->new Tuple2<>(x._1+y._1, x._2+y._2));
        System.out.println(result);
        int result2 =rdd.aggregate(0, (x,y)-> (x+y), (x,y)-> (x+y));
        System.out.println(result2);
    }

    /**
     * 聚合每个分区的值
     * 跟aggregate的区别就是返回类型;fold要求的返回类型必须一致
     */
    private static void api_fold()
    {
        List<Integer> data = Arrays.asList(1, 1, 2, 3);
        JavaRDD<Integer> rdd = jsc.parallelize(data, 2);
        int result =rdd.fold(0,(x,y)->(x+y));
        System.out.println(result);

    }

    /**
     * 聚合函数
     * 跟fold的区别就是没有初始值,返回的类型也必须一致
     */
    private static void api_reduce()
    {
        List<Integer> data = Arrays.asList(1, 1, 2, 3);
        JavaRDD<Integer> rdd = jsc.parallelize(data, 2);
        int result =rdd.reduce((x,y)->(x+y));
        System.out.println(result);
    }


    /**
     * 对每个元素执行对应的操作
     */
    private static void api_foreach()
    {
        List<Integer> data = Arrays.asList(1, 1, 2, 3);
        JavaRDD<Integer> rdd = jsc.parallelize(data, 2);
        rdd.foreach(x->System.out.println(x));
    }

    /**
     * 对分区内的每个元素执行对应的操作
     * 跟foreach的区别:函数内如果有数据库,网络TCP等IO连接,用foreachPartition针对每个分区集合进行运算,可以节省性能开销
     */
    private static void api_foreachPartition()
    {
        List<Integer> data = Arrays.asList(1, 1, 2, 3);
        JavaRDD<Integer> rdd = jsc.parallelize(data, 2);
        rdd.foreachPartition(x->x.forEachRemaining(y->System.out.println(y)));
    }

    private static void action_type1()
    {
        api_count();
        api_countByValue();
        api_countByKey();
        api_max();
        api_min();
    }

    private static void action_type2()
    {
        api_first();
        api_collect();
        api_collectPartitions();
        api_lookup();
        api_take();
        api_takeOrdered();
        api_top();
        api_takeSample();
        api_collectAsMap();
    }

    private static void action_type3()
    {
        api_aggregate();
        api_fold();
        api_reduce();
    }

    private static void action_type4()
    {
        api_foreach();
        api_foreachPartition();
    }

    public static void main(String[] args) {
        action_type1();
        action_type2();
        action_type3();
        action_type4();
        jsc.stop();
    }
}