合并两个数据集。两个数据集的类型要一致。
union 生成RDD的分区个数是父RDD的分区数之和
- java实现
package transformations;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.Arrays;
/**
* @Author yqq
* @Date 2021/12/09 17:10
* @Version 1.0
*/
public class UnionTest {
public static void main(String[] args) {
JavaSparkContext context = new JavaSparkContext(
new SparkConf()
.setAppName("union")
.setMaster("local")
);
context.setLogLevel("Error");
JavaRDD<String> rdd = context.parallelize(Arrays.asList("a", "b", "c", "e", "f"),2);
JavaRDD<String> rdd1 = context.parallelize(Arrays.asList("a", "b", "f", "h", "g"),3);
JavaRDD<String> union = rdd.union(rdd1);
System.out.println("rdd partition length = "+rdd.getNumPartitions());
System.out.println("rdd1 partition length = "+rdd1.getNumPartitions());
System.out.println("union partition length = "+union.getNumPartitions());
union.foreach(e-> System.out.print(e+"\t"));
}
}
2. scala实现
package transformation
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* @Author yqq
* @Date 2021/12/09 17:33
* @Version 1.0
*/
object UnionTest {
def main(args: Array[String]): Unit = {
val context = new SparkContext(
new SparkConf()
.setMaster("local")
.setAppName("union")
)
context.setLogLevel("Error")
val rdd: RDD[String] = context.makeRDD(Array[String]("a", "b", "c", "d", "e"))
val rdd1: RDD[String] = context.parallelize(Array[String]("a", "b", "f", "g", "h"))
val value: RDD[String] = rdd.union(rdd1)
value.foreach(print)
}
}