1.Scala 版
package demo
import org.apache.spark.{SparkConf, SparkContext}
object SparkDemo {
def main(args: Array[String]): Unit = {
//配置
val sparkConf = new SparkConf()
sparkConf.setAppName("sparkdemo").setMaster("local")
//创建sc 对象
val sc = new SparkContext(sparkConf)
/*val rdd1 = sc.textFile("hdfs://192.168.163.11:9000/data/data.txt")
val rdd2= rdd1.flatMap(_.split(" "))
val rdd3 = rdd2.map((_,1))
println(rdd3.collect())
val rdd4 = rdd3.reduceByKey(_+_)
rdd4.saveAsTextFile("hdfs://192.168.163.11:9000/spark/data/1012/1")
*/
//使用sc 对象执行任务
sc.textFile(args(0))
.flatMap(_.split(" "))
.map((_,1))
.reduceByKey(_+_)
.saveAsTextFile(args(1))
//停止任务
sc.stop()
}
}
然后 idea 打成jar文件(如有问题请查看 IDEA 输出jar)
运行程序:
[root@BigData11 spark-2.1.0-bin-hadoop2.7]# bin/spark-submit --master spark://BigData11:7077 --class demo.SparkDemo /root/tools/mysparkdemo.jar hdfs://192.168.163.11:9000/data/data.txt hdfs://192.168.163.11:9000/spark/1013
2.Java版
(-更新时间 2018年10月13日23:27:22-)
package com.demo.spark;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
/**
* 集群模式命令 bin/spark-submit --master spark://BigData11:7077 --class com.demo.spark.JavaWordCount /root/tmp/sparkwc.jar hdfs://192.168.163.11:9000/data/data.txt
* @author Dream
*
*/
public class JavaWordCount {
public static void main(String[] args) {
// 配置 指定任务的名称
SparkConf conf = new SparkConf();
conf.setMaster("local");
conf.setAppName("JavaWordCount");
// 创建sparkcontext 对象,使用java版本的上下文
JavaSparkContext cont = new JavaSparkContext(conf);
// 读取数据
JavaRDD<String> textFile = cont.textFile(args[0]);
// JavaRDD<String> textFile = cont.textFile("hdfs://192.168.163.11:9000/data/data.txt");
// 分词
/**FlatMapFunction<String, U>
* String 数据源的类型 --U返回值的类型
*/
JavaRDD<String> flatMap = textFile.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String data) throws Exception {
// TODO Auto-generated method stub
return Arrays.asList(data.split(" ")).iterator() ;
}
});
/**
*每个单词记一次数
*return (Beijing,1)
* maptoPair 相当于MapReduce中的Map 的输出类型
* PairFunction<String,K2,V2> String 每个单词,K2 V2 相当于Map 的输出
*/
JavaPairRDD<String,Integer> mapToPair = flatMap.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String word) throws Exception {
// TODO Auto-generated method stub
return new Tuple2<String, Integer>(word, 1);
}
});
/**
* 执行reduce 操作
* (Beijing,3)
*/
JavaPairRDD<String,Integer> reduceByKey = mapToPair.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer a, Integer b) throws Exception {
// TODO Auto-generated method stub
return a+b;
}
});
//触发一个计算,并输出屏幕
List<Tuple2<String,Integer>> collect = reduceByKey.collect();
for (Tuple2<String, Integer> t2 : collect) {
System.out.println(t2._1+"\t"+t2._2);
}
// 停止context 对象
cont.stop();
}
}