输入文件
1.txt
hello word hello ww lily hadoop hadoop spark hive spark hive hadoop hello word lily hadoop hadoop spark hive spark hive hadoop hello word lily hadoop hadoop spark hive spark hive hadoop hello word hello ww lily hadoop hadoop spark hive spark hive hadoop hello word hello ww lily hadoop hadoop spark hive spark hive hadoop lily hadoop hadoop spark hive spark hello word hello ww lily hadoop hadoop spark hive spark hive hadoop hadoop spark hive spark hive hadoop hadoop spark hive spark hive hadoop
java实现
Maven依赖
<dependencies> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.7.6</version> </dependency> <!-- <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>2.7.6</version> </dependency>--> <!--如果直接使用上面的依赖需要hdfs的配置文件,这里借用一点spark-core里面的mapreduce包,不需要配置文件--> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>2.4.0</version> </dependency> </dependencies>
WordCount.java
package com.daniel.mapreduce; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; /** * @Author Daniel * @Description mapreduce入门程序——wordcount **/ public class WordCount { static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { // map是来统计每一组的key:单词,value:出现次数,所以应该是Text与IntWritable Text mk = new Text(); IntWritable mv = new IntWritable(); // 重写map方法 @Override /* * map方法中三个参数 * 1.key值 * 2.value值 * 3.context,写入key与value的值 */ protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException { // 将Text类型的value转为String String line = value.toString(); // 因为文件中是用tab键分割的,所以直接以\t来分 String[] words = line.split("\t"); // 遍历这个数组 for (String w : words) { // 每遍历一个单词就放到key中 mk.set(w); // 用来给单词上一个标记"1" mv.set(1); // 写入 context.write(mk, mv); } } } static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> { // 我们在reduce中要做的事情就是统计单词出现的次数,所以需要一个IntWritable对象 IntWritable rv = new IntWritable(); @Override /* * reduce方法中3个参数 * 1.map的key值 * 2.map的value值 * 3.context,写入key与value的值 */ protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException { // 定义一个sum用来计数 int sum = 0; // 算出总数 for (IntWritable v : values) { sum += v.get(); } // 然后改变value值,key原样输出即可 rv.set(sum); // 写入 context.write(key, rv); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // System.setProperty("hadoop.home.dir", "C:\\Development Files\\hadoop"); // 加载配置获取配置 Configuration conf = new Configuration(); // 获取job对象,传入配置 Job job = Job.getInstance(conf); // 获取Driver 启动类 job.setJarByClass(WordCount.class); // 获取WordCountMapper map类 job.setMapperClass(MyMapper.class); // 获取WordCountReducer reduce类 job.setReducerClass(MyReduce.class); // 设置map中key的类型加载类 job.setMapOutputKeyClass(Text.class); // 设置map中value的类型加载类 job.setMapOutputValueClass(IntWritable.class); // 设置reduce中key的类型加载类 job.setOutputKeyClass(Text.class); // 设置reduce中value的类型加载类 job.setOutputValueClass(IntWritable.class); // 设置传入的参数,使用与jar打包方式 Path inPath = new Path(args[0]); // 传入job,指定输入的路径 FileInputFormat.addInputPath(job, inPath); Path outPath = new Path(args[1]); // 默认的返回fs对象,本地文件系统对象,输入的路径一定不能存在其他文件 FileSystem fs = FileSystem.get(conf); // 如果存在其他文件就进行递归删除 if (fs.exists(outPath)) { fs.delete(outPath, true); } // 传入job,指定输出的路径 FileOutputFormat.setOutputPath(job, outPath); // 提交job并打印日志 job.waitForCompletion(true); } }
打包之后,将输入文件上传至指定目录,直接输入下面的命令即可运行
- haoop jar 全类名 输入文件或文件夹 输出文件夹
hadoop jar wc.jar com.daniel.mapreduce.WordCount /words/1.txt /output
python实现
mapper.py
#!/usr/bin/env python import sys # sys.stdin表示文件里面的内容 for line in sys.stdin: # 删除开头和结尾的空白 line = line.strip() # 输入文件的单词之间是以tab分割 words = line.split("\t") # 将结果输出,格式为:Hadoop 1 for word in words: print("%s\t%s" % (word, 1))
reducer.py
#!/usr/bin/env python import sys current_word = None current_count = 0 word = None # sys.stdin表示从mapper过来的内容 for line in sys.stdin: # 删除开头和结尾的空白 line = line.strip() # 前面mapper是以tab分割,出现一次就记为1 word, count = line.split('\t', 1) try: # 将当前字符串形式的数字转换为int类型的数字 count = int(count) # count如果不是数字的话,直接忽略掉 except ValueError: continue # 如果出现相同的词,就将他们出现的次数进行累加,聚合 if current_word == word: current_count += count # 如果不是相同的词,那直接print他们出现的次数 else: if current_word: print("%s\t%s" % (current_word, current_count)) current_count = count current_word = word # 最后输出 if word == current_word: print("%s\t%s" % (current_word, current_count))
直接将文件上传至linux,运行命令中的$HADOOP_HOME就是自己的hadoop的安装目录,python的版本可以自行选择,我这里使用的python3
hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-*streaming*.jar -mapper "python3 mapper.py" -reducer "python3 reducer.py" -file mapper.py -file reducer.py -input /words/1.txt -output /output
输出结果
两种语言都会得到同一个结果
hdfs dfs -cat /output/*
hadoop 24 hello 10 hive 17 lily 7 spark 18 word 6 ww 4