一、目的
远程客户端编写MapReduce代码,并且配置环境进行调试。
二、环境
1.IDEA
2.JDK1.8
3.CDH 5.7.0
三、步骤
1.创建Maven项目
2.加载对应CDH需求依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>BG</groupId>
<artifactId>Hadoop</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<properties>
<hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
<hbase.version>1.2.0-cdh5.7.0</hbase.version>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<!-- <dependency>-->
<!-- <groupId>org.apache.hadoop</groupId>-->
<!-- <artifactId>hadoop-common</artifactId>-->
<!-- <version>2.5.0-cdh5.2.0</version>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-logging/commons-logging -->
<!-- <dependency>-->
<!-- <groupId>commons-logging</groupId>-->
<!-- <artifactId>commons-logging</artifactId>-->
<!-- <version>1.2</version>-->
<!-- </dependency>-->
</dependencies>
</project>
3.将hdfs-site.xml、core-site.xml和log4j.properties到resources
4.将hadoop-2.6.0-cdh5.7.0的tar包直接解压缩到本地,并且配置Hadoop环境变量
5.添加winutils.exe到hadoop-2.6.0-cdh5.7.0\bin目录下,并且配置winutils.exe到系统Path中
6.添加hadoop.dll进入到C:\Windows\System32
7.开启namenode和datanode
8.如果依旧包同样错误,有必要重启一次电脑
四、测试编写
1.编写WordCount代码
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCount {
//map类
//继承Mapper类,<KEYIN, VALUEIN, KEYOUT, VALUEOUT> 输入的key,输入的value,输出的key,输出的value
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
//创建一个IntWritable类型的对象,给定值为1
IntWritable i = new IntWritable(1);
Text keystr = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
String line = value.toString();
//传入每一个map方法的key和value做打印
System.out.println("key : "+key.get()+"--------- value : "+line);
String [] strs = line.split(" ");
for (String str : strs) {
//每一次循环遍历到一个单词就要输出到下一个步骤
keystr.set(str);
System.out.println("map的输出:key : ("+str+",1)");
context.write(keystr, i);
}
/* StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
keystr.set(itr.nextToken());
context.write(keystr, i);
}*/
}
}
//1.5 实现combiner
public static class MyCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable countwritable = new IntWritable();
@Override
//Map类的map方法的数据输入到Reduce类的group方法中,得到<text,it(1,1)>,再将这个数据输入reduce类到reduce方法中
protected void reduce(Text inputkey, Iterable<IntWritable> inputvalue,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//得到了key
String key = inputkey.toString();
//迭代Iterable,把每一个值相加
int count = 0;
//循环遍历迭代器中的所有值,做相加
for (IntWritable intWritable : inputvalue) {
count = count + intWritable.get();
}
//把值设置到IntWritable,等待输出
countwritable.set(count);
System.out.println("combiner输出结果:key : "+key+" , "+ count);
context.write(inputkey, countwritable);
}
}
//reduce类
//reduce类的输入,其实就是map类中map方法的输出
//输入key 输入value 输出key 输出value
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
IntWritable countwritable = new IntWritable();
@Override
//Map类的map方法的数据输入到Reduce类的group方法中,得到<text,it(1,1)>,再将这个数据输入reduce类到reduce方法中
protected void reduce(Text inputkey, Iterable<IntWritable> inputvalue,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//得到了key
String key = inputkey.toString();
//迭代Iterable,把每一个值相加
int count = 0;
//循环遍历迭代器中的所有值,做相加
for (IntWritable intWritable : inputvalue) {
count = count + intWritable.get();
}
//把值设置到IntWritable,等待输出
countwritable.set(count);
System.out.println("reduce输出结果:key : "+key+" , "+ count);
context.write(inputkey, countwritable);
}
}
//运行类,run方法,在测试的时候使用main函数,调用这个类的run方法来运行
/**
*
* @param args 参数是要接受main方法得到的参数,在run中使用
* @return
* @throws Exception
*/
public int run(String[] args) throws Exception {
//hadoop的配置的上下文!
Configuration configuration = new Configuration();
//通过上下文,构建一个job实例,并且传入任务名称,单例!
Job job = Job.getInstance(configuration, this.getClass().getSimpleName());
//这参数必须添加,否则本地运行没有问题,服务器上运行会报错
job.setJarByClass(WordCount.class);
//设置任务读取数据位置
//调用这个方法的时候,要往args中传入参数,第一个位置上要传入从哪里读数据
Path inputpath = new Path(args[0]);
FileInputFormat.addInputPath(job, inputpath);
//设置任务结果数据保存到哪里?
//调用这个方法的时候,要往args中传入参数,第二个位置上要传入结果数据保存到哪里
Path outputpath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outputpath);
//设置mapper类的参数
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setNumReduceTasks(2);
//设置reduce类的参数
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setCombinerClass(MyCombiner.class);
// submit job -> YARN
boolean isSuccess = job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
public static void main(String[] args) {
args= new String[]{
"hdfs://hadoop01:8020/core-site.xml",
"hdfs://hadoop01:8020/output3"
};
WordCount mr = new WordCount();
try {
int success = -1;
success = mr.run(args);
System.out.println("success:"+success);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
2.报错:org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
19/04/13 08:59:32 INFO mapreduce.JobSubmitter: Cleaning up the staging area file:/opt/modules/hadoop-2.6.0-cdh5.7.0/data/tmp/mapred/staging/Lenovo1474735874/.staging/job_local1474735874_0001
Exception in thread "main" java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:557)
at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:980)
原因:(1)系统C:\Windows\System32下缺少hadoop.dll
(2)NativeIO这个类,在windows运行会出现问题。解决是找到源码(很简单的~),全文复制,然后直接paste到java/Hadoop目录下,会自动生成该类(需要导入相应的依赖包,自己手动导入一下就好!)
然后,修改access函数为直接return true;
3.再次运行即可成功~
五、打包部署服务器
1.将代码中输入路径和输出路径注释
// args= new String[]{
// "hdfs://hadoop01:8020/core-site.xml",
// "hdfs://hadoop01:8020/output3"
// };
2.重命名为WordCountOnline如下
public static void main(String[] args) {
// 在线打包注释
// args= new String[]{
// "hdfs://hadoop01:8020/core-site.xml",
// "hdfs://hadoop01:8020/output3"
// };
WordCountOnline mr = new WordCountOnline();
try {
int success = -1;
success = mr.run(args);
System.out.println("success:"+success);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
3.使用IDEA打jar包
参考博客:《Idea打包:scala打成jar包》,地址:】
4.上传到服务器hadoop根目录下
/opt/modules/hadoop-2.6.0-cdh5.7.0/Hadoop.jar
5.开启hdfs和yarn的服务
(1)namenode和datanode
(2)resourcemananger和nodemanager
6.运行命令
bin/hadoop jar Hadoop.jar Hadoop.WordCountOnline /core-site.xml /output5
结果:
19/04/14 17:17:26 INFO mapreduce.Job: Running job: job_1555233349434_0001
19/04/14 17:17:36 INFO mapreduce.Job: Job job_1555233349434_0001 running in uber mode : false
19/04/14 17:17:36 INFO mapreduce.Job: map 0% reduce 0%
19/04/14 17:17:42 INFO mapreduce.Job: map 100% reduce 0%
19/04/14 17:17:49 INFO mapreduce.Job: map 100% reduce 50%
19/04/14 17:17:50 INFO mapreduce.Job: map 100% reduce 100%
19/04/14 17:17:50 INFO mapreduce.Job: Job job_1555233349434_0001 completed successfully
19/04/14 17:17:51 INFO mapreduce.Job: Counters: 49
File System Counters
FILE: Number of bytes read=1357
FILE: Number of bytes written=337515
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=1080
HDFS: Number of bytes written=998
HDFS: Number of read operations=9
HDFS: Number of large read operations=0
HDFS: Number of write operations=4
Job Counters
Launched map tasks=1
Launched reduce tasks=2
Data-local map tasks=1
Total time spent by all maps in occupied slots (ms)=4229
Total time spent by all reduces in occupied slots (ms)=8113
Total time spent by all map tasks (ms)=4229
Total time spent by all reduce tasks (ms)=8113
Total vcore-seconds taken by all map tasks=4229
Total vcore-seconds taken by all reduce tasks=8113
Total megabyte-seconds taken by all map tasks=4330496
Total megabyte-seconds taken by all reduce tasks=8307712
Map-Reduce Framework
Map input records=28
Map output records=133
Map output bytes=1513
Map output materialized bytes=1357
Input split bytes=99
Combine input records=133
Combine output records=87
Reduce input groups=87
Reduce shuffle bytes=1357
Reduce input records=87
Reduce output records=87
Spilled Records=174
Shuffled Maps =2
Failed Shuffles=0
Merged Map outputs=2
GC time elapsed (ms)=356
CPU time spent (ms)=2960
Physical memory (bytes) snapshot=649351168
Virtual memory (bytes) snapshot=8271003648
Total committed heap usage (bytes)=649592832
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=981
File Output Format Counters
Bytes Written=998
success:0
六、工具类
6.1 递归类
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path, RemoteIterator}
import scala.collection.mutable.ArrayBuffer
object HdfsUtil {
val fs: FileSystem = FileSystem.get(new Configuration)
def hdfs(filePath: String): FileSystem = {
if (filePath.startsWith("oss://"))
FileSystem.get(new Path(filePath).toUri, new Configuration)
else
FileSystem.get(new Configuration)
}
def delete(pathStr: String): Unit = {
hdfs(pathStr).delete(new Path(pathStr), true)
}
def mkdir(dir: String): Unit = {
hdfs(dir).mkdirs(new Path(dir))
}
def exists(pathStr: String): Boolean = {
hdfs(pathStr).exists(new Path(pathStr))
}
def rename(srcPath: String, destPath: String): Boolean = {
hdfs(srcPath).rename(new Path(srcPath), new Path(destPath))
}
def copyToLocalFile(srcPath: String, destPath: String): Unit = {
hdfs(srcPath).copyToLocalFile(new Path(srcPath), new Path(destPath))
}
def copyFromLocalFile(srcPath: String, destPath: String): Unit = {
hdfs(destPath).copyFromLocalFile(new Path(srcPath), new Path(destPath))
}
def localPath(hdfsPath: String): String = {
hdfsPath.substring(hdfsPath.indexOf("/data/production"))
}
/**
* 关闭fs系统
* @param system
*/
def closeFS(system: FileSystem): Unit = {
fs.close();
}
/**
* 递归获取目录下所有目录
* @param srcPath
* @return
*/
def listFullFilePathNames(srcPath: String): List[String] = {
val buffer = new ArrayBuffer[String]
try {
val iterator: RemoteIterator[LocatedFileStatus] = fs.listFiles(new Path(srcPath), true)
while (iterator.hasNext) {
val fileStatus = iterator.next()
buffer.append(fileStatus.getPath.toString) //获取所有目录List
}
} finally {
this.closeFS(fs)
}
buffer.toList
}
def main(args: Array[String]): Unit = {
val allfiles=listFullFilePathNames("/data/test/metro/campaign/20200911")
println(allfiles)
}
}
七、推荐文章
在此推荐两篇大神的文章,这个是对本文成功运行帮助很大!
1.zhengcongyi的《解决Exception: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z 等一系列问题》里面包括了问题解决以及资源下载