一、目的

远程客户端编写MapReduce代码,并且配置环境进行调试。

二、环境

1.IDEA

2.JDK1.8

3.CDH 5.7.0

三、步骤

1.创建Maven项目

2.加载对应CDH需求依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>BG</groupId>
    <artifactId>Hadoop</artifactId>
    <version>1.0-SNAPSHOT</version>


    <repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
    </repositories>
    <properties>
        <hadoop.version>2.6.0-cdh5.7.0</hadoop.version>
        <hbase.version>1.2.0-cdh5.7.0</hbase.version>

        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
    </properties>

    <dependencies>
        <!--            <dependency>-->
        <!--                <groupId>org.apache.hadoop</groupId>-->
        <!--                <artifactId>hadoop-common</artifactId>-->
        <!--                <version>2.5.0-cdh5.2.0</version>-->
        <!--            </dependency>-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>



        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.10</version>
            <scope>test</scope>
        </dependency>


        <!-- https://mvnrepository.com/artifact/commons-logging/commons-logging -->
        <!--        <dependency>-->
        <!--            <groupId>commons-logging</groupId>-->
        <!--            <artifactId>commons-logging</artifactId>-->
        <!--            <version>1.2</version>-->
        <!--        </dependency>-->

    </dependencies>

</project>

3.将hdfs-site.xml、core-site.xml和log4j.properties到resources

Hadoop在idea上respority包 idea运行hadoop,环境配置_外部链接

4.将hadoop-2.6.0-cdh5.7.0的tar包直接解压缩到本地,并且配置Hadoop环境变量

Hadoop在idea上respority包 idea运行hadoop,环境配置_Hadoop_02

5.添加winutils.exe到hadoop-2.6.0-cdh5.7.0\bin目录下,并且配置winutils.exe到系统Path中

Hadoop在idea上respority包 idea运行hadoop,环境配置_大数据调试环境配置_03

6.添加hadoop.dll进入到C:\Windows\System32

7.开启namenode和datanode

8.如果依旧包同样错误,有必要重启一次电脑

四、测试编写

1.编写WordCount代码

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCount {
    //map类
    //继承Mapper类,<KEYIN, VALUEIN, KEYOUT, VALUEOUT>   输入的key,输入的value,输出的key,输出的value
    public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        //创建一个IntWritable类型的对象,给定值为1
        IntWritable i = new IntWritable(1);
        Text keystr  = new Text();
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            String line = value.toString();
            //传入每一个map方法的key和value做打印
            System.out.println("key : "+key.get()+"--------- value : "+line);
            String [] strs = line.split(" ");
            for (String str : strs) {
                //每一次循环遍历到一个单词就要输出到下一个步骤
                keystr.set(str);
                System.out.println("map的输出:key : ("+str+",1)");
                context.write(keystr, i);
            }

	/*		StringTokenizer itr = new StringTokenizer(value.toString());
			    while (itr.hasMoreTokens()) {
			    	keystr.set(itr.nextToken());
			       context.write(keystr, i);
			      }*/
        }
    }


    //1.5 实现combiner
    public static class MyCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {
        IntWritable countwritable = new IntWritable();
        @Override
        //Map类的map方法的数据输入到Reduce类的group方法中,得到<text,it(1,1)>,再将这个数据输入reduce类到reduce方法中
        protected void reduce(Text inputkey, Iterable<IntWritable> inputvalue,
                              Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            //得到了key
            String key = inputkey.toString();
            //迭代Iterable,把每一个值相加
            int count = 0;
            //循环遍历迭代器中的所有值,做相加
            for (IntWritable intWritable : inputvalue) {
                count = count + intWritable.get();
            }
            //把值设置到IntWritable,等待输出
            countwritable.set(count);
            System.out.println("combiner输出结果:key : "+key+" , "+ count);
            context.write(inputkey, countwritable);
        }
    }

    //reduce类
    //reduce类的输入,其实就是map类中map方法的输出
	//输入key  输入value  输出key  输出value
    public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        IntWritable countwritable = new IntWritable();
        @Override
        //Map类的map方法的数据输入到Reduce类的group方法中,得到<text,it(1,1)>,再将这个数据输入reduce类到reduce方法中
        protected void reduce(Text inputkey, Iterable<IntWritable> inputvalue,
                              Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            //得到了key
            String key = inputkey.toString();
            //迭代Iterable,把每一个值相加
            int count = 0;
            //循环遍历迭代器中的所有值,做相加
            for (IntWritable intWritable : inputvalue) {
                count = count + intWritable.get();
            }
            //把值设置到IntWritable,等待输出
            countwritable.set(count);
            System.out.println("reduce输出结果:key : "+key+" , "+ count);
            context.write(inputkey, countwritable);
        }
    }
	
    //运行类,run方法,在测试的时候使用main函数,调用这个类的run方法来运行
    /**
     *
     * @param args 参数是要接受main方法得到的参数,在run中使用
     * @return
     * @throws Exception
     */
    public int run(String[] args) throws Exception {
        //hadoop的配置的上下文!
        Configuration configuration = new Configuration();
        //通过上下文,构建一个job实例,并且传入任务名称,单例!
        Job job = Job.getInstance(configuration, this.getClass().getSimpleName());

        //这参数必须添加,否则本地运行没有问题,服务器上运行会报错
        job.setJarByClass(WordCount.class);

        //设置任务读取数据位置
        //调用这个方法的时候,要往args中传入参数,第一个位置上要传入从哪里读数据
        Path inputpath = new Path(args[0]);
        FileInputFormat.addInputPath(job, inputpath);
        //设置任务结果数据保存到哪里?
        //调用这个方法的时候,要往args中传入参数,第二个位置上要传入结果数据保存到哪里
        Path outputpath = new Path(args[1]);
        FileOutputFormat.setOutputPath(job, outputpath);

        //设置mapper类的参数
        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setNumReduceTasks(2);

        //设置reduce类的参数
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setCombinerClass(MyCombiner.class);
        // submit job -> YARN
        boolean isSuccess = job.waitForCompletion(true);
        return isSuccess ? 0 : 1;
    }


    public static void main(String[] args) {
        args= new String[]{
                "hdfs://hadoop01:8020/core-site.xml",
                "hdfs://hadoop01:8020/output3"
        };

        WordCount mr = new WordCount();
        try {
            int success = -1;
            success = mr.run(args);
            System.out.println("success:"+success);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
}

2.报错:org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z

19/04/13 08:59:32 INFO mapreduce.JobSubmitter: Cleaning up the staging area file:/opt/modules/hadoop-2.6.0-cdh5.7.0/data/tmp/mapred/staging/Lenovo1474735874/.staging/job_local1474735874_0001
Exception in thread "main" java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:557)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:980)

原因:(1)系统C:\Windows\System32下缺少hadoop.dll

(2)NativeIO这个类,在windows运行会出现问题。解决是找到源码(很简单的~),全文复制,然后直接paste到java/Hadoop目录下,会自动生成该类(需要导入相应的依赖包,自己手动导入一下就好!)

Hadoop在idea上respority包 idea运行hadoop,环境配置_外部链接_04

然后,修改access函数为直接return true;

Hadoop在idea上respority包 idea运行hadoop,环境配置_IDEA_05

3.再次运行即可成功~

五、打包部署服务器

1.将代码中输入路径和输出路径注释

//        args= new String[]{
//                "hdfs://hadoop01:8020/core-site.xml",
//                "hdfs://hadoop01:8020/output3"
//        };

2.重命名为WordCountOnline如下

public static void main(String[] args) {
//        在线打包注释
//        args= new String[]{
//                "hdfs://hadoop01:8020/core-site.xml",
//                "hdfs://hadoop01:8020/output3"
//        };

        WordCountOnline mr = new WordCountOnline();
        try {
            int success = -1;
            success = mr.run(args);
            System.out.println("success:"+success);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

3.使用IDEA打jar包

参考博客:《Idea打包:scala打成jar包》,地址:】

4.上传到服务器hadoop根目录下

/opt/modules/hadoop-2.6.0-cdh5.7.0/Hadoop.jar

5.开启hdfs和yarn的服务

(1)namenode和datanode

(2)resourcemananger和nodemanager

6.运行命令

bin/hadoop jar Hadoop.jar Hadoop.WordCountOnline /core-site.xml /output5

结果:

19/04/14 17:17:26 INFO mapreduce.Job: Running job: job_1555233349434_0001
19/04/14 17:17:36 INFO mapreduce.Job: Job job_1555233349434_0001 running in uber mode : false
19/04/14 17:17:36 INFO mapreduce.Job:  map 0% reduce 0%
19/04/14 17:17:42 INFO mapreduce.Job:  map 100% reduce 0%
19/04/14 17:17:49 INFO mapreduce.Job:  map 100% reduce 50%
19/04/14 17:17:50 INFO mapreduce.Job:  map 100% reduce 100%
19/04/14 17:17:50 INFO mapreduce.Job: Job job_1555233349434_0001 completed successfully
19/04/14 17:17:51 INFO mapreduce.Job: Counters: 49
        File System Counters
                FILE: Number of bytes read=1357
                FILE: Number of bytes written=337515
                FILE: Number of read operations=0
                FILE: Number of large read operations=0
                FILE: Number of write operations=0
                HDFS: Number of bytes read=1080
                HDFS: Number of bytes written=998
                HDFS: Number of read operations=9
                HDFS: Number of large read operations=0
                HDFS: Number of write operations=4
        Job Counters 
                Launched map tasks=1
                Launched reduce tasks=2
                Data-local map tasks=1
                Total time spent by all maps in occupied slots (ms)=4229
                Total time spent by all reduces in occupied slots (ms)=8113
                Total time spent by all map tasks (ms)=4229
                Total time spent by all reduce tasks (ms)=8113
                Total vcore-seconds taken by all map tasks=4229
                Total vcore-seconds taken by all reduce tasks=8113
                Total megabyte-seconds taken by all map tasks=4330496
                Total megabyte-seconds taken by all reduce tasks=8307712
        Map-Reduce Framework
                Map input records=28
                Map output records=133
                Map output bytes=1513
                Map output materialized bytes=1357
                Input split bytes=99
                Combine input records=133
                Combine output records=87
                Reduce input groups=87
                Reduce shuffle bytes=1357
                Reduce input records=87
                Reduce output records=87
                Spilled Records=174
                Shuffled Maps =2
                Failed Shuffles=0
                Merged Map outputs=2
                GC time elapsed (ms)=356
                CPU time spent (ms)=2960
                Physical memory (bytes) snapshot=649351168
                Virtual memory (bytes) snapshot=8271003648
                Total committed heap usage (bytes)=649592832
        Shuffle Errors
                BAD_ID=0
                CONNECTION=0
                IO_ERROR=0
                WRONG_LENGTH=0
                WRONG_MAP=0
                WRONG_REDUCE=0
        File Input Format Counters 
                Bytes Read=981
        File Output Format Counters 
                Bytes Written=998
success:0

六、工具类

6.1 递归类

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path, RemoteIterator}

import scala.collection.mutable.ArrayBuffer

object HdfsUtil {

  val fs: FileSystem = FileSystem.get(new Configuration)

  def hdfs(filePath: String): FileSystem = {
    if (filePath.startsWith("oss://"))
      FileSystem.get(new Path(filePath).toUri, new Configuration)
    else
      FileSystem.get(new Configuration)
  }

  def delete(pathStr: String): Unit = {
    hdfs(pathStr).delete(new Path(pathStr), true)
  }

  def mkdir(dir: String): Unit = {
    hdfs(dir).mkdirs(new Path(dir))
  }

  def exists(pathStr: String): Boolean = {
    hdfs(pathStr).exists(new Path(pathStr))
  }

  def rename(srcPath: String, destPath: String): Boolean = {
    hdfs(srcPath).rename(new Path(srcPath), new Path(destPath))
  }

  def copyToLocalFile(srcPath: String, destPath: String): Unit = {
    hdfs(srcPath).copyToLocalFile(new Path(srcPath), new Path(destPath))
  }

  def copyFromLocalFile(srcPath: String, destPath: String): Unit = {
    hdfs(destPath).copyFromLocalFile(new Path(srcPath), new Path(destPath))
  }

  def localPath(hdfsPath: String): String = {
    hdfsPath.substring(hdfsPath.indexOf("/data/production"))
  }

  /**
   * 关闭fs系统
   * @param system
   */
  def closeFS(system: FileSystem): Unit = {
    fs.close();
  }

  /**
   * 递归获取目录下所有目录
   * @param srcPath
   * @return
   */
  def listFullFilePathNames(srcPath: String): List[String] = {
    val buffer = new ArrayBuffer[String]
    try {
      val iterator: RemoteIterator[LocatedFileStatus] = fs.listFiles(new Path(srcPath), true)
      while (iterator.hasNext) {
        val fileStatus = iterator.next()
        buffer.append(fileStatus.getPath.toString) //获取所有目录List
      }
    } finally {
      this.closeFS(fs)
    }
    buffer.toList
  }

  def main(args: Array[String]): Unit = {
    val allfiles=listFullFilePathNames("/data/test/metro/campaign/20200911")
    println(allfiles)

  }
}

七、推荐文章

在此推荐两篇大神的文章,这个是对本文成功运行帮助很大!

1.zhengcongyi的《解决Exception: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z 等一系列问题》里面包括了问题解决以及资源下载