数据及需求

数据


数据一
MapReduce---天气数据清洗_mapreduce
字段解释:年,月,日,小时,温度,湿度,气压,风向,风速,天气情况,1h降雨量,6h降雨量


数据二

0,cloudless
1,cumulus
2,cumulonimbus
3,stratocumulus 
4,stratus
5,nimbostratus 
6,altostratus
7,altocumulus
8,Cirrus
9,stratocirrus
10,cirrocumulus

字段解释:id,天气状况


需求及实现


需求

  1. 将分割符由一个或多个空格转换成逗号
  2. 清除不合法数据:字段长度不足,风向不在[0,360]的,风速为负的,气压为负的,天气情况不在[0,10],湿度不在[0,100],温度不在[-40,50]的数据
  3. 将数据一与数据二的数据以天气情况进行join操作,把天气情况变为其对应的云属;
  4. 对进入同一个分区的数据排序; 排序规则: (1)同年同月同天为key; (2)按每日温度升序; (3)若温度相同则按风速升序; (4)风速相同则按压强降序

解析

  1. 需求一的解决方法:链接: 链接.
  2. 需求二就是判断语句
  3. 需求三将数据二缓存到内存里面
  4. 需求四在自定义类的时候,定义排序规则,然后自定义分组
代码实现

自定义的类

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Data implements WritableComparable<Data> {
    //年
    private String year;
    //月
    private String month;
    //日
    private String day;
    //小时
    private String hour;
    //温度
    private String temperature;
    //湿度
    private String dew;
    //气压/压强
    private int pressure;
    //风向
    private String wind_direction;
    //风速
    private String wind_speed;
    //天气情况
    private String sky_condition;
    //1小时降雨量
    private String rain_1h;
    //6小时降雨量
    private String rain_6h;

    @Override
    public int compareTo(Data o) {
        int a = (this.temperature.compareTo(o.temperature));
        int b = (this.wind_speed.compareTo(o.wind_speed));
        if (a == 0) {
            if (b == 0) {
                return o.pressure - this.pressure;
            }
            return b;
        }
        return a;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(year);
        dataOutput.writeUTF(month);
        dataOutput.writeUTF(day);
        dataOutput.writeUTF(hour);
        dataOutput.writeUTF(temperature);
        dataOutput.writeUTF(dew);
        dataOutput.writeInt(pressure);
        dataOutput.writeUTF(wind_direction);
        dataOutput.writeUTF(wind_speed);
        dataOutput.writeUTF(sky_condition);
        dataOutput.writeUTF(rain_1h);
        dataOutput.writeUTF(rain_6h);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        year = dataInput.readUTF();
        month = dataInput.readUTF();
        day = dataInput.readUTF();
        hour = dataInput.readUTF();
        temperature = dataInput.readUTF();
        dew = dataInput.readUTF();
        pressure = dataInput.readInt();
        wind_direction = dataInput.readUTF();
        wind_speed = dataInput.readUTF();
        sky_condition = dataInput.readUTF();
        rain_1h = dataInput.readUTF();
        rain_6h = dataInput.readUTF();
    }

    @Override
    public String toString() {
        return year + "," + month + "," + day + "," + hour + "," + temperature + "," + dew + "," + pressure + ","
                + wind_direction + "," + wind_speed + "," + sky_condition + "," + rain_1h + "," + rain_6h;
    }

    public void set(String year, String month, String day, String hour, String temperature, String dew, int pressure, String wind_direction, String wind_speed, String sky_condition, String rain_1h, String rain_6h) {
        this.year = year;
        this.month = month;
        this.day = day;
        this.hour = hour;
        this.temperature = temperature;
        this.dew = dew;
        this.pressure = pressure;
        this.wind_direction = wind_direction;
        this.wind_speed = wind_speed;
        this.sky_condition = sky_condition;
        this.rain_1h = rain_1h;
        this.rain_6h = rain_6h;
    }

    public String getYear() {
        return year;
    }

    public void setYear(String year) {
        this.year = year;
    }

    public String getMonth() {
        return month;
    }

    public void setMonth(String month) {
        this.month = month;
    }

    public String getDay() {
        return day;
    }

    public void setDay(String day) {
        this.day = day;
    }

    public String getHour() {
        return hour;
    }

    public void setHour(String hour) {
        this.hour = hour;
    }

    public String getTemperature() {
        return temperature;
    }

    public void setTemperature(String temperature) {
        this.temperature = temperature;
    }

    public String getDew() {
        return dew;
    }

    public void setDew(String dew) {
        this.dew = dew;
    }

    public int getPressure() {
        return pressure;
    }

    public void setPressure(int pressure) {
        this.pressure = pressure;
    }

    public String getWind_direction() {
        return wind_direction;
    }

    public void setWind_direction(String wind_direction) {
        this.wind_direction = wind_direction;
    }

    public String getWind_speed() {
        return wind_speed;
    }

    public void setWind_speed(String wind_speed) {
        this.wind_speed = wind_speed;
    }

    public String getSky_condition() {
        return sky_condition;
    }

    public void setSky_condition(String sky_condition) {
        this.sky_condition = sky_condition;
    }

    public String getRain_1h() {
        return rain_1h;
    }

    public void setRain_1h(String rain_1h) {
        this.rain_1h = rain_1h;
    }

    public String getRain_6h() {
        return rain_6h;
    }

    public void setRain_6h(String rain_6h) {
        this.rain_6h = rain_6h;
    }
}

Mapper阶段

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

public class MapTest extends Mapper<LongWritable, Text, Data, NullWritable> {
    Data k = new Data();
    Map<String, String> sky_status = new HashMap<String, String>();
    String status;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        URI[] uris = context.getCacheFiles();
        File file = new File(uris[0]);
        String line;
        BufferedReader br = new BufferedReader(new FileReader(file));
        while ((line = br.readLine()) != null) {
            sky_status.put(line.split(",")[0], line.split(",")[1]);
        }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String datas[] = value.toString().split("\\s+");
        if (datas.length != 12
                || Integer.parseInt(datas[7]) < 0
                || Integer.parseInt(datas[6]) < 0
                || (Integer.parseInt(datas[9]) < 0 || Integer.parseInt(datas[9]) > 10)
                || (Integer.parseInt(datas[5]) < 0 || Integer.parseInt(datas[5]) > 100)
                || (Integer.parseInt(datas[4]) < -40 || Integer.parseInt(datas[4]) > 50)) {
            return;
        }
        status = sky_status.get(datas[9]);
        k.set(datas[0], datas[1], datas[2], datas[3], datas[4], datas[5], Integer.parseInt(datas[6]), datas[7], datas[8], status, datas[10], datas[11]);
        context.write(k, NullWritable.get());
    }
}

自定义分组

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class Group extends WritableComparator {
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        Data d1 = (Data) a;
        Data d2 = (Data) b;
        String date1 = d1.getYear() + d1.getMonth() + d1.getDay();
        String date2 = d2.getYear() + d2.getMonth() + d2.getDay();
        return date1.compareTo(date2);
    }
    protected Group(){
        super(Data.class,true);
    }
}

Reduce阶段

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class RedTest extends Reducer<Data, NullWritable, Data, NullWritable> {
    @Override
    protected void reduce(Data key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        for (NullWritable v : values) {
            context.write(key, NullWritable.get());
        }
    }
}

Driver阶段

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.File;
import java.net.URI;

public class DriTest {
    public static void main(String[] args) throws Exception {
        File file = new File("D:\\MP\\气象数据\\output");
        if (file.exists()) {
            delFile(file);
            driver();
        } else {
            driver();
        }
    }

    public static void delFile(File file) {
        File[] files = file.listFiles();
        if (files != null && files.length != 0) {
            for (int i = 0; i < files.length; i++) {
                delFile(files[i]);
            }
        }
        file.delete();
    }

    public static void driver() throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setMapperClass(MapTest.class);
        job.setJarByClass(DriTest.class);
        job.setReducerClass(RedTest.class);

        job.setMapOutputKeyClass(Data.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(Data.class);
        job.setOutputValueClass(NullWritable.class);

        job.addCacheFile(new URI("file:///D:/MP/气象数据/input/sky.txt"));
        job.setGroupingComparatorClass(Group.class);

        FileInputFormat.setInputPaths(job, "D:\\MP\\气象数据\\input\\data.txt");
        FileOutputFormat.setOutputPath(job, new Path("D:\\MP\\气象数据\\output"));
        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}