java 添加自定义字段

转载

编程之翼 2024-10-09 08:05:21

文章标签 java 添加自定义字段 python 开发语言 hadoop apache 文章分类 Java 后端开发

1、重写ImageInputFormat

package org.example;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import java.io.IOException;

/**
 * @program: hadoop2
 * @description: 新建图片输入类，继承文件输入类，重写读取的方法
 * 输出的key是文件名，value是读取的整个图片的字节
 * @author: Brooke
 * @create: 2023-12-29 13:58
 **/
public class ImageInputFormat extends FileInputFormat<Text, BytesWritable> {
    /*
    重写读取文件的方法，返回读取后的RecordReader
    因为我们要重新定义这个RecordReader，所以新建一个ImageRecordReader类
     */
    @Override
    public RecordReader<Text, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        return new ImageRecordReader();// 这是一个空的？???
    }

    /*
        因为是图片，我们采用单个文件读取，所以不分切片
         */
    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return false;
    }
}

2、ImageRecordReader

package org.example;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;
import java.io.InputStream;

/**
 * @program: hadoop2
 * @description: 输出的key是文件名，value是读取的整个图片的字节
 * @author: Brooke
 * @create: 2023-12-29 14:04
 **/
public class ImageRecordReader extends RecordReader<Text, BytesWritable> {

    private FSDataInputStream in;// 图片的输入流
    private String imageName;// 图片名称
    private boolean isHandle;// 是否读取完成

    /*
    在初始化作用：
    1、获得文件名
    2、
     */
    @Override
    public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
        FileSplit inputSplit1 = (FileSplit) inputSplit;// 将单个切片先转为FileSplit文件切片类型
        // 创建文件管理系统
        FileSystem fs = FileSystem.get(context.getConfiguration());

        in = fs.open(inputSplit1.getPath());// 使用文件管理的open方法将传入的切片文件塞入流中
        imageName = inputSplit1.getPath().getName();// 获得文件的图片名称

        isHandle = false;
    }

    /*
    表示还有下一个kv需要读取吗？
    true表示还有
     */
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (isHandle){ // 当读取完成后，在getCurrentValue方法中isHandle变为true
            return false; // 返回false，表示没有下面的kv需要读取了
        }
        return true;
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        // 直接就是图片名称
        return new Text(imageName);
    }

    /*
    value返回的是bytes
     */
    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        // available()方法表示剩余读取的字节长度，在这就可以表示流的长度
        int length = in.available();
        byte[] bytes = new byte[length];// 创建字节数组，用来存储读取图片的字节
        // 使用工具类来读取图片流，从输入流 in 中读取指定长度 length 的字节，并将其存储在字节数组 bytes 中
        IOUtils.read(in, bytes, 0, length);

        isHandle = true;// 表示读取完成

        return new BytesWritable(bytes);
    }

    /*
    读取进度
    创建isHandle表示是否读取完成，读完返回1.0f
     */
    @Override
    public float getProgress() throws IOException, InterruptedException {
        if (isHandle){ // 如果为true，则表示读取完成，返回1
            return 1.0f;
        }
        return 0;
    }

    @Override
    public void close() throws IOException {
        in.close();
    }
}

3、ImageJobMapper

package org.example;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @program: hadoop2
 * @description:
 * @author: Brooke
 * @create: 2023-12-29 15:14
 **/
public class ImageJobMapper extends Mapper<Text, BytesWritable,Text,BytesWritable> {
    /*
    map不做数据变现
     */
    @Override
    protected void map(Text key, BytesWritable value, Mapper<Text, BytesWritable, Text, BytesWritable>.Context context) throws IOException, InterruptedException {
        context.write(key,value);
    }
}

4、ImageJobReducer

package org.example;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;

/**
 * @program: hadoop2
 * @description: 输入的kv是从ImageInputFormat读出的<图片名称, 图片字节>
 * 输出的是<图片名称，图片属性类>
 * @author: Brooke
 * @create: 2023-12-29 15:24
 **/
public class ImageJobReducer extends Reducer<Text, BytesWritable, Text, ImageBean> {

    ByteArrayInputStream byteArrayInputStream;
    @Override
    protected void reduce(Text key, Iterable<BytesWritable> values, Reducer<Text, BytesWritable, Text, ImageBean>.Context context) throws IOException, InterruptedException {
        // k还是图片名称，v就要将字节写出为完成图片，并拿到长和高
        for (BytesWritable item : values) {
            byte[] bytes = item.getBytes();

            // ByteArrayInputStream 是 Java 中的一个类，用于从字节数组中读取数据
            //将字节数组 bytes 包装成一个字节数组输入流 byteArrayInputStream，以便其他部分的代码可以使用输入流的方法读取数据
            byteArrayInputStream = new ByteArrayInputStream(bytes);
        }
        // 使用图片流工具类读取流称为图片
        //BufferedImage 对象是 Java 中表示图像的一种数据结构，它包含了图像的像素数据、宽度、高度等信息
        BufferedImage bufferedImage = ImageIO.read(byteArrayInputStream);

        ImageBean imageBean = new ImageBean(bufferedImage.getWidth(), bufferedImage.getHeight());// 这就是value

        context.write(key,imageBean);

        byteArrayInputStream.close();// 关流
    }
}

5、ImageBean

package org.example;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * @program: hadoop2
 * @description:
 * @author: Brooke
 * @create: 2023-12-29 15:16
 **/
public class ImageBean implements Writable {
    private int width;
    private int height;

    public ImageBean() {
    }

    public ImageBean(int width, int height) {
        this.width = width;
        this.height = height;
    }

    @Override
    public void write(DataOutput output) throws IOException {
        output.writeInt(this.width);
        output.writeInt(this.height);
    }

    @Override
    public void readFields(DataInput input) throws IOException {
        this.width = input.readInt();
        this.height = input.readInt();
    }

    @Override
    public String toString() {
        return "ImageBean{" +
                "width=" + width +
                ", height=" + height +
                '}';
    }

    public int getWidth() {
        return width;
    }

    public void setWidth(int width) {
        this.width = width;
    }

    public int getHeight() {
        return height;
    }

    public void setHeight(int height) {
        this.height = height;
    }
}

5、ImageOutputFormat

package org.example;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @program: hadoop2
 * @description: 自定义输出文件类
 * @author: Brooke
 * @create: 2023-12-29 15:54
 **/
public class ImageOutputFormat extends FileOutputFormat<Text,ImageBean> {
    @Override
    public RecordWriter<Text, ImageBean> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        // 文件管理系统
        FileSystem fs = FileSystem.get(taskAttemptContext.getConfiguration());
        // 创建一个默认路径的临时文件流，往里面写东西
        FSDataOutputStream outputStream = fs.create(super.getDefaultWorkFile(taskAttemptContext, ""));
        // 将传来的数据写到image里
        ImageRecordWriter image = new ImageRecordWriter(outputStream);
        return image;
    }
}

6、ImageRecordWriter

package org.example;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;

/**
 * @program: hadoop2
 * @description:
 * @author: Brooke
 * @create: 2023-12-29 16:11
 **/
public class ImageRecordWriter extends RecordWriter<Text,ImageBean> {
    // 在hdfs中，写出需要FSDataOutputStream实例对象
    private FSDataOutputStream outputStream;

    // 创建构造方法，在外面才能把那个临时文件传进来，
    public ImageRecordWriter(FSDataOutputStream outputStream) {
        this.outputStream = outputStream;
    }

    /*
    给那个临时文件写进去文件名和imageBean
     */
    @Override
    public void write(Text text, ImageBean imageBean) throws IOException, InterruptedException {
        outputStream.writeUTF(text.toString()+","+ imageBean.toString());
    }

    @Override
    public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        outputStream.close();
    }
}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。