UDTF是一个用户自定义的函数,它对单行进行操作,并在表中生成多行作为输出。简单来说就是传入一条输出多条,这里我主要是通过两个例子来介绍UDTF
首先引入Maven依赖
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.3.2</version>
</dependency>
然后写一个类继承GenericUDTF
主要是对这个类中的三个方法进行重写:
- initialize
- process
- close
它们的含义在代码中有详细注释
这个udtf主要用来处理key-value形式的字符串,代码如下
package blog;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
/**
* @Author Daniel
* @Description 解析key:value:value形式的字符串
**/
public class UdtfExplodeMap extends GenericUDTF {
//重写initialize方法进行初始化
@Override
public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
//判断传入的是不是一个参数
if (args.length != 1) {
throw new UDFArgumentLengthException("ExplodeMap takes only one argument");
}
//判断传入的参数是否为字符串
if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentException("ExplodeMap takes string as a parameter");
}
//列名集合
ArrayList<String> fieldNames = new ArrayList<String>();
//列对应的value值
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
//设置列名
fieldNames.add("id");
//将value值解析成Hive认识的字段
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("lower");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("upper");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
//将结果封装给ObjectInspectorFactory
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
//重写process方法,真正的逻辑在这里
@Override
public void process(Object[] args) throws HiveException {
String input = args[0].toString();
//行切割
String[] tokens = input.split(";");
for (int i = 0; i < tokens.length; i++) {
try {
//字段之间切割
String[] result = tokens[i].split(":");
//传给Hive
forward(result);
} catch (Exception e) {
continue;
}
}
}
//重写close方法,一般用于释放全局变量的资源,我这里没有定义全局变量,就不用写内容
@Override
public void close() throws HiveException {
}
}
这个udtf主要是将一对多的关系转换成一对一的关系,这里相当于传入了一个数组,代码如下
package blog;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.Iterator;
/**
* @Author Daniel
* @Description 将一对多的表解析为一对一的表
**/
public class UdtfExplodeRow extends GenericUDTF {
//PrimitiveObjectInspector对象,当参数为一个表中所有的字段时需要用到
private PrimitiveObjectInspector stringOI = null;
@Override
public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
if (args.length != 1) {
throw new UDFArgumentLengthException("ExplodeRow takes only one argument");
}
if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentException("ExplodeRow takes string as a parameter");
}
//stringOI要在initialize中进行初始化
stringOI = (PrimitiveObjectInspector) args[0];
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("id");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("phone_number");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
//获取table中的key值
String id = stringOI.getPrimitiveJavaObject(args[0]).toString();
ArrayList<Object[]> result = new ArrayList<Object[]>();
if (id == null || id.isEmpty())
return;
//指定字段之间的分隔符
String[] tokens = id.split(",");
if (tokens.length == 2) {
result.add(new Object[]{tokens[0], tokens[1]});
} else if (tokens.length == 3) {
result.add(new Object[]{tokens[0], tokens[1]});
result.add(new Object[]{tokens[0], tokens[2]});
}
Iterator<Object[]> it = result.iterator();
while (it.hasNext()) {
Object[] r = it.next();
forward(r);
}
}
@Override
public void close() throws HiveException {
}
}
接着我们打成jar包上传到linux上
数据如下:
letter
1:a:A;2:b:B;3:c:C;4:d:D;
phone
123,phone1,phone2,phone3
123,phone1,phone3
124,phone1
125,phone1,phone3
126,phone1,phone2
127,phone1
128,phone1,phone3
129,phone1
130,phone1,phone2
hql:
add jar /home/hadoop/hive_jar/udtf.jar;
create temporary function ExplodeMap as "blog.UdtfExplodeMap";
create temporary function ExplodeRow as "blog.UdtfExplodeRow";
create table if not exists letter(data string);
load data local inpath '/home/hadoop/hive_data/letter' into table letter;
select * from letter;
select ExplodeMap(data) from letter;
可以看到字符串被我们转换成了一个relational table 一条id记录多个colum
接下来测试第二个udtf
hql:
create table if not exists phone(id string) row format delimited fields terminated by "\n";
load data local inpath '/home/hadoop/hive_data/phone' into table phone;
select * from phone;
select ExplodeRow(id ) from phone;
可以看到,一多对的关系已经被转成了一对一的关系