hive中first_value函数

转载

编程小天匠 2024-12-17 16:26:48

文章标签 hive中first_value函数数据库大数据 java sql 文章分类 Hive 大数据

hive一般用来执行离线统计分析相关的功能，然后将执行的结果导入到数据库的表中供前端报表可视化展现来查询。

导回数据库的方式有许多，sqoop、hive jdbc、mr jdbc等等，但是这几种方式都会有一个二次处理环节（数据需要人工）。

这次介绍另外一种处理方式，直接将对数据库的操作集成在udf中，这样直接写一个hql查询语句就可以了。

代码如下：

package com.taisenki.tools;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.Properties;

import org.apache.commons.io.FilenameUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.IntWritable;

/**
 * 在hive0.13版本之上才能注册永久函数，否则只能注册临时函数
 * @author taisenki
 *
 */
@Description(name = "batch_import",  value = "_FUNC_(sql, args1[, args2,...][, config_path]) - Return ret ")  
public class SqlBatchImportUDF extends GenericUDF {

    public static final String DEFAULT_CONFIG_ROOT_PATH = "/user/hive/udf/sjk/"; 
    public static final String DEFAULT_CONFIG_FILE_NAME = "sjk.properties";  
    public static final String DEFAULT_CONFIG_FILE_SUFFIX = "properties";  
    private IntObjectInspector retValInspector; 
    private String sql;   
    private PrimitiveObjectInspector[] paramsInspectors;  
    private int insert = 0;
    private Connection conn;
    private PreparedStatement psi;
    private int count = 0;
    
    @Override
    public void close() throws IOException {
        // TODO Auto-generated method stub
        try {if (insert > 0) {
                psi.executeBatch();
                conn.commit();
            }
        } catch (SQLException e) {
            e.printStackTrace();
        }finally{
            try {
                if(conn != null)
                    conn.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
        super.close();
    }

    @Override
    public Object evaluate(DeferredObject[] arg0) throws HiveException {
        // TODO Auto-generated method stub
        try {
            for (int i = 0; i < count; i++) {  
                Object param = paramsInspectors[i].getPrimitiveJavaObject(arg0[i + 1].get());  
                psi.setObject(i + 1, param);  
            }  
            psi.addBatch();
            insert++;
            if(insert>1000){
                psi.executeBatch();
                conn.commit();
                insert = 0;
            }  
            IntWritable iw = new IntWritable(insert);  
            return retValInspector.getPrimitiveWritableObject(iw);  

        } catch (SQLException e) {
            e.printStackTrace();
            throw new HiveException(e);
        }
    }

    @Override
    public String getDisplayString(String[] arg0) {
        // TODO Auto-generated method stub
        return "batch_import(sql, args1[, args2,...][, config_path])"; 
    }

    @Override
    public ObjectInspector initialize(ObjectInspector[] arg0)
            throws UDFArgumentException {
        // TODO Auto-generated method stub
        if (arg0.length < 2) {  
            throw new UDFArgumentException(" Expecting  at least two arguments ");  
        }  
        insert = 0;
        //第一个参数校验，必须是一个非空的sql语句  
        if (arg0[0].getCategory() == Category.PRIMITIVE  
                && ((PrimitiveObjectInspector) arg0[0]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {  
            if (!(arg0[0] instanceof ConstantObjectInspector)) {  
                throw new UDFArgumentException("the frist arg   must be a sql string constant");  
            }  
            ConstantObjectInspector sqlInsp = (ConstantObjectInspector) arg0[0];  
            this.sql = sqlInsp.getWritableConstantValue().toString();  
            int i = -1;
            count = 0;
            while (true) {
                i = sql.indexOf("?", i + 1);
                if (i == -1) {
                    break;
                }
                count++;
            }
            if (this.sql == null || this.sql.trim().length() == 0) {  
                throw new UDFArgumentException("the frist arg   must be a sql string constant and not nullable");  
            }  
        }  
        
        if (count+1 > arg0.length){
            throw new UDFArgumentException("arguments not enough with this sql["+(arg0.length-1)/count+"]");  
        }
        
        //默认情况
        String fileName1 = SqlBatchImportUDF.DEFAULT_CONFIG_ROOT_PATH + SqlBatchImportUDF.DEFAULT_CONFIG_FILE_NAME;
        //判断是否存在指定的配置文件路径
        if (count+1 < arg0.length){
            //第一个参数校验  
            if (arg0[count+1].getCategory() == Category.PRIMITIVE  
                    && ((PrimitiveObjectInspector) arg0[count+1]).getPrimitiveCategory() == PrimitiveObjectInspector.PrimitiveCategory.STRING) {  
                if (!(arg0[count+1] instanceof ConstantObjectInspector)) {  
                    throw new UDFArgumentException("mysql connection pool config path  must be constant");  
                }  
                ConstantObjectInspector propertiesPath = (ConstantObjectInspector) arg0[count+1];  
      
                fileName1 = propertiesPath.getWritableConstantValue().toString();  
                Path path1 = new Path(fileName1);  
                if (path1.toUri().getScheme() == null) {  
                    if (!"".equals(FilenameUtils.getExtension(fileName1)) && !DEFAULT_CONFIG_FILE_SUFFIX.equals(FilenameUtils.getExtension(fileName1))) {  
                        throw new UDFArgumentException("不支持的文件扩展名，目前只支持properties文件!");  
                    }  
                    //如果是相对路径,补齐根路径  
                    if (!fileName1.startsWith("/")) {  
                        fileName1 = SqlBatchImportUDF.DEFAULT_CONFIG_ROOT_PATH + fileName1;  
                    }  
                }  
                //如果只写了文件前缀的话，补上后缀  
                if (!FilenameUtils.isExtension(fileName1, DEFAULT_CONFIG_FILE_SUFFIX)) {  
                    fileName1 = fileName1 + FilenameUtils.EXTENSION_SEPARATOR_STR + DEFAULT_CONFIG_FILE_SUFFIX;  
                }  
            } 
        }

        Properties properties = new Properties();  
        Configuration conf = new Configuration();  
        Path path2 = new Path(fileName1);  

        try (FileSystem fs = FileSystem.newInstance(path2.toUri(), conf); //这里不能用FileSystem.get(path2.toUri(), conf)，必须得重新newInstance,get出来的是共享的连接，这边关闭的话，会导致后面执行完之后可能出现FileSystem is closed的异常  
             InputStream in = fs.open(path2)) {  
            properties.load(in);
        } catch (FileNotFoundException ex) {  
            throw new UDFArgumentException("在文件系统中或者是HDFS上没有找到对应的配置文件");  
        } catch (Exception e) {  
            e.printStackTrace();  
            throw new UDFArgumentException(e);  
        }  
        
        try {
            Class.forName(properties.getProperty("driverClassName"));
            System.out.println(properties.getProperty("driverClassName"));
            System.out.println(properties.getProperty("url"));
            conn = DriverManager.getConnection(properties.getProperty("url"), properties);
            psi = conn.prepareStatement(sql);
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            throw new UDFArgumentException(e);  
        } catch (ClassNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            throw new UDFArgumentException(e);  
        }  
        //中间为参数
        paramsInspectors = new PrimitiveObjectInspector[count];  
        for (int i = 0; i < count; i++) {  
            paramsInspectors[i] = (PrimitiveObjectInspector) arg0[i+1];  
        }  
        retValInspector = PrimitiveObjectInspectorFactory.writableIntObjectInspector; 
        return retValInspector;
    }

}

然后上传jar包，注册udf，注意此处需把对应数据库的驱动包一同进行注册操作：

如回写oracle: create function default.oracleSave as 'com.taisenki.tools.SqlBatchImportUDF' using jar 'hdfs://cdh5/data/lib/test.jar', jar 'hdfs://cdh5/data/lib/ojdbc6.jar';
然后写一个HQL测试一下：
select oracleSave('insert into test111 values (?)',b.id) from (select 2 id from dual) b;

UDF第一个参数是静态参数，是对应数据库的sql语句，描述入库方式，然后后面的参数就不固定了，一一对应sql语句中的占位符，比如我上面有1个占位符，然后我后面就跟了1个参数。

若传入的参数恰好比占位符多1个的时候，最后一个参数则为指定数据库配置文件名，里面配置了如何开启连接池连接哪个数据库什么的。

附上一个默认的sjk.properties：

driverClassName=oracle.jdbc.driver.OracleDriver
url=jdbc:oracle:thin:@host:port:inst
user=test
password=test

此处注意，如果是hive 0.13以下的版本，是不支持注册永久function的，请使用
create temporaryfunction来进行，而且只支持session级别的，断开后自动消失……

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。

上一篇：LXC容器模版

下一篇：java 对方feign自己微服务之前拦截执行

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯

hive中first_value函数

hive中first_value函数

51CTO博客