spark通过pipline方式批量插入redis集群网上资料比较少,但是有一大堆都是单机的方式,spring倒是也有写入redis集群的实现代码,以下整理了spark通过pipline批量写入的方式,速度确实快,不然一条条set进去,真的是天都要黑了。

依赖到的maven有以下(spark忽略):

<dependency>
            <groupId>org.mybatis</groupId>
            <artifactId>mybatis</artifactId>
            <version>3.5.2</version>
        </dependency>
        <dependency>
            <groupId>redis.clients</groupId>
            <artifactId>jedis</artifactId>
            <version>3.3.0</version>
        </dependency>

 以下是spark集成redis cluster部分例子:

import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import redis.clients.jedis.HostAndPort;
import redis.clients.jedis.JedisCluster;
import redis.clients.jedis.JedisPoolConfig;

import java.util.HashSet;
import java.util.Set;

public class SparkPiplineRedis {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setAppName("test").setMaster("local");
        SparkSession session = SparkSession.builder().config(conf).getOrCreate();

        Dataset<Row> dataset = session.sql("" +
                "select '1001' id,'jeff' name,1 age " +
                "union all " +
                "select '1002' id,'kitty' name,2 age ");

        String hosts = "";
        String ports = "";

        dataset.foreachPartition(iter->{
            JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
            jedisPoolConfig.setMaxTotal(10);// 最大连接数, 默认8个
            jedisPoolConfig.setMaxIdle(10);// redis.maxIdle
            jedisPoolConfig.setMaxWaitMillis(2000);// 2s
            jedisPoolConfig.setTestOnBorrow(true);

            Set<HostAndPort> hostAndPortsSet = new HashSet<HostAndPort>();
            for (String ip : hosts.split(",")) {
                for (String port : ports.split(",")) {
                    hostAndPortsSet.add(new HostAndPort(ip, Integer.parseInt(port)));
                }
            }
            JedisCluster jedisCluster = new JedisCluster(hostAndPortsSet, jedisPoolConfig);
            JedisClusterPipeline jedisClusterPipeline = new JedisClusterPipeline(jedisCluster);

            while (iter.hasNext()){
                Row row = iter.next();
                String id = row.getAs("id").toString();
                String name = row.getAs("name").toString();
                jedisClusterPipeline.hsetByPipeline("TEST:PERSON",id,name);
            }
            jedisClusterPipeline.releaseConnection();
        });

        session.stop();
    }
}

 

jedisCluster管道方式实现代码如下(转自哪里忘了):

import org.apache.ibatis.reflection.MetaObject;
import org.apache.ibatis.reflection.SystemMetaObject;
import redis.clients.jedis.*;
import redis.clients.jedis.exceptions.JedisNoReachableClusterNodeException;
import redis.clients.jedis.util.JedisClusterCRC16;

import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;

/**
 * 基于JedisCluster实现管道的使用
 * 核心对象:JedisClusterInfoCache和JedisSlotBasedConnectionHandler
 * 使用构造方法将JedisCluster对象传递进来
 */
public class JedisClusterPipeline {

    /**
     * 构造方法
     * 通过JedisCluster获取JedisClusterInfoCache和JedisSlotBasedConnectionHandler
     * @param jedisCluster
     */
    public JedisClusterPipeline(JedisCluster jedisCluster){
        this.jedisCluster = jedisCluster;
        MetaObject metaObject = SystemMetaObject.forObject(jedisCluster);
        clusterInfoCache = (JedisClusterInfoCache)metaObject.getValue("connectionHandler.cache");
        connectionHandler = (JedisSlotBasedConnectionHandler)metaObject.getValue("connectionHandler");
    }

    /** 管道命令提交阈值 */
    private final int MAX_COUNT = 10000;
    /** Redis集群缓存信息对象 Jedis提供*/
    private JedisClusterInfoCache clusterInfoCache;
    /** Redis链接处理对象 继承于JedisClusterConnectionHandler,对其提供友好的调用方法 Jedis提供 */
    private JedisSlotBasedConnectionHandler connectionHandler;
    /** Redis集群操作对象 Jedis提供 */
    private JedisCluster jedisCluster;

    /** 存储获取的Jedis对象,用于统一释放对象 */
    private CopyOnWriteArrayList<Jedis> jedisList = new CopyOnWriteArrayList();
    /** 存储获取的Jedis连接池对象与其对应开启的管道,用于保证slot(哈希槽)对应的节点链接的管道只被开启一次 */
    private ConcurrentHashMap<JedisPool, Pipeline> pipelines = new ConcurrentHashMap<>();
    /** 存储每个开启的管道需要处理的命令(数据)数,当计数达到提交阈值时进行提交 */
    private ConcurrentHashMap<Pipeline, Integer> nums = new ConcurrentHashMap<>();

    public void hsetByPipeline(String key, String field, String value){
        Pipeline pipeline = getPipeline(key);
        pipeline.hset(key, field, value);
        nums.put(pipeline, nums.get(pipeline) + 1);
        this.maxSync(pipeline);
    }

    /**
     * 释放获取的Jedis链接
     * 释放的过程中会强制执行PipeLine sync
     */
    public void releaseConnection() {
        jedisList.forEach(jedis -> jedis.close());
    }

    /**
     * 获取JedisPool
     * 第一次获取不到尝试刷新缓存的SlotPool再获取一次
     * @param key
     * @return
     */
    private JedisPool getJedisPool(String key){
        /** 通过key计算出slot */
        int slot = JedisClusterCRC16.getSlot(key);
        /** 通过slot获取到对应的Jedis连接池 */
        JedisPool jedisPool = clusterInfoCache.getSlotPool(slot);
        if(null != jedisPool){
            return jedisPool;
        }else{
            /** 刷新缓存的SlotPool */
            connectionHandler.renewSlotCache();
            jedisPool = clusterInfoCache.getSlotPool(slot);
            if (jedisPool != null) {
                return jedisPool;
            } else {
                throw new JedisNoReachableClusterNodeException("No reachable node in cluster for slot " + slot);
            }
        }
    }

    /**
     * 获取Pipeline对象
     * 缓存在pipelines中,保证集群中同一节点的Pipeline只被开启一次
     * 管道第一次开启,jedisList,pipelines,nums存入与该管道相关信息
     * @param key
     * @return
     */
    private Pipeline getPipeline(String key){
        JedisPool jedisPool = getJedisPool(key);
        /** 检查管道是否已经开启 */
        Pipeline pipeline = pipelines.get(jedisPool);
        if(null == pipeline){
            Jedis jedis = jedisPool.getResource();
            pipeline = jedis.pipelined();
            jedisList.add(jedis);
            pipelines.put(jedisPool, pipeline);
            nums.put(pipeline, 0);
        }
        return pipeline;
    }

    /**
     * 管道对应的命令计数,并在达到阈值时触发提交
     * 提交后计数归零
     * @param pipeline
     * @return
     */
    private void maxSync(Pipeline pipeline){
        Integer num = nums.get(pipeline);
        if(null != num){
            if(num % MAX_COUNT == 0){
                pipeline.sync();
                nums.put(pipeline, 0);
            }
        }
    }
}