spark通过pipline方式批量插入redis集群网上资料比较少,但是有一大堆都是单机的方式,spring倒是也有写入redis集群的实现代码,以下整理了spark通过pipline批量写入的方式,速度确实快,不然一条条set进去,真的是天都要黑了。
依赖到的maven有以下(spark忽略):
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>3.5.2</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>3.3.0</version>
</dependency>
以下是spark集成redis cluster部分例子:
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import redis.clients.jedis.HostAndPort;
import redis.clients.jedis.JedisCluster;
import redis.clients.jedis.JedisPoolConfig;
import java.util.HashSet;
import java.util.Set;
public class SparkPiplineRedis {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setAppName("test").setMaster("local");
SparkSession session = SparkSession.builder().config(conf).getOrCreate();
Dataset<Row> dataset = session.sql("" +
"select '1001' id,'jeff' name,1 age " +
"union all " +
"select '1002' id,'kitty' name,2 age ");
String hosts = "";
String ports = "";
dataset.foreachPartition(iter->{
JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
jedisPoolConfig.setMaxTotal(10);// 最大连接数, 默认8个
jedisPoolConfig.setMaxIdle(10);// redis.maxIdle
jedisPoolConfig.setMaxWaitMillis(2000);// 2s
jedisPoolConfig.setTestOnBorrow(true);
Set<HostAndPort> hostAndPortsSet = new HashSet<HostAndPort>();
for (String ip : hosts.split(",")) {
for (String port : ports.split(",")) {
hostAndPortsSet.add(new HostAndPort(ip, Integer.parseInt(port)));
}
}
JedisCluster jedisCluster = new JedisCluster(hostAndPortsSet, jedisPoolConfig);
JedisClusterPipeline jedisClusterPipeline = new JedisClusterPipeline(jedisCluster);
while (iter.hasNext()){
Row row = iter.next();
String id = row.getAs("id").toString();
String name = row.getAs("name").toString();
jedisClusterPipeline.hsetByPipeline("TEST:PERSON",id,name);
}
jedisClusterPipeline.releaseConnection();
});
session.stop();
}
}
jedisCluster管道方式实现代码如下(转自哪里忘了):
import org.apache.ibatis.reflection.MetaObject;
import org.apache.ibatis.reflection.SystemMetaObject;
import redis.clients.jedis.*;
import redis.clients.jedis.exceptions.JedisNoReachableClusterNodeException;
import redis.clients.jedis.util.JedisClusterCRC16;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
/**
* 基于JedisCluster实现管道的使用
* 核心对象:JedisClusterInfoCache和JedisSlotBasedConnectionHandler
* 使用构造方法将JedisCluster对象传递进来
*/
public class JedisClusterPipeline {
/**
* 构造方法
* 通过JedisCluster获取JedisClusterInfoCache和JedisSlotBasedConnectionHandler
* @param jedisCluster
*/
public JedisClusterPipeline(JedisCluster jedisCluster){
this.jedisCluster = jedisCluster;
MetaObject metaObject = SystemMetaObject.forObject(jedisCluster);
clusterInfoCache = (JedisClusterInfoCache)metaObject.getValue("connectionHandler.cache");
connectionHandler = (JedisSlotBasedConnectionHandler)metaObject.getValue("connectionHandler");
}
/** 管道命令提交阈值 */
private final int MAX_COUNT = 10000;
/** Redis集群缓存信息对象 Jedis提供*/
private JedisClusterInfoCache clusterInfoCache;
/** Redis链接处理对象 继承于JedisClusterConnectionHandler,对其提供友好的调用方法 Jedis提供 */
private JedisSlotBasedConnectionHandler connectionHandler;
/** Redis集群操作对象 Jedis提供 */
private JedisCluster jedisCluster;
/** 存储获取的Jedis对象,用于统一释放对象 */
private CopyOnWriteArrayList<Jedis> jedisList = new CopyOnWriteArrayList();
/** 存储获取的Jedis连接池对象与其对应开启的管道,用于保证slot(哈希槽)对应的节点链接的管道只被开启一次 */
private ConcurrentHashMap<JedisPool, Pipeline> pipelines = new ConcurrentHashMap<>();
/** 存储每个开启的管道需要处理的命令(数据)数,当计数达到提交阈值时进行提交 */
private ConcurrentHashMap<Pipeline, Integer> nums = new ConcurrentHashMap<>();
public void hsetByPipeline(String key, String field, String value){
Pipeline pipeline = getPipeline(key);
pipeline.hset(key, field, value);
nums.put(pipeline, nums.get(pipeline) + 1);
this.maxSync(pipeline);
}
/**
* 释放获取的Jedis链接
* 释放的过程中会强制执行PipeLine sync
*/
public void releaseConnection() {
jedisList.forEach(jedis -> jedis.close());
}
/**
* 获取JedisPool
* 第一次获取不到尝试刷新缓存的SlotPool再获取一次
* @param key
* @return
*/
private JedisPool getJedisPool(String key){
/** 通过key计算出slot */
int slot = JedisClusterCRC16.getSlot(key);
/** 通过slot获取到对应的Jedis连接池 */
JedisPool jedisPool = clusterInfoCache.getSlotPool(slot);
if(null != jedisPool){
return jedisPool;
}else{
/** 刷新缓存的SlotPool */
connectionHandler.renewSlotCache();
jedisPool = clusterInfoCache.getSlotPool(slot);
if (jedisPool != null) {
return jedisPool;
} else {
throw new JedisNoReachableClusterNodeException("No reachable node in cluster for slot " + slot);
}
}
}
/**
* 获取Pipeline对象
* 缓存在pipelines中,保证集群中同一节点的Pipeline只被开启一次
* 管道第一次开启,jedisList,pipelines,nums存入与该管道相关信息
* @param key
* @return
*/
private Pipeline getPipeline(String key){
JedisPool jedisPool = getJedisPool(key);
/** 检查管道是否已经开启 */
Pipeline pipeline = pipelines.get(jedisPool);
if(null == pipeline){
Jedis jedis = jedisPool.getResource();
pipeline = jedis.pipelined();
jedisList.add(jedis);
pipelines.put(jedisPool, pipeline);
nums.put(pipeline, 0);
}
return pipeline;
}
/**
* 管道对应的命令计数,并在达到阈值时触发提交
* 提交后计数归零
* @param pipeline
* @return
*/
private void maxSync(Pipeline pipeline){
Integer num = nums.get(pipeline);
if(null != num){
if(num % MAX_COUNT == 0){
pipeline.sync();
nums.put(pipeline, 0);
}
}
}
}