一、Yarn api 提交spark任务
日常在编写spark任务时,大部分都是通过spark集群或者spark集群作为client,将任务提交到yarn里面来运行。常规的提交方式在做在线服务过程中就不太实用了,当然可以通过java api调用脚本的方式来提交,个人感觉有点不友好。所以经过研究以后,可以直接对接spark yarn api,方便动态提交计算任务,管理计算任务。
第一步:将spark计算任务打包(jar)并上传到hdfs具体路径下面;
第二步:提交spark任务到yarn,并获取到appId;
public String dslSubmit(Task task) throws CoException {
String paramValue = task.getParamValue();
String[] runArgs = new String[]{
"--class", "classPath",
"--jar", "jarPath",
"--arg", "-m",
"--arg", "yarn",
"--arg", "-d",
"--arg", paramValue
};
// 构建Spark conf
SparkConf sparkConf = engineConfigService.buildEngineConfigInSpark();
try {
// 构造运行参数对象
ClientArguments cArgs = new ClientArguments(runArgs);
// 构造Client
Client client = new Client(cArgs, sparkConf, null);
// 提交spark任务到yarn,并获取到appId,后续可以通过appId控制运行的任务,查询运行的状态
ApplicationId applicationId = client.submitApplication();
return applicationId.toString();
} catch (Exception e) {
throw new CoException(ErrorCodeEnum.ENGINE_SUBMIT_EXCEPTION.getCode(), "application submit error");
}
}
提交源码:
import com.alibaba.fastjson.JSONArray;
import com.zdww.cd.ecs.easycomputer.bean.Instance;
import com.zdww.cd.ecs.easycomputer.bean.Task;
import com.zdww.cd.ecs.easycomputer.common.constant.HandleEngineTypeEnum;
import com.zdww.cd.ecs.easycomputer.constant.ComputerErrorCodeEnum;
import com.zdww.cd.ecs.easycomputer.constant.HandleStatusEnum;
import com.zdww.cd.ecs.easycomputer.exception.ComputerException;
import com.zdww.cd.ecs.easycomputer.service.engine.EngineConfigService;
import com.zdww.cd.ecs.easycomputer.service.engine.IEngineService;
import com.zdww.cd.ecs.easyengine.spi.util.DslFormatUtil;
import javax.annotation.Resource;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.spark.SparkConf;
import org.apache.spark.deploy.yarn.Client;
import org.apache.spark.deploy.yarn.ClientArguments;
import org.springframework.stereotype.Component;
/**
* @author heshifei
* @date 2021/3/18 19:40
*/
@Component
public class SparkBatchEngineServiceImpl implements IEngineService {
private static final String SPARK_YARN_QUEUE_KEY = "spark.yarn.queue";
private static final String DEFAULT_SPARK_YARN_QUEUE_VALUE = "default";
@Resource
private EngineConfigService engineConfigService;
/**
* engine task submit
*
* @param task task信息
* @return appId 任务执行Id
* @throws ComputerException exception
*/
@Override
public String dslSubmit(Task task, Instance instance) throws ComputerException {
// 中文会导致乱码,因此进行dsl 编码
// add by heshifei 2022-01-18
String dslValue = DslFormatUtil.encodeValue(task.getDsl());
String[] runArgs = new String[]{
"--class", engineConfigService.getEngineBatchMainClass(),
"--jar", engineConfigService.getEngineBatchResource(),
"--arg", "-m",
"--arg", "yarn",
"--arg", "-d",
"--arg", dslValue
};
("dslSubmit runArgs value:{}", JSONArray.toJSONString(runArgs));
SparkConf sparkConf = engineConfigService.buildEngineConfigInSpark();
sparkConf.set(SPARK_YARN_QUEUE_KEY, DEFAULT_SPARK_YARN_QUEUE_VALUE);
try {
ClientArguments cArgs = new ClientArguments(runArgs);
Client client = new Client(cArgs, sparkConf, null);
ApplicationId applicationId = client.submitApplication();
return applicationId.toString();
} catch (Exception e) {
LOGGER.error("dslSubmit Exception msg:{}", e.getMessage(), e);
throw new ComputerException(ComputerErrorCodeEnum.ENGINE_SUBMIT_EXCEPTION.getCode(),
"application submit error");
}
}
/**
* get status
*
* @param appId 应用Id
* @return string 处理状态
* @throws ComputerException Exception
*/
@Override
public HandleStatusEnum getTaskStatus(String appId) throws ComputerException {
ApplicationId applicationId = ApplicationId.fromString(appId);
YarnApplicationState yarnApplicationState;
YarnClient yarnClient = null;
try {
yarnClient = YarnClient.createYarnClient();
// 初始化hadoop配置
yarnClient.init(engineConfigService.buildHadoopBaseConfig());
// 开启yarnClient链接
yarnClient.start();
ApplicationReport applicationReport = yarnClient.getApplicationReport(applicationId);
yarnApplicationState = applicationReport.getYarnApplicationState();
} catch (Exception e) {
LOGGER.error("getTaskStatusByYarn Exception msg:{}", e.getMessage(), e);
throw new ComputerException(ComputerErrorCodeEnum.ENGINE_ACCESS_EXCEPTION.getCode(),
"application status get");
} finally {
try {
if (yarnClient != null) {
yarnClient.close();
}
} catch (Exception e) {
LOGGER.error("getTaskStatusByYarn Exception", e);
}
}
return transDriverStateByYarn(yarnApplicationState);
}
/**
* 任务取消
*
* @param appId 应用Id
* @return boolean 任务取消调用结果
* @throws ComputerException Exception
*/
@Override
public Boolean taskCancel(String appId) throws ComputerException {
ApplicationId applicationId = ApplicationId.fromString(appId);
YarnClient yarnClient = null;
try {
yarnClient = YarnClient.createYarnClient();
// 初始化hadoop配置
yarnClient.init(engineConfigService.buildHadoopBaseConfig());
// 开启yarnClient链接
yarnClient.start();
yarnClient.killApplication(applicationId);
return Boolean.TRUE;
} catch (Exception e) {
LOGGER.error("taskCancelByYarn Exception msg:{}", e.getMessage(), e);
throw new ComputerException(ComputerErrorCodeEnum.ENGINE_ACCESS_EXCEPTION.getCode(),
"yarn kill application fail");
} finally {
try {
if (yarnClient != null) {
yarnClient.close();
}
} catch (Exception e) {
LOGGER.error("taskCancelByYarn Exception", e);
}
}
}
@Override
public Object getTaskResult(String appId) throws ComputerException {
return null;
}
/**
* application state转换成HandlerStatusEnum
*
* @param yarnApplicationState 引用执行状态
* @return HandleStatusEnum 处理状态映射
*/
private HandleStatusEnum transDriverStateByYarn(YarnApplicationState yarnApplicationState) {
HandleStatusEnum statusEnum = HandleStatusEnum.WAITING;
if (yarnApplicationState != null) {
switch (yarnApplicationState) {
case FINISHED:
statusEnum = HandleStatusEnum.SUCCESS;
break;
case RUNNING:
statusEnum = HandleStatusEnum.RUNNING;
break;
case NEW:
case NEW_SAVING:
case SUBMITTED:
case ACCEPTED:
statusEnum = HandleStatusEnum.WAITING;
break;
case KILLED:
statusEnum = HandleStatusEnum.CANCEL;
break;
default:
statusEnum = HandleStatusEnum.FAIL;
}
}
return statusEnum;
}
@Override
public HandleEngineTypeEnum getEngineType() {
return HandleEngineTypeEnum.SPARK_BATCH;
}
}
依赖的engineConfigService
import java.io.File;
import java.io.FileInputStream;
import java.lang.invoke.MethodHandles;
import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import org.I0Itec.zkclient.ZkClient;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.spark.SparkConf;
import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
/**
* @author heshifei
* @date 2021/5/21 15:15
*/
@Component
public class EngineConfigService {
Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/**
* zk客户端
*/
@Resource
public ZkClient zkClient;
/**
* master host
*/
@Value("${spark.master.host}")
public String sparkMasterHost;
/**
* rest服务端口
*/
@Value("${spark.master.rest.port}")
public int sparkMasterRestPort;
/**
* spark 版本
*/
@Value("${spark.version}")
public String sparkVersion;
/**
* batch jar资源地址
*/
@Value("${engine.batch.resource}")
public String engineBatchResource;
/**
* batch主函数
*/
@Value("${engine.batch.main.class}")
public String engineBatchMainClass;
/**
* driver jar资源地址
*/
@Value("${engine.driver.resource}")
public String engineDriverResource;
/**
* driver 主函数
*/
@Value("${engine.driver.main.class}")
public String engineDriverMainClass;
/**
* zk 地址
*/
@Value("${engine.driver.zk.addr}")
private String engineDriverZkAddr;
/**
* 引擎插件模板地址
*/
@Value("${engine.plugin.resource.template}")
public String enginePluginResourceTemplate;
/**
* 插件ml模型存储路径
*/
@Value("${plugin.ml.model.path}")
public String pluginMlModelPath;
/**
* 插件pmml模型存储路径
*/
@Value("${plugin.ml.pmml.path}")
public String pluginMlPmmlPath;
@Value("${plugin.udf.path}")
public String pluginUdfPath;
/**
* 一个包含需要的Spark Jars的档案, 用于分发到YARN缓存
*/
@Value("${spark.yarn.archive}")
public String sparkYarnArchive;
/**
* 包含要分发到YARN容器的Spark代码的库的列表。 这允许YARN将它缓存在节点上,以便每次应用程序运行时不需要分发它.
*/
@Value("${spark.yarn.jars}")
public String sparkYarnJars;
/**
* 提交应用程序时使用的暂存目录。
*/
@Value("${spark.yarn.stagingDir}")
public String sparkYarnStagingDir;
@Value("${spark.files}")
public String sparkFiles;
@Value("${hive.exec.scratchdir}")
public String hiveExecScratchdir;
@Value("${hadoop.security.user}")
public String hadoopSecurityUser;
@Value("${hadoop.security.user.keytab}")
public String hadoopSecurityUserKeytab;
@Value("${hadoop.security.krb5.conf}")
public String hadoopSecurityKrb5Conf;
@Value("${desensitize.control.flag}")
public String desensitizeControl;
@Value("${ad.hoc.server.start}")
public String adHocServerStart;
@Value("${ad.hoc.queue}")
public String adHocQueue;
@Value("${ad.hoc.limit}")
public String adHocLimit;
@Value("${hadoop.conf.files}")
public String hadoopConfFiles;
@Value("${yarn.ha.flag}")
public String yarnHaFlag;
@Value("${yarn.active.node.path}")
public String yarnActiveNodePath;
@Value("${yarn.active.node.pre}")
public String yarnActiveNodePre;
private Configuration configuration = new Configuration();
public String getEngineBatchResource() {
return engineBatchResource;
}
public String getEngineBatchMainClass() {
return engineBatchMainClass;
}
public String getEngineDriverMainClass() {
return engineDriverMainClass;
}
public String getEngineDriverResource() {
return engineDriverResource;
}
public String getEngineDriverZkAddr() {
return engineDriverZkAddr;
}
public String getHadoopSecurityUser() {
return hadoopSecurityUser;
}
public String getHadoopSecurityUserKeytab() {
return hadoopSecurityUserKeytab;
}
public Boolean isDesensitize() {
if ("true".equalsIgnoreCase(desensitizeControl)) {
return Boolean.TRUE;
}
return Boolean.FALSE;
}
public Boolean isAdHoc() {
if ("true".equalsIgnoreCase(adHocServerStart)) {
return Boolean.TRUE;
}
return Boolean.FALSE;
}
public String getAdHocQueue() {
return adHocQueue;
}
public int getAdHocLimit() {
int limit = 100;
if (StringUtils.isEmpty(adHocLimit)) {
return limit;
}
try {
limit = Integer.parseInt(adHocLimit);
} catch (Exception e) {
LOGGER.error(e.getMessage(), e);
}
return limit;
}
public String getYarnClusterApiAddr() {
// hostName
String hostNameKey = "yarn.resourcemanager.hostname";
String activeHost = configuration.get(hostNameKey, "");
String activeNode = getActiveNode();
if (StringUtils.isNotEmpty(activeNode)) {
hostNameKey = hostNameKey + "." + activeNode;
activeHost = configuration.get(hostNameKey, "");
}
String resourceManagerAddressKey = "yarn.resourcemanager.webapp.address";
return buildYarnResources(configuration, activeNode, activeHost, resourceManagerAddressKey);
}
public String getFsDefaultFS() {
return configuration.get("fs.defaultFS");
}
@PostConstruct
public void init() {
Configuration config = new Configuration();
try {
String[] confFiles = hadoopConfFiles.split(",|;");
for (String confFile : confFiles) {
config.addResource(new FileInputStream(new File(confFile)));
}
} catch (Exception e) {
LOGGER.error("init hadoop conf msg:{}", e.getMessage(), e);
}
configuration = config;
}
/**
* 构建spark config
*
* @return sparkConf
*/
public SparkConf buildEngineConfigInSpark() {
Configuration conf = buildHadoopBaseConfig();
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.yarn.scheduler.heartbeat.interval-ms", "1000");
sparkConf.set("spark.submit.deployMode", "cluster");
sparkConf.set("spark.yarn.preserve.staging.files", "false");
sparkConf.set("mapreduce.app-submission.cross-platform", "true");
sparkConf.set("spark.yarn.isHadoopProvided", "true");
sparkConf.set("spark.yarn.archive", sparkYarnArchive);
sparkConf.set("spark.yarn.jars", sparkYarnJars);
sparkConf.set("spark.yarn.stagingDir", sparkYarnStagingDir);
sparkConf.set("spark.files", sparkFiles);
sparkConf.set("spark.hadoop.mapreduce.framework.name", "yarn");
sparkConf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
if (StringUtils.isNotEmpty(hiveExecScratchdir)) {
sparkConf.set("hive.exec.scratchdir", hiveExecScratchdir);
}
buildYarnResourceInfo(sparkConf, conf);
// 是否进行安全认证设置
if (securityFlag()) {
sparkConf.set("spark.kerberos.keytab", hadoopSecurityUserKeytab);
sparkConf.set("spark.kerberos.principal", hadoopSecurityUser);
}
return sparkConf;
}
private void buildYarnResourceInfo(SparkConf sparkConf, Configuration config) {
String activeNode = getActiveNode();
sparkConf.set("fs.defaultFS", config.get("fs.defaultFS"));
// hostName
String hostNameKey = "yarn.resourcemanager.hostname";
String activeHost = config.get(hostNameKey, "");
if (StringUtils.isNotEmpty(activeNode)) {
hostNameKey = hostNameKey + "." + activeNode;
activeHost = config.get(hostNameKey, "");
}
// resourceManager.address
String resourceManagerAddressKey = "yarn.resourcemanager.address";
String resourceManagerAddress = buildYarnResources(config, activeNode, activeHost,
resourceManagerAddressKey);
// resourceManager.address scheduler
String resourceSchedulerAddressKey = "yarn.resourcemanager.scheduler.address";
String resourceSchedulerAddress = buildYarnResources(config, activeNode, activeHost,
resourceSchedulerAddressKey);
sparkConf.set("spark.hadoop.yarn.resourcemanager.hostname", activeHost);
sparkConf.set("spark.hadoop.yarn.resourcemanager.address",
resourceManagerAddress);
sparkConf
.set("spark.hadoop.yarn.resourcemanager.scheduler.address",
resourceSchedulerAddress);
}
private String buildYarnResources(Configuration config, String activeNode, String activeHost,
String resourceManagerAddressKey) {
String resourceSchedulerAddress = config.get(resourceManagerAddressKey, "");
if (StringUtils.isNotEmpty(resourceSchedulerAddress)) {
String[] rma = resourceSchedulerAddress.split(",|;");
for (String rmAddress : rma) {
if (rmAddress.contains(activeHost)) {
resourceSchedulerAddress = rmAddress;
break;
}
}
} else {
resourceSchedulerAddress = config.get(resourceManagerAddressKey + "." + activeNode, "");
}
return resourceSchedulerAddress;
}
/**
* 构建hadoop config
*
* @return Conf
*/
public Configuration buildHadoopBaseConfig() {
Configuration conf = configuration;
try {
// 是否进行安全认证设置
if (securityFlag()) {
System.setProperty("HADOOP_USER_NAME", hadoopSecurityUser.split("//")[0]);
System.setProperty("java.security.krb5.conf", hadoopSecurityKrb5Conf);
conf.set("hadoop.security.authentication", "Kerberos");
UserGroupInformation.setConfiguration(conf);
UserGroupInformation
.loginUserFromKeytab(hadoopSecurityUser, hadoopSecurityUserKeytab);
}
} catch (Exception e) {
LOGGER.error("buildHadoopBaseConfig file system error:{}", e.getMessage());
}
return conf;
}
/**
* 校验安全 boolean 是否需要安全认证
*
* @return true:需要安全认证,false:不需要安全认证
*/
public boolean securityFlag() {
return StringUtils.isNotEmpty(getHadoopSecurityUser()) && StringUtils
.isNotEmpty(getHadoopSecurityUserKeytab());
}
public String getActiveNode() {
String node = "";
if (StringUtils.isNotEmpty(yarnHaFlag) && "true".equals(yarnHaFlag)) {
Pair<String, Stat> nodeInfo = ZkUtil.readDataMaybeNull(zkClient, yarnActiveNodePath);
String info = nodeInfo.getKey();
if (StringUtils.isNotEmpty(info)) {
// 中间有两个特殊字符,ah3-yarnrm2 [DC2ETX]
node = info
.substring(info.indexOf(yarnActiveNodePre) + yarnActiveNodePre.length() + 2);
}
}
return node;
}
}
第三步:通过第二步的appId,控制运行的任务,查询运行的状态等等管理操作。
二、spark libs/conf
首先yarn和spark集群没有必然的联系,yarn是通用的资源管理器,不耦合具体的引擎。所以再yarn里面运行的spark程序,只依赖libs包和conf信息。
即: __app__.jar、__spark_libs__、__spark_conf__这三个文件。每次运行任务都会将这几个文件包进行打包上传,然后才初始化运行环境。为了加快运行时间可以将libs打包上传到hdfs中,可以通过配置key来设置libs包的具体路径:
spark.yarn.archive -> source/__spark_libs__.zip // spark lib的zip包
spark.yarn.jars -> source/jars/*
在spark yarn client中,conf是每次动态生成的。因此每次任务执行都需要动态的设置运行的配置信息。
三、Spark任务提交出现的0.0.0.0:8030问题?
通过spark yarn api提交spark任务来提供在线服务的过程中,遇到连不上yarn RM,网上找了比较多的资料,0.0.0.0:8030依然存在。
最后通过手动设置yarn RM的host来解决的。完整配置如下:
spark.hadoop.yarn.resourcemanager.hostname
spark.hadoop.yarn.resourcemanager.address
spark.hadoop.yarn.resourcemanager.scheduler.address
四、HadoopHA,导致spark任务提交的: “.UnknownHostException” 问题
HA是属于hadoop系统,脱离hadoop系统是无法识别的。因此需要将响应的配置信息加载进来。
解决方法如下:
在SparkConf对象中添加
sparkConf.set("spark.files", "hdfs:///conf/hdfs-site.xml,hdfs:///conf/core-site.xml");
五、spark yarn api模式资源管理kerberos权限认证
官网说可以通过添加这两个配置来进行kerberos权限验证:spark.kerberos.keytab、spark.kerberos.principal
然而我们项目中,权限划分比较细,通过这种方式任然不行。
最后的解决办法是在服务项目中将yarn-site.xml,hdfs-site.xml,core-site.xml添加到resource下面。