背景介绍
在双中心部署的架构下,为了验证系统的容灾能力,需要进行灾备演练。应用服务部署在kubernetes集群中,通过web控制台修改副本数量为0不可用,比较耗费时间。本文将介绍如何通过脚本优雅地处理一个中心的服务副本数,并在演练后恢复原状。
实验环境:
使用kind搭建了一个1.26.3的 k8s集群,两个work节点。这里可以随意。只要是k8s环境即可:
kubectl get nodes
创建了一个develop的测试命名空间:
kubectl get ns
创建了三个deployments 三个statefulset应用如下:
root@ubuntu1:~/now# kubectl get deployments -n develop
NAME READY UP-TO-DATE AVAILABLE AGE
nginx2-deployment 1/1 1 1 23m
nginx4-deployment 3/3 3 3 28m
nginx6-deployment 0/0 0 0 25m
root@ubuntu1:~/now# kubectl get statefulsets -n develop
NAME READY AGE
nginx1-stateful 1/1 24m
nginx3-stateful 3/3 27m
nginx5-stateful 0/0 26m
root@ubuntu1:~/now# kubectl get pods -n develop
NAME READY STATUS RESTARTS AGE
nginx1-stateful-0 1/1 Running 0 7m20s
nginx2-deployment-766f555bb5-j6xh6 1/1 Running 0 7m20s
nginx3-stateful-0 1/1 Running 0 7m20s
nginx3-stateful-1 1/1 Running 0 7m18s
nginx3-stateful-2 1/1 Running 0 7m17s
nginx4-deployment-796d46cfbf-7zqkr 1/1 Running 0 7m20s
nginx4-deployment-796d46cfbf-d92zz 1/1 Running 0 7m20s
nginx4-deployment-796d46cfbf-hpj2h 1/1 Running 0 7m20s
特意做了statefulets deployments副本数为0的应用(因为我线上环境有这样的应用,尽可能模拟线上环境!)
下面我要实现的结果
- 可以通过一个参数直接将develop命名空间下的所有应用副本数设置为0。
- 通过一个参数直接回复develop命名空间下的所有应急参数恢复为原来的参数。
- 通过传入一个参数排查某个应用(可以是deployments,也可以是statefulets对应用不进行副本数的变更)
实现方案
方案一:使用kubectl命令+配置文件
这个方案通过将原始副本数保存到配置文件,然后修改副本数为0,最后从配置文件恢复。以下是相对应的脚步内容:
#!/bin/bash
# 参数定义
namespace=$1 # 命名空间
action=$2 # 操作类型:scale/restore
exclude=$3 # 排除的服务,多个用逗号分隔
# 检查参数
if [ -z "$namespace" ] || [ -z "$action" ]; then
echo "Usage: $0 <namespace> <scale|restore> [exclude_services]"
echo "非0"
exit 1
fi
# 配置文件路径
config_file="/tmp/replicas_backup.txt"
# scale down操作
scale_down() {
# 获取所有deployment和statefulset
resources=$(kubectl get deploy,sts -n $namespace -o name)
# 清空配置文件
> $config_file
for resource in $resources; do
name=$(echo $resource | cut -d/ -f2)
# 检查是否在排除列表中
if [ -n "$exclude" ]; then
echo "检查服务: $name"
# 将排除列表转换为数组
IFS=',' read -ra EXCLUDE_ARRAY <<< "$exclude"
for excluded in "${EXCLUDE_ARRAY[@]}"; do
if [ "$name" = "$excluded" ]; then
echo "跳过排除的服务: $name"
continue 2
fi
done
fi
# 获取当前副本数并保存
replicas=$(kubectl get $resource -n $namespace -o jsonpath='{.spec.replicas}')
echo "$resource:$replicas" >> $config_file
# 设置副本数为0
echo "缩容服务: $name"
kubectl scale $resource -n $namespace --replicas=0
done
}
# 恢复操作
restore() {
if [ ! -f $config_file ]; then
echo "Backup file not found!"
echo "非0"
exit 1
fi
while IFS=: read -r resource replicas
do
echo "恢复服务: $resource 到 $replicas 个副本"
kubectl scale $resource -n $namespace --replicas=$replicas
done < $config_file
}
# 主流程
case $action in
scale)
scale_down
;;
restore)
restore
;;
*)
echo "Invalid action: $action"
echo "非0"
exit 1
;;
esac
echo "0"
方案二:使用Go语言实现
package main
import (
"context"
"flag"
"fmt"
"os"
"strings"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
)
func scaleDown(clientset *kubernetes.Clientset, namespace string, excludeList []string) {
// 处理 Deployments
deployments, err := clientset.AppsV1().Deployments(namespace).List(context.TODO(), metav1.ListOptions{})
if err != nil {
fmt.Printf("Error listing deployments: %v\n", err)
fmt.Println("非0")
os.Exit(1)
}
for _, deployment := range deployments.Items {
excluded := false
for _, excludeName := range excludeList {
if excludeName != "" && deployment.Name == excludeName {
excluded = true
break
}
}
if !excluded {
annotations := deployment.Annotations
if annotations == nil {
annotations = make(map[string]string)
}
annotations["original-replicas"] = fmt.Sprintf("%d", *deployment.Spec.Replicas)
deployment.Annotations = annotations
var replicas int32 = 0
deployment.Spec.Replicas = &replicas
_, err = clientset.AppsV1().Deployments(namespace).Update(context.TODO(), &deployment, metav1.UpdateOptions{})
if err != nil {
fmt.Printf("Error scaling down deployment %s: %v\n", deployment.Name, err)
fmt.Println("非0")
os.Exit(1)
}
}
}
// 处理 StatefulSets
statefulsets, err := clientset.AppsV1().StatefulSets(namespace).List(context.TODO(), metav1.ListOptions{})
if err != nil {
fmt.Printf("Error listing statefulsets: %v\n", err)
fmt.Println("非0")
os.Exit(1)
}
for _, statefulset := range statefulsets.Items {
excluded := false
for _, excludeName := range excludeList {
if excludeName != "" && statefulset.Name == excludeName {
excluded = true
break
}
}
if !excluded {
annotations := statefulset.Annotations
if annotations == nil {
annotations = make(map[string]string)
}
annotations["original-replicas"] = fmt.Sprintf("%d", *statefulset.Spec.Replicas)
statefulset.Annotations = annotations
var replicas int32 = 0
statefulset.Spec.Replicas = &replicas
_, err = clientset.AppsV1().StatefulSets(namespace).Update(context.TODO(), &statefulset, metav1.UpdateOptions{})
if err != nil {
fmt.Printf("Error scaling down statefulset %s: %v\n", statefulset.Name, err)
fmt.Println("非0")
os.Exit(1)
}
}
}
}
func restore(clientset *kubernetes.Clientset, namespace string) {
// 恢复 Deployments
deployments, err := clientset.AppsV1().Deployments(namespace).List(context.TODO(), metav1.ListOptions{})
if err != nil {
fmt.Printf("Error listing deployments: %v\n", err)
fmt.Println("非0")
os.Exit(1)
}
for _, deployment := range deployments.Items {
if originalReplicas, exists := deployment.Annotations["original-replicas"]; exists {
var replicas int32
fmt.Sscanf(originalReplicas, "%d", &replicas)
deployment.Spec.Replicas = &replicas
_, err = clientset.AppsV1().Deployments(namespace).Update(context.TODO(), &deployment, metav1.UpdateOptions{})
if err != nil {
fmt.Printf("Error restoring deployment %s: %v\n", deployment.Name, err)
fmt.Println("非0")
os.Exit(1)
}
}
}
// 恢复 StatefulSets
statefulsets, err := clientset.AppsV1().StatefulSets(namespace).List(context.TODO(), metav1.ListOptions{})
if err != nil {
fmt.Printf("Error listing statefulsets: %v\n", err)
fmt.Println("非0")
os.Exit(1)
}
for _, statefulset := range statefulsets.Items {
if originalReplicas, exists := statefulset.Annotations["original-replicas"]; exists {
var replicas int32
fmt.Sscanf(originalReplicas, "%d", &replicas)
statefulset.Spec.Replicas = &replicas
_, err = clientset.AppsV1().StatefulSets(namespace).Update(context.TODO(), &statefulset, metav1.UpdateOptions{})
if err != nil {
fmt.Printf("Error restoring statefulset %s: %v\n", statefulset.Name, err)
fmt.Println("非0")
os.Exit(1)
}
}
}
}
func main() {
namespace := flag.String("namespace", "", "kubernetes namespace")
action := flag.String("action", "", "scale or restore")
exclude := flag.String("exclude", "", "excluded services")
flag.Parse()
if *namespace == "" || *action == "" {
fmt.Println("Missing required parameters")
fmt.Println("非0")
os.Exit(1)
}
config, err := clientcmd.BuildConfigFromFlags("", os.Getenv("KUBECONFIG"))
if err != nil {
fmt.Printf("Error building config: %v\n", err)
fmt.Println("非0")
os.Exit(1)
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
fmt.Printf("Error creating clientset: %v\n", err)
fmt.Println("非0")
os.Exit(1)
}
excludeList := strings.Split(*exclude, ",")
switch *action {
case "scale":
scaleDown(clientset, *namespace, excludeList)
case "restore":
restore(clientset, *namespace)
default:
fmt.Println("Invalid action")
fmt.Println("非0")
os.Exit(1)
}
fmt.Println("0")
}
使用说明
方案一使用方法
# 缩容操作
./k8s.sh develop scale
# 恢复操作
./k8s.sh develop restore
测试屏蔽某个应用:
./k8s.sh develop scale "nginx3-stateful,nginx4-deployment"
# 恢复操作
./k8s.sh develop restore
方案二使用方法
go build -o k8s-scaler main.go
指定 kubeconfig 文件路径
export KUBECONFIG=/root/.kube/config
尝试缩容指定命名空间下所有应用,并还原:
# 缩容操作
./k8s-scaler -namespace=develop -action=scale
# 恢复操作
./k8s-scaler -namespace=develop -action=restore
# 缩容操作
./k8s-scaler -namespace=develop -action=scale -exclude=nginx3-stateful,nginx4-deployment
# 恢复操作
./k8s-scaler -namespace=develop -action=restore
方案对比
方案 | 优点 | 缺点 |
---|---|---|
Shell脚本 | 实现简单,依赖少 | 错误处理相对简单 |
Go程序 | 错误处理完善,性能好 | 需要编译,依赖多。还有各种版本问题以及代理问题 |
注意事项
- 执行前需要确保有足够的权限
- 建议先在测试环境验证
- 保持备份文件安全
- 建议加入日志记录
- 可以考虑添加超时控制
总结
本文提供了两种实现方案,都能满足灾备演练的需求。Shell脚本方案简单易用,Go方案则更加健壮。根据实际情况选择合适的方案即可。
希望本文对你进行灾备演练有所帮助!