最近公司机房新上了ssd存储盘,赶紧申请使用,已经无法忍受etcd超时打不开web页面的现象了。更换成ssd,瞬间感受丝滑的酸爽。记录下平台迁移的过程。

要做平台迁移,前提是新旧机的网络要互通,kubernetes版本1.16.3 版本比较旧了回头升级下。

一、rancher管理平台

    

                      旧平台

ip

CPU

内存(G)

磁盘

172.26.179.146master

4

8

40G(ceph磁盘)

172.26.179.147

4

8

40G(ceph磁盘)

172.26.179.148

4

8

40G(ceph磁盘)

                     新平台

ip

CPU

内存(G)

磁盘

172.25.149.111master

4

8

40G(SSD磁盘)

172.25.149.112

4

8

40G(SSD磁盘)

172.25.149.113

4

8

40G(SSD磁盘)

二、新环境准备(在三台新机同时执行)

1.性能优化

echo "
net.ipv4.tcp_tw_reuse = 1
net.ipv4.tcp_tw_recycle = 0
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-iptables=1
net.ipv4.ip_forward=1
net.ipv4.conf.all.forwarding=1
net.ipv4.neigh.default.gc_thresh1=4096
net.ipv4.neigh.default.gc_thresh2=6144
net.ipv4.neigh.default.gc_thresh3=8192
net.ipv4.neigh.default.gc_interval=60
net.ipv4.neigh.default.gc_stale_time=120

# 参考 https://github.com/prometheus/node_exporter#disabled-by-default
kernel.perf_event_paranoid=-1

#sysctls for k8s node config
net.ipv4.tcp_slow_start_after_idle=0
net.core.rmem_max=16777216
fs.inotify.max_user_watches=524288
kernel.softlockup_all_cpu_backtrace=1
kernel.softlockup_panic=1
fs.file-max=2097152
fs.inotify.max_user_instances=8192
fs.inotify.max_queued_events=16384
vm.max_map_count=262144
fs.may_detach_mounts=1
net.core.netdev_max_backlog=16384
net.ipv4.tcp_wmem=4096 12582912 16777216
net.core.wmem_max=16777216
net.core.somaxconn=32768
net.ipv4.ip_forward=1
net.ipv4.tcp_max_syn_backlog=8096
net.ipv4.tcp_rmem=4096 12582912 16777216

net.ipv6.conf.all.disable_ipv6=1
net.ipv6.conf.default.disable_ipv6=1
net.ipv6.conf.lo.disable_ipv6=1

kernel.yama.ptrace_scope=0
vm.swappiness=0

# 可以控制core文件的文件名中是否添加pid作为扩展。
kernel.core_uses_pid=1

# Do not accept source routing
net.ipv4.conf.default.accept_source_route=0
net.ipv4.conf.all.accept_source_route=0

# Promote secondary addresses when the primary address is removed
net.ipv4.conf.default.promote_secondaries=1
net.ipv4.conf.all.promote_secondaries=1

# Enable hard and soft link protection
fs.protected_hardlinks=1
fs.protected_symlinks=1

# 源路由验证
# see details in https://help.aliyun.com/knowledge_detail/39428.html
net.ipv4.conf.all.rp_filter=0
net.ipv4.conf.default.rp_filter=0
net.ipv4.conf.default.arp_announce = 2
net.ipv4.conf.lo.arp_announce=2
net.ipv4.conf.all.arp_announce=2

# see details in https://help.aliyun.com/knowledge_detail/41334.html
net.ipv4.tcp_max_tw_buckets=5000
net.ipv4.tcp_syncookies=1
net.ipv4.tcp_fin_timeout=30
net.ipv4.tcp_synack_retries=2
kernel.sysrq=1

" >> /etc/sysctl.conf
sysctl -p
cat >> /etc/security/limits.conf <<EOF
* soft nofile 65535
* hard nofile 65536
EOF

2.安装docker

yum install -y yum-utils device-mapper-persistent-data lvm2
yum-config-manager --add-repo http://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
yum install -y docker-ce-20.10.7-3.el7.x86_64 docker-ce-cli-20.10.7-3.el7.x86_64

启动docker

systemctl start docker
systemctl enable docker
systemctl status docker

优化docker的配置

cat > /etc/docker/daemon.json <<EOF
{
    "oom-score-adjust": -1000,
    "log-driver": "json-file",
    "log-opts": {
    "max-size": "500m",
    "max-file": "3"
    },
    "registry-mirrors": ["https://7bezldxe.mirror.aliyuncs.com"]
}
EOF

重新启动docker

systemctl restart docker

3.时区优化

时间同步
timedatectl status
timedatectl set-timezone Asia/Shanghai
timedatectl set-ntp yes
date

4.创建 centos账户

#创建centos用户
useradd centos
#配置密码
echo “新密码”|passwd --stdin centos
#将centos用户加入docker用户组
usermod -aG docker centos

5.配置私有镜像仓库

根据个人情况配置

6.配置hosts及免密登录

echo "172.25.149.122    k8s.example.com" >> /etc/hosts
# 从172.26.179.146免密登录3台新机
在172.26.179.146上执行 
ssh-copy-id centos@172.26.149.111 
ssh-copy-id centos@172.26.149.112
ssh-copy-id centos@172.26.149.113
 根据提示输入密码即可完成免密配置
# 从172.26.149.111免密登录6台新机
在172.26.149.111上执行
ssh-keygen -t rsa #全部回车会生成密钥
ssh-copy-id centos@172.26.179.146 
ssh-copy-id centos@172.26.179.147
ssh-copy-id centos@172.26.179.148
ssh-copy-id centos@172.26.149.111 
ssh-copy-id centos@172.26.149.112
ssh-copy-id centos@172.26.149.113
 根据提示输入密码即可完成免密配置

三、在线热迁移(172.26.179.146上执行)

1.修改cluster.yml 

将原来的配置文件修改为

nodes:
  - address: 172.26.179.146
    user: centos
    role: [controlplane,worker,etcd]
  - address: 172.26.179.147
    user: centos
    role: [controlplane,worker,etcd]
  - address: 172.26.179.148
    user: centos
    role: [controlplane,worker,etcd]
  - address: 172.25.149.111
    user: centos
    role: [controlplane,worker,etcd]
  - address: 172.25.149.112
    user: centos
    role: [controlplane,worker,etcd]
  - address: 172.25.149.113
    user: rancher
    role: [controlplane,worker,etcd]
services:
  etcd:
    snapshot: true
    creation: 6h
    retention: 24h
private_registries:
- url: 10.15.128.38
  user: admin
  password: Abc123@#!ddd
  is_default: true
#rke up --config ./cluster.yml   
#根据网络情况而定耗时 ,最后成功时会看到如下字样
INFO[0339] [sync] Successfully synced nodes Labels and Taints 
INFO[0339] [network] Setting up network plugin: canal   
INFO[0339] [addons] Saving ConfigMap for addon rke-network-plugin to Kubernetes 
INFO[0339] [addons] Successfully saved ConfigMap for addon rke-network-plugin to Kubernetes 
INFO[0339] [addons] Executing deploy job rke-network-plugin 
INFO[0339] [addons] Setting up coredns                  
INFO[0339] [addons] Saving ConfigMap for addon rke-coredns-addon to Kubernetes 
INFO[0339] [addons] Successfully saved ConfigMap for addon rke-coredns-addon to Kubernetes 
INFO[0339] [addons] Executing deploy job rke-coredns-addon 
INFO[0339] [addons] CoreDNS deployed successfully..     
INFO[0339] [dns] DNS provider coredns deployed successfully 
INFO[0339] [addons] Setting up Metrics Server           
INFO[0339] [addons] Saving ConfigMap for addon rke-metrics-addon to Kubernetes 
INFO[0339] [addons] Successfully saved ConfigMap for addon rke-metrics-addon to Kubernetes 
INFO[0339] [addons] Executing deploy job rke-metrics-addon 
INFO[0339] [addons] Metrics Server deployed successfully 
INFO[0339] [ingress] Setting up nginx ingress controller 
INFO[0339] [addons] Saving ConfigMap for addon rke-ingress-controller to Kubernetes 
INFO[0339] [addons] Successfully saved ConfigMap for addon rke-ingress-controller to Kubernetes 
INFO[0339] [addons] Executing deploy job rke-ingress-controller 
INFO[0339] [ingress] ingress controller nginx deployed successfully 
INFO[0339] [addons] Setting up user addons              
INFO[0339] [addons] no user addons defined              
INFO[0339] Finished building Kubernetes cluster successfully

会在同目录下产生两个文件,kube_config_cluster.yml  cluster.rkestate ,保存好文件

连同cluster.yml、rke、kubectl拷贝至172.26.149.111的/home/rancher目录下

以下命令均在172.26.149.111上执行

在172.26.149.111上执行
sudo chmod +x rke kubectl
sudo mv {rke,kubectl} /bin/

修改cluster.yml(在172.25.149.111上面执行)

修改后的内容如下

nodes:
  - address: 172.25.149.111
    user: centos
    role: [controlplane,worker,etcd]
  - address: 172.25.149.112
    user: centos
    role: [controlplane,worker,etcd]
  - address: 172.25.149.113
    user: rancher
    role: [controlplane,worker,etcd]
services:
  etcd:
    snapshot: true
    creation: 6h
    retention: 24h
private_registries:
- url: 10.15.128.38
  user: admin
  password: Abc123@#!ddd
  is_default: true

执行升级,将旧平台的机器从集群移除。

#rke up --config ./cluster.yml   
#根据网络情况而定耗时 ,最后成功时会看到如下字样
INFO[0339] [sync] Successfully synced nodes Labels and Taints 
INFO[0339] [network] Setting up network plugin: canal   
INFO[0339] [addons] Saving ConfigMap for addon rke-network-plugin to Kubernetes 
INFO[0339] [addons] Successfully saved ConfigMap for addon rke-network-plugin to Kubernetes 
INFO[0339] [addons] Executing deploy job rke-network-plugin 
INFO[0339] [addons] Setting up coredns                  
INFO[0339] [addons] Saving ConfigMap for addon rke-coredns-addon to Kubernetes 
INFO[0339] [addons] Successfully saved ConfigMap for addon rke-coredns-addon to Kubernetes 
INFO[0339] [addons] Executing deploy job rke-coredns-addon 
INFO[0339] [addons] CoreDNS deployed successfully..     
INFO[0339] [dns] DNS provider coredns deployed successfully 
INFO[0339] [addons] Setting up Metrics Server           
INFO[0339] [addons] Saving ConfigMap for addon rke-metrics-addon to Kubernetes 
INFO[0339] [addons] Successfully saved ConfigMap for addon rke-metrics-addon to Kubernetes 
INFO[0339] [addons] Executing deploy job rke-metrics-addon 
INFO[0339] [addons] Metrics Server deployed successfully 
INFO[0339] [ingress] Setting up nginx ingress controller 
INFO[0339] [addons] Saving ConfigMap for addon rke-ingress-controller to Kubernetes 
INFO[0339] [addons] Successfully saved ConfigMap for addon rke-ingress-controller to Kubernetes 
INFO[0339] [addons] Executing deploy job rke-ingress-controller 
INFO[0339] [ingress] ingress controller nginx deployed successfully 
INFO[0339] [addons] Setting up user addons              
INFO[0339] [addons] no user addons defined              
INFO[0339] Finished building Kubernetes cluster successfully

配置环境

#在profile文件末尾添加kube_config_rancher-cluster.yml文件路径并保存#其实可以自定义位置
export KUBECONFIG=/home/rancher/kube_config_cluster.yml
[root@k8s-master ~]# echo "source <(kubectl completion bash)" >> ~/.bashrc
[root@k8s-master ~]# source ~/.bashrc
[root@k8s-master ~]# su - rancher
[rancher@k8s-master ~]# echo "source <(kubectl completion bash)" >> ~/.bashrc
[rancher@k8s-master ~]# source ~/.bashrc

测试集群

通过kubectl测试您的连接,并查看您的所有节点是否处于Ready状态
[centos@k8s-master ~]# kubectl get node
[centos@k8s-master ~]# kubectl get pods --all-namespaces

四、nginx切换

在nginx服务器上切换nginx.config ,修改配置前保存一份nginx.config 

切换完后,观察一周时间,由于使用了ssd盘,现在的性能要比之前好很多,不会再报etcd超时的问题,顺利解决了etcd超时管理web页面打不开的现象。瞬间丝滑!