操作系统初始化设置
操作系统:centos 7.4版本以上
修改ip地址
[root@localhost ~]# vim /etc/sysconfig/network-scripts/ifcfg-ens192
TYPE=Ethernet
BOOTPROTO=static
DEVICE=ens192
ONBOOT=yes
IPADDR=192.168.0.200
NETMASK=255.255.255.0
GATEWAY=192.168.0.254
DNS1=223.5.5.5
修改主机名
[root@lx-test ~]# vim /etc/hostname
lx-test
禁用不常用服务
systemctl disable postfix.service
systemctl disable firewalld.service
systemctl disable NetworkManager.service
systemctl disable kdump.service
systemctl disable cups.path
systemctl disable cups.socket
systemctl disable cups.service
systemctl disable rpcbind.socket
systemctl disable bluetooth.service
systemctl disable abrt-ccpp.service
systemctl disable abrt-oops.service
systemctl disable abrt-vmcore.service
systemctl disable abrt-xorg.service
systemctl disable abrtd.service
systemctl disable accounts-daemon.service
systemctl disable atd.service
systemctl disable avahi-daemon.service
systemctl disable chronyd.service
systemctl disable dbus-org.freedesktop.Avahi.service
systemctl disable dbus-org.freedesktop.ModemManager1.service
systemctl disable display-manager.service
systemctl disable dmraid-activation.service
systemctl disable gdm.service
systemctl disable hypervkvpd.service
systemctl disable hypervvssd.service
systemctl disable iscsi.service
systemctl disable ksm.service
systemctl disable ksmtuned.service
systemctl disable libstoragemgmt.service
systemctl disable libvirtd.service
systemctl disable mdmonitor.service
systemctl disable ModemManager.service
systemctl disable multipathd.service
systemctl disable packagekit-offline-update.service
systemctl disable rngd.service
systemctl disable rtkit-daemon.service
systemctl disable smartd.service
systemctl disable spice-vdagentd.service
systemctl disable sysstat.service
systemctl disable avahi-daemon.socket
systemctl disable iscsid.socket
systemctl disable iscsiuio.socket
# RHEL 8.4 额外添加 https://docs.rancher.cn/docs/rke/os/_index
systemctl disable nm-cloud-setup.service nm-cloud-setup.timer
关闭防火墙&selinux
(firewalld 上面不常用服务已禁用)
,禁用selinux对于有特殊要求的环境,根据实际配置
#禁用 firewalld 服务
systemctl stop firewalld
systemctl disable firewalld
#禁用 SELinux
setenforce 0
sed -i 's/^SELINUX=enforcing$/SELINUX=disabled/' /etc/selinux/config
禁用swap
[root@lx-test ~]# vim /etc/fsta 注释swap所在行
#/dev/mapper/centos-swap swap swap defaults 0 0
添加yum仓库 (无外网可忽略此步骤)
备份现有repo文件
[root@ lx-test yum.repos.d]# tar -cvf CentOS-repo.tar.gz CentOS-* --remove-files
CentOS-Base.repo
CentOS-CR.repo
CentOS-Debuginfo.repo
CentOS-fasttrack.repo
CentOS-Media.repo
CentOS-Sources.repo
CentOS-Vault.repo
CentOS-x86_64-kernel.repo
常用yum仓库
包含centos bas源、docker-ce源、epel源、kubernetes 源
vim /etc/yum.repos.d/yum-out-all.repo
#============ centos ==============
[base]
name=CentOS-$releasever - Base - mirrors.aliyun.com
failovermethod=priority
baseurl=http://mirrors.aliyun.com/centos/$releasever/os/$basearch/
gpgcheck=0
gpgkey=http://mirrors.aliyun.com/centos/RPM-GPG-KEY-CentOS-7
#released updates
[updates]
name=CentOS-$releasever - Updates - mirrors.aliyun.com
failovermethod=priority
baseurl=http://mirrors.aliyun.com/centos/$releasever/updates/$basearch/
gpgcheck=0
gpgkey=http://mirrors.aliyun.com/centos/RPM-GPG-KEY-CentOS-7
#additional packages that may be useful
[extras]
name=CentOS-$releasever - Extras - mirrors.aliyun.com
failovermethod=priority
baseurl=http://mirrors.aliyun.com/centos/$releasever/extras/$basearch/
gpgcheck=0
gpgkey=http://mirrors.aliyun.com/centos/RPM-GPG-KEY-CentOS-7
#additional packages that extend functionality of existing packages
[centosplus]
name=CentOS-$releasever - Plus - mirrors.aliyun.com
failovermethod=priority
baseurl=http://mirrors.aliyun.com/centos/$releasever/centosplus/$basearch/
gpgcheck=0
enabled=1
gpgkey=http://mirrors.aliyun.com/centos/RPM-GPG-KEY-CentOS-7
#============ docker-ce ==============
[docker-ce-stable]
name=Docker CE Stable - $basearch
baseurl=https://mirrors.tuna.tsinghua.edu.cn/docker-ce/linux/centos/$releasever/$basearch/stable
enabled=1
gpgcheck=0
gpgkey=https://mirrors.tuna.tsinghua.edu.cn/docker-ce/linux/centos/gpg
#============ epel ==============
[epel]
name=Extra Packages for Enterprise Linux 7 - $basearch
baseurl=https://mirrors.tuna.tsinghua.edu.cn/epel/7/$basearch
#mirrorlist=https://mirrors.fedoraproject.org/metalink?repo=epel-7&arch=$basearch
failovermethod=priority
enabled=1
gpgcheck=0
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7
#============ kubernetes ==============
[kubernetes]
name=kubernetes
baseurl=https://mirrors.tuna.tsinghua.edu.cn/kubernetes/yum/repos/kubernetes-el7-$basearch
enabled=1
gpgcheck=0
centos8 yum源更换 (可选操作)
修改所有的CentOS文件内容
sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*
sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
更新yum源为阿里镜像
wget -O /etc/yum.repos.d/CentOS-Base.repo https://mirrors.aliyun.com/repo/Centos-vault-8.5.2111.repo
yum clean all
yum makecache
#yum安装测试是否可以yum安装 (可选)
yum install wget –y
安装常用工具
yum install vim wget git iotop net-tools telnet ntpdate lsof sysstat psmisc iftop nmon unzip -y
时间同步
2种方式二选一
(ntp方式)
[root@lx-test ~]# yum install ntp -y #安装ntp服务
[root@lx-test ~]# ntpdate ntp.aliyun.com #手动同步时间
22 Apr 17:32:58 ntpdate[9380]: adjust time server 203.107.6.88 offset -0.000697 sec
[root@lx-test ~]# vim /etc/ntp.conf
#修改国内常用时间服务器
server ntp.aliyun.com iburst
[root@lx-test ~]# systemctl enable --now ntpd
[root@lx-test ~]# systemctl status ntpd
[root@lx-test ~]# ntpq -p
(Chrony方式)
[root@lx-test ~]# yum install chrony -y #安装chrony服务
[root@lx-test ~]# chronyc -a makestep #手动同步时间
200 OK
[root@lx-test ~]# vim /etc/chrony.conf
#注释原有server,添加国内常用时间服务器
server ntp.aliyun.com iburst
server ntp.tencent.com iburst
server ntp.ntsc.ac.cn iburst
[root@lx-test ~]# systemctl enable --now chronyd
[root@lx-test ~]# systemctl status chronyd
[root@lx-test ~]# chronyc sources -v
节点OS调优
内核调优
执行以下命令
echo "
net.bridge.bridge-nf-call-ip6tables=1
net.bridge.bridge-nf-call-iptables=1
net.ipv4.ip_forward=1
net.ipv4.conf.all.forwarding=1
net.ipv4.neigh.default.gc_thresh1=4096
net.ipv4.neigh.default.gc_thresh2=6144
net.ipv4.neigh.default.gc_thresh3=8192
net.ipv4.neigh.default.gc_interval=60
net.ipv4.neigh.default.gc_stale_time=120
# 参考 https://github.com/prometheus/node_exporter#disabled-by-default
kernel.perf_event_paranoid=-1
#sysctls for k8s node config
net.ipv4.tcp_slow_start_after_idle=0
net.core.rmem_max=16777216
fs.inotify.max_user_watches=524288
kernel.softlockup_all_cpu_backtrace=1
kernel.softlockup_panic=0
kernel.watchdog_thresh=30
fs.file-max=2097152
fs.inotify.max_user_instances=8192
fs.inotify.max_queued_events=16384
vm.max_map_count=262144
fs.may_detach_mounts=1
net.core.netdev_max_backlog=16384
net.ipv4.tcp_wmem=4096 12582912 16777216
net.core.wmem_max=16777216
net.core.somaxconn=32768
net.ipv4.ip_forward=1
net.ipv4.tcp_max_syn_backlog=8096
net.ipv4.tcp_rmem=4096 12582912 16777216
net.ipv6.conf.all.disable_ipv6=1
net.ipv6.conf.default.disable_ipv6=1
net.ipv6.conf.lo.disable_ipv6=1
kernel.yama.ptrace_scope=0
vm.swappiness=0
# 可以控制core文件的文件名中是否添加pid作为扩展。
kernel.core_uses_pid=1
# Do not accept source routing
net.ipv4.conf.default.accept_source_route=0
net.ipv4.conf.all.accept_source_route=0
# Promote secondary addresses when the primary address is removed
net.ipv4.conf.default.promote_secondaries=1
net.ipv4.conf.all.promote_secondaries=1
# Enable hard and soft link protection
fs.protected_hardlinks=1
fs.protected_symlinks=1
# 源路由验证
# see details in https://help.aliyun.com/knowledge_detail/39428.html
net.ipv4.conf.all.rp_filter=0
net.ipv4.conf.default.rp_filter=0
net.ipv4.conf.default.arp_announce = 2
net.ipv4.conf.lo.arp_announce=2
net.ipv4.conf.all.arp_announce=2
# see details in https://help.aliyun.com/knowledge_detail/41334.html
net.ipv4.tcp_max_tw_buckets=5000
net.ipv4.tcp_syncookies=1
net.ipv4.tcp_fin_timeout=30
net.ipv4.tcp_synack_retries=2
kernel.sysrq=1
#redis优化
vm.overcommit_memory = 1
" >> /etc/sysctl.conf
#执行sysctl -p 生效
sysctl -p
nproc资源限制
修改添加普通用户资源限制
cat > /etc/security/limits.d/20-nproc.conf <<EOF
# Default limit for number of user's processes to prevent
# accidental fork bombs.
# See rhbz #432903 for reasoning.
* soft nproc 65536
* hard nproc 65536
* soft nofile 65536
* hard nofile 65536
root soft nproc unlimited
EOF
Docker安装
卸载docker
若docker版本非docker-ce版本,需卸载安装docker-ce版本
sudo yum -y remove docker \
docker-client \
docker-client-latest \
docker-common \
docker-latest \
docker-latest-logrotate \
docker-logrotate \
docker-selinux \
docker-engine-selinux \
docker-engine \
container* \
docker*
rm -rf /var/lib/docker/ /etc/docker/
安装docker
yum install docker-ce -y
二进制安装方式(可选)
针对离线环境无法访问yum仓库,可二进制方式安装
1.下载docker二进制安装包
下载地址>> https://download.docker.com/linux/static/stable/x86_64/
此次下载为: docker-20.10.16.tgz
2. 解压安装包
tar zxf docker-20.10.16.tgz
3. 复制二进制文件到/usr/local/bin目录下
cp docker/* /usr/local/bin
4. 编写docker启动脚本
vim /usr/lib/systemd/system/docker.service
[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target firewalld.service
Wants=network-online.target
[Service]
OOMScoreAdjust=-1000
Type=notify
# the default is not to use systemd for cgroups because the delegate issues still
# exists and systemd currently does not support the cgroup feature set required
# for containers run by docker
ExecStart=/usr/local/bin/dockerd
ExecReload=/bin/kill -s HUP $MAINPID
#ubuntu系统注释下面一行
ExecStartPost=/usr/sbin/iptables -P FORWARD ACCEPT
# Having non-zero Limit*s causes performance problems due to accounting overhead
# in the kernel. We recommend using cgroups to do container-local accounting.
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
# Uncomment TasksMax if your systemd version supports it.
# Only systemd 226 and above support this version.
#TasksMax=infinity
TimeoutStartSec=0
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
# restart the docker process if it exits prematurely
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s
[Install]
WantedBy=multi-user.target
5. 启动docker并设为开机自启
systemctl daemon-reload
systemctl enable --now docker
docker.service 配置调优
对于 CentOS 系统,docker.service 默认位于/usr/lib/systemd/system/docker.service
;对于 Ubuntu 系统,docker.service 默认位于/lib/systemd/system/docker.service
。编辑docker.service
,添加以下参数。
- 防止 docker 服务 OOM:
OOMScoreAdjust=-1000
(配置文件可能已包含,更改值) - 开启 iptables 转发链:
ExecStartPost=/usr/sbin/iptables -P FORWARD ACCEPT
docker默认root目录
如需切换docker默认数据存放目录(默认路径:/var/lib/docker),示例为切换为/data/docker目录,(已启动的docker服务需停止:systemctl stop docker,systemctl stop docker.socket
) 编辑docker配置文件,增加 --data-root=/data/docker参数
vim /usr/lib/systemd/system/docker.service
ExecStart=/usr/bin/dockerd -H fd:// --containerd=/run/containerd/containerd.sock --data-root=/data/docker
docker 参数调优
指定存储驱动、镜像加速,日志大小及限制配置
mkdir -p /etc/docker #已启动过docker服务会自动生成,无需手动创建
cat > /etc/docker/daemon.json <<EOF
{
"exec-opts": ["native.cgroupdriver=systemd"],
"log-driver": "json-file",
"log-opts": { "max-size": "100m", "max-file": "3"},
"max-concurrent-downloads": 10,
"max-concurrent-uploads": 10,
"registry-mirrors": ["https://3284ug2c.mirror.aliyuncs.com"],
"storage-driver": "overlay2",
"storage-opts": [
"overlay2.override_kernel_check=true"
]
}
EOF
#重载配置、重启docker服务并设置开机自启动
systemctl daemon-reload && systemctl restart docker && systemctl enable docker
GPU驱动安装
cuda安装
centos7 cuda和runtime安装
(安装cuda可以一并安装nvida驱动,也可单独下载nvida驱动安装,cuda可安装多个版本)
查询显卡(可选)
验证系统是否已识别NVIDIA显卡
lspci |grep -i nvidia (若没有lspci命令,执行安装:yum install pciutils -y)
对与能直接显示的显卡:Tesla T4卡
对于不能直接显示型号的GPU显卡,如图:25b6卡,查询结果为A2/A16卡
查询网址:http://pci-ids.ucw.cz/mods/PC/10de?action=help?help=pci
cuda支持列表与GPU 的计算能力
cuda与Driver兼容性列表:https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
算力查询地址:CUDA GPU | NVIDIA Developer
安装所需的软件
yum install gcc-c++ kernel-devel-$(uname -r) kernel-headers-$(uname -r) -y
下载cuda
驱动下载官网地址:https://www.nvidia.cn/Download/index.aspx?lang=cn
cuda官网地址:https://developer.nvidia.com/cuda-toolkit-archive
当前下载为官网11.6.0版本,可根据需求安装相应版本
wget https://developer.download.nvidia.com/compute/cuda/11.6.0/local_installers/cuda_11.6.0_510.39.01_linux.run
禁用默认nouveau驱动
编辑dist-blacklist.conf文件在最后添加以下内容
vim /usr/lib/modprobe.d/dist-blacklist.conf
#######禁用默认nouveau###
blacklist nouveau
options nouveau modeset=0
#######################
备份原始内核
mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r).img.bak
重新生成
dracut /boot/initramfs-$(uname -r).img $(uname -r)
重启操作系统,(确保rpm -qa |grep kernel启动内核和上面安装的kernel-devel和kernel-headers版本号一致)
reboot
查看是否已禁用默认nouveau,无输出表示已禁用
lsmod | grep nouveau
查看系统运行级别
runlevel (若为5图形化模式,请切换至3多用户模式)
systemctl isolate runlevel3.target \\切换到3多用户
systemctl set-default multi-user.target \\runlevel3级别 (可选操作:设置系统开机默认启动级别为多用户模式)
安装cuda
开始安装
sudo sh cuda_11.6.0_510.39.01_linux.run
输入accept后可按空格取消示例demo,向下移动箭头选中Install安装(一般是取消后三项中的“X”)
##############
CUDA Installer
- [X] Driver
[X] 510.39.01
+ [X] CUDA Toolkit 11.6
[ ] CUDA Samples 11.6
[ ] CUDA Demo Suite 11.6
[ ] CUDA Documentation 11.6
Options
Install
###################
安装完成
添加配置变量
根据实际安装版本修改
vim /etc/profile 配置文件末尾添加
# cuda 默认安装路径为/usr/local/cuda-x版本号
export PATH=/usr/local/cuda-11.6/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda-11.6/lib64$LD_LIBRARY_PATH
配置生效
source /etc/profile
查看nvida显卡信息
nvidia-smi
查看cuda版本
# nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_Mar__8_18:18:20_PST_2022
Cuda compilation tools, release 11.6, V11.6.124
Build cuda_11.6.r11.6/compiler.31057947_0
启用GPU持久模式
提高GPU加载速度
nvidia-smi -pm 1 (也可使用nvidia-persistenced --persistence-mode 命令)仅当前生效
# nvidia-smi -pm 1
Enabled persistence mode for GPU 00000000:2F:00.0.
All done.
再次查看:Persistence-M 已从Off
状态变成On
配置开机自启持久化模式
cat > /etc/systemd/system/multi-user.target.wants/nvidia-pm.service <<EOF
[Unit]
Description=Set NVIDIA Persistence Mode to Enable
Wants=syslog.target
[Service]
Type=forking
ExecStart=/usr/bin/nvidia-smi -pm 1
[Install]
WantedBy=multi-user.target
EOF
runtime安装
nvidia-container-runtime 下载
nvidia-container-runtime 地址:https://github.com/NVIDIA/nvidia-container-runtime/
nvidia-container-runtime.repo yum源
vim /etc/yum.repos.d/nvidia-container-runtime.repo
[libnvidia-container]
name=libnvidia-container
baseurl=https://nvidia.github.io/libnvidia-container/stable/centos7/$basearch
gpgcheck=0
enabled=1
安装nvidia-container-runtime
yum install nvidia-container-runtime -y
离线安装(离线环境提前下载的包)
上传runtime目录汇总rpm包
rpm -ivh docker-runtime/*.rpm
配置nvidia-rumtime
vim /etc/docker/daemon.json 增添参数
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
},
完整daemon.json文件内容 建议复制粘贴至GPU服务器执行,一步到位!
#GPU服务器
mkdir -p /etc/docker
cat > /etc/docker/daemon.json <<EOF
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
},
"exec-opts": ["native.cgroupdriver=systemd"],
"log-driver": "json-file",
"log-opts": { "max-size": "100m", "max-file": "3"},
"max-concurrent-downloads": 10,
"max-concurrent-uploads": 10,
"registry-mirrors": ["https://3284ug2c.mirror.aliyuncs.com"],
"storage-driver": "overlay2",
"storage-opts": [
"overlay2.override_kernel_check=true"
]
}
EOF
#重载配置、重启docker服务并设置开机自启动
systemctl daemon-reload && systemctl restart docker && systemctl enable docker
重载docker 已执行可忽略
systemctl daemon-reload
systemctl restart docker
查看docker runtime
查看Default Runtime: 为nvidia表示切换成功
docker info