1.docker基础环境安装 yum -y install docker apt-get -y update ;apt-get -y install docker-compose systemctl enable docker systemctl start docker timedatectl timedatectl set-timezone Asia/Shanghai 宿主机时间设定 2.Prometheus安装 编辑配置文件prometheus.yml cat prometheus.yml global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
scrape_configs:
-
job_name: 'prometheus' static_configs:
- targets: ['localhost:9090']
-
job_name: 'kafka' static_configs:
- targets: ['localhost:9308'] labels: instance: kafka
-
job_name: elasticsearch scrape_interval: 5s metrics_path: "/_prometheus/metrics"
file_sd_configs:
- files:
- es.yml
- files:
sudo docker stop prometheus
sudo docker rm prometheus
sudo docker run -d --restart=always
-v /etc/localtime:/etc/localtime
-v /data/monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
--name prometheus
--net=host docker.io/wang049718/prometheus --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml
3.grafana部署
docker run -d --name grafana
--restart=always
-v /etc/localtime:/etc/localtime
-p 3000:3000
docker.io/wang049718/grafana
admin/admin
4.监控node_exporter部署
参数说明
--web.listen-address=":9200"
#node_exporter监听的端口,默认是9100,若需要修改则通过此参数。
--web.telemetry-path="/metrics"
#获取metric信息的url,默认是/metrics,若需要修改则通过此参数
--log.level="info"
#设置日志级别
--log.format="logger:stderr"
docker run -d --restart=always
-v /etc/localtime:/etc/localtime
--name node-exporter
--net=host docker.io/wang049718/node-exporter:0.18
Grafana导入模版id 10262
5.告警规则 Prometheus添加规则
alerting: #指定alertmanager报警组件地址 alertmanagers:
- static_configs:
- targets: [ '1.1.1.5:9093']
rule_files: #指定报警规则文件
- "rules.yml"
增加规则文件rules.yml groups:
- name: example #定义规则组
rules:
- alert: InstanceDown #定义报警名称 expr: up == 0 #Promql语句,触发规则 for: 1m # 一分钟 labels: #标签定义报警的级别和主机 name: instance severity: Critical annotations: #注解 summary: " {{ $labels.instance }}" #报警摘要,取报警信息的appname名称 description: " 服务停止运行 " #报警信息 value: "{{ $value }}%" # 当前报警状态值
- name: Host
rules:
- alert: HostMemory Usage expr: 1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.8 for: 1m labels: name: Memory severity: Warning annotations: summary: " {{ $labels.appname }} " description: "宿主机内存使用率超过80%." value: "{{ $value }}"
- alert: HostCPU Usage expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.65 for: 1m labels: name: CPU severity: Warning annotations: summary: " {{ $labels.appname }} " description: "宿主机CPU使用率超过65%." value: "{{ $value }}"
- alert: HostLoad expr: node_load5 > 4 for: 1m labels: name: Load severity: Warning annotations: summary: "{{ $labels.appname }} " description: " 主机负载5分钟超过4." value: "{{ $value }}"
- alert: HostLoad expr: node_load1 > 10 for: 1m labels: name: Load severity: Warning annotations: summary: "{{ $labels.appname }} " description: " 主机负载1分钟超过10." value: "{{ $value }}"
- alert: HostFilesystem Usage expr: 1-(node_filesystem_free_bytes / node_filesystem_size_bytes) > 0.8 for: 1m labels: name: Disk severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [ {{ $labels.mountpoint }} ]分区使用超过80%." value: "{{ $value }}%"
- alert: HostDiskio expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10 for: 1m labels: name: Diskio severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高." value: "{{ $value }}iops"
- alert: Network_receive expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|eth[0-9]|cbr[0-9]|veth.|virbr.|ovs-system"}[5m]) / 1048576 > 3 for: 1m labels: name: Network_receive severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过3Mbps." value: "{{ $value }}3Mbps"
- alert: Network_transmit expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|eth[0-9]|cbr[0-9]|veth.|virbr.|ovs-system"}[5m]) / 1048576 > 3 for: 1m labels: name: Network_transmit severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过3Mbps." value: "{{ $value }}3Mbps"
- name: Container
rules:
- alert: ContainerCPU Usage expr: (sum by(name,instance) (rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 60 for: 1m labels: name: CPU severity: Warning annotations: summary: "{{ $labels.name }} " description: " 容器CPU使用超过60%." value: "{{ $value }}%"
- alert: ContainerMem Usage expr: container_memory_usage_bytes{name=~".+"} / 1048576 > 1024 for: 1m labels: name: Memory severity: Warning annotations: summary: "{{ $labels.name }} " description: " 容器内存使用超过1GB." value: "{{ $value }}G"
- name: Kafka
rules:
- alert: kafka_lag expr: kafka_consumergroup_lag > 180 for: 1m labels: severity: Warning annotations: description: "剩余队列长度大于180." value: "{{ $value }}"
- name: Redis
rules:
- alert: rejected_connections expr: redis_rejected_connections_total > 0 for: 1m labels: severity: Warning annotations: description: "redis达到链接上限,拒绝的个数" value: "{{ $value }}"
- alert: blocked_clients expr: irate(redis_blocked_clients[5m]) > 10 for: 1m labels: severity: Warning annotations: description: "redis是单线程,5分钟阻塞大于10" value: "{{ $value }}"
- alert: slave expr: redis_connected_slaves == 1 for: 1m labels: severity: Warning annotations: description: "slave donw" value: "{{ $value }}"
- name: ES
rules:
- alert: es_cluster_node expr: es_cluster_nodes_number < 3 for: 1m labels: severity: Warning annotations: description: "ES集群异常." value: "{{ $value }}"
- alert: es_cluster_datanodes_number expr: es_cluster_datanodes_number < 3 for: 1m labels: severity: Warning annotations: description: "ES集群数据节点异常." value: "{{ $value }}"
- alert: es内存使用率 expr: es_os_mem_used_bytes / es_os_mem_total_bytes * 100 > 80 for: 1m labels: severity: Warning annotations: description: "内粗使用率大于80%" value: "{{ $value }}"
- alert: es cpu使用率 expr: es_os_cpu_percent > 0.6 for: 1m labels: severity: Warning annotations: description: "cpu使用率大于60%" value: "{{ $value }}"
- name: web
rules:
- alert: basevisitor expr: basevisitor != 200 for: 1m labels: severity: Warning annotations: description: "basevisitor 异常" value: "{{ $value }}"
- alert: km expr: km != 200 for: 1m labels: severity: Warning annotations: description: "km 异常" value: "{{ $value }}"
- alert: gtower expr: gtower != 200 for: 1m labels: severity: Warning annotations: description: "gtower 异常" value: "{{ $value }}"
- alert: im03 expr: im03 != 200 for: 1m labels: severity: Warning annotations: description: "im03 异常" value: "{{ $value }}"
- alert: immonitor expr: immonitor != 200 for: 1m labels: severity: Warning annotations: description: "immonitor 异常" value: "{{ $value }}"
- alert: volcano expr: volcano != 200 for: 1m labels: severity: Warning annotations: description: "volcano 异常" value: "{{ $value }}"
- alert: kfonline expr: kfonline != 302 for: 1m labels: severity: Warning annotations: description: "kfonline 异常" value: "{{ $value }}"
- alert: ocs expr: ocs != 403 for: 1m labels: severity: Warning annotations: description: "ocs 异常" value: "{{ $value }}"
- alert: fliter expr: fliter != 200 for: 1m labels: severity: Warning annotations: description: "fliter 异常" value: "{{ $value }}"
- name: mongo
rules:
- alert: cluster expr: mongodb_mongod_replset_my_state != 2 for: 1m labels: severity: Warning annotations: description: "集群异常" value: "{{ $value }}"Prometheus加载告警规则
docker run -d --restart=always
-v /etc/localtime:/etc/localtime
-v /data/monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
-v /data/monitor/prometheus/rules.yml:/etc/prometheus/rules.yml
--name prometheus
--net=host docker.io/wang049718/prometheus --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml
启动告警服务插件 cat alertmanager.yml global: resolve_timeout: 2m smtp_smarthost: smtp.163.com:25 smtp_from: wang049718@163.com smtp_auth_username: wang049718@163.com smtp_auth_password: wang049718
templates: ##消息模板
- '/etc/alertmanager/template/wechat.tmpl' route: group_by: ['alertname_wechat'] group_wait: 30s group_interval: 60s receiver: 'email' # 优先使用wechat发送 repeat_interval: 1h routes: #子路由,使用email发送
- receiver: email match_re: serverity: email receivers:
- name: 'email'
email_configs:
- to: '1715498045@qq.com' send_resolved: true # 发送已解决通知 wechat.tmpl [root@localhost wang]# cat wechat.tmpl {{ define "wechat.default.message" }} {{ range $i, $alert :=.Alerts }} ========监控报警========== 告警状态:{{ .Status }} 告警级别:{{ $alert.Labels.severity }} 告警类型:{{ $alert.Labels.alertname }} 告警应用:{{ $alert.Annotations.summary }} 告警主机:{{ $alert.Labels.instance }} 告警详情:{{ $alert.Annotations.description }} 触发阀值:{{ $alert.Annotations.value }} 告警时间:{{ $alert.StartsAt.Format "2006-01-02 15:04:05" }} ========end============= {{ end }} {{ end }}
docker run -d -p 9093:9093 --name alertmanager
--restart always
-v /etc/localtime:/etc/localtime
-v /home/wang/alertmanager.yml:/etc/alertmanager/alertmanager.yml
-v /home/wang:/etc/alertmanager/template
docker.io/wang049718/alertmanager
6.监控mongo docker run -itd --net=host -v /etc/localtime:/etc/localtime --name mongo mongo --auth docker exec -it mongo mongo use admin db.createUser({ user: 'admin', pwd: '123456', roles: [{ "role": "userAdminAnyDatabase", "db": "admin" }, { "role": "dbAdminAnyDatabase", "db": "admin" }, { role: "root", db: "admin" } ] })
docker exec -it mongo mongo -uadmin -p123456
use col
db.col.insert({title: 'MongoDB 教程',
description: 'MongoDB 是一个 Nosql 数据库',
by: '菜鸟教程',
url: 'http://www.runoob.com',
tags: ['mongodb', 'database', 'NoSQL'],
likes: 100
})
sudo docker run -d --name mongo-explorer
--restart=always
-p 9105:9104
--cpuset-cpus=2,1
-m 300m
-v /etc/localtime:/etc/localtime
docker.io/wang049718/mongo --mongodb.uri "mongodb://admin:bdpass9937465@172.21.10.17:27017"
模版为json串
7监控mysql
sudo docker run -d --restart=always
--net=host
--cpuset-cpus=0,1
-m 1200m
-e DATA_SOURCE_NAME="monitoring:monitoring@(172.21.10.22:3306)"/
-v /etc/localtime:/etc/localtime
--name mysql_exporter
docker.io/wang049718/mysqld-exporter:latest
8.监控redis
docker stop redis-server
docker rm redis-server
docker run -d --name redis-server -p 6379:6379
-v /etc/localtime:/etc/localtime
-v /home/redis:/data
--restart always redis
--requirepass "123456" --appendonly yes
docker run -d --name redis_exporter
-p 9121:9121
-v /etc/localtime:/etc/localtime
--restart always docker.io/wang049718/redis_exporter
--redis.addr redis://1.1.1.4:6379 -redis.password 123456
sudo docker run -d --name redis_exporter
--net=host
-v /etc/localtime:/etc/localtime
--cpuset-cpus=0,1
-m 1200m
--restart always
docker.io/wang049718/redis_exporter
--redis.addr redis://172.21.10.11:6379
9.监控kafka
sudo docker stop kafka
sudo docker rm kafka
sudo docker run -d --restart=always
--cpuset-cpus=3,2
-m 1200m
-p 9308:9308
-v /etc/localtime:/etc/localtime
--name kafka
docker.io/wang049718/kafka_exporter
/kafka_exporter-1.2.0.linux-amd64/kafka_exporter --kafka.server=172.21.10.4:9092
Json串
10.pushgateway
sudo docker stop pushgateway
sudo docker rm pushgateway
sudo docker run -d --restart=always
-v /etc/localtime:/etc/localtime
--name pushgateway -p 9091:9091
docker.io/wang049718/pushgateway:latest
访问9091端口(http://pushgatewayIP:9091)
打开prometheus的配置文件
- job_name: 'pushgateway'
static_configs:
- targets: ['pushgatewayIP:9091'] honor_labels: true #作用:如果没有设置instance标签,Prometheus服务器也会附加标签,否则instance标签值会为空
写入测试 单条 echo "test 123" | curl --data-binary @- http://localhost:9091/metrics/job/test 上述测试的目的是,在被监控的机器上,想pushgateway发送了一条数据,内容是“test 123”,指标名称是“test”,指标值是“123”; http://pushgatewayIP:9091/metrics/job/test,此次也声名了,在pushgateway处建立一个job为test的指标。
可以在prometheus图形查看test值
API格式:
http://pustgatewayIP/metrices/job/job名/标签名/标签值(一般 标签名 采用 instance)
例子:
http://pustgatewayIP/metrics/job/
/sb/instance/si
/testjob/abc/pushgateway1
/testjob/yyy/pushgateway1
分别触发上述三个API,打开pushgateway的web UI
cat <<EOF | curl --data-binary @- http://pushgatewayIP:9091/metrics/job/docker_runtime/instance/xa-lsr-billubuntu # TYPE docker_runtime counter docker_runtime{name="cadvisor"} 33 docker_runtime{name="nginx"} 331 docker_runtime{name="abc"} 332
echo "basevisitor curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-basevisitor.ziroom.com
" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/basevisitor
echo "km curl -I -m 10 -o /dev/null -s -w %{http_code} https://kf-km.ziroom.com/backend/health
" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/km
echo "gtower curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-gtower.ziroom.com:7002
" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/gtower
echo "im03 curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-im03.ziroom.com/health-check
" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/im03
echo "fliter curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-fliter.ziroom.com
" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/fliter
echo "immonitor curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-immonitor.ziroom.com/health-check
" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/immonitor
echo "volcano curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-volcano.ziroom.com/monitor/group_all_agent
" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/volcano
echo "kfonline curl -I -m 10 -o /dev/null -s -w %{http_code} http://kfonline.ziroom.com
" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/kfonline
echo "push curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-push.ziroom.com:7002
" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/push
echo "ocs curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-ocs.ziroom.com/minio/login
" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/ocs
11.监控es
es监控 /usr/local/services/elasticsearch/bin/elasticsearch-plugin install file:///home/webuser/package/elasticsearch-prometheus-exporter-5.6.4.0.zip
重启服务器 服务器端
-
job_name: elasticsearch scrape_interval: 5s metrics_path: "/_prometheus/metrics"
file_sd_configs:
- files:
- es.yml
- files:
es.yml文件,位置和prometheus.yml在一个文件夹
- targets:
- 172.21.10.10:9200
- 172.21.8.49:9200
- 172.21.10.12:9200 labels: server: c2-jenkins grafana导入266
sudo docker stop prometheus
sudo docker rm prometheus
sudo docker run -d --restart=always
-v /etc/localtime:/etc/localtime
-v /data/monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
-v /data/monitor/prometheus/es.yml:/etc/prometheus/es.yml
-v /data/monitor/prometheus/rules.yml:/etc/prometheus/rules.yml
--name prometheus
--net=host prom/prometheus --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml
12监控nginx
1.10.3 apt-get -y update apt-get -y install libpcre3 libpcre3-dev gcc apt-get -y install openssl libssl-dev libxslt-dev libgd-dev libgeoip-dev
git clone git://github.com/vozlt/nginx-module-vts.git
wget http://nginx.org/download/nginx-1.10.3.tar.gz tar xvf nginx-1.10.3.tar.gz nginx/1.10.3 ./configure --with-cc-opt='-g -O2 -fPIE -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2' --with-ld-opt='-Wl,-Bsymbolic-functions -fPIE -pie -Wl,-z,relro -Wl,-z,now' --prefix=/usr/share/nginx --conf-path=/etc/nginx/nginx.conf --http-log-path=/var/log/nginx/access.log --error-log-path=/var/log/nginx/error.log --lock-path=/var/lock/nginx.lock --pid-path=/run/nginx.pid --http-client-body-temp-path=/var/lib/nginx/body --http-fastcgi-temp-path=/var/lib/nginx/fastcgi --http-proxy-temp-path=/var/lib/nginx/proxy --http-scgi-temp-path=/var/lib/nginx/scgi --http-uwsgi-temp-path=/var/lib/nginx/uwsgi --with-debug --with-pcre-jit --with-ipv6 --with-http_ssl_module --with-http_stub_status_module --with-http_realip_module --with-http_auth_request_module --with-http_addition_module --with-http_dav_module --with-http_geoip_module --with-http_gunzip_module --with-http_gzip_static_module --with-http_image_filter_module --with-http_v2_module --with-http_sub_module --with-http_xslt_module --with-stream --with-stream_ssl_module --with-mail --with-mail_ssl_module --with-threads --add-module=/home/webuser/nginx-module-vts
make && make install make upgrade
rm -rf /usr/sbin/nginx ln -s /usr/share/nginx/sbin/nginx /usr/sbin/ nginx -V
http
vhost_traffic_status_zone;
vhost_traffic_status_filter_by_host on;
server location /status { vhost_traffic_status_display; vhost_traffic_status_display_format html; } 不能是localhost不然不生效
wget -c https://github.com/hnlq715/nginx-vts-exporter/releases/download/v0.9.1/nginx-vts-exporter-0.9.1.linux-amd64.tar.gz tar -xvf nginx-vts-exporter-0.9.1.linux-amd64.tar.gz -C /usr/local/ cd /usr/local/nginx-vts-exporter-0.9.1.linux-amd64/ ./nginx-vts-exporter -nginx.scrape_uri http://172.21.10.3:7002/status/format/json &
9913
sudo docker stop nginx-vts-exporter
sudo docker rm nginx-vts-exporter
sudo docker run -d --name nginx-vts-exporter
--restart=always
-p 9913:9913
--cpuset-cpus=2,1
-e nginx.scrape_uri='http://172.21.10.3:7002/status/format/json'
-e NGINX_HOST=http://172.21.10.3:7002
-m 300m
-v /etc/localtime:/etc/localtime
docker.io/sophos/nginx-vts-exporter
9913端口 http://1.1.1.4:9913/metrics 查看数据
grafana配置 2949