1.docker基础环境安装 yum -y install docker apt-get -y update ;apt-get -y install docker-compose systemctl enable docker systemctl start docker timedatectl timedatectl set-timezone Asia/Shanghai 宿主机时间设定 2.Prometheus安装 编辑配置文件prometheus.yml cat prometheus.yml global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.

scrape_configs:

  • job_name: 'prometheus' static_configs:

    • targets: ['localhost:9090']
  • job_name: 'kafka' static_configs:

    • targets: ['localhost:9308'] labels: instance: kafka
  • job_name: elasticsearch scrape_interval: 5s metrics_path: "/_prometheus/metrics"

    file_sd_configs:

    • files:
      • es.yml

sudo docker stop prometheus sudo docker rm prometheus sudo docker run -d --restart=always
-v /etc/localtime:/etc/localtime
-v /data/monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
--name prometheus
--net=host docker.io/wang049718/prometheus --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml

3.grafana部署 docker run -d --name grafana
--restart=always
-v /etc/localtime:/etc/localtime
-p 3000:3000
docker.io/wang049718/grafana admin/admin 4.监控node_exporter部署 参数说明 --web.listen-address=":9200"
#node_exporter监听的端口,默认是9100,若需要修改则通过此参数。 --web.telemetry-path="/metrics"
#获取metric信息的url,默认是/metrics,若需要修改则通过此参数 --log.level="info" #设置日志级别 --log.format="logger:stderr"

docker run -d --restart=always
-v /etc/localtime:/etc/localtime
--name node-exporter
--net=host docker.io/wang049718/node-exporter:0.18

Grafana导入模版id 10262

5.告警规则 Prometheus添加规则

alerting: #指定alertmanager报警组件地址 alertmanagers:

  • static_configs:
    • targets: [ '1.1.1.5:9093']

rule_files: #指定报警规则文件

  • "rules.yml"

增加规则文件rules.yml groups:

  • name: example #定义规则组 rules:
    • alert: InstanceDown #定义报警名称 expr: up == 0 #Promql语句,触发规则 for: 1m # 一分钟 labels: #标签定义报警的级别和主机 name: instance severity: Critical annotations: #注解 summary: " {{ $labels.instance }}" #报警摘要,取报警信息的appname名称 description: " 服务停止运行 " #报警信息 value: "{{ $value }}%" # 当前报警状态值
  • name: Host rules:
    • alert: HostMemory Usage expr: 1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.8 for: 1m labels: name: Memory severity: Warning annotations: summary: " {{ $labels.appname }} " description: "宿主机内存使用率超过80%." value: "{{ $value }}"
    • alert: HostCPU Usage expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.65 for: 1m labels: name: CPU severity: Warning annotations: summary: " {{ $labels.appname }} " description: "宿主机CPU使用率超过65%." value: "{{ $value }}"
    • alert: HostLoad expr: node_load5 > 4 for: 1m labels: name: Load severity: Warning annotations: summary: "{{ $labels.appname }} " description: " 主机负载5分钟超过4." value: "{{ $value }}"
    • alert: HostLoad expr: node_load1 > 10 for: 1m labels: name: Load severity: Warning annotations: summary: "{{ $labels.appname }} " description: " 主机负载1分钟超过10." value: "{{ $value }}"
    • alert: HostFilesystem Usage expr: 1-(node_filesystem_free_bytes / node_filesystem_size_bytes) > 0.8 for: 1m labels: name: Disk severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [ {{ $labels.mountpoint }} ]分区使用超过80%." value: "{{ $value }}%"
    • alert: HostDiskio expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10 for: 1m labels: name: Diskio severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高." value: "{{ $value }}iops"
    • alert: Network_receive expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|eth[0-9]|cbr[0-9]|veth.|virbr.|ovs-system"}[5m]) / 1048576 > 3 for: 1m labels: name: Network_receive severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过3Mbps." value: "{{ $value }}3Mbps"
    • alert: Network_transmit expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|eth[0-9]|cbr[0-9]|veth.|virbr.|ovs-system"}[5m]) / 1048576 > 3 for: 1m labels: name: Network_transmit severity: Warning annotations: summary: " {{ $labels.appname }} " description: " 宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过3Mbps." value: "{{ $value }}3Mbps"
  • name: Container rules:
    • alert: ContainerCPU Usage expr: (sum by(name,instance) (rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 60 for: 1m labels: name: CPU severity: Warning annotations: summary: "{{ $labels.name }} " description: " 容器CPU使用超过60%." value: "{{ $value }}%"
    • alert: ContainerMem Usage expr: container_memory_usage_bytes{name=~".+"} / 1048576 > 1024 for: 1m labels: name: Memory severity: Warning annotations: summary: "{{ $labels.name }} " description: " 容器内存使用超过1GB." value: "{{ $value }}G"
  • name: Kafka rules:
    • alert: kafka_lag expr: kafka_consumergroup_lag > 180 for: 1m labels: severity: Warning annotations: description: "剩余队列长度大于180." value: "{{ $value }}"
  • name: Redis rules:
    • alert: rejected_connections expr: redis_rejected_connections_total > 0 for: 1m labels: severity: Warning annotations: description: "redis达到链接上限,拒绝的个数" value: "{{ $value }}"
    • alert: blocked_clients expr: irate(redis_blocked_clients[5m]) > 10 for: 1m labels: severity: Warning annotations: description: "redis是单线程,5分钟阻塞大于10" value: "{{ $value }}"
    • alert: slave expr: redis_connected_slaves == 1 for: 1m labels: severity: Warning annotations: description: "slave donw" value: "{{ $value }}"
  • name: ES rules:
    • alert: es_cluster_node expr: es_cluster_nodes_number < 3 for: 1m labels: severity: Warning annotations: description: "ES集群异常." value: "{{ $value }}"
    • alert: es_cluster_datanodes_number expr: es_cluster_datanodes_number < 3 for: 1m labels: severity: Warning annotations: description: "ES集群数据节点异常." value: "{{ $value }}"
    • alert: es内存使用率 expr: es_os_mem_used_bytes / es_os_mem_total_bytes * 100 > 80 for: 1m labels: severity: Warning annotations: description: "内粗使用率大于80%" value: "{{ $value }}"
    • alert: es cpu使用率 expr: es_os_cpu_percent > 0.6 for: 1m labels: severity: Warning annotations: description: "cpu使用率大于60%" value: "{{ $value }}"
  • name: web rules:
    • alert: basevisitor expr: basevisitor != 200 for: 1m labels: severity: Warning annotations: description: "basevisitor 异常" value: "{{ $value }}"
    • alert: km expr: km != 200 for: 1m labels: severity: Warning annotations: description: "km 异常" value: "{{ $value }}"
    • alert: gtower expr: gtower != 200 for: 1m labels: severity: Warning annotations: description: "gtower 异常" value: "{{ $value }}"
    • alert: im03 expr: im03 != 200 for: 1m labels: severity: Warning annotations: description: "im03 异常" value: "{{ $value }}"
    • alert: immonitor expr: immonitor != 200 for: 1m labels: severity: Warning annotations: description: "immonitor 异常" value: "{{ $value }}"
    • alert: volcano expr: volcano != 200 for: 1m labels: severity: Warning annotations: description: "volcano 异常" value: "{{ $value }}"
    • alert: kfonline expr: kfonline != 302 for: 1m labels: severity: Warning annotations: description: "kfonline 异常" value: "{{ $value }}"
    • alert: ocs expr: ocs != 403 for: 1m labels: severity: Warning annotations: description: "ocs 异常" value: "{{ $value }}"
    • alert: fliter expr: fliter != 200 for: 1m labels: severity: Warning annotations: description: "fliter 异常" value: "{{ $value }}"
  • name: mongo rules:
    • alert: cluster expr: mongodb_mongod_replset_my_state != 2 for: 1m labels: severity: Warning annotations: description: "集群异常" value: "{{ $value }}"Prometheus加载告警规则

docker run -d --restart=always
-v /etc/localtime:/etc/localtime
-v /data/monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
-v /data/monitor/prometheus/rules.yml:/etc/prometheus/rules.yml
--name prometheus
--net=host docker.io/wang049718/prometheus --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml

启动告警服务插件 cat alertmanager.yml global: resolve_timeout: 2m smtp_smarthost: smtp.163.com:25 smtp_from: wang049718@163.com smtp_auth_username: wang049718@163.com smtp_auth_password: wang049718

templates: ##消息模板

  • '/etc/alertmanager/template/wechat.tmpl' route: group_by: ['alertname_wechat'] group_wait: 30s group_interval: 60s receiver: 'email' # 优先使用wechat发送 repeat_interval: 1h routes: #子路由,使用email发送
  • receiver: email match_re: serverity: email receivers:
  • name: 'email' email_configs:
    • to: '1715498045@qq.com' send_resolved: true # 发送已解决通知 wechat.tmpl [root@localhost wang]# cat wechat.tmpl {{ define "wechat.default.message" }} {{ range $i, $alert :=.Alerts }} ========监控报警========== 告警状态:{{ .Status }} 告警级别:{{ $alert.Labels.severity }} 告警类型:{{ $alert.Labels.alertname }} 告警应用:{{ $alert.Annotations.summary }} 告警主机:{{ $alert.Labels.instance }} 告警详情:{{ $alert.Annotations.description }} 触发阀值:{{ $alert.Annotations.value }} 告警时间:{{ $alert.StartsAt.Format "2006-01-02 15:04:05" }} ========end============= {{ end }} {{ end }}

docker run -d -p 9093:9093 --name alertmanager
--restart always
-v /etc/localtime:/etc/localtime
-v /home/wang/alertmanager.yml:/etc/alertmanager/alertmanager.yml
-v /home/wang:/etc/alertmanager/template
docker.io/wang049718/alertmanager

6.监控mongo docker run -itd --net=host -v /etc/localtime:/etc/localtime --name mongo mongo --auth docker exec -it mongo mongo use admin db.createUser({ user: 'admin', pwd: '123456', roles: [{ "role": "userAdminAnyDatabase", "db": "admin" }, { "role": "dbAdminAnyDatabase", "db": "admin" }, { role: "root", db: "admin" } ] })

docker exec -it mongo mongo -uadmin -p123456

use col db.col.insert({title: 'MongoDB 教程', description: 'MongoDB 是一个 Nosql 数据库', by: '菜鸟教程', url: 'http://www.runoob.com', tags: ['mongodb', 'database', 'NoSQL'], likes: 100 }) sudo docker run -d --name mongo-explorer
--restart=always
-p 9105:9104
--cpuset-cpus=2,1
-m 300m
-v /etc/localtime:/etc/localtime
docker.io/wang049718/mongo --mongodb.uri "mongodb://admin:bdpass9937465@172.21.10.17:27017"

模版为json串 7监控mysql sudo docker run -d --restart=always
--net=host
--cpuset-cpus=0,1
-m 1200m
-e DATA_SOURCE_NAME="monitoring:monitoring@(172.21.10.22:3306)"/
-v /etc/localtime:/etc/localtime
--name mysql_exporter
docker.io/wang049718/mysqld-exporter:latest 8.监控redis docker stop redis-server docker rm redis-server docker run -d --name redis-server -p 6379:6379
-v /etc/localtime:/etc/localtime
-v /home/redis:/data
--restart always redis
--requirepass "123456" --appendonly yes docker run -d --name redis_exporter
-p 9121:9121
-v /etc/localtime:/etc/localtime
--restart always docker.io/wang049718/redis_exporter
--redis.addr redis://1.1.1.4:6379 -redis.password 123456

sudo docker run -d --name redis_exporter
--net=host
-v /etc/localtime:/etc/localtime
--cpuset-cpus=0,1
-m 1200m
--restart always
docker.io/wang049718/redis_exporter
--redis.addr redis://172.21.10.11:6379

9.监控kafka sudo docker stop kafka sudo docker rm kafka sudo docker run -d --restart=always
--cpuset-cpus=3,2
-m 1200m
-p 9308:9308
-v /etc/localtime:/etc/localtime
--name kafka
docker.io/wang049718/kafka_exporter
/kafka_exporter-1.2.0.linux-amd64/kafka_exporter --kafka.server=172.21.10.4:9092

Json串

10.pushgateway

sudo docker stop pushgateway sudo docker rm pushgateway sudo docker run -d --restart=always
-v /etc/localtime:/etc/localtime
--name pushgateway -p 9091:9091
docker.io/wang049718/pushgateway:latest

访问9091端口(http://pushgatewayIP:9091)

打开prometheus的配置文件

  • job_name: 'pushgateway' static_configs:
    • targets: ['pushgatewayIP:9091'] honor_labels: true #作用:如果没有设置instance标签,Prometheus服务器也会附加标签,否则instance标签值会为空  

写入测试 单条 echo "test 123" | curl --data-binary @- http://localhost:9091/metrics/job/test 上述测试的目的是,在被监控的机器上,想pushgateway发送了一条数据,内容是“test 123”,指标名称是“test”,指标值是“123”;   http://pushgatewayIP:9091/metrics/job/test,此次也声名了,在pushgateway处建立一个job为test的指标。

可以在prometheus图形查看test值

API格式:   http://pustgatewayIP/metrices/job/job名/标签名/标签值(一般 标签名 采用 instance) 例子:   http://pustgatewayIP/metrics/job/
    /sb/instance/si     /testjob/abc/pushgateway1     /testjob/yyy/pushgateway1   分别触发上述三个API,打开pushgateway的web UI

cat <<EOF | curl --data-binary @- http://pushgatewayIP:9091/metrics/job/docker_runtime/instance/xa-lsr-billubuntu # TYPE docker_runtime counter docker_runtime{name="cadvisor"} 33 docker_runtime{name="nginx"} 331 docker_runtime{name="abc"} 332

echo "basevisitor curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-basevisitor.ziroom.com" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/basevisitor echo "km curl -I -m 10 -o /dev/null -s -w %{http_code} https://kf-km.ziroom.com/backend/health" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/km echo "gtower curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-gtower.ziroom.com:7002" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/gtower echo "im03 curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-im03.ziroom.com/health-check" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/im03 echo "fliter curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-fliter.ziroom.com" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/fliter echo "immonitor curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-immonitor.ziroom.com/health-check" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/immonitor echo "volcano curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-volcano.ziroom.com/monitor/group_all_agent" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/volcano echo "kfonline curl -I -m 10 -o /dev/null -s -w %{http_code} http://kfonline.ziroom.com" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/kfonline echo "push curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-push.ziroom.com:7002" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/push echo "ocs curl -I -m 10 -o /dev/null -s -w %{http_code} http://kf-ocs.ziroom.com/minio/login" | curl --data-binary @- http://localhost:9091/metrics/job/web/instance/ocs 11.监控es

es监控 /usr/local/services/elasticsearch/bin/elasticsearch-plugin install file:///home/webuser/package/elasticsearch-prometheus-exporter-5.6.4.0.zip

重启服务器 服务器端

  • job_name: elasticsearch scrape_interval: 5s metrics_path: "/_prometheus/metrics"

    file_sd_configs:

    • files:
      • es.yml

es.yml文件,位置和prometheus.yml在一个文件夹

  • targets:
    • 172.21.10.10:9200
    • 172.21.8.49:9200
    • 172.21.10.12:9200 labels: server: c2-jenkins grafana导入266

sudo docker stop prometheus sudo docker rm prometheus sudo docker run -d --restart=always
-v /etc/localtime:/etc/localtime
-v /data/monitor/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
-v /data/monitor/prometheus/es.yml:/etc/prometheus/es.yml
-v /data/monitor/prometheus/rules.yml:/etc/prometheus/rules.yml
--name prometheus
--net=host prom/prometheus --web.enable-lifecycle --config.file=/etc/prometheus/prometheus.yml

12监控nginx

1.10.3 apt-get -y update apt-get -y install libpcre3 libpcre3-dev gcc apt-get -y install openssl libssl-dev libxslt-dev libgd-dev libgeoip-dev

git clone git://github.com/vozlt/nginx-module-vts.git

wget http://nginx.org/download/nginx-1.10.3.tar.gz tar xvf nginx-1.10.3.tar.gz nginx/1.10.3 ./configure --with-cc-opt='-g -O2 -fPIE -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2' --with-ld-opt='-Wl,-Bsymbolic-functions -fPIE -pie -Wl,-z,relro -Wl,-z,now' --prefix=/usr/share/nginx --conf-path=/etc/nginx/nginx.conf --http-log-path=/var/log/nginx/access.log --error-log-path=/var/log/nginx/error.log --lock-path=/var/lock/nginx.lock --pid-path=/run/nginx.pid --http-client-body-temp-path=/var/lib/nginx/body --http-fastcgi-temp-path=/var/lib/nginx/fastcgi --http-proxy-temp-path=/var/lib/nginx/proxy --http-scgi-temp-path=/var/lib/nginx/scgi --http-uwsgi-temp-path=/var/lib/nginx/uwsgi --with-debug --with-pcre-jit --with-ipv6 --with-http_ssl_module --with-http_stub_status_module --with-http_realip_module --with-http_auth_request_module --with-http_addition_module --with-http_dav_module --with-http_geoip_module --with-http_gunzip_module --with-http_gzip_static_module --with-http_image_filter_module --with-http_v2_module --with-http_sub_module --with-http_xslt_module --with-stream --with-stream_ssl_module --with-mail --with-mail_ssl_module --with-threads --add-module=/home/webuser/nginx-module-vts

make && make install make upgrade

rm -rf /usr/sbin/nginx ln -s /usr/share/nginx/sbin/nginx /usr/sbin/ nginx -V

http
vhost_traffic_status_zone; vhost_traffic_status_filter_by_host on;

server location /status { vhost_traffic_status_display; vhost_traffic_status_display_format html; } 不能是localhost不然不生效

wget -c https://github.com/hnlq715/nginx-vts-exporter/releases/download/v0.9.1/nginx-vts-exporter-0.9.1.linux-amd64.tar.gz tar -xvf nginx-vts-exporter-0.9.1.linux-amd64.tar.gz -C /usr/local/ cd /usr/local/nginx-vts-exporter-0.9.1.linux-amd64/ ./nginx-vts-exporter -nginx.scrape_uri http://172.21.10.3:7002/status/format/json &

9913

sudo docker stop nginx-vts-exporter sudo docker rm nginx-vts-exporter sudo docker run -d --name nginx-vts-exporter
--restart=always
-p 9913:9913
--cpuset-cpus=2,1
-e nginx.scrape_uri='http://172.21.10.3:7002/status/format/json'
-e NGINX_HOST=http://172.21.10.3:7002
-m 300m
-v /etc/localtime:/etc/localtime
docker.io/sophos/nginx-vts-exporter

9913端口 http://1.1.1.4:9913/metrics 查看数据

grafana配置 2949