01 在prometheus record机器上 安装confd
- 下载 带分片功能的confd二进制
wget https://github.com/ning1875/confd/releases/download/v0.16.0/confd_shard-0.16.0-linux-amd64.tar.gz
- 创建目录
mkdir -p /etc/confd/{conf.d,templates}
- 主配置文件/etc/confd/conf.d/records.yml.toml ,注意dest要和你的prometheus目录一致
cat <<-"EOF" > /etc/confd/conf.d/records.yml.toml
[template]
prefix = "/prometheus"
src = "records.yml.tmpl"
dest = "/opt/app/prometheus/confd_record.yml"
#shards=3
#num=0
keys = [
"/records"
]
reload_cmd = "curl -X POST http://localhost:9090/-/reload"
EOF
- shards代表分片总数,num代表第几个分片
- record模板文件 /etc/confd/templates/records.yml.tmpl
每个record单独的group分组,好处是互相不影响,缺点是group过多
cat <<-"EOF" > /etc/confd/templates/records.yml.tmpl
groups:
{{range gets "/records/*"}}{{$item := json .Value}}
- name: {{$item.record}}
rules:
- record: {{$item.record}}
expr: {{$item.expr}}
{{end}}
EOF
使用相同分组,需要按顺序执行record
cat <<-"EOF" > /etc/confd/templates/records.yml.tmpl
groups:
- name: confd_record
interval: 30s
rules:{{range gets "/records/*"}}{{$item := json .Value}}
- record: {{$item.record}}
expr: {{$item.expr}}{{end}}
EOF
指定consul backend 启动confd
- onetime代表运行一次
confd -onetime --backend consul --node localhost:8500 --log-level debug
cat <<EOF> /etc/systemd/system/confd.service
[Unit]
Description=confd server
Wants=network-online.target
After=network-online.target
[Service]
ExecStart=/usr/bin/confd --backend consul --node 172.20.70.205:8500 --log-level debug -interval=30
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=confd
[Install]
WantedBy=default.target
EOF
# 启动服务
systemctl daemon-reload && systemctl start confd
systemctl status confd
02 中控机上部署consul redis ansible
consul 安装
准备工作
# 下载consul
wget -O /opt/tgzs/consul_1.9.4_linux_amd64.zip https://releases.hashicorp.com/consul/1.9.4/consul_1.9.4_linux_amd64.zip
cd /opt/tgzs/
unzip consul_1.9.4_linux_amd64.zip
/bin/cp -f consul /usr/bin/
启动单机版consul
#
mkdir /opt/app/consul
# 准备配置文件
cat <<EOF > /opt/app/consul/single_server.json
{
"datacenter": "dc1",
"node_name": "consul-svr-01",
"server": true,
"bootstrap_expect": 1,
"data_dir": "/opt/app/consul/",
"log_level": "INFO",
"log_file": "/opt/logs/",
"ui": true,
"bind_addr": "0.0.0.0",
"client_addr": "0.0.0.0",
"retry_interval": "10s",
"raft_protocol": 3,
"enable_debug": false,
"rejoin_after_leave": true,
"enable_syslog": false
}
EOF
# 多个ip地址时,将bind_addr 改为一个内网的ip
# 写入service文件
cat <<EOF > /etc/systemd/system/consul.service
[Unit]
Description=consul server
Wants=network-online.target
After=network-online.target
[Service]
ExecStart=/usr/bin/consul agent -config-file=/opt/app/consul/single_server.json
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=consul
[Install]
WantedBy=default.target
EOF
# 启动服务
systemctl daemon-reload && systemctl start consul
systemctl status consul
验证访问
03 将pre_query 放到中控机上
- all_prome_query 中的prometheus query ip改为自己的
- prometheus query 开启query log
global:
query_log_file: /App/logs/prometheus_query.log
- config.yaml 填写相关配置项
04 执行pre_query中的分析record命令
将promtool 复制到当前目录用作 record promql的check
- /bin/cp -f /opt/app/prometheus/promtool
pre_query目录下执行ansible命令
ansible-playbook -i all_prome_query prome_heavy_expr_parse.yaml
检查本地record yaml
[root@k8s-master01 pre_query]# ll local_record_yml_dir/
total 12
-rw-r--r-- 1 root root 551 Sep 13 15:53 record_2_2021-09-13.yml
-rw-r--r-- 1 root root 5455 Sep 13 15:53 record_26_2021-09-13.yml
[root@k8s-master01 pre_query]# head local_record_yml_dir/record_26_2021-09-13.yml
groups:
- name: heavy_expr_record
rules:
- record: hke:heavy_expr:082a631dfddb7cf65ddd0fb4923ab17e
expr: rate(mysql_global_status_sort_scan{instance=~"172.20.70.205:9104"}[5s])
or irate(mysql_global_status_sort_scan{instance=~"172.20.70.205:9104"}[5m])
- record: hke:heavy_expr:1416fc3de389e2a5c36aa5c8c376391f
expr: mysql_global_status_threads_cached{instance=~"172.20.70.205:9104"}
- record: hke:heavy_expr:14e8a540527123cc11ad96c5faa03f43
expr: irate(mysql_slave_status_relay_log_pos{instance=~"172.20.70.205:9104"}[5m])
检查consul中的记录
curl http://localhost:8500/v1/kv/prometheus/record?recurse= |python -m json.tool
{
"CreateIndex": 585468,
"Flags": 0,
"Key": "prometheus/records/6",
"LockIndex": 0,
"ModifyIndex": 585468,
"Value": "eyJyZWNvcmQiOiAiaGtlOmhlYXZ5X2V4cHI6MjY1YzUwMzMxZjRiNzk4MzRjMzc1MDY2ZTY2NWQ4NDYiLCAiZXhwciI6ICJyYXRlKG15c3FsX2dsb2JhbF9zdGF0dXNfY3JlYXRlZF90bXBfdGFibGVze2luc3RhbmNlPX5cIjE3Mi4yMC43MC4yMDU6OTEwNFwifVs1c10pIG9yIGlyYXRlKG15c3FsX2dsb2JhbF9zdGF0dXNfY3JlYXRlZF90bXBfdGFibGVze2luc3RhbmNlPX5cIjE3Mi4yMC43MC4yMDU6OTEwNFwifVs1bV0pIn0="
},
{
"CreateIndex": 585468,
"Flags": 0,
"Key": "prometheus/records/7",
"LockIndex": 0,
"ModifyIndex": 585468,
"Value": "eyJyZWNvcmQiOiAiaGtlOmhlYXZ5X2V4cHI6MjZkODYwNzY4NzcxOTUyOTc3ZGNiZjUzYzU3ZWZhNTUiLCAiZXhwciI6ICJyYXRlKG15c3FsX2dsb2JhbF9zdGF0dXNfcXVlcmllc3tpbnN0YW5jZT1+XCIxNzIuMjAuNzAuMjA1OjkxMDRcIn1bNXNdKSBvciBpcmF0ZShteXNxbF9nbG9iYWxfc3RhdHVzX3F1ZXJpZXN7aW5zdGFuY2U9flwiMTcyLjIwLjcwLjIwNTo5MTA0XCJ9WzVtXSkifQ=="
},
检测部署了confd的 prometheus record 上的record文件内容
[root@k8s-master01 pre_query]# cat /opt/app/prometheus/confd_record.yml |head
groups:
- name: hke:heavy_expr:082a631dfddb7cf65ddd0fb4923ab17e
rules:
- record: hke:heavy_expr:082a631dfddb7cf65ddd0fb4923ab17e
expr: rate(mysql_global_status_sort_scan{instance=~"172.20.70.205:9104"}[5s]) or irate(mysql_global_status_sort_scan{instance=~"172.20.70.205:9104"}[5m])
- name: hke:heavy_expr:4b93ce0bd3db2848e1b6d330a03272f7
rules:
- record: hke:heavy_expr:4b93ce0bd3db2848e1b6d330a03272f7
prometheus record页面上检查 聚合规则并查询数据
- 截图
检查redis中的key
[root@k8s-master01 pre_query]# redis-cli keys "hke:heavy_expr*"
1) "hke:heavy_expr:bc7775bb5e33bf84afa9a1d4c0c45a9a"
2) "hke:heavy_expr:de2548ae6a00a90b1c2f85f8d6d9f13b"
3) "hke:heavy_expr:d86e3aa799b6a84790e133aa8a306e96"
4) "hke:heavy_expr:4fe8ee091e7823b66b475ba05b5fd030"
5) "hke:heavy_expr:b96a96befac765f6c00743a82ffae053"
6) "hke:heavy_expr:513ddfbf6f83d1ba1dd9b0b4a21a43bf"
7) "hke:heavy_expr:2998d2677fc1873a0e46802cbdd1bfee"
8) "hke:heavy_expr:22ccf0a71b6651763d1b7c16f5c05365"
9) "hke:heavy_expr:0d8c4be4ea8dccb9f06389246a02c6b3"
10) "hke:heavy_expr:f30b7b481bb0fdee0466902b9abb3b35"
11) "hke:heavy_expr:298afe40c3479e217b0b0b3666bd6904"
12) "hke:heavy_expr:bebca671decc9d5954af35628a05baa2"
13) "hke:heavy_expr:db9f0c1be81f91c95d9eb617ab70da36"
14) "hke:heavy_expr:45d5dc64bef02cf3f515481747cccd80"
15) "hke:heavy_expr:d797f93ad8ec0f7c80a5617eb5e4f3d8"
16) "hke:heavy_expr:eb1637bfe8f1388e99659d4621a79367"
17) "hke:heavy_expr:26d860768771952977dcbf53c57efa55"
18) "hke:heavy_expr:25bc18bd90a1a69d950802d937d337a0"
19) "hke:heavy_expr:d8aaf244a86fcfae8e51aeeb6935a5a5"
20) "hke:heavy_expr:189831b5aaa2d688c49a9c717fbf8b3d"
05 confd分片功能演示
默认不开启分片 ,shards 和num注释掉就可以
- confd配置文件 /etc/confd/conf.d/records.yml.toml
[template]
prefix = "/prometheus"
src = "records.yml.tmpl"
dest = "/opt/app/prometheus/confd_record.yml"
#shards=2
#num=0
keys = [
"/records"
]
reload_cmd = "curl -X POST http://localhost:9090/-/reload"
- prometheus record 通过的结果 46个
[root@k8s-master01 conf.d]# confd -onetime --backend consul --node localhost:8500
2021-09-13T16:45:15+08:00 k8s-master01 confd[30010]: INFO Backend set to consul
2021-09-13T16:45:15+08:00 k8s-master01 confd[30010]: INFO Starting confd
2021-09-13T16:45:15+08:00 k8s-master01 confd[30010]: INFO Backend source(s) set to localhost:8500
2021-09-13T16:45:15+08:00 k8s-master01 confd[30010]: INFO t.shards:0,t.nums:0
[root@k8s-master01 conf.d]# /opt/app/prometheus/promtool check rules /opt/app/prometheus/confd_record.yml
Checking /opt/app/prometheus/confd_record.yml
SUCCESS: 46 rules found
开启分片 配置 shards=2 num=0 代表 2分片中的第一个
- confd配置文件 /etc/confd/conf.d/records.yml.toml
[template]
prefix = "/prometheus"
src = "records.yml.tmpl"
dest = "/opt/app/prometheus/confd_record.yml"
shards=2
num=0
keys = [
"/records"
]
reload_cmd = "curl -X POST http://localhost:9090/-/reload"
- prometheus record 通过的结果 23个
[root@k8s-master01 conf.d]# confd -onetime --backend consul --node localhost:8500
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO Backend set to consul
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO Starting confd
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO Backend source(s) set to localhost:8500
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO t.shards:2,t.nums:0
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO /opt/app/prometheus/confd_record.yml has md5sum a0c39c7a73d741ec911b64a6eb5d1b8c should be 50ad6045ba32557c64037702bbc2613c
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO Target config /opt/app/prometheus/confd_record.yml out of sync
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO Target config /opt/app/prometheus/confd_record.yml has been updated
[root@k8s-master01 conf.d]# /opt/app/prometheus/promtool check rules /opt/app/prometheus/confd_record.yml
Checking /opt/app/prometheus/confd_record.yml
SUCCESS: 23 rules found
[root@k8s-master01 conf.d
06 openresty和lua组件,新增grafana数据源
安装openresty ,准备lua环境
yum install yum-utils -y
yum-config-manager --add-repo https://openresty.org/package/centos/openresty.repo
yum install openresty openresty-resty -y
修改信息
- 修改prome_redirect.lua 文件中的 27 行 localhost redis地址为你自己的
- 修改ngx_prome_redirect.conf文件中 真实real_prometheus后端,使用前请修改
将nginx配置和lua文件放到指定目录
mkdir -pv /usr/local/openresty/nginx/conf/conf.d/
mkdir -pv /usr/local/openresty/nginx/lua_files/
/bin/cp -f ngx_prome_redirect.conf /usr/local/openresty/nginx/conf/conf.d/
/bin/cp -f nginx.conf /usr/local/openresty/nginx/conf/
/bin/cp -f prome_redirect.lua /usr/local/openresty/nginx/lua_files/
启动openresty
systemctl enable openresty
systemctl start openresty
请求OpenResty 9992端口 ,出现/graph则正常
[root@k8s-master01 pre_query]# curl localhost:9992/
<a href="/graph">Found</a>.
openresty查看日志
tail -f /usr/local/openresty/nginx/logs/access.log
修改grafana数据源,将原来的指向真实prometheus地址改为指向openresty的9992端口
- 截图
之前查询慢的大盘导出一份,再导入,选择新的9992数据源 查看对比
- 截图
运维指南
# 查看redis中的heavy_query记录
redis-cli -h $redis_host keys hke:heavy_expr*
# 查看consul中的heavy_query记录
curl http://$consul_addr:8500/v1/kv/prometheus/record?recurse= |python -m json.tool
# 根据一个heavy_record文件恢复记录
python3 recovery_by_local_yaml.py local_record_yml/record_to_keep.yml
# 根据一个metric_name前缀删除record记录
bash -x recovery_heavy_metrics.sh $metric_name
总结
- 使用OpenResty的数据源 不会影响未配置预聚合的图
- 因为只是nginx代理了一下,如果redis中没有要替换的expr就会以原查询ql查询