01 在prometheus record机器上 安装confd

  • 下载 带分片功能的confd二进制
wget https://github.com/ning1875/confd/releases/download/v0.16.0/confd_shard-0.16.0-linux-amd64.tar.gz
  • 创建目录
mkdir -p /etc/confd/{conf.d,templates}
  • 主配置文件/etc/confd/conf.d/records.yml.toml ,注意dest要和你的prometheus目录一致
cat <<-"EOF"  > /etc/confd/conf.d/records.yml.toml
[template] 
prefix = "/prometheus"
src = "records.yml.tmpl"
dest = "/opt/app/prometheus/confd_record.yml"
#shards=3
#num=0
keys = [
    "/records"
]
reload_cmd = "curl -X POST http://localhost:9090/-/reload"


EOF
  • shards代表分片总数,num代表第几个分片
  • record模板文件 /etc/confd/templates/records.yml.tmpl

每个record单独的group分组,好处是互相不影响,缺点是group过多

cat <<-"EOF"  > /etc/confd/templates/records.yml.tmpl
groups:
{{range gets "/records/*"}}{{$item := json .Value}}
- name: {{$item.record}}
  rules:	
  - record: {{$item.record}}
    expr: {{$item.expr}}
{{end}}
EOF

使用相同分组,需要按顺序执行record

cat <<-"EOF"  > /etc/confd/templates/records.yml.tmpl
groups:
- name: confd_record
  interval: 30s
  rules:{{range gets "/records/*"}}{{$item := json .Value}}
  - record: {{$item.record}}
    expr: {{$item.expr}}{{end}}
EOF

指定consul backend 启动confd

  • onetime代表运行一次
confd -onetime --backend consul --node localhost:8500 --log-level debug
cat <<EOF>  /etc/systemd/system/confd.service
[Unit]
Description=confd server
Wants=network-online.target
After=network-online.target

[Service]
ExecStart=/usr/bin/confd  --backend consul --node 172.20.70.205:8500 --log-level debug -interval=30
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=confd
[Install]
WantedBy=default.target
EOF

# 启动服务
systemctl daemon-reload && systemctl start confd   

systemctl status confd

02 中控机上部署consul redis ansible

consul 安装

准备工作
# 下载consul
wget -O /opt/tgzs/consul_1.9.4_linux_amd64.zip  https://releases.hashicorp.com/consul/1.9.4/consul_1.9.4_linux_amd64.zip 

cd /opt/tgzs/
unzip consul_1.9.4_linux_amd64.zip

/bin/cp -f consul /usr/bin/
启动单机版consul
# 
mkdir  /opt/app/consul

# 准备配置文件
cat <<EOF > /opt/app/consul/single_server.json
{
    "datacenter": "dc1",
    "node_name": "consul-svr-01",
    "server": true,
    "bootstrap_expect": 1,
    "data_dir": "/opt/app/consul/",
    "log_level": "INFO",
    "log_file": "/opt/logs/",
    "ui": true,
    "bind_addr": "0.0.0.0",
    "client_addr": "0.0.0.0",
    "retry_interval": "10s",
    "raft_protocol": 3,
    "enable_debug": false,
    "rejoin_after_leave": true,
    "enable_syslog": false
}
EOF

# 多个ip地址时,将bind_addr 改为一个内网的ip

# 写入service文件
cat <<EOF > /etc/systemd/system/consul.service
[Unit]
Description=consul server
Wants=network-online.target
After=network-online.target

[Service]
ExecStart=/usr/bin/consul agent  -config-file=/opt/app/consul/single_server.json
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=consul
[Install]
WantedBy=default.target
EOF

# 启动服务
systemctl daemon-reload && systemctl start consul   

systemctl status consul
验证访问

03 将pre_query 放到中控机上

  • all_prome_query 中的prometheus query ip改为自己的
  • prometheus query 开启query log
global:
  query_log_file: /App/logs/prometheus_query.log
  • config.yaml 填写相关配置项

04 执行pre_query中的分析record命令

将promtool 复制到当前目录用作 record promql的check

  • /bin/cp -f /opt/app/prometheus/promtool

pre_query目录下执行ansible命令

ansible-playbook -i all_prome_query  prome_heavy_expr_parse.yaml

检查本地record yaml

[root@k8s-master01 pre_query]# ll local_record_yml_dir/
total 12
-rw-r--r-- 1 root root  551 Sep 13 15:53 record_2_2021-09-13.yml
-rw-r--r-- 1 root root 5455 Sep 13 15:53 record_26_2021-09-13.yml
[root@k8s-master01 pre_query]# head local_record_yml_dir/record_26_2021-09-13.yml 
groups:
- name: heavy_expr_record
  rules:
  - record: hke:heavy_expr:082a631dfddb7cf65ddd0fb4923ab17e
    expr: rate(mysql_global_status_sort_scan{instance=~"172.20.70.205:9104"}[5s])
      or irate(mysql_global_status_sort_scan{instance=~"172.20.70.205:9104"}[5m])
  - record: hke:heavy_expr:1416fc3de389e2a5c36aa5c8c376391f
    expr: mysql_global_status_threads_cached{instance=~"172.20.70.205:9104"}
  - record: hke:heavy_expr:14e8a540527123cc11ad96c5faa03f43
    expr: irate(mysql_slave_status_relay_log_pos{instance=~"172.20.70.205:9104"}[5m])

检查consul中的记录

curl http://localhost:8500/v1/kv/prometheus/record?recurse= |python -m json.tool
    {
        "CreateIndex": 585468,
        "Flags": 0,
        "Key": "prometheus/records/6",
        "LockIndex": 0,
        "ModifyIndex": 585468,
        "Value": "eyJyZWNvcmQiOiAiaGtlOmhlYXZ5X2V4cHI6MjY1YzUwMzMxZjRiNzk4MzRjMzc1MDY2ZTY2NWQ4NDYiLCAiZXhwciI6ICJyYXRlKG15c3FsX2dsb2JhbF9zdGF0dXNfY3JlYXRlZF90bXBfdGFibGVze2luc3RhbmNlPX5cIjE3Mi4yMC43MC4yMDU6OTEwNFwifVs1c10pIG9yIGlyYXRlKG15c3FsX2dsb2JhbF9zdGF0dXNfY3JlYXRlZF90bXBfdGFibGVze2luc3RhbmNlPX5cIjE3Mi4yMC43MC4yMDU6OTEwNFwifVs1bV0pIn0="
    },
    {
        "CreateIndex": 585468,
        "Flags": 0,
        "Key": "prometheus/records/7",
        "LockIndex": 0,
        "ModifyIndex": 585468,
        "Value": "eyJyZWNvcmQiOiAiaGtlOmhlYXZ5X2V4cHI6MjZkODYwNzY4NzcxOTUyOTc3ZGNiZjUzYzU3ZWZhNTUiLCAiZXhwciI6ICJyYXRlKG15c3FsX2dsb2JhbF9zdGF0dXNfcXVlcmllc3tpbnN0YW5jZT1+XCIxNzIuMjAuNzAuMjA1OjkxMDRcIn1bNXNdKSBvciBpcmF0ZShteXNxbF9nbG9iYWxfc3RhdHVzX3F1ZXJpZXN7aW5zdGFuY2U9flwiMTcyLjIwLjcwLjIwNTo5MTA0XCJ9WzVtXSkifQ=="
    },

检测部署了confd的 prometheus record 上的record文件内容

[root@k8s-master01 pre_query]# cat /opt/app/prometheus/confd_record.yml  |head 
groups:

- name: hke:heavy_expr:082a631dfddb7cf65ddd0fb4923ab17e
  rules:
  - record: hke:heavy_expr:082a631dfddb7cf65ddd0fb4923ab17e
    expr: rate(mysql_global_status_sort_scan{instance=~"172.20.70.205:9104"}[5s]) or irate(mysql_global_status_sort_scan{instance=~"172.20.70.205:9104"}[5m])

- name: hke:heavy_expr:4b93ce0bd3db2848e1b6d330a03272f7
  rules:
  - record: hke:heavy_expr:4b93ce0bd3db2848e1b6d330a03272f7

prometheus record页面上检查 聚合规则并查询数据

  • 截图

检查redis中的key

[root@k8s-master01 pre_query]# redis-cli keys "hke:heavy_expr*"
 1) "hke:heavy_expr:bc7775bb5e33bf84afa9a1d4c0c45a9a"
 2) "hke:heavy_expr:de2548ae6a00a90b1c2f85f8d6d9f13b"
 3) "hke:heavy_expr:d86e3aa799b6a84790e133aa8a306e96"
 4) "hke:heavy_expr:4fe8ee091e7823b66b475ba05b5fd030"
 5) "hke:heavy_expr:b96a96befac765f6c00743a82ffae053"
 6) "hke:heavy_expr:513ddfbf6f83d1ba1dd9b0b4a21a43bf"
 7) "hke:heavy_expr:2998d2677fc1873a0e46802cbdd1bfee"
 8) "hke:heavy_expr:22ccf0a71b6651763d1b7c16f5c05365"
 9) "hke:heavy_expr:0d8c4be4ea8dccb9f06389246a02c6b3"
10) "hke:heavy_expr:f30b7b481bb0fdee0466902b9abb3b35"
11) "hke:heavy_expr:298afe40c3479e217b0b0b3666bd6904"
12) "hke:heavy_expr:bebca671decc9d5954af35628a05baa2"
13) "hke:heavy_expr:db9f0c1be81f91c95d9eb617ab70da36"
14) "hke:heavy_expr:45d5dc64bef02cf3f515481747cccd80"
15) "hke:heavy_expr:d797f93ad8ec0f7c80a5617eb5e4f3d8"
16) "hke:heavy_expr:eb1637bfe8f1388e99659d4621a79367"
17) "hke:heavy_expr:26d860768771952977dcbf53c57efa55"
18) "hke:heavy_expr:25bc18bd90a1a69d950802d937d337a0"
19) "hke:heavy_expr:d8aaf244a86fcfae8e51aeeb6935a5a5"
20) "hke:heavy_expr:189831b5aaa2d688c49a9c717fbf8b3d"

05 confd分片功能演示

默认不开启分片 ,shards 和num注释掉就可以

  • confd配置文件 /etc/confd/conf.d/records.yml.toml
[template] 
prefix = "/prometheus"
src = "records.yml.tmpl"
dest = "/opt/app/prometheus/confd_record.yml"
#shards=2
#num=0
keys = [
    "/records"
]
reload_cmd = "curl -X POST http://localhost:9090/-/reload"
  • prometheus record 通过的结果 46个
[root@k8s-master01 conf.d]# confd -onetime --backend consul --node localhost:8500 
2021-09-13T16:45:15+08:00 k8s-master01 confd[30010]: INFO Backend set to consul
2021-09-13T16:45:15+08:00 k8s-master01 confd[30010]: INFO Starting confd
2021-09-13T16:45:15+08:00 k8s-master01 confd[30010]: INFO Backend source(s) set to localhost:8500
2021-09-13T16:45:15+08:00 k8s-master01 confd[30010]: INFO t.shards:0,t.nums:0
[root@k8s-master01 conf.d]# /opt/app/prometheus/promtool check rules   /opt/app/prometheus/confd_record.yml 
Checking /opt/app/prometheus/confd_record.yml
  SUCCESS: 46 rules found

开启分片 配置 shards=2 num=0 代表 2分片中的第一个

  • confd配置文件 /etc/confd/conf.d/records.yml.toml
[template] 
prefix = "/prometheus"
src = "records.yml.tmpl"
dest = "/opt/app/prometheus/confd_record.yml"
shards=2
num=0
keys = [
    "/records"
]
reload_cmd = "curl -X POST http://localhost:9090/-/reload"
  • prometheus record 通过的结果 23个
[root@k8s-master01 conf.d]# confd -onetime --backend consul --node localhost:8500                               
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO Backend set to consul
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO Starting confd
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO Backend source(s) set to localhost:8500
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO t.shards:2,t.nums:0
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO /opt/app/prometheus/confd_record.yml has md5sum a0c39c7a73d741ec911b64a6eb5d1b8c should be 50ad6045ba32557c64037702bbc2613c
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO Target config /opt/app/prometheus/confd_record.yml out of sync
2021-09-13T16:47:16+08:00 k8s-master01 confd[32350]: INFO Target config /opt/app/prometheus/confd_record.yml has been updated
[root@k8s-master01 conf.d]# /opt/app/prometheus/promtool check rules   /opt/app/prometheus/confd_record.yml                                      
Checking /opt/app/prometheus/confd_record.yml
  SUCCESS: 23 rules found

[root@k8s-master01 conf.d

06 openresty和lua组件,新增grafana数据源

安装openresty ,准备lua环境

yum install yum-utils -y
yum-config-manager --add-repo https://openresty.org/package/centos/openresty.repo
yum install openresty openresty-resty -y

修改信息

  • 修改prome_redirect.lua 文件中的 27 行 localhost redis地址为你自己的
  • 修改ngx_prome_redirect.conf文件中 真实real_prometheus后端,使用前请修改

将nginx配置和lua文件放到指定目录

mkdir -pv /usr/local/openresty/nginx/conf/conf.d/
mkdir -pv /usr/local/openresty/nginx/lua_files/
/bin/cp -f  ngx_prome_redirect.conf /usr/local/openresty/nginx/conf/conf.d/
/bin/cp -f  nginx.conf /usr/local/openresty/nginx/conf/
/bin/cp -f prome_redirect.lua /usr/local/openresty/nginx/lua_files/

启动openresty

systemctl enable openresty
systemctl start openresty

请求OpenResty 9992端口 ,出现/graph则正常

[root@k8s-master01 pre_query]# curl localhost:9992/
<a href="/graph">Found</a>.

openresty查看日志

tail -f /usr/local/openresty/nginx/logs/access.log

修改grafana数据源,将原来的指向真实prometheus地址改为指向openresty的9992端口

  • 截图

之前查询慢的大盘导出一份,再导入,选择新的9992数据源 查看对比

  • 截图

运维指南

# 查看redis中的heavy_query记录
redis-cli -h $redis_host   keys hke:heavy_expr*
# 查看consul中的heavy_query记录
curl http://$consul_addr:8500/v1/kv/prometheus/record?recurse= |python -m json.tool
# 根据一个heavy_record文件恢复记录
python3 recovery_by_local_yaml.py local_record_yml/record_to_keep.yml
# 根据一个metric_name前缀删除record记录
bash -x recovery_heavy_metrics.sh  $metric_name

总结

  • 使用OpenResty的数据源 不会影响未配置预聚合的图
  • 因为只是nginx代理了一下,如果redis中没有要替换的expr就会以原查询ql查询