架构图
Prometheus Server: 收集指标和存储时间序列数据,并提供查询接口
ClientLibrary:客户端库
Push Gateway: 短期存储指标数据。主要用于临时性的任务
Exporters:采集已有的第三方服务监控指标并暴露
metrics Alertmanager:告警
Web UI :简单的web控制台
部署方式
包安装
REHL系统:https://packagecloud.io/prometheus-rpm/release
ubuntu和debia系统可直接食用apt命令安装
二进制安装
https://prometheus.io/download/
基于docker运行
https://prometheus.io/docs/prometheus/latest/installation/
基于kubernetes operator安装
https://github.com/prometheus-operator/kube-prometheus
基于docer方式部署
下载yum源
wget https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo -O /etc/yum.repos.d/docker-ce.repo
安装docker
yum -y install docker-ce-20.10.12-3.el7
启动docker
systemctl enable --now docker && systemctl status docker
运行prometheus日期哦
docker run -d --name prometheus -p 9090:9090 prom/prometheus
运行结果
[root@localhost ~]# docker run -d --name prometheus -p 9090:9090 prom/prometheus
Unable to find image 'prom/prometheus:latest' locally
latest: Pulling from prom/prometheus
4399114b4c59: Pull complete
225de5a6f1e7: Pull complete
ebd656a24cc0: Pull complete
e6170f30a38d: Pull complete
5aaacfdc4145: Pull complete
5420b5446248: Pull complete
8a2ab86e3ff1: Pull complete
3dab14bd0e8c: Pull complete
5bce46b63ac9: Pull complete
00cd3198f364: Pull complete
cbd4b912d9ee: Pull complete
7e5bc43c7502: Pull complete
Digest: sha256:d2ab0a27783fd4ad96a8853e2847b99a0be0043687b8a5d1ebfb2dd3fa4fd1b8
Status: Downloaded newer image for prom/prometheus:latest
a10dbec14a8fcc75758c8ba5ff785d17e31b7f6a6c1f148812925ba739f1005f
网页查看监控页面
二进制包部署
下载安装包
wget https://github.com/prometheus/prometheus/releases/download/v2.37.5/prometheus-2.37.5.linux-amd64.tar.gz
移动安装包
mv prometheus-2.37.5.linux-amd64.tar.gz /usr/local/
解压
tar -xvf prometheus-2.37.5.linux-amd64.tar.gz
设置软连接
[root@localhost local]# ln -sv /usr/local/prometheus-2.37.5.linux-amd64 /usr/local/prometheus
‘/usr/local/prometheus’ -> ‘/usr/local/prometheus-2.37.5.linux-amd64’
创建service文件
vim /etc/systemd/system/prometheus.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/usr/local/prometheus/
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --storage.tsdb.path=/export/prometheus/prometheus.yaml
[Install]
WantedBy=multi-user.target
加载配置文件
[root@localhost prometheus]# systemctl daemon-reload
启动服务开机自启
[root@localhost prometheus]# systemctl start prometheus && systemctl enable prometheus
网页访问
http://IP:9090
Node exporter部署
安装Node exporter用于收集K8S node节点上的监控指标数据,监听端口为9100
下载连接:https://prometheus.io/download/
下载
wget https://github.com/prometheus/node_exporter/releases/download/v1.5.0/node_exporter-1.5.0.linux-amd64.tar.gz
解压安装包
tar -xvf node_exporter-1.5.0.linux-amd64.tar.gz
创建链接
ln -sv node_exporter-1.5.0.linux-amd64 node_exporter
创建service文件
vi /etc/systemd/system/node-exporter.service
[Unit]
Description=Prometheus Node Exporter
After=network.target
[Service]
ExecStart=/root/node_exporter/node_exporter
[Install]
WantedBy=multi-user.target
加载配置文件
[root@centos7-study ~]# systemctl daemon-reload
启动服务开机自启
[root@centos7-study ~]# systemctl restart node-exporter.service && systemctl enable node-exporter.service
prometheus采集node指标数据
配置prometheus通过node-exporter采集node节点的监控指标数据
vim /usr/local/prometheus/prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
#global指标说明
global:
scrape_interval: 15s # 默认15s 数据收集的间隔时间,不配置默认1分钟
evaluation_interval: 15s # 规则扫描时间间隔是15秒,默认不填写是 1分钟
scrape_timeout: 10s #超时时间
external_labels: # 用于外部系统标签的,不是用于metrics(度量)数据
#rule_files说明
#设置告警规则
#scrape_configs配置采集目标
添加刚才安装node_exporter节点
- job_name: "prometheus-node"
static_configs:
- targets: ["192.168.160.129:9100","192.168.160.130:9100"]
添加完主机后检查yaml文件是否正确
./promtool check config prometheus.yml
重启prometheus服务
systemctl restart prometheus
登录网页查看监控主机是否添加
官方提供的一个exporter,可以通过http、https、dns、tcp、和icmp对被监测节点进行监控和数据采集。
http/https:url/api可用性检测
tcp:端口监听检测
icmp:主机存货检测
DNS:域名解析
下载连接:https://prometheus.io/download/
下载
wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.23.0/blackbox_exporter-0.23.0.linux-amd64.tar.gz
解压
tar xvf blackbox_exporter-0.23.0.linux-amd64.tar.gz
链接
ln -sv blackbox_exporter-0.23.0.linux-amd64 blackbox_exporter
创建service文件
vim /etc/systemd/system/blackbox-exporter.service
[Unit]
Description= Prometheus blackbox Exporter
After=network.target
[Service]
Type=simple
User=root
Group=root
ExecStart=/root/blackbox_exporter/blackbox_exporter \
--config.file=/root/blackbox_exporter/blackbox.yml \
--web.listen-address=:9115
Restart=on-failure
[Install]
WantedBy=multi-user.target
加载配置文件
systemctl daemon-reload
启动服务开机自启
systemctl restart blackbox-exporter.service && systemctl enable blackbox-exporter.service
blackbox exporter 实现url监控
编辑prometheus监控主机配置文件
vim /usr/local/prometheus/prometheus.yml
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
- job_name: "prometheus-node"
static_configs:
- targets: ["192.168.160.129:9100","192.168.160.130:9100"]
#网站监控
- job_name: 'http_status'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets: ['http://www.xiaomi.com']
labels:
instance: http_status
group: web
relabel_configs:
- source_labels: [__address__] #relabel通过将_address_(当前目标地址)写入——param——target标签来创建一个label
target_label: __param_target #监控目标xiaomi。com,做为_address_的value
- source_labels: [__param_target] #监控目标
target_label: url #将监控目标与url创建一个label
- target_label: __address__
replacement: 192.168.160.130:9115
重启prometheus
systemctl restart prometheus
登录网页查看
需要其他网站直接添加
- targets: ['http://www.xiaomi.com','http://www.jd.com']
ICMP监控
编辑prometheus监控主机配置文件
vim /usr/local/prometheus/prometheus.yml
#ICMP检测
- job_name: 'ping_status'
metrics_path: /probe
params:
module: [icmp]
static_configs:
- targets: ['192.168.160.130','192.168.160.129']
labels:
instance: ping_status
group: icmp
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: ip
- target_label: __address__
replacement: 192.168.160.130:9115
端口监控
编辑prometheus监控主机配置文件
vim /usr/local/prometheus/prometheus.yml
#端口监控
- job_name: 'port_status'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: ['192.168.160.129:9100', '192.168.160.130:9115']
labels:
instance: 'port_status'
group: 'port'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: ip
- target_label: __address__
replacement: 192.168.160.130:9115
grafana 可视化界面
下载
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-7.5.11-1.x86_64.rpm
配置文件
vim /etc/grafana/grafana.ini
启动服务
systemctl restart grafana-server.service
浏览器访问
http://192.168.160.129:3000/login
浏览器访问
默认账号密码都是admin
添加数据源
导入图形
PromQL简介
prometheus提供一个函数式的表达式语言promql,可以使用户实时的查找和聚合时间序列数据,表达式计算结果可以在图形中展示,也可以在prometheus表达式浏览器中以表格形式展示,或者作为数据源,以http api方式提供给外部系统使用。
使用promql语句,在grafana中,点击+创建dashboard 创建自定义图形,指定查询参数;
promql基本查询
node_memory_MemTotal_bytes #查询node节点内存大小
node_memory_MemFree_bytes #查询node节点剩余可以内存
node_memory_MemTotal_bytes{instance="192.168.160.129:9100"} #查询指定节点内存
node_memory_MemFree_bytes{instance="192.168.160.129:9100"} #查询指定节点可用内存
node_disk_io_time_seconds_total{device="sda"} #查询指定磁盘每秒磁盘IO
node_filesystem_free_bytes{device="/dev/sda1",fstype="xfs"} #查看指定磁盘的剩余空间
node_filesystem_free_bytes{device="/dev/sda1"}
时间范围
s 秒
m 分钟
h 小时
d 天
w 周
y 年
node_memory_MemFree_bytes{instance="192.168.160.129:9100"}[5m]
node_memory_MemFree_bytes{instance="192.168.160.129:9100"}[1h]
运算符
+ 加
- 减
* 乘
/ 除
% 模
^ 幂
node_memory_MemFree_bytes/1024/1024 #剩余内存单位转换
node_disk_read_bytes_total{device="sda"}+node_disk_written_bytes_total{device="sda"} #读写速率
监控pod
cadvisor可以搜集一台机器上所有运行的容器信息,还可以提供基础查询界面和http接口,方便其他组件进行数据抓取,cadvisor可以对节点机器上的资源及容器进行实时监控和性能数据采集,包括cpu使用情况,内存使用情况,网络吞吐量及文件系统使用情况。
https://github.com/google/cadvisor
https://github.com/google/cadvisor/releases/
docker pull hub.c.163.com/xbingo/cadvisor:latest
docker run \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:ro \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--volume=/dev/disk/:/dev/disk:ro \
--publish=8080:8080 \
--detach=true \
--name=cadvisor \
--privileged \
--device=/dev/kmsg \
hub.c.163.com/xbingo/cadvisor:latest
报错解决
WARNING: IPv4 forwarding is disabled. Networking will not work.
7f0e2a9c99483d6dec73eb1382ae1214a4d52d2106d0f7dcbb4c2a0888322df5
[root@localhost ~]# echo "net.ipv4.ip_forward=1" >>/usr/lib/sysctl.d/00-system.conf
[root@localhost ~]# systemctl restart network && systemctl restart docker
重新启动docker
docker start cadvisor
编辑prometheus.yaml配置文件
#监控容器
- job_name: 'prometheus-con'
static_configs:
- targets: ["192.168.160.129:8080","192.168.160.130:8080"]
重启prometheus服务
在grafana页面中导入容器监控界面
prometheus报警设置
prometheus-->触发阈值--->超出持续时间---->alertmanager--->分组|抑制|静默---->媒体类型---->邮件|钉钉|微信等。
分组:将类似性质的警报合并为单个通知,比如网络通知、主机通知、服务通知。
静默:是一种简单的特定时间静音的机制,例如:服务器要升级维护可以设置此时间段告警静默
抑制:当警报发出后,停止重复发送由此警报引发的其他警报即合并一个故障引起的多个报警时间,可以消除冗余告警。
下载告警组件alertmanage
cd /usr/local/
下载链接
https://prometheus.io/download/
ln -sv alertmanager-0.25.0.linux-amd64 alertmanager
vim /etc/systemd/system/alertmanager.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/usr/local/alertmanager
ExecStart=/usr/local/alertmanager/alertmanager
[Install]
WantedBy=multi-user.target
systemctl start alertmanager.service&& systemctl enable alertmanager.service
告警配置
告警配置参考
https://prometheus.io/docs/alerting/latest/configuration/
global:
smtp_from: #发件人邮箱地址
smtp_smarthost: #邮箱smtp地址
smtp_auth_username: #发件人的登录用户名,默认和发件人地址一致
smtp_auth_password: #发件人的登录密码,有时候是授权码
smtp_require_tls: #是否需要tls协议,默认是true
wechat_api_url: #企业微信api地址
wechat_api_secret: #企业微信api secret
wechat_corp_id: #企业微信corp id信息
resolve_timeout: #在指定时间内没有产生新的时间就发送恢复通知
配置alertmanager目录下yaml文件
vim alertmanager.yml
global:
resolve_timeout: 5m #在制定时间内没有产生新的时间就发送恢复通知
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '1162727670@qq.com'
smtp_auth_username: '1162727670@qq.com'
smtp_auth_password: 'rzqarracbyxjhfhe'
smtp_hello: '@qq.com'
smtp_require_tls: false
route: #route用来设置报警的分发策略
group_by: ['alertname'] #采用那个标签来作为分组依据
group_wait: 30s #组告警等待时间,就是告警产生后等待30s,如果有同组告警一起发出
group_interval: 5m #两组告警的间隔时间
repeat_interval: 1h #重复告警的间隔时间,减少邮件的发送频率
receiver: 'web.hook' #设置接收人
receivers:
- name: 'web.hook'
# webhook_configs:
# - url: 'http://127.0.0.1:5001/'
email_configs:
- to: '1162727670@qq.com'
inhibit_rules:#抑制的规则
- source_match: #源匹配级别,当匹配成功发出通知,但是其他的通知将被抑制
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
配置完成重启服务
systemctl restart alertmanager.service
登录网页可以查看界面 端口9093
在prometheus主机上添加Alertmanager
vi prometheus.yml
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.160.130:9093 #Alertmanager主机
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/usr/local/prometheus/rule.yml" #告警规则所在目录文件
# - "first_rules.yml"
# - "second_rules.yml"
[root@localhost prometheus]# vim /usr/local/prometheus/rule.yml
[root@localhost prometheus]#
[root@localhost prometheus]# ls
console_libraries consoles data LICENSE NOTICE prometheus prometheus.yml promtool rule.yml
将告警规则写入rule.yml文件中
cat > /usr/local/prometheus/rule.yml << 'EOF'
groups:
- name: general.rules
rules:
- alert: InstanceDown
expr: up == 0 # 表达式当前被监控实例服务(node_exporter)状态,1为正常,
for: 10s # 告警持续时间10s
labels:
severity: warning # 告警级别warning| error
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} 已经停止工作10秒钟。"
EOF
之后重启prometheus服务
登录网页查看告警规则