16、prometheus + grafana + alertmanager

k8s 手撕方式安装 prometheus + grafana + alertmanager

k8s版本:k8s-1.29.1

prometheus + grafana + alertmanager 监控报警

1、k8s 手撕方式安装 prometheus

mkdir ~/prometheus-yml

kubectl create ns monitoring
cat > ~/prometheus-yml/prometheus-rbac.yml << 'EOF'
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups: [""]
  resources:
  - nodes
  - nodes/metrics
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources:
  - configmaps
  verbs: ["get"]
- apiGroups:
  - networking.k8s.io
  resources:
  - ingresses
  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: monitoring
EOF
kubectl apply -f ~/prometheus-yml/prometheus-rbac.yml
cat > ~/prometheus-yml/prometheus-ConfigMap.yml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: monitoring
data:
  prometheus.yml: |
    global:
      scrape_interval: 15s
      evaluation_interval: 15s
    scrape_configs:
      - job_name: prometheus
        static_configs:
          - targets: ['localhost:9090']
EOF
kubectl apply -f ~/prometheus-yml/prometheus-ConfigMap.yml

这里暂时只配置了对 prometheus 本身的监控

如果以后有新的资源需要被监控,只需要将 ConfigMap 对象更新即可

cat > ~/prometheus-yml/prometheus-ConfigMap.yml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: monitoring
data:
  prometheus.yml: |
    global:
      scrape_interval: 15s
      evaluation_interval: 15s

    # 告警规则文件
    rule_files:
    - /etc/prometheus/rules.yml
    - /etc/prometheus/rules/*.rules.yml

    # 对接alertmanager
    alerting:
      alertmanagers:
        - static_configs:
          - targets: ["alertmanager-service.monitoring.svc.cluster.local:9093"]

    scrape_configs:

      #0、监控 prometheus
      - job_name: prometheus
        static_configs:
          - targets: ['localhost:9090']

      - job_name: 1.15.172.119
        static_configs:
          - targets: ['1.15.172.119:9100']

      #1、监控 k8s节点
      - job_name: 'k8s-nodes'
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: [__address__]
          regex: '(.*):10250'
          replacement: '${1}:9100'
          target_label: __address__
          action: replace
        - action: labelmap
          regex: __meta_kubernetes_node_label_(.+)

      #2、监控 k8s-etcd
      - job_name: 'k8s-etcd'
        metrics_path: metrics
        scheme: http
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        kubernetes_sd_configs:
        - role: endpoints
        relabel_configs:
        - source_labels: [__meta_kubernetes_service_name]
          regex: etcd-k8s
          action: keep
        - action: labelmap
          regex: __meta_kubernetes_pod_label_(.+)
          
      #3、监控 kube-apiserver
      - job_name: 'kube-apiserver'
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        relabel_configs:
        - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
          action: keep
          regex: default;kubernetes;https

      #4、监控 kube-controller-manager
      - job_name: 'kube-controller-manager'
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          insecure_skip_verify: true
        relabel_configs:
        - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name]
          action: keep
          regex: kube-system;kube-controller-manager

      #5、监控 kube-scheduler
      - job_name: 'kube-scheduler'
        kubernetes_sd_configs:
        - role: endpoints
        scheme: https
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          insecure_skip_verify: true
        relabel_configs:
        - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name]
          action: keep
          regex: kube-system;kube-scheduler

      #6、监控 kubelet
      - job_name: 'kubelet'
        kubernetes_sd_configs:
        - role: node
        scheme: https
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        relabel_configs:
        - action: labelmap
          regex: __meta_kubernetes_node_label_(.+)
          replacement: $1

      #7、监控 kube-proxy
      - job_name: 'kube-proxy'
        metrics_path: metrics
        scheme: http
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          insecure_skip_verify: false
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        kubernetes_sd_configs:
        - role: endpoints
        relabel_configs:
        - source_labels: [__meta_kubernetes_service_name]
          regex: kube-proxy
          action: keep
        - action: labelmap
          regex: __meta_kubernetes_pod_label_(.+)

      #8、监控 coredns
      - job_name: 'coredns'
        static_configs:
          - targets: ['kube-dns.kube-system.svc.cluster.local:9153']

      #9、监控容器
      - job_name: 'kubernetes-cadvisor'
        kubernetes_sd_configs:
        - role: node
        scheme: https
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          insecure_skip_verify: true
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        relabel_configs:
        - action: labelmap
          regex: __meta_kubernetes_node_label_(.+)
          replacement: $1
        - source_labels: [__meta_kubernetes_node_name]
          regex: (.+)
          replacement: /metrics/cadvisor
          target_label: __metrics_path__

      #10、svc自动发现
      - job_name: 'k8s-service-endpoints'
        kubernetes_sd_configs:
        - role: endpoints
        relabel_configs:
        - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
          action: keep
          regex: true
        - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
          action: replace
          target_label: __scheme__
          regex: (https?)
        - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
          action: replace
          target_label: __metrics_path__
          regex: (.+)
        - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
          action: replace
          target_label: __address__
          regex: ([^:]+)(?::\d+)?;(\d+)
          replacement: $1:$2
        - action: labelmap
          regex: __meta_kubernetes_service_label_(.+)
        - source_labels: [__meta_kubernetes_namespace]
          action: replace
          target_label: kubernetes_namespace
        - source_labels: [__meta_kubernetes_service_name]
          action: replace
          target_label: kubernetes_name
        - source_labels: [__meta_kubernetes_pod_name]
          action: replace
          target_label: kubernetes_pod_name
          
      # 11、监控 kube-state-metrics
      - job_name: "kube-state-metrics"
        kubernetes_sd_configs:
        - role: endpoints
        relabel_configs:
        - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_endpoints_name]
          regex: kube-system;kube-state-metrics
          action: keep

  # 告警规则
  rules.yml: |
    groups:
    - name: test-node-mem
      rules:
      - alert: NodeMemoryUsage
        expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 20
        for: 1m
        labels:
          cluster: RTG
          severity: P1
        annotations:
          summary: "{{$labels.instance}}: High Memory usage detected"
          description: "{{$labels.instance}}: Memory usage is above 20% (current value is: {{ $value }})"
          
    - name: Hosts.rules
      rules:
      ## Custom By huanghuanhui
      - alert: HostDown
        expr: up == 0
        for: 1m
        labels:
          cluster: RTG
          severity: P1
        annotations:
          Summary: '主机{{ $labels.instance }}  ${{ $labels.job }} down'
          description: "主机: 【{{ $labels.instance }}】has been down for more than 1 minute"

      - alert: HostCpuLoadAvage
        expr:  node_load5 /count by (instance, job) (node_cpu_seconds_total{mode="idle"}) >= 0.95
        for: 1m
        annotations:
          Summary: "主机{{ $labels.instance }} cpu 5分钟负载比率大于1 (当前值:{{ $value }})"
          description: "主机: 【{{ $labels.instance }}】 cpu_load5值大于核心数。 (当前比率值:{{ $value }})"
        labels:
          cluster: RTG
          severity: 'P3'

      - alert: HostCpuUsage
        expr: (1-((sum(increase(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))/ (sum(increase(node_cpu_seconds_total[5m])) by (instance))))*100 > 80
        for: 1m
        annotations:
          Summary: "主机{{ $labels.instance }} CPU 5分钟使用率大于80% (当前值:{{ $value }})"
          description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU使用率超过80% (当前值:{{ $value }})"
        labels:
          cluster: RTG
          severity: 'P1'

      - alert: HostMemoryUsage
        expr: (1-((node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes)/node_memory_MemTotal_bytes))*100 > 80
        for: 1m
        annotations:
          Summary: "主机{{ $labels.instance }} 内存使用率大于80% (当前值:{{ $value }})"
          description: "主机: 【{{ $labels.instance }}】 内存使用率超过80% (当前使用率:{{ $value }}%)"
        labels:
          cluster: RTG
          severity: 'P3'

      - alert: HostIOWait
        expr: ((sum(increase(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance))/(sum(increase(node_cpu_seconds_total[5m])) by (instance)))*100 > 10
        for: 1m
        annotations:
          Summary: "主机{{ $labels.instance }} iowait大于10% (当前值:{{ $value }})"
          description: "主机: 【{{ $labels.instance }}】 5五分钟内磁盘IO过高 (当前负载值:{{ $value }})"
        labels:
          cluster: RTG
          severity: 'P3'

      - alert: HostFileSystemUsage
        expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }/node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }))*100 > 80
        for: 1m
        annotations:
          Summary: "主机{{ $labels.instance }} {{ $labels.mountpoint }} 磁盘空间使用大于80%  (当前值:{{ $value }})"
          description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区使用率超过80%, 当前值使用率:{{ $value }}%"
        labels:
          cluster: RTG
          severity: 'P3'

      - alert: HostSwapIsFillingUp
        expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
        for: 2m
        labels:
          cluster: RTG
          severity: 'P4'
        annotations:
          Summary: "主机: 【{{ $labels.instance }}】 swap分区使用超过 (>80%), 当前值使用率: {{ $value }}%"
          description: "主机: 【{{ $labels.instance }}】 swap分区使用超过 (>80%), 当前值使用率: {{ $value }}%"

      - alert: HostNetworkConnection-ESTABLISHED
        expr:  sum(node_netstat_Tcp_CurrEstab) by (instance) > 2000
        for: 5m
        labels:
          cluster: RTG
          severity: 'P4'
        annotations:
          Summary: "主机{{ $labels.instance }} ESTABLISHED连接数过高 (当前值:{{ $value }})"
          description: "主机: 【{{ $labels.instance }}】 ESTABLISHED连接数超过2000, 当前ESTABLISHED连接数: {{ $value }}"

      - alert: HostNetworkConnection-TIME_WAIT
        expr:  sum(node_sockstat_TCP_tw) by (instance) > 1000
        for: 5m
        labels:
          cluster: RTG
          severity: 'P3'
        annotations:
          Summary: "主机{{ $labels.instance }} TIME_WAIT连接数过高 (当前值:{{ $value }})"
          description: "主机: 【{{ $labels.instance }}】 TIME_WAIT连接数超过1000, 当前TIME_WAIT连接数: {{ $value }}"

      - alert: HostUnusualNetworkThroughputIn
        expr:  sum by (instance, device) (rate(node_network_receive_bytes_total{device=~"eth.*"}[2m])) / 1024 / 1024 > 300
        for: 5m
        labels:
          cluster: RTG
          severity: 'P3'
        annotations:
          Summary: "主机{{ $labels.instance }} 入口流量超过 (> 300 MB/s)  (当前值:{{ $value }})"
          description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 入口流量超过 (> 300 MB/s), 当前值: {{ $value }}"

      - alert: HostUnusualNetworkThroughputOut
        expr: sum by (instance, device) (rate(node_network_transmit_bytes_total{device=~"eth.*"}[2m])) / 1024 / 1024 > 300
        for: 5m
        labels:
          cluster: RTG
          severity: 'P4'
        annotations:
          Summary: "主机{{ $labels.instance }} 出口流量超过 (> 300 MB/s)  (当前值:{{ $value }})"
          description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 出口流量超过 (> 300 MB/s), 当前值: {{ $value }}"

      - alert: HostUnusualDiskReadRate
        expr: sum by (instance, device) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
        for: 5m
        labels:
          cluster: RTG
          severity: 'P4'
        annotations:
          Summary: "主机{{ $labels.instance }} 磁盘读取速率超过(50 MB/s)  (当前值:{{ $value }})"
          description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 读取速度超过(50 MB/s), 当前值: {{ $value }}"

      - alert: HostUnusualDiskWriteRate
        expr: sum by (instance, device) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
        for: 2m
        labels:
          cluster: RTG
          severity: 'P4'
        annotations:
          Summary: "主机{{ $labels.instance }} 磁盘读写入率超过(50 MB/s)  (当前值:{{ $value }})"
          description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 写入速度超过(50 MB/s), 当前值: {{ $value }}"

      - alert: HostOutOfInodes
        expr: node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_files{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } * 100 < 10
        for: 2m
        labels:
          cluster: RTG
          severity: 'P3'
        annotations:
          Summary: "主机{{ $labels.instance }} {{ $labels.mountpoint }}分区主机Inode值小于5% (当前值:{{ $value }}) "
          description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区inode节点不足 (可用值小于{{ $value }}%)"

      - alert: HostUnusualDiskReadLatency
        expr: rate(node_disk_read_time_seconds_total[2m]) / rate(node_disk_reads_completed_total[2m])  * 1000 > 100 and rate(node_disk_reads_completed_total[2m]) > 0
        for: 5m
        labels:
          cluster: RTG
          severity: 'P4'
        annotations:
          Summary: "主机{{ $labels.instance }} 主机磁盘Read延迟大于100ms (当前值:{{ $value }}ms)"
          description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Read延迟过高 (read operations > 100ms), 当前延迟值: {{ $value }}ms"

      - alert: HostUnusualDiskWriteLatency
        expr: rate(node_disk_write_time_seconds_total[2m]) / rate(node_disk_writes_completed_total[2m]) * 1000 > 100 and rate(node_disk_writes_completed_total[2m]) > 0
        for: 5m
        labels:
          cluster: RTG
          severity: 'P4'
        annotations:
          Summary: "主机{{ $labels.instance }} 主机磁盘write延迟大于100ms (当前值:{{ $value }}ms)"
          description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Write延迟过高 (write operations > 100ms), 当前延迟值: {{ $value }}ms"

      - alert: NodeFilesystemFilesFillingUp
        annotations:
          description: '预计4小时后 分区:{{ $labels.device }}  主机:{{ $labels.instance }} 可用innode仅剩余 {{ printf "%.2f" $value }}%.'
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
          Summary: '主机{{ $labels.instance }} 预计4小时后可用innode数会低于15% (当前值:{{ $value }})'
        labels:
          cluster: RTG
          severity: p3
        expr: |
          (
            node_filesystem_files_free{job="node-exporter|vm-node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter|vm-node-exporter",fstype!=""} * 100 < 15
          and
            predict_linear(node_filesystem_files_free{job="node-exporter|vm-node-exporter",fstype!=""}[6h], 4*60*60) < 0
          and
            node_filesystem_readonly{job="node-exporter|vm-node-exporter",fstype!=""} == 0
          )
        for: 1h

      - alert: NodeFileDescriptorLimit
        annotations:
          description: '主机:{{ $labels.instance }} 文件描述符使用率超过70% {{ printf "%.2f" $value }}%.'
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
          Summary: '主机: {{ $labels.instance }}文件描述符即将被耗尽. (当前值:{{ $value }})'
        expr: |
          (
            node_filefd_allocated{job="node-exporter|vm-node-exporter"} * 100 / node_filefd_maximum{job="node-exporter|vm-node-exporter"} > 70
          )
        for: 15m
        labels:
          severity: p3
          action: monitor
          cluster: RTG

      - alert: NodeClockSkewDetected
        annotations:
          description: '主机: {{ $labels.instance }} 时钟延时超过 300s.'
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
          Summary: '主机: {{ $labels.instance }}时钟延时超过 300s.(当前值:{{ $value }})'
        expr: |
          (
            node_timex_offset_seconds > 0.05
          and
            deriv(node_timex_offset_seconds[5m]) >= 0
          )
          or
          (
            node_timex_offset_seconds < -0.05
          and
            deriv(node_timex_offset_seconds[5m]) <= 0
          )
        for: 10m
        labels:
          severity: p3
          cluster: RTG

      - alert: NodeFilesystemFilesFillingUp
        annotations:
          description: '预计4小时后 分区:{{ $labels.device }}  主机:{{ $labels.instance }} 可用innode仅剩余 {{ printf "%.2f" $value }}%.'
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
          Summary: '主机{{ $labels.instance }} 预计4小时后可用innode数会低于15% (当前值:{{ $value }})'
        expr: |
          (
            node_filesystem_files_free{job="node-exporter|vm-node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter|vm-node-exporter",fstype!=""} * 100 < 15
          and
            predict_linear(node_filesystem_files_free{job="node-exporter|vm-node-exporter",fstype!=""}[6h], 4*60*60) < 0
          and
            node_filesystem_readonly{job="node-exporter|vm-node-exporter",fstype!=""} == 0
          )
        for: 1h
        labels:
          severity: p3
          cluster: RTG


      - alert: NodeFilesystemSpaceFillingUp
        annotations:
          description: '主机: {{ $labels.instance }} 分区: {{ $labels.device }} 预计在4小时候只有 {{ printf "%.2f" $value }}%.'
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
          Summary: "主机: {{ $labels.instance }}预计4小时候磁盘空闲会低于15% (当前值:{{ $value }})"
        expr: |
          (
            node_filesystem_avail_bytes{job="node-exporter|vm-node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter|vm-node-exporter",fstype!=""} * 100 < 15
          and
            predict_linear(node_filesystem_avail_bytes{job="node-exporter|vm-node-exporter",fstype!=""}[6h], 4*60*60) < 0
          and
            node_filesystem_readonly{job="node-exporter|vm-node-exporter",fstype!=""} == 0
          )
        for: 1h
        labels:
          severity: p3
          cluster: RTG
      - alert: NodeNetworkReceiveErrs
        annotations:
          description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
            {{ printf "%.0f" $value }} receive errors in the last two minutes.'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
          Summary: "主机{{ $labels.instance }} 网卡{{ $labels.device }} Node网络接受错误  (当前值:{{ $value }})"
        expr: |
          increase(node_network_receive_errs_total[2m]) > 10
        for: 2h
        labels:
          severity: p3
          cluster: RTG
      - alert: NodeNetworkTransmitErrs
        annotations:
          description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
            {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
          Summary: "主机{{ $labels.instance }} 网卡{{ $labels.device }} Node网络传输错误  (当前值:{{ $value }})"
        expr: |
          increase(node_network_transmit_errs_total[2m]) > 10
        for: 1h
        labels:
          severity: p3
          cluster: RTG
      - alert: NodeHighNumberConntrackEntriesUsed
        annotations:
          description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodehighnumberconntrackentriesused
          Summary: 主机{{ $labels.instance }} Conntrack条目使用率大于75% (当前值:{{ $value }})
        expr: |
          (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
        labels:
          severity: p2
          cluster: RTG

      - alert: NodeTextFileCollectorScrapeError
        annotations:
          description: Node Exporter text file collector failed to scrape.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodetextfilecollectorscrapeerror
          Summary: 主机{{ $labels.instance }} 打开或读取文件时出错,(当前值:{{ $value }})
        expr: |
          node_textfile_scrape_error{job="node-exporter|vm-node-exporter"} == 1
        labels:
          severity: p2
          cluster: RTG
      - alert: NodeClockNotSynchronising
        annotations:
          message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP
            is configured on this host.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeclocknotsynchronising
          Summary: 主机{{ $labels.instance }} 时间不同步(当前值:{{ $value }})
        expr: |
          min_over_time(node_timex_sync_status[5m]) == 0
        for: 10m
        labels:
          severity: p4
          cluster: RTG
EOF
kubectl apply -f ~/prometheus-yml/prometheus-ConfigMap.yml

prometheus_podIP=`kubectl get pods -n monitoring -o custom-columns='NAME:metadata.name,podIP:status.podIPs[*].ip' |grep prometheus |awk '{print $2}'`

curl -X POST "http://$prometheus_podIP:9090/-/reload"
# 因为告警规则是以ConfigMap挂载Prometheus上,为了可以后期可以方便加规则,这里先创建一个空的告警规则ConfigMap(目的:先让Prometheus正常启动)
kubectl create configmap prometheus-rules --from-literal=empty=empty
cat > ~/prometheus-yml/prometheus-Deployment.yml << 'EOF'
apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus
  namespace: monitoring
  labels:
    app: prometheus
spec:
  replicas: 1
  selector:
    matchLabels:
     app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      serviceAccountName: prometheus
      containers:
      - name: prometheus
        image: prom/prometheus:v2.49.1
        imagePullPolicy: IfNotPresent
        args:
        - "--config.file=/etc/prometheus/prometheus.yml"
        - "--storage.tsdb.path=/prometheus"
        - "--storage.tsdb.retention.time=30d"
        - "--web.enable-admin-api"
        - "--web.enable-lifecycle"
        ports:
        - containerPort: 9090
          name: http
        volumeMounts:
        - mountPath: "/prometheus"
          subPath: prometheus
          name: data
        - mountPath: "/etc/prometheus"
          name: config
        - mountPath: "/etc/prometheus/rules"
          name: rules
        - name: localtime
          mountPath: /etc/localtime
        resources:
          limits:
            cpu: "2"
            memory: "4Gi"
          requests:
            cpu: "1"
            memory: "2Gi"
      volumes:
      - name: data
        persistentVolumeClaim:
          claimName: prometheus-nfs-client-pvc
      - name: config
        configMap:
          name: prometheus-config
      - name: rules
        configMap:
          name: prometheus-rules
      - name: localtime
        hostPath:
          path: /etc/localtime
---
apiVersion: v1
kind:  PersistentVolumeClaim
metadata:
  name: prometheus-nfs-client-pvc
  namespace: monitoring
spec:
  storageClassName: nfs-storage
  accessModes: [ReadWriteOnce]
  resources:
    requests:
      storage: 2Ti
EOF
kubectl apply -f ~/prometheus-yml/prometheus-Deployment.yml
cat > ~/prometheus-yml/prometheus-Service.yml << 'EOF'
apiVersion: v1
kind: Service
metadata:
  name: prometheus-service
  namespace: monitoring
  labels:
    app: prometheus
  annotations:
    prometheus.io/port: "9090"
    prometheus.io/scrape: "true"
spec:
  selector:
    app: prometheus
  type: NodePort
  ports:
  - name: web
    port: 9090
    targetPort: http
    nodePort: 31111
EOF
kubectl apply -f ~/prometheus-yml/prometheus-Service.yml
cat > ~/prometheus-yml/prometheus-Ingress.yml << 'EOF'
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: prometheus-ingress
  namespace: monitoring
  annotations:
    nginx.ingress.kubernetes.io/ssl-redirect: 'true'
    nginx.ingress.kubernetes.io/proxy-body-size: '4G'
spec:
  ingressClassName: nginx
  rules:
  - host: prometheus.huanghuanhui.cloud
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: prometheus-service
            port:
              number: 9090

  tls:
  - hosts:
    - prometheus.huanghuanhui.cloud
    secretName: prometheus-ingress-tls
EOF
kubectl create secret -n monitoring \
tls prometheus-ingress-tls \
--key=/root/ssl/huanghuanhui.cloud.key \
--cert=/root/ssl/huanghuanhui.cloud.crt
kubectl apply -f ~/prometheus-yml/prometheus-Ingress.yml

访问地址:prometheus.huanghuanhui.cloud

告警规则

更多告警规则查看:https://samber.github.io/awesome-prometheus-alerts/

mkdir -p ~/prometheus-yml/rules-yml

pod.rules

cat > ~/prometheus-yml/rules-yml/pod.rules.yml << 'EOF'
groups:
- name: pod.rules
  rules:

  - alert: PodDown
    expr: kube_pod_container_status_running != 1  
    for: 2s
    labels:
      severity: warning
      cluster: k8s
    annotations:
      summary: 'Container: {{ $labels.container }} down'
      description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} is not running'


  - alert: PodReady
    expr: kube_pod_container_status_ready != 1  
    for: 5m   # Ready持续5分钟,说明启动有问题
    labels:
      severity: warning
      cluster: k8s
    annotations:
      summary: 'Container: {{ $labels.container }} ready'
      description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} always ready for 5 minitue'


  - alert: PodRestart
    expr: changes(kube_pod_container_status_restarts_total[30m]) > 0 # 最近30分钟pod重启
    for: 2s
    labels:
      severity: warning
      cluster: k8s
    annotations:
      summary: 'Container: {{ $labels.container }} restart'
      description: 'namespace: {{ $labels.namespace }}, pod: {{ $labels.pod }} restart {{ $value }} times'


  - alert: PodFailed
    expr: sum (kube_pod_status_phase{phase="Failed"}) by (pod,namespace) > 0
    for: 5s
    labels:
      severity: error 
    annotations:
      summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Failed (当前值: {{ $value }})"


  - alert: PodPending
    expr: sum (kube_pod_status_phase{phase="Pending"}) by (pod,namespace) > 0
    for: 1m
    labels:
      severity: error
    annotations:
      summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} Pod状态Pending (当前值: {{ $value }})"


  - alert: PodErrImagePull
    expr: sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ErrImagePull"}) == 1
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态ErrImagePull (当前值: {{ $value }})"


  - alert: PodImagePullBackOff
    expr: sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"}) == 1
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态ImagePullBackOff (当前值: {{ $value }})"


  - alert: PodCrashLoopBackOff
    expr: sum by(namespace,pod) (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) == 1
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }}  Pod状态CrashLoopBackOff (当前值: {{ $value }})"


  - alert: PodCPUUsage
    expr: sum by(pod, namespace) (rate(container_cpu_usage_seconds_total{image!=""}[5m]) * 100) > 5
    for: 5m
    labels:
      severity: warning 
    annotations:
      summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} CPU使用大于80% (当前值: {{ $value }})"


  - alert: PodMemoryUsage
    expr: sum(container_memory_rss{image!=""}) by(pod, namespace) / sum(container_spec_memory_limit_bytes{image!=""}) by(pod, namespace) * 100 != +inf > 80
    for: 5m
    labels:
      severity: error 
    annotations:
      summary: "命名空间: {{ $labels.namespace }} | Pod名称: {{ $labels.pod }} 内存使用大于80% (当前值: {{ $value }})"


  - alert: PodStatusChange  # Pod 状态异常变更警报
    expr: changes(kube_pod_status_phase[5m]) > 5
    for: 5m
    annotations:
      summary: "Pod 状态异常变更"
      description: "Pod {{ $labels.pod }} 的状态异常变更次数超过 5 次."


  - alert: ContainerCrash  # Pod 容器崩溃警报
    expr: increase(container_cpu_cfs_throttled_seconds_total{container!="",pod!=""}[5m]) > 0
    for: 5m
    annotations:
      summary: "Pod 容器崩溃"
      description: "Pod {{ $labels.pod }} 中的容器发生崩溃."
EOF

svc.rules

cat > ~/prometheus-yml/rules-yml/svc.rules.yml << 'EOF'
groups:
- name: svc.rules
  rules:

  - alert: ServiceDown
    expr: avg_over_time(up[5m]) * 100 < 50
    annotations:
      description: The service {{ $labels.job }} instance {{ $labels.instance }} is not responding for more than 50% of the time for 5 minutes.
      summary: The service {{ $labels.job }} is not responding
EOF

pvc.rules

cat > ~/prometheus-yml/rules-yml/pvc.rules.yml << 'EOF'
groups:
- name: pvc.rules
  rules:

  - alert: PersistentVolumeClaimLost
    expr: sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Lost"}) == 1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is lost\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


  - alert: PersistentVolumeClaimPendig
    expr: sum by(namespace, persistentvolumeclaim) (kube_persistentvolumeclaim_status_phase{phase="Pendig"}) == 1
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pendig\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


  - alert: HighPersistentVolumeUsage  # PersistentVolume 使用率过高警报
    expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100 > 90
    for: 5m
    annotations:
      summary: "PersistentVolume 使用率过高"
      description: "PersistentVolume {{ $labels.persistentvolume }} 的使用率超过 90%."


  - alert: HighPVUsageForPod   # Pod 挂载的 PersistentVolume 使用率过高警报
    expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100 > 90
    for: 5m
    annotations:
      summary: "Pod 挂载的 PersistentVolume 使用率过高"
      description: "Pod {{ $labels.pod }} 挂载的 PersistentVolume 使用率超过 90%."
EOF

kubeadm.rules

cat > ~/prometheus-yml/rules-yml/kubeadm.rules.yml << 'EOF'
groups:
- name: kubeadm.rules
  rules:

  # Kubelet 健康状态检查
  - alert: KubeletDown
    expr: up{job="kubelet"} == 0
    for: 1m
    annotations:
      summary: "Kubelet 不可用"
      description: "Kubelet {{ $labels.instance }} 不可用."


  # Node 不可用警报:
  - alert: NodeDown
    expr: up{job="k8s-nodes"} == 0
    for: 1m
    annotations:
      summary: "Node 不可用"
      description: "Node {{ $labels.node }} 不可用."


  # Kube Proxy 健康状态检查
  - alert: KubeProxyDown
    expr: up{job="kube-proxy"} == 0
    for: 1m
    annotations:
      summary: "Kube Proxy 不可用"
      description: "Kube Proxy {{ $labels.instance }} 不可用."


  # Kube Scheduler 健康状态检查
  - alert: KubeSchedulerDown
    expr: up{job="kube-scheduler"} == 0
    for: 1m
    annotations:
      summary: "Kube Scheduler 不可用"
      description: "Kube Scheduler 不可用."


  # Kube Controller Manager 健康状态检查
  - alert: KubeControllerManagerDown
    expr: up{job="kube-controller-manager"} == 0
    for: 1m
    annotations:
      summary: "Kube Controller Manager 不可用"
      description: "Kube Controller Manager 不可用."


  # Kube State Metrics 健康状态检查
  - alert: KubeStateMetricsDown
    expr: up{job="kube-state-metrics"} == 0
    for: 1m
    annotations:
      summary: "Kube State Metrics 不可用"
      description: "Kube State Metrics 不可用."


    # KubernetesNodeNotReady
  - alert: KubernetesNodeNotReady
    expr: sum(kube_node_status_condition{condition="Ready",status="true"}) by (node) == 0
    for: 10m
    labels:
      severity: critical
    annotations:
      summary: Kubernetes node is not ready
      description: A node in the cluster is not ready, which may cause issues with cluster functionality.
EOF
# 更新前面创建空的prometheus-rules的ConfigMap
kubectl create configmap prometheus-rules \
--from-file=pod.rules.yml \
--from-file=svc.rules.yml \
--from-file=pvc.rules.yml \
--from-file=kubeadm.rules.yml \
-o yaml --dry-run=client | kubectl apply -f -
prometheus_podIP=`kubectl get pods -n monitoring -o custom-columns='NAME:metadata.name,podIP:status.podIPs[*].ip' |grep prometheus |awk '{print $2}'`

curl -X POST "http://$prometheus_podIP:9090/-/reload"

0、对k8s-node的监控

cat > ~/prometheus-yml/node-exporter.yml << 'EOF'
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: node-exporter
  namespace: monitoring
  labels:
    app: node-exporter
spec:
  selector:
    matchLabels:
      app: node-exporter
  template:
    metadata:
      labels:
        app: node-exporter
    spec:
      hostPID: true
      hostIPC: true
      hostNetwork: true
      nodeSelector:
        kubernetes.io/os: linux
      containers:
      - name: node-exporter
        image: prom/node-exporter:v1.7.0
        args:
        - --web.listen-address=$(HOSTIP):9100
        - --path.procfs=/host/proc
        - --path.sysfs=/host/sys
        - --path.rootfs=/host/root
        - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
        - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
        ports:
        - containerPort: 9100
        env:
        - name: HOSTIP
          valueFrom:
            fieldRef:
              fieldPath: status.hostIP
        resources:
          requests:
            cpu: 150m
            memory: 180Mi
          limits:
            cpu: 150m
            memory: 180Mi
        securityContext:
          runAsNonRoot: true
          runAsUser: 65534
        volumeMounts:
        - name: proc
          mountPath: /host/proc
        - name: sys
          mountPath: /host/sys
        - name: root
          mountPath: /host/root
          mountPropagation: HostToContainer
          readOnly: true
        - name: localtime
          mountPath: /etc/localtime
      tolerations:
      - operator: "Exists"
      volumes:
      - name: proc
        hostPath:
          path: /proc
      - name: dev
        hostPath:
          path: /dev
      - name: sys
        hostPath:
          path: /sys
      - name: root
        hostPath:
          path: /
      - name: localtime
        hostPath:
          path: /etc/localtime
EOF
kubectl apply -f ~/prometheus-yml/node-exporter.yml
docker run -d \
--name node-exporter \
--restart=always \
--net="host" \
--pid="host" \
-v "/proc:/host/proc:ro" \
-v "/sys:/host/sys:ro" \
-v "/:/rootfs:ro" \
-e TZ=Asia/Shanghai \
-v /etc/localtime:/etc/localtime \
prom/node-exporter:v1.7.0 \
--path.procfs=/host/proc \
--path.rootfs=/rootfs \
--path.sysfs=/host/sys \
--collector.filesystem.ignored-mount-points='^/(sys|proc|dev|host|etc)($$|/)'

模版:8919、12159

方式1:手动配置 node-exporter

# prometheus-ConfigMap.yml
      - job_name: 192.168.1.100
        static_configs:
          - targets: ['192.168.1.100:9100']

方式2:基于 consul 自动发现 node-exporter

mkdir -p ~/prometheus-yml/consul-yml
cat > ~/prometheus-yml/consul-yml/consul.yaml << 'EOF'
---
apiVersion: v1
kind: Service
metadata:
  name: consul-server
  namespace: monitoring
  labels:
    name: consul-server
spec:
  selector:
    name: consul-server
  ports:
    - name: http
      port: 8500
      targetPort: 8500
    - name: https
      port: 8443
      targetPort: 8443
    - name: rpc
      port: 8400
      targetPort: 8400
    - name: serf-lan-tcp
      protocol: "TCP"
      port: 8301
      targetPort: 8301
    - name: serf-lan-udp
      protocol: "UDP"
      port: 8301
      targetPort: 8301
    - name: serf-wan-tcp
      protocol: "TCP"
      port: 8302
      targetPort: 8302
    - name: serf-wan-udp
      protocol: "UDP"
      port: 8302
      targetPort: 8302
    - name: server
      port: 8300
      targetPort: 8300
    - name: consul-dns
      port: 8600
      targetPort: 8600
---
apiVersion: v1
kind: Service
metadata:
  name: consul-server-http
  namespace: monitoring
spec:
  selector:
    name: consul-server
  type: NodePort
  ports:
    - protocol: TCP
      port: 8500
      targetPort: 8500
      nodePort: 32685
      name: consul-server-tcp
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: consul-server
  namespace: monitoring
  labels:
    name: consul-server
spec:
  serviceName: consul-server
  selector:
    matchLabels:
      name: consul-server
  replicas: 3
  template:
    metadata:
      labels:
        name: consul-server
      annotations:
        prometheus.io/scrape: "true"  # prometueus自动发现标签
        prometheus.io/path: "v1/agent/metrics" # consul的metrics路径
        prometheus.io/port: "8500"
    spec:
      affinity:
        podAntiAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            - labelSelector:
                matchExpressions:
                  - key: "name"
                    operator: In
                    values:
                      - consul-server
              topologyKey: "kubernetes.io/hostname"
      terminationGracePeriodSeconds: 10
      containers:
      - name: consul
        image: ccr.ccs.tencentyun.com/huanghuanhui/consul:1.15.4
        imagePullPolicy: IfNotPresent
        args:
          - "agent"
          - "-server"
          - "-bootstrap-expect=3"
          - "-ui"
          - "-data-dir=/consul/data"
          - "-bind=0.0.0.0"
          - "-client=0.0.0.0"
          - "-advertise=$(POD_IP)"
          - "-retry-join=consul-server-0.consul-server.$(NAMESPACE).svc.cluster.local"
          - "-retry-join=consul-server-1.consul-server.$(NAMESPACE).svc.cluster.local"
          - "-retry-join=consul-server-2.consul-server.$(NAMESPACE).svc.cluster.local"
          - "-domain=cluster.local"
          - "-disable-host-node-id"
        volumeMounts:
        - name: consul-nfs-client-pvc
          mountPath: /consul/data
        - name: localtime
          mountPath: /etc/localtime
        env:
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        ports:
        - containerPort: 8500
          name: http
        - containerPort: 8400
          name: rpc
        - containerPort: 8443
          name: https-port
        - containerPort: 8301
          name: serf-lan
        - containerPort: 8302
          name: serf-wan
        - containerPort: 8600
          name: consul-dns
        - containerPort: 8300  
          name: server
      volumes:
      - name: localtime
        hostPath:
          path: /etc/localtime
  volumeClaimTemplates:
  - metadata:
      name: consul-nfs-client-pvc
    spec:
      accessModes: ["ReadWriteOnce"]
      storageClassName: nfs-storage
      resources:
        requests:
          storage: 20Gi
EOF
kubectl apply -f ~/prometheus-yml/consul-yml/consul.yaml
cat > ~/prometheus-yml/consul-yml/consul-Ingress.yml << 'EOF'
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: consul-ingress
  namespace: monitoring
  annotations:
    nginx.ingress.kubernetes.io/ssl-redirect: 'true'
    nginx.ingress.kubernetes.io/proxy-body-size: '4G'
spec:
  ingressClassName: nginx
  rules:
  - host: consul.huanghuanhui.cloud
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: consul-server
            port:
              number: 8500

  tls:
  - hosts:
    - consul.huanghuanhui.cloud
    secretName: consul-ingress-tls
EOF
kubectl create secret -n monitoring \
tls consul-ingress-tls \
--key=/root/ssl/huanghuanhui.cloud.key \
--cert=/root/ssl/huanghuanhui.cloud.crt
kubectl apply -f ~/prometheus-yml/consul-yml/consul-Ingress.yml

ip访问地址:192.168.1.201:32685

域名访问地址:consul.huanghuanhui.cloud

# prometheus-ConfigMap.yml
      - job_name: 'consul-prometheus'
        consul_sd_configs:
          - server: 'consul-server-http.monitoring.svc.cluster.local:8500'
        relabel_configs:
          - source_labels: [__meta_consul_service_id]
            regex: (.+)
            target_label: 'node_name'
            replacement: '$1'
          - source_labels: [__meta_consul_service]
            regex: '.*(node-exporter|hosts).*'
            action: keep
# 服务注册(ip)
curl -X PUT -d '{"id": "1.15.172.119-node-exporter","name": "1.15.172.119-node-exporter","address": "1.15.172.119","port": 9100,"checks": [{"http": "http://1.15.172.119:9100/","interval": "5s"}]}' http://192.168.1.201:32685/v1/agent/service/register

curl -X PUT -d '{"id": "192.168.1.200-node-exporter","name": "192.168.1.200-node-exporter","address": "192.168.1.200","port": 9100,"checks": [{"http": "http://192.168.1.200:9100/","interval": "5s"}]}' http://192.168.1.201:32685/v1/agent/service/register


# 服务注册(域名)
curl -X PUT -d '{"id": "1.15.172.119-node-exporter","name": "1.15.172.119-node-exporter","address": "1.15.172.119","port": 9100,"checks": [{"http": "http://1.15.172.119:9100/","interval": "5s"}]}' https://consul.huanghuanhui.cloud/v1/agent/service/register

curl -X PUT -d '{"id": "192.168.1.200-node-exporter","name": "192.168.1.200-node-exporter","address": "192.168.1.200","port": 9100,"checks": [{"http": "http://192.168.1.200:9100/","interval": "5s"}]}' https://consul.huanghuanhui.cloud/v1/agent/service/register

id或者name要包含node-exporter|hosts标签才能自动发现

# 下线服务(ip)
curl -X PUT http://192.168.1.201:32685/v1/agent/service/deregister/1.15.172.119-node-exporter

curl -X PUT http://192.168.1.201:32685/v1/agent/service/deregister/192.168.1.200-node-exporter

# 下线服务(域名)
curl -X PUT https://consul.huanghuanhui.cloud/v1/agent/service/deregister/1.15.172.119-node-exporter

curl -X PUT https://consul.huanghuanhui.cloud/v1/agent/service/deregister/192.168.1.200-node-exporter

consul 批量注册脚本

mkdir -p ~/prometheus-yml/consul-yml/node-exporter-json

cat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-1.15.172.119.json << 'EOF'
{
    "id": "1.15.172.119-node-exporter",
    "name": "1.15.172.119-node-exporter",
    "address": "1.15.172.119",
    "port": 9100,
    "tags": ["node-exporter"],
    "checks": [{
        "http": "http://1.15.172.119:9100/metrics",
        "interval": "5s"
    }]
}
EOF


cat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-192.168.1.201.json << 'EOF'
{
    "id": "192.168.1.201-node-exporter",
    "name": "192.168.1.201-node-exporter",
    "address": "192.168.1.201",
    "port": 9100,
    "tags": ["node-exporter"],
    "checks": [{
        "http": "http://192.168.1.201:9100/metrics",
        "interval": "5s"
    }]
}
EOF


cat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-192.168.1.202.json << 'EOF'
{
    "id": "192.168.1.202-node-exporter",
    "name": "192.168.1.202-node-exporter",
    "address": "192.168.1.202",
    "port": 9100,
    "tags": ["node-exporter"],
    "checks": [{
        "http": "http://192.168.1.202:9100/metrics",
        "interval": "5s"
    }]
}
EOF


cat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-192.168.1.203.json << 'EOF'
{
    "id": "192.168.1.203-node-exporter",
    "name": "192.168.1.203-node-exporter",
    "address": "192.168.1.203",
    "port": 9100,
    "tags": ["node-exporter"],
    "checks": [{
        "http": "http://192.168.1.203:9100/metrics",
        "interval": "5s"
    }]
}
EOF


cat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-192.168.1.204.json << 'EOF'
{
    "id": "192.168.1.204-node-exporter",
    "name": "192.168.1.204-node-exporter",
    "address": "192.168.1.204",
    "port": 9100,
    "tags": ["node-exporter"],
    "checks": [{
        "http": "http://192.168.1.204:9100/metrics",
        "interval": "5s"
    }]
}
EOF


cat > ~/prometheus-yml/consul-yml/node-exporter-json/node-exporter-192.168.1.200.json << 'EOF'
{
    "id": "192.168.1.200-node-exporter",
    "name": "192.168.1.200-node-exporter",
    "address": "192.168.1.200",
    "port": 9100,
    "tags": ["node-exporter"],
    "checks": [{
        "http": "http://192.168.1.200:9100/metrics",
        "interval": "5s"
    }]
}
EOF


# 添加更多 JSON 文件,每个文件包含一个服务的信息
# 批量注册脚本
cat > ~/prometheus-yml/consul-yml/node-exporter-json/register-service.sh << 'EOF'
#!/bin/bash

CONSUL_API="https://consul.huanghuanhui.cloud/v1/agent/service/register"

declare -a SERVICES=(
    "node-exporter-1.15.172.119.json"
    "node-exporter-192.168.1.201.json"
    "node-exporter-192.168.1.202.json"
    "node-exporter-192.168.1.203.json"
    "node-exporter-192.168.1.204.json"
    "node-exporter-192.168.1.200.json"
    # 添加更多 JSON 文件,每个文件包含一个服务的信息
)

for SERVICE_FILE in "${SERVICES[@]}"; do
    curl -X PUT --data @"$SERVICE_FILE" "$CONSUL_API"
done
EOF
# 批量下线脚本
cat > ~/prometheus-yml/consul-yml/node-exporter-json/deregister-service.sh << 'EOF'
#!/bin/bash

CONSUL_API="https://consul.huanghuanhui.cloud/v1/agent/service/deregister"

declare -a SERVICES=(
    "node-exporter-1.15.172.119.json"
    "node-exporter-192.168.1.201.json"
    "node-exporter-192.168.1.202.json"
    "node-exporter-192.168.1.203.json"
    "node-exporter-192.168.1.204.json"
    "node-exporter-192.168.1.200.json"
    # 添加更多 JSON 文件,每个文件包含一个服务的信息
)

for SERVICE_FILE in "${SERVICES[@]}"; do
    SERVICE_ID=$(jq -r .id "$SERVICE_FILE")
    curl -X PUT "$CONSUL_API/$SERVICE_ID"
done
EOF
mkdir -p ~/prometheus-yml/kube-yml

1、对kube-controller-manager的监控

sed -i 's/bind-address=127.0.0.1/bind-address=0.0.0.0/g' /etc/kubernetes/manifests/kube-controller-manager.yaml
cat > ~/prometheus-yml/kube-yml/prometheus-kube-controller-manager-Service.yml << 'EOF'
apiVersion: v1
kind: Service
metadata:
  namespace: kube-system
  name: kube-controller-manager
  labels:
    app.kubernetes.io/name: kube-controller-manager
spec:
  selector:
    component: kube-controller-manager
  ports:
  - name: https-metrics
    port: 10257
    targetPort: 10257
EOF
kubectl apply -f ~/prometheus-yml/kube-yml/prometheus-kube-controller-manager-Service.yml

2、对kube-scheduler的监控

sed -i 's/bind-address=127.0.0.1/bind-address=0.0.0.0/g' /etc/kubernetes/manifests/kube-scheduler.yaml
cat > ~/prometheus-yml/kube-yml/prometheus-kube-scheduler-Service.yml << EOF
apiVersion: v1
kind: Service
metadata:
  namespace: kube-system
  name: kube-scheduler
  labels:
    app.kubernetes.io/name: kube-scheduler
spec:
  selector:
    component: kube-scheduler
  ports:
  - name: https-metrics
    port: 10259  
    targetPort: 10259
EOF
kubectl apply -f ~/prometheus-yml/kube-yml/prometheus-kube-scheduler-Service.yml

3、对kube-proxy的监控

kubectl get configmap kube-proxy -n kube-system -o yaml | \
sed -e 's/metricsBindAddress: ""/metricsBindAddress: "0.0.0.0:10249"/' | \
kubectl diff -f - -n kube-system

kubectl get configmap kube-proxy -n kube-system -o yaml | \
sed -e 's/metricsBindAddress: ""/metricsBindAddress: "0.0.0.0:10249"/' | \
kubectl apply -f - -n kube-system
kubectl rollout restart daemonset kube-proxy -n kube-system
netstat -tnlp |grep kube-proxy

netstat -antp|grep 10249
cat > ~/prometheus-yml/kube-yml/prometheus-kube-proxy-Service.yml << 'EOF'
apiVersion: v1
kind: Service
metadata:
  name: kube-proxy
  namespace: kube-system
  labels:
    k8s-app: kube-proxy
spec:
  selector:
    k8s-app: kube-proxy
  ports:
  - name: https-metrics
    port: 10249
    targetPort: 10249
    protocol: TCP
EOF
kubectl apply -f ~/prometheus-yml/kube-yml/prometheus-kube-proxy-Service.yml

4、对k8s-etcd的监控

sed -i 's/127.0.0.1:2381/0.0.0.0:2381/g' /etc/kubernetes/manifests/etcd.yaml
cat > ~/prometheus-yml/kube-yml/etcd-k8s-master-Service.yml << 'EOF'
apiVersion: v1
kind: Service
metadata:
  name: etcd-k8s
  namespace: kube-system
  labels:
    k8s-app: etcd
spec:
  type: ClusterIP
  clusterIP: None
  ports:
  - name: port
    port: 2381
---
apiVersion: v1
kind: Endpoints
metadata:
  name: etcd-k8s
  namespace: kube-system
  labels:
    k8s-app: etcd
subsets:
- addresses:
  - ip: 192.168.1.200
    nodeName: k8s-01
  ports:
  - name: port
    port: 2381
EOF
kubectl apply -f ~/prometheus-yml/kube-yml/etcd-k8s-master-Service.yml

https://grafana.com/grafana/dashboards/9733-etcd-for-k8s-cn/

模版:9733

2、k8s 手撕方式安装 grafana

cat > ~/prometheus-yml/grafana-ConfigMap.yml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-config
  namespace: monitoring
data:
  grafana.ini: |
    [smtp]
    enabled = false
    host = localhost:25
    user =
    password =
    skip_verify = false
    from_address = admin@grafana.localhost
    from_name = Grafana
    [alerting]
    enabled =
    execute_alerts = true
EOF
kubectl apply -f ~/prometheus-yml/grafana-ConfigMap.yml
cat > ~/prometheus-yml/grafana-Deployment.yml << 'EOF'
apiVersion: apps/v1
kind: Deployment
metadata:
  name: grafana
  namespace: monitoring
  labels:
    app: grafana
spec:
  selector:
    matchLabels:
      app: grafana
  template:
    metadata:
      labels:
        app: grafana
    spec:
      securityContext:
        fsGroup: 472
        supplementalGroups:
          - 0
      containers:
        - name: grafana
          image: grafana/grafana:10.2.3
          imagePullPolicy: IfNotPresent
          ports:
          - containerPort: 3000
            name: http-grafana
            protocol: TCP
          env:
          - name: TZ
            value: Asia/Shanghai
          - name: GF_SECURITY_ADMIN_USER
            value: admin
          - name: GF_SECURITY_ADMIN_PASSWORD
            value: Admin@2024
          readinessProbe:
            failureThreshold: 3
            httpGet:
              path: /robots.txt
              port: 3000
              scheme: HTTP
            initialDelaySeconds: 10
            periodSeconds: 30
            successThreshold: 1
            timeoutSeconds: 2
          livenessProbe:
            failureThreshold: 3
            initialDelaySeconds: 30
            periodSeconds: 10
            successThreshold: 1
            tcpSocket:
              port: 3000
            timeoutSeconds: 1
          resources:
            limits:
              cpu: "1"
              memory: "2Gi"
            requests:
              cpu: "0.5"
              memory: "1Gi"
          volumeMounts:
            - mountPath: /var/lib/grafana
              name: grafana-data
            - mountPath: /etc/grafana
              name: config
      volumes:
        - name: grafana-data
          persistentVolumeClaim:
            claimName: grafana-nfs-client-pvc
        - name: config
          configMap:
            name: grafana-config
---
apiVersion: v1
kind:  PersistentVolumeClaim
metadata:
  name: grafana-nfs-client-pvc
  namespace: monitoring
spec:
  storageClassName: nfs-storage
  accessModes: [ReadWriteOnce]
  resources:
    requests:
      storage: 2Ti
---
apiVersion: v1
kind: Service
metadata:
  name: grafana-service
  namespace: monitoring
  labels:
    app: grafana
spec:
  type: NodePort
  ports:
  - nodePort: 31300
    port: 3000
  selector:
    app: grafana
EOF
kubectl apply -f ~/prometheus-yml/grafana-Deployment.yml
cat > ~/prometheus-yml/grafana-Ingress.yml << 'EOF'
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: grafana-ingress
  namespace: monitoring
  annotations:
    nginx.ingress.kubernetes.io/ssl-redirect: 'true'
    nginx.ingress.kubernetes.io/proxy-body-size: '4G'
spec:
  ingressClassName: nginx
  rules:
  - host: grafana.huanghuanhui.cloud
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: grafana-service
            port:
              number: 3000

  tls:
  - hosts:
    - grafana.huanghuanhui.cloud
    secretName: grafana-ingress-tls
EOF
kubectl create secret -n monitoring \
tls grafana-ingress-tls \
--key=/root/ssl/huanghuanhui.cloud.key \
--cert=/root/ssl/huanghuanhui.cloud.crt
kubectl apply -f ~/prometheus-yml/grafana-Ingress.yml

访问地址:grafana.huanghuanhui.cloud

模版:8919、12159、13105、9276、12006

3、k8s 手撕方式安装 alertmanager

与qq邮箱集成

cat > ~/prometheus-yml/alertmanager-qq-ConfigMap.yml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:
  name: alertmanager-config
  namespace: monitoring
data:
  alertmanager.yml: |-
    global:
      resolve_timeout: 5m
      smtp_smarthost: 'smtp.qq.com:465'
      smtp_from: '1308470940@qq.com'
      smtp_auth_username: '1308470940@qq.com'
      smtp_auth_password: 'kgwsqpzsvhxvjjii'
      smtp_hello: 'qq.com'
      smtp_require_tls: false
    route:
      group_by: ['alertname', 'cluster']
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 5m
      receiver: default
      routes:
      - receiver: email
        group_wait: 10s
        match:
          team: node
    templates:
      - '/etc/config/template/email.tmpl'
    receivers:
    - name: 'default'
      email_configs:
      - to: '1308470940@qq.com'
        html: '{{ template "email.html" . }}'
        headers: { Subject: "[WARN] Prometheus 告警邮件" }
    - name: 'email'
      email_configs:
      - to: '1308470940@qq.com'
        send_resolved: true
EOF

与钉钉集成(为例)

cat > ~/prometheus-yml/alertmanager-webhook-dingtalk-ConfigMap.yml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:
  name: alertmanager-config
  namespace: monitoring
data:
  alertmanager.yml: |-
    global:
      resolve_timeout: 5m
    route:
      receiver: webhook
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 4h
      group_by: [alertname]
      routes:
      - receiver: webhook
        group_wait: 10s
        match:
          team: node
    receivers:
    - name: webhook
      webhook_configs:
      - url: 'http://alertmanager-webhook-dingtalk.monitoring.svc.cluster.local:8060/dingtalk/webhook1/send'
        send_resolved: true
EOF
kubectl apply -f ~/prometheus-yml/alertmanager-webhook-dingtalk-ConfigMap.yml
cat > ~/prometheus-yml/alertmanager-Deployment.yaml << 'EOF'
apiVersion: apps/v1
kind: Deployment
metadata:
  name: alertmanager
  namespace: monitoring
spec:
  replicas: 1
  selector:
    matchLabels:
      app: alertmanager
  template:
    metadata:
      labels:
        app: alertmanager
    spec:
      containers:
      - name: alertmanager
        image: prom/alertmanager:v0.26.0
        ports:
        - containerPort: 9093
          name: http
        volumeMounts:
        - name: alertmanager-config
          mountPath: /etc/alertmanager
        - name: alertmanager-data
          mountPath: /alertmanager
        - name: localtime
          mountPath: /etc/localtime
        command:
        - "/bin/alertmanager"
        - "--config.file=/etc/alertmanager/alertmanager.yml"
        - "--storage.path=/alertmanager"
      volumes:
      - name: alertmanager-config
        configMap:
          name: alertmanager-config
      - name: alertmanager-data
        persistentVolumeClaim:
          claimName: alertmanager-nfs-client-pvc
      - name: localtime
        hostPath:
          path: /etc/localtime

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: alertmanager-nfs-client-pvc
  namespace: monitoring
spec:
  storageClassName: nfs-storage
  accessModes:
  - ReadWriteOnce
  resources:
    requests:
      storage: "20Gi"
EOF
kubectl apply -f ~/prometheus-yml/alertmanager-Deployment.yaml
cat > ~/prometheus-yml/alertmanager-Service.yml << 'EOF'
apiVersion: v1
kind: Service
metadata:
  name: alertmanager-service
  namespace: monitoring
spec:
  selector:
    app: alertmanager
    
  type: NodePort
  ports:
  - name: web
    port: 9093
    targetPort: http
    nodePort: 30093
EOF
kubectl apply -f ~/prometheus-yml/alertmanager-Service.yml
cat > ~/prometheus-yml/alertmanager-Ingress.yml << 'EOF'
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: alertmanager-ingress
  namespace: monitoring
  annotations:
    nginx.ingress.kubernetes.io/ssl-redirect: 'true'
    nginx.ingress.kubernetes.io/proxy-body-size: '4G'
spec:
  ingressClassName: nginx
  rules:
  - host: alertmanager.huanghuanhui.cloud
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: alertmanager-service
            port:
              number: 9093

  tls:
  - hosts:
    - prometheus.huanghuanhui.cloud
    secretName: alertmanager-ingress-tls
EOF
kubectl create secret -n monitoring \
tls alertmanager-ingress-tls \
--key=/root/ssl/huanghuanhui.cloud.key \
--cert=/root/ssl/huanghuanhui.cloud.crt
kubectl apply -f ~/prometheus-yml/alertmanager-Ingress.yml

访问地址:alertmanager.huanghuanhui.cloud

钉钉集成:alertmanager-webhook-dingtalk

cat > ~/prometheus-yml/alertmanager-webhook-dingtalk-Deployment.yaml << 'EOF'
apiVersion: v1
kind: ConfigMap
metadata:
  name: alertmanager-webhook-dingtalk
  namespace: monitoring
data:
  config.yaml: |-
    templates:
      - /config/template.tmpl

    targets:
      webhook1:
        url: https://oapi.dingtalk.com/robot/send?access_token=423eedfe3802198314e15f712f0578545b74a44cb982723623db2fb034bdc83e
        secret: SECd3c53fbbb1df76a987a658e0ca759ef371ae955ff731af8945219e99d143d3ae

  # 告警模版(也就是钉钉收到怎样的信息模板)
  template.tmpl: |-
    {{ define "__subject" }}
    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
    {{ end }}


    {{ define "__alert_list" }}{{ range . }}
    ---
    {{ if .Labels.owner }}@{{ .Labels.owner }}{{ end }}

    >- **告警状态 :** {{   .Status }}

    >- **告警级别 :** **{{ .Labels.severity }}**

    >- **告警类型 :** {{ .Labels.alertname }}

    >- **告警主机 :** {{ .Labels.instance }} 

    >- **告警主题 :** {{ .Annotations.summary }}

    >- **告警信息 :** {{ index .Annotations "description" }}

    >- **告警时间 :** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
    {{ end }}{{ end }}

    {{ define "__resolved_list" }}{{ range . }}
    ---
    {{ if .Labels.owner }}@{{ .Labels.owner }}{{ end }}

    >- **告警状态 :** {{   .Status }}

    >- **告警类型 :** {{ .Labels.alertname }}

    >- **告警主机 :** {{ .Labels.instance }} 

    >- **告警主题 :** {{ .Annotations.summary }}

    >- **告警信息 :** {{ index .Annotations "description" }}

    >- **告警时间 :** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}

    >- **恢复时间 :** {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}
    {{ end }}{{ end }}


    {{ define "default.title" }}
    {{ template "__subject" . }}
    {{ end }}

    {{ define "default.content" }}
    {{ if gt (len .Alerts.Firing) 0 }}
    **Prometheus-Alertmanager 监控到{{ .Alerts.Firing | len  }}个故障**
    {{ template "__alert_list" .Alerts.Firing }}
    ---
    {{ end }}

    {{ if gt (len .Alerts.Resolved) 0 }}
    **恢复{{ .Alerts.Resolved | len  }}个故障**
    {{ template "__resolved_list" .Alerts.Resolved }}
    {{ end }}
    {{ end }}


    {{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }}
    {{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }}
    {{ template "default.title" . }}
    {{ template "default.content" . }}

---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: alertmanager-webhook-dingtalk
  namespace: monitoring
spec:
  replicas: 1
  selector:
    matchLabels:
      app: alertmanager-webhook-dingtalk
  template:
    metadata:
      labels:
        app: alertmanager-webhook-dingtalk
    spec:
      volumes:
        - name: config
          configMap:
            name: alertmanager-webhook-dingtalk
      containers:
        - name: alertmanager-webhook-dingtalk
          image: ccr.ccs.tencentyun.com/huanghuanhui/prometheus-alertmanager-webhook-dingtalk:v1
          imagePullPolicy: Always
          args:
            - --web.listen-address=:8060
            - --config.file=/config/config.yaml
          volumeMounts:
            - name: config
              mountPath: /config
          resources:
            limits:
              cpu: 100m
              memory: 100Mi
          ports:
            - name: http
              containerPort: 8060

---
apiVersion: v1
kind: Service
metadata:
  name: alertmanager-webhook-dingtalk
  namespace: monitoring
spec:
  selector:
    app: alertmanager-webhook-dingtalk
  ports:
    - name: http
      port: 8060
      targetPort: http
EOF
kubectl apply -f ~/prometheus-yml/alertmanager-webhook-dingtalk-Deployment.yaml

yml 截图

grafana或条件 grafana的+可以create_prometheus