文章目录
- 引包
- 指标的定义
- 整理
- NewCounter
- NewGauge
- NewHistogram
- Summary
- NewCounterVec
我们知道,我们很多时候是需要知道我们的一个程序的运行状态的,那这个时候就是需要用到监控。这里,我们使用的监控是 Prometheus ,那我们的这个监控的指标怎么写呢,笔者找了点资料,写了几个简单的 Demo。
引包
import (
. "flag"
"github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"log"
"math/rand"
"strconv"
"sync"
"time"
)
指标的定义
totalCounterVec = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "worker",
Subsystem: "jobs",
Name: "processed_total",
Help: "Total number of jobs processed by the workers",
},
[]string{"worker_id", "type"},
)
http_request_total = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "http_request_total" ,
Namespace: "prometheus_front_server",
Help: "The total number of processed http requests",
})
inflightCounterVec = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "worker",
Subsystem: "jobs",
Name: "inflight",
Help: "Number of jobs inflight",
},
[]string{"type"},
)
processingTimeVec = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "worker",
Subsystem: "jobs",
Name: "process_time_seconds",
Help: "Amount of time spent processing jobs",
},
[]string{"worker_id", "type"},
)
先来看这三个 vec 的指标:
// track the total number of jobs processed by the worker
totalCounterVec.WithLabelValues(strconv.FormatInt(int64(workerID), 10), job.Type).Inc()
// decrement the inflight tracker
inflightCounterVec.WithLabelValues(job.Type).Dec()
processingTimeVec.WithLabelValues(strconv.FormatInt(int64(workerID), 10), job.Type).Observe(time.Now().Sub(startTime).Seconds())
再来看这个普通的 http_request_total:
func test(ctx *gin.Context){
http_request_total.Inc()
}
现在,我们再来看这个主函数的写法:
engine := gin.New()
engine.GET("/test" , test)
engine.GET("/metrics" , prometheusHttp)
engine.Run("0.0.0.0:9010")
prometheusHttp 里面的代码如下:
// 这里我是不是很清楚的,因为我觉得这个 gin 框架应该是可以不这么做
func prometheusHttp(ctx *gin.Context){
handler := promhttp.Handler()
handler.ServeHTTP(ctx.Writer , ctx.Request)
}
整理
NewCounter
我们利用promauto包提供的 NewCounter 方法定义了一个 Counter 类型的监控指标,只需要填充名字以及帮助信息,该指标就创建完成了。需要注意的是,Counter 类型数据的名字要尽量以 _total 作为后缀。否则当 Prometheus 与其他系统集成时,可能会出现指标无法识别的问题。每当有请求访问根目录时,该指标就会调用 Inc() 方法加一,当然,我们也可以调用 Add()方法累加任意的非负数。
NewGauge
监控累积的请求处理显然还是不够的,通常我们还想知道当前正在处理的请求的数量。Prometheus中的Gauge类型数据,与Counter不同,它既能增大也能变小。将正在处理的请求数量定义为Gauge类型是合适的。e.g:
http_request_in_flight = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "prometheus_front_server" ,
Name: "http_request_in_flight",
Help: "Current number of http requests in flight",
},
)
func test2(ctx *gin.Context){
http_request_in_flight.Inc()
defer http_request_in_flight.Desc()
http_request_total.Inc()
}
Gauge和Counter类型的数据操作起来的差别并不大,唯一的区别是Gauge支持Dec()或者Sub()方法减小指标的值。
NewHistogram
对于一个网络服务来说,能够知道它的平均时延是重要的,不过很多时候我们更想知道响应时间的分布状况。Prometheus 中的 Histogram 类型就对此类需求提供了很好的支持。e.g:
http_request_duration_seconds = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: "prometheus_front_server",
Subsystem: "jobs",
Name: "http_request_duration_seconds",
Help: "Histogram of lantencies for HTTP requests",
ConstLabels: nil,
Buckets: nil,
})
// 测试
http_request_in_flight.Inc()
defer http_request_in_flight.Desc()
http_request_total.Inc()
time.Sleep(time.Duration(rand.Intn(1000)) * time.Millisecond)
http_request_duration_seconds.Observe(time.Since(time.Now()).Seconds())
Summary
这个和 Histogram 是一样的用法
NewCounterVec
不过,有的时候,我们可能希望从更多的特征维度去衡量一个指标。例如,对于接收到的HTTP请求的数目,我们可能希望知道具体到每个路径接收到的请求数目。假设当前能够访问 / 和 /foo 目录,显然定义两个不同的 Counter,比如 http_request_root_total和 http_request_foo_total,并不是一个很好的方法。一方面扩展性比较差:如果定义更多的访问路径就需要创建更多新的监控指标,同时,我们定义的特征维度往往不止一个,可能我们想知道某个路径且返回码为XXX的请求数目是多少,这种方法就无能为力了;另一方面,PromQL也无法很好地对这些指标进行聚合分析。
Prometheus对于此类问题的方法是为指标的每个特征维度定义一个label,一个label本质上就是一组键值对。一个指标可以和多个label相关联,而一个指标和一组具体的label可以唯一确定一条时间序列。对于上述分别统计每条路径的请求数目的问题,标准的Prometheus的解决方法如下:e.g:
totalCounterVec = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "worker",
Subsystem: "jobs",
Name: "processed_total",
Help: "Total number of jobs processed by the workers",
},
// []string{"path"}
[]string{"worker_id", "type"},
)
这里,我们是根据这个 work_id , 和 当前的 类型 type 来做的这个 维度:
totalCounterVec.WithLabelValues(workerId, "type").Inc()
后面的几个 Vec 也是和这个的用法一样的。
这里的介绍就这么多,给一个我测试用的 Demo 的源码:
package main
import (
. "flag"
"github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"log"
"math/rand"
"strconv"
"sync"
"time"
)
// Package: awesomeProject
// Version: 1.0
//
// Created by SunYang on 2020/5/15 11:14
var (
types = []string{"emai", "deactivation", "activation", "transaction", "customer_renew", "order_processed"}
workers = 0
totalCounterVec = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "worker",
Subsystem: "jobs",
Name: "processed_total",
Help: "Total number of jobs processed by the workers",
},
[]string{"worker_id", "type"},
)
http_request_total = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "http_request_total" ,
Namespace: "prometheus_front_server",
Help: "The total number of processed http requests",
})
http_request_in_flight = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "prometheus_front_server" ,
Name: "http_request_in_flight",
Help: "Current number of http requests in flight",
},
)
http_request_duration_seconds = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: "prometheus_front_server",
Subsystem: "jobs",
Name: "http_request_duration_seconds",
Help: "Histogram of lantencies for HTTP requests",
ConstLabels: nil,
Buckets: nil,
})
inflightCounterVec = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "worker",
Subsystem: "jobs",
Name: "inflight",
Help: "Number of jobs inflight",
},
[]string{"type"},
)
processingTimeVec = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "worker",
Subsystem: "jobs",
Name: "process_time_seconds",
Help: "Amount of time spent processing jobs",
},
[]string{"worker_id", "type"},
)
)
func init() {
IntVar(&workers, "workers", 10, "Number of workers to use")
}
func getType() string {
return types[rand.Int()%len(types)]
}
type Job struct {
Type string
Sleep time.Duration
}
func main() {
Parse()
// 开始注册
prometheus.MustRegister(
totalCounterVec,
http_request_total,
http_request_in_flight,
http_request_duration_seconds,
inflightCounterVec,
processingTimeVec)
// create a channel with a 10,000 Job buffer
jobsChannel := make(chan *Job, 10000)
go startJobProcess(jobsChannel)
engine := gin.New()
engine.GET("/test" , test)
engine.GET("/test2" , test2)
engine.GET("/metrics" , prometheusHttp)
engine.Run("0.0.0.0:9010")
}
func prometheusHttp(ctx *gin.Context){
handler := promhttp.Handler()
handler.ServeHTTP(ctx.Writer , ctx.Request)
}
func test(ctx *gin.Context){
http_request_total.Inc()
}
func test2(ctx *gin.Context){
http_request_in_flight.Inc()
defer http_request_in_flight.Desc()
http_request_total.Inc()
time.Sleep(time.Duration(rand.Intn(1000)) * time.Millisecond)
http_request_duration_seconds.Observe(time.Since(time.Now()).Seconds())
}
func startJobProcess(jobs <-chan *Job) {
log.Printf("[INFO] starting %d workers\n", workers)
wait := sync.WaitGroup{}
// notify the sync group we need to wait for 10 goroutines
wait.Add(workers)
// start 10 works
for i := 0; i < workers; i++ {
go func(workerID int) {
// start the worker
startWorker(workerID, jobs)
wait.Done()
}(i)
}
wait.Wait()
}
func startWorker(workerID int, jobs <-chan *Job) {
for {
select {
// read from the job channel
case job := <-jobs:
startTime := time.Now()
// fake processing the request
time.Sleep(job.Sleep)
log.Printf("[%d][%s] Processed job in %0.3f seconds", workerID, job.Type, time.Now().Sub(startTime).Seconds())
// track the total number of jobs processed by the worker
totalCounterVec.WithLabelValues(strconv.FormatInt(int64(workerID), 10), job.Type).Inc()
// decrement the inflight tracker
inflightCounterVec.WithLabelValues(job.Type).Dec()
processingTimeVec.WithLabelValues(strconv.FormatInt(int64(workerID), 10), job.Type).Observe(time.Now().Sub(startTime).Seconds())
}
}
}
func createJobs(jobs chan<- *Job) {
for {
// create a random job
job := makeJob()
// track the job in the inflight tracker
inflightCounterVec.WithLabelValues(job.Type).Inc()
// send the job down the channel
jobs <- job
// don't pile up too quickly
time.Sleep(5 * time.Millisecond)
}
}
func makeJob() *Job {
return &Job{
Type: getType(),
Sleep: time.Duration(rand.Int()%100+10) * time.Millisecond,
}
}