剖析elasticsearch的评分计算过程

  • es搜索结果是怎样的排序的?
  • 准备测试数据
  • 搜索
  • 剖析参数含义
  • 结论


es搜索结果是怎样的排序的?

es的排序准则的相关度,根据搜索 关键词 计算关键词在一个文档中的得分,得分越高结果越靠前。那么计算的准则是什么?

  1. TF/IDF
  2. BM25
这两种算法在这里我就先不做详细说明,看下图,两种算法的得分趋势图。TF/IDF会随着关键词出现的次数得分逐渐增高,BM25随着关键词出现的次数,得分会有一个极限(用两个参数可以进行调节 k1[默认1.2],b[默认0.75])。目前ES5.0以后版本默认使用BM25。

es _score 命中评分 es得分计算_es

准备测试数据

测试索引mapping结构如下

# PUT /es_test
{
  "mappings": {
    "zzq":{
       "properties" : {
          "date" : {
            "type" : "date",
            "format" : "yyyy/MM/dd HH:mm:ss"
          },
          "likes" : {
            "type" : "long"
          },
          "text" : {
            "type" : "text",
            "analyzer": "ik_max_word",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "title" : {
            "type" : "text",
            "analyzer": "ik_max_word",
            "fields" : {
              "keyword" : {
                "type" : "keyword",
                "ignore_above" : 256
              }
            }
          },
          "views" : {
            "type" : "long"
          }
        }
    }
  }
}

可以看到到默认分片主分片是5个,每个分片的副本分片是1个。

GET /es_test/_settings?pretty
{
  "es_test" : {
    "settings" : {
      "index" : {
        "creation_date" : "1564302790660",
        "number_of_shards" : "5",
        "number_of_replicas" : "1",
        "uuid" : "11dagZOrTEyaiCWZluFpdg",
        "version" : {
          "created" : "6050399"
        },
        "provided_name" : "es_test"
      }
    }
  }
}

测试数据如下

"hits" : [
      {
        "_index" : "es_test",
        "_type" : "zzq",
        "_id" : "2",
        "_score" : 1.0,
        "_source" : {
          "title" : "测试数据",
          "text" : "我爱您中国,中国",
          "date" : "2019/07/28 00:00:00",
          "views" : 1000,
          "likes" : 4000
        }
      },
      {
        "_index" : "es_test",
        "_type" : "zzq",
        "_id" : "4",
        "_score" : 1.0,
        "_source" : {
          "title" : "测试数据",
          "text" : "我是中国人,我爱中国。为中国自豪",
          "date" : "2019/07/28 00:00:00",
          "views" : 1000,
          "likes" : 4000
        }
      },
      {
        "_index" : "es_test",
        "_type" : "zzq",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "title" : "测试数据",
          "text" : "我爱中国",
          "date" : "2019/07/28 00:00:00",
          "views" : 1000,
          "likes" : 4000
        }
      },
      {
        "_index" : "es_test",
        "_type" : "zzq",
        "_id" : "3",
        "_score" : 1.0,
        "_source" : {
          "title" : "测试数据",
          "text" : "中国",
          "date" : "2019/07/28 00:00:00",
          "views" : 1000,
          "likes" : 4000
        }
      }
    ]
  1. 我爱您中国,中国
####分词器 ik_max_word 分词个数 6   “中国”出现的次数2
GET _analyze
{
  "analyzer": "ik_max_word"
  , "text": "我爱您中国,中国"
}
###结果
{
  "tokens" : [
    {
      "token" : "我爱",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "我",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "爱",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "CN_CHAR",
      "position" : 2
    },
    {
      "token" : "您",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "CN_CHAR",
      "position" : 3
    },
    {
      "token" : "中国",
      "start_offset" : 3,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "中国",
      "start_offset" : 6,
      "end_offset" : 8,
      "type" : "CN_WORD",
      "position" : 5
    }
  ]
}
  1. 我是中国人,我爱中国。为中国自豪
####分词器 ik_max_word 分词个数 14  “中国”出现的次数3
GET _analyze
{
  "analyzer": "ik_max_word"
  , "text": "我是中国人,我爱中国。为中国自豪"
}
###结果
{
  "tokens" : [
    {
      "token" : "我是",
      "start_offset" : 0,
      "end_offset" : 2,
      "type" : "CN_WORD",
      "position" : 0
    },
    {
      "token" : "我",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "CN_WORD",
      "position" : 1
    },
    {
      "token" : "是中国人",
      "start_offset" : 1,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 2
    },
    {
      "token" : "中国人",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 3
    },
    {
      "token" : "中国",
      "start_offset" : 2,
      "end_offset" : 4,
      "type" : "CN_WORD",
      "position" : 4
    },
    {
      "token" : "国人",
      "start_offset" : 3,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 5
    },
    {
      "token" : "人",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "CN_WORD",
      "position" : 6
    },
    {
      "token" : "我爱",
      "start_offset" : 6,
      "end_offset" : 8,
      "type" : "CN_WORD",
      "position" : 7
    },
    {
      "token" : "我",
      "start_offset" : 6,
      "end_offset" : 7,
      "type" : "CN_WORD",
      "position" : 8
    },
    {
      "token" : "爱",
      "start_offset" : 7,
      "end_offset" : 8,
      "type" : "CN_CHAR",
      "position" : 9
    },
    {
      "token" : "中国",
      "start_offset" : 8,
      "end_offset" : 10,
      "type" : "CN_WORD",
      "position" : 10
    },
    {
      "token" : "为",
      "start_offset" : 11,
      "end_offset" : 12,
      "type" : "CN_CHAR",
      "position" : 11
    },
    {
      "token" : "中国",
      "start_offset" : 12,
      "end_offset" : 14,
      "type" : "CN_WORD",
      "position" : 12
    },
    {
      "token" : "自豪",
      "start_offset" : 14,
      "end_offset" : 16,
      "type" : "CN_WORD",
      "position" : 13
    }
  ]
}
  1. 我爱中国
####分词器 ik_max_word 分词个数 4  “中国”出现的次数1
GET _analyze
{
 "analyzer": "ik_max_word"
 , "text": " 我爱中国"
}
###结果
{
 "tokens" : [
   {
     "token" : "我爱",
     "start_offset" : 1,
     "end_offset" : 3,
     "type" : "CN_WORD",
     "position" : 0
   },
   {
     "token" : "我",
     "start_offset" : 1,
     "end_offset" : 2,
     "type" : "CN_WORD",
     "position" : 1
   },
   {
     "token" : "爱",
     "start_offset" : 2,
     "end_offset" : 3,
     "type" : "CN_CHAR",
     "position" : 2
   },
   {
     "token" : "中国",
     "start_offset" : 3,
     "end_offset" : 5,
     "type" : "CN_WORD",
     "position" : 3
   }
 ]
}
  1. 中国
####分词器 ik_max_word 分词个数 1  “中国”出现的次数1
GET _analyze
{
  "analyzer": "ik_max_word"
  , "text": "中国"
}
###结果
{
  "tokens" : [
    {
      "token" : "中国",
      "start_offset" : 1,
      "end_offset" : 3,
      "type" : "CN_WORD",
      "position" : 0
    }
  ]
}

分词器 ik_max_word
我爱您中国,中国 (分词个数 6 “中国”出现的次数2)
我是中国人,我爱中国。为中国自豪(分词个数 14 “中国”出现的次数3)
我爱中国(分词个数 4 “中国”出现的次数1)
中国(分词个数 1 “中国”出现的次数1)

搜索

使用 “中国” 做搜索词 explain开启

GET es_test/zzq/_search
{
  "explain": true, #此参数可以打印出算分过程
  "query": {
    "match": {
      "text": {
        "query": "中国",
        "analyzer": "ik_max_word"
      }
    }
  }
}
####结果
"hits" : [
      {
        "_shard" : "[es_test][3]",
        "_node" : "DIMbjVqATrW0c7QifsaiFw",
        "_index" : "es_test",
        "_type" : "zzq",
        "_id" : "1",
        "_score" : 0.2876821,
        "_source" : {
          "title" : "测试数据",
          "text" : "我爱中国",
          "date" : "2019/07/28 00:00:00",
          "views" : 1000,
          "likes" : 4000
        },
        "_explanation" : {
          "value" : 0.2876821,
          "description" : "weight(text:中国 in 0) [PerFieldSimilarity], result of:",
          "details" : [
            {
              "value" : 0.2876821,
              "description" : "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
              "details" : [
                {
                  "value" : 0.2876821,
                  "description" : "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                  "details" : [
                    {
                      "value" : 1.0,
                      "description" : "docFreq",
                      "details" : [ ]
                    },
                    {
                      "value" : 1.0,
                      "description" : "docCount",
                      "details" : [ ]
                    }
                  ]
                },
                {
                  "value" : 1.0,
                  "description" : "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                  "details" : [
                    {
                      "value" : 1.0,
                      "description" : "termFreq=1.0",
                      "details" : [ ]
                    },
                    {
                      "value" : 1.2,
                      "description" : "parameter k1",
                      "details" : [ ]
                    },
                    {
                      "value" : 0.75,
                      "description" : "parameter b",
                      "details" : [ ]
                    },
                    {
                      "value" : 4.0,
                      "description" : "avgFieldLength",
                      "details" : [ ]
                    },
                    {
                      "value" : 4.0,
                      "description" : "fieldLength",
                      "details" : [ ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard" : "[es_test][4]",
        "_node" : "DIMbjVqATrW0c7QifsaiFw",
        "_index" : "es_test",
        "_type" : "zzq",
        "_id" : "3",
        "_score" : 0.2876821,
        "_source" : {
          "title" : "测试数据",
          "text" : "中国",
          "date" : "2019/07/28 00:00:00",
          "views" : 1000,
          "likes" : 4000
        },
        "_explanation" : {
          "value" : 0.2876821,
          "description" : "weight(text:中国 in 0) [PerFieldSimilarity], result of:",
          "details" : [
            {
              "value" : 0.2876821,
              "description" : "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
              "details" : [
                {
                  "value" : 0.2876821,
                  "description" : "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                  "details" : [
                    {
                      "value" : 1.0,
                      "description" : "docFreq",
                      "details" : [ ]
                    },
                    {
                      "value" : 1.0,
                      "description" : "docCount",
                      "details" : [ ]
                    }
                  ]
                },
                {
                  "value" : 1.0,
                  "description" : "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                  "details" : [
                    {
                      "value" : 1.0,
                      "description" : "termFreq=1.0",
                      "details" : [ ]
                    },
                    {
                      "value" : 1.2,
                      "description" : "parameter k1",
                      "details" : [ ]
                    },
                    {
                      "value" : 0.75,
                      "description" : "parameter b",
                      "details" : [ ]
                    },
                    {
                      "value" : 1.0,
                      "description" : "avgFieldLength",
                      "details" : [ ]
                    },
                    {
                      "value" : 1.0,
                      "description" : "fieldLength",
                      "details" : [ ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard" : "[es_test][2]",
        "_node" : "DIMbjVqATrW0c7QifsaiFw",
        "_index" : "es_test",
        "_type" : "zzq",
        "_id" : "2",
        "_score" : 0.28247002,
        "_source" : {
          "title" : "测试数据",
          "text" : "我爱您中国,中国",
          "date" : "2019/07/28 00:00:00",
          "views" : 1000,
          "likes" : 4000
        },
        "_explanation" : {
          "value" : 0.28247002,
          "description" : "weight(text:中国 in 0) [PerFieldSimilarity], result of:",
          "details" : [
            {
              "value" : 0.28247002,
              "description" : "score(doc=0,freq=2.0 = termFreq=2.0\n), product of:",
              "details" : [
                {
                  "value" : 0.18232156,
                  "description" : "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                  "details" : [
                    {
                      "value" : 2.0,
                      "description" : "docFreq",
                      "details" : [ ]
                    },
                    {
                      "value" : 2.0,
                      "description" : "docCount",
                      "details" : [ ]
                    }
                  ]
                },
                {
                  "value" : 1.5492958,
                  "description" : "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                  "details" : [
                    {
                      "value" : 2.0,
                      "description" : "termFreq=2.0",
                      "details" : [ ]
                    },
                    {
                      "value" : 1.2,
                      "description" : "parameter k1",
                      "details" : [ ]
                    },
                    {
                      "value" : 0.75,
                      "description" : "parameter b",
                      "details" : [ ]
                    },
                    {
                      "value" : 10.0,
                      "description" : "avgFieldLength",
                      "details" : [ ]
                    },
                    {
                      "value" : 6.0,
                      "description" : "fieldLength",
                      "details" : [ ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard" : "[es_test][2]",
        "_node" : "DIMbjVqATrW0c7QifsaiFw",
        "_index" : "es_test",
        "_type" : "zzq",
        "_id" : "4",
        "_score" : 0.2638865,
        "_source" : {
          "title" : "测试数据",
          "text" : "我是中国人,我爱中国。为中国自豪",
          "date" : "2019/07/28 00:00:00",
          "views" : 1000,
          "likes" : 4000
        },
        "_explanation" : {
          "value" : 0.26388648,
          "description" : "weight(text:中国 in 0) [PerFieldSimilarity], result of:",
          "details" : [
            {
              "value" : 0.26388648,
              "description" : "score(doc=0,freq=3.0 = termFreq=3.0\n), product of:",
              "details" : [
                {
                  "value" : 0.18232156,
                  "description" : "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                  "details" : [
                    {
                      "value" : 2.0,
                      "description" : "docFreq",
                      "details" : [ ]
                    },
                    {
                      "value" : 2.0,
                      "description" : "docCount",
                      "details" : [ ]
                    }
                  ]
                },
                {
                  "value" : 1.4473685,
                  "description" : "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                  "details" : [
                    {
                      "value" : 3.0,
                      "description" : "termFreq=3.0",
                      "details" : [ ]
                    },
                    {
                      "value" : 1.2,
                      "description" : "parameter k1",
                      "details" : [ ]
                    },
                    {
                      "value" : 0.75,
                      "description" : "parameter b",
                      "details" : [ ]
                    },
                    {
                      "value" : 10.0,
                      "description" : "avgFieldLength",
                      "details" : [ ]
                    },
                    {
                      "value" : 14.0,
                      "description" : "fieldLength",
                      "details" : [ ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      }
    ]

剖析参数含义

{
        "_shard" : "[es_test][3]"
        }

_shard 代表文档所在的分片,通过检索结果我们来整理一下数据

文档内容

分词数

所在分片

“中国”出现次数

我爱您中国,中国

6

2

2

我是中国人,我爱中国。为中国自豪

14

2

3

我爱中国

4

3

1

中国

1

4

1

分片文档数

分片

文档数

2

2

3

1

4

1

研究其中一个结果进行解析 看json中的注释

{
        "_shard" : "[es_test][3]",//分片所在位置
        "_node" : "DIMbjVqATrW0c7QifsaiFw",
        "_index" : "es_test",
        "_type" : "zzq",
        "_id" : "1",
        "_score" : 0.2876821,//得分
        "_source" : {
          "title" : "测试数据",
          "text" : "我爱中国",
          "date" : "2019/07/28 00:00:00",
          "views" : 1000,
          "likes" : 4000
        },
        "_explanation" : {
          "value" : 0.2876821,
          "description" : "weight(text:中国 in 0) [PerFieldSimilarity], result of:",
          "details" : [
            {
              "value" : 0.2876821,
              "description" : "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
              "details" : [
                {//idf
                  "value" : 0.2876821,
                  "description" : "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                  "details" : [
                    {
                      "value" : 1.0,
                      "description" : "docFreq",//docFreq=1
                      "details" : [ ]
                    },
                    {
                      "value" : 1.0,
                      "description" : "docCount",//docCount=1
                      "details" : [ ]
                    }
                  ]
                },
                {//tfNorm
                  "value" : 1.0,
                  "description" : "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                  "details" : [
                    {
                      "value" : 1.0,
                      "description" : "termFreq=1.0",//termFreq=1
                      "details" : [ ]
                    },
                    {
                      "value" : 1.2,
                      "description" : "parameter k1",//k1=1.2
                      "details" : [ ]
                    },
                    {
                      "value" : 0.75,
                      "description" : "parameter b",//b=0.75
                      "details" : [ ]
                    },
                    {
                      "value" : 4.0,
                      "description" : "avgFieldLength",//avgFieldLength=4.0
                      "details" : [ ]
                    },
                    {
                      "value" : 4.0,
                      "description" : "fieldLength",//fieldLength=4.0
                      "details" : [ ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      }

idf=log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) =0.2876821
docFreq=1
docCount=1

tfNorm=(freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength))=1
termFreq=1
k1=1.2
b=0.75
avgFieldLength=4.0
fieldLength=4.0

_score=idf * tfNorm=1
分析过程这里就不细致的描述,直接看结论即可。

结论

docFreq:当前分片包含关键词的文档个数
docCount:当前分片的个数

termFreq:当前文档分词后的terms匹配关键词的个数
k1、b 是BM25算法的两个调节因子,文章开头已经简述
avgFieldLength:当前分片的平均term(当前分片所有文档的分词个数/当前分片总文档数)
fieldLength:本文档的分词数

BM25 IDF函数图像