撰于 阅读 213

Lucene评分算法解释

Lucene 的 IndexSearcher 提供一个 explain 方法,能够解释 Document 的 Score 是怎么得来的,具体每一部分的得分都可以详细地打印出来。这里用一个中文实例来纯手工验算一遍 Lucene 的评分算法,并且结合 Lucene 的源码做一个解释。

实际用例

首先是公司开发实际的用例,使用“湿疹”来检索多个字段。

{
  "from": 0,
  "size": 1,
  "query": {
    "bool": {
      "filter": [
        {
          "term": {
            "is_delete": {
              "value": false,
              "boost": 1
            }
          }
        },
        {
          "bool": {
            "should": [
              {
                "term": {
                  "item_type": {
                    "value": 1601,
                    "boost": 1
                  }
                }
              }
            ],
            "disable_coord": false,
            "adjust_pure_negative": true,
            "boost": 1
          }
        }
      ],
      "should": [
       {
         "term": {
           "title.keyword": {
             "value": "湿疹",
             "boost": 400
           }
         }
       },
        {
          "term": {
            "title_alias.keyword": {
              "value": "湿疹",
              "boost": 200
            }
          }
        },
        {
          "constant_score": {
            "filter": {
              "query_string": {
                "query": "title.ik_gram:湿疹",
                "fields": [],
                "use_dis_max": true,
                "tie_breaker": 0,
                "default_operator": "or",
                "auto_generate_phrase_queries": false,
                "max_determinized_states": 10000,
                "enable_position_increments": true,
                "fuzziness": "AUTO",
                "fuzzy_prefix_length": 0,
                "fuzzy_max_expansions": 50,
                "phrase_slop": 0,
                "escape": false,
                "split_on_whitespace": true,
                "boost": 1
              }
            },
            "boost": 100
          }
        },
        {
          "constant_score": {
            "filter": {
              "query_string": {
                "query": "title_alias.ik_gram:湿疹",
                "fields": [],
                "use_dis_max": true,
                "tie_breaker": 0,
                "default_operator": "or",
                "auto_generate_phrase_queries": false,
                "max_determinized_states": 10000,
                "enable_position_increments": true,
                "fuzziness": "AUTO",
                "fuzzy_prefix_length": 0,
                "fuzzy_max_expansions": 50,
                "phrase_slop": 0,
                "escape": false,
                "split_on_whitespace": true,
                "boost": 1
              }
            },
            "boost": 50
          }
        },
        {
          "constant_score": {
            "filter": {
              "query_string": {
                "query": "title_sub_title:湿疹",
                "fields": [],
                "use_dis_max": true,
                "tie_breaker": 0,
                "default_operator": "or",
                "auto_generate_phrase_queries": false,
                "max_determinized_states": 10000,
                "enable_position_increments": true,
                "fuzziness": "AUTO",
                "fuzzy_prefix_length": 0,
                "fuzzy_max_expansions": 50,
                "phrase_slop": 0,
                "escape": false,
                "split_on_whitespace": true,
                "boost": 1
              }
            },
            "boost": 50
          }
        },
        {
          "constant_score": {
            "filter": {
              "query_string": {
                "query": "alias_sub_title:湿疹",
                "fields": [],
                "use_dis_max": true,
                "tie_breaker": 0,
                "default_operator": "or",
                "auto_generate_phrase_queries": false,
                "max_determinized_states": 10000,
                "enable_position_increments": true,
                "fuzziness": "AUTO",
                "fuzzy_prefix_length": 0,
                "fuzzy_max_expansions": 50,
                "phrase_slop": 0,
                "escape": false,
                "split_on_whitespace": true,
                "boost": 1
              }
            },
            "boost": 50
          }
        },
        {
          "query_string": {
            "query": "explain_introduce_exists:1",
            "fields": [],
            "use_dis_max": true,
            "tie_breaker": 0,
            "default_operator": "or",
            "auto_generate_phrase_queries": false,
            "max_determinized_states": 10000,
            "enable_position_increments": true,
            "fuzziness": "AUTO",
            "fuzzy_prefix_length": 0,
            "fuzzy_max_expansions": 50,
            "phrase_slop": 0,
            "escape": false,
            "split_on_whitespace": true,
            "boost": 0.8
          }
        },
        {
          "query_string": {
            "query": "-explain_introduce_exists:1",
            "fields": [],
            "use_dis_max": true,
            "tie_breaker": 0,
            "default_operator": "or",
            "auto_generate_phrase_queries": false,
            "max_determinized_states": 10000,
            "enable_position_increments": true,
            "fuzziness": "AUTO",
            "fuzzy_prefix_length": 0,
            "fuzzy_max_expansions": 50,
            "phrase_slop": 0,
            "escape": false,
            "split_on_whitespace": true,
            "boost": 0.5
          }
        }
      ],
      "disable_coord": false,
      "adjust_pure_negative": true,
      "boost": 1
    }
  },
  "min_score": 1,
  "explain": true
}

完整的输出不展示了,截取_explanation 解释字段,逐一分析各项的得分是怎么计算的。

{
    "value": 500.8,
    "description": "sum of:",
    "details": [
        {
            "value": 400,
            "description": "weight(title.keyword:湿疹 in 1490) [PerFieldSimilarity], result of:",
            "details": [
                {
                    "value": 400,
                    "description": "score(doc=1490,freq=1.0), product of:",
                    "details": [
                        {
                            "value": 400,
                            "description": "queryWeight, product of:",
                            "details": [
                                {
                                    "value": 400,
                                    "description": "boost",
                                    "details": []
                                },
                                {
                                    "value": 1,
                                    "description": "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
                                    "details": [
                                        {
                                            "value": 3,
                                            "description": "docFreq",
                                            "details": []
                                        },
                                        {
                                            "value": 1491,
                                            "description": "docCount",
                                            "details": []
                                        }
                                    ]
                                },
                                {
                                    "value": 1,
                                    "description": "queryNorm",
                                    "details": []
                                }
                            ]
                        },
                        {
                            "value": 1,
                            "description": "fieldWeight in 1490, product of:",
                            "details": [
                                {
                                    "value": 1,
                                    "description": "tf(freq=1.0), with freq of:",
                                    "details": [
                                        {
                                            "value": 1,
                                            "description": "termFreq=1.0",
                                            "details": []
                                        }
                                    ]
                                },
                                {
                                    "value": 1,
                                    "description": "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
                                    "details": [
                                        {
                                            "value": 3,
                                            "description": "docFreq",
                                            "details": []
                                        },
                                        {
                                            "value": 1491,
                                            "description": "docCount",
                                            "details": []
                                        }
                                    ]
                                },
                                {
                                    "value": 1,
                                    "description": "fieldNorm(doc=1490)",
                                    "details": []
                                }
                            ]
                        }
                    ]
                }
            ]
        },
        {
            "value": 100,
            "description": "ConstantScore(title.ik_gram:湿疹), product of:",
            "details": [
                {
                    "value": 100,
                    "description": "boost",
                    "details": []
                },
                {
                    "value": 1,
                    "description": "queryNorm",
                    "details": []
                }
            ]
        },
        {
            "value": 0.8,
            "description": "explain_introduce_exists:[1 TO 1], product of:",
            "details": [
                {
                    "value": 0.8,
                    "description": "boost",
                    "details": []
                },
                {
                    "value": 1,
                    "description": "queryNorm",
                    "details": []
                }
            ]
        },
        {
            "value": 0,
            "description": "match on required clause, product of:",
            "details": [
                {
                    "value": 0,
                    "description": "# clause",
                    "details": []
                },
                {
                    "value": 1,
                    "description": "is_delete:F, product of:",
                    "details": [
                        {
                            "value": 1,
                            "description": "boost",
                            "details": []
                        },
                        {
                            "value": 1,
                            "description": "queryNorm",
                            "details": []
                        }
                    ]
                }
            ]
        },
        {
            "value": 0,
            "description": "match on required clause, product of:",
            "details": [
                {
                    "value": 0,
                    "description": "# clause",
                    "details": []
                },
                {
                    "value": 1,
                    "description": "item_type:[1601 TO 1601], product of:",
                    "details": [
                        {
                            "value": 1,
                            "description": "boost",
                            "details": []
                        },
                        {
                            "value": 1,
                            "description": "queryNorm",
                            "details": []
                        }
                    ]
                }
            ]
        }
    ]
}

Lucene 的评分计算公式

来看看 Lucene 的评分计算公式:

image

分值计算方式为查询语句 q 中每个项 t 与文档 d 的匹配分值之和,当然还有权重的因素。其中每一项的意思如下表所示:

image

总匹配分值的计算

具体到上面的测试来讲,最终匹配分值=查询语句在多个字段中的得分之和。即最终结果:500.8=400+100+0.8

  • 400:title.keyword
  • 100:constant_score 中的 title.ik_gram
  • 0.8:explain_introduce_exists 存在
查询语句在某个域匹配分值计算

具体分析每个字段下的得分是怎么计算的。以 title.keyword 的得分 400 为例,搜索词为:“湿疹”。所以计算结果等于这两部分的和:“湿疹”在 title 中的匹配分值 。即查询设置的 boost=400。

某个项在某个域的匹配分值的计算
第一部分:should

其中包括 3 个部分:

  • title 和 title_alias 的 term 精准匹配
  • title.ik_gram、title_alias.ik_gram、title_sub_title、alias_sub_title 的 constant_score 查询
  • explain_introduce_exists 的 query_string 查询
term 精准匹配

接下来我们看看“湿疹”在 title.keyword 中的匹配分值 400 是怎么算出来的。这里的查询是精准匹配且 title 不进行分词,只有一个项:“湿疹”。在 field 中的分值 score = 查询权重 queryWeight x 域权重 fieldWeight,即 400 =400 * 1。

我们可以看下源码:

private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) {
  Explanation queryExpl = explainQuery(stats);
  Explanation fieldExpl = explainField(doc, freq, stats, norms);
  if (queryExpl.getValue() == 1f) {
    return fieldExpl;
  }
  return Explanation.match(
      queryExpl.getValue() * fieldExpl.getValue(),
      "score(doc="+doc+",freq="+freq.getValue()+"), product of:",
      queryExpl, fieldExpl);
}

最后的匹配的得分= queryExpl.getValue() * fieldExpl.getValue(),

queryWeight 的计算

queryWeight 的计算可以在 TermQuery$TermWeight.normalize(float)方法中看到计算的实现:

 private Explanation explainQuery(IDFStats stats) {
  List<Explanation> subs = new ArrayList<>();

  Explanation boostExpl = Explanation.match(stats.boost, "boost");
  if (stats.boost != 1.0f)
    subs.add(boostExpl);
  subs.add(stats.idf);

  Explanation queryNormExpl = Explanation.match(stats.queryNorm,"queryNorm");
  subs.add(queryNormExpl);

  return Explanation.match(
      boostExpl.getValue() * stats.idf.getValue() * queryNormExpl.getValue(),
      "queryWeight, product of:", subs);
}



@Override
  public void normalize(float queryNorm, float boost) {
    this.boost = boost;
    this.queryNorm = queryNorm;
    queryWeight = queryNorm * boost * idf.getValue();
    value = queryWeight * idf.getValue();         // idf for document
  }
}

queryWeight = queryNorm boost idf.getValue(),这里设置 boost = 400。即 400 = 400 1 1。

idf 的计算

idf 是项在倒排文档中出现的频率,计算方式为

public float idf(long docFreq, long docCount) {
    return (float)(Math.log((double)(docCount + 1L) / (double)(docFreq + 1L)) + 1.0D);
}

docFreq 是根据指定关键字进行检索,检索到的 Document 的数量,我们测试的 docFreq=3;docCount 是指索引文件中总共的 Document 的数量,例子的 numDocs=1491(这里实际上是分片的文档数,实际中为了规避不同分片的影响,设置 idf=1)。但实际上我们自定义了相似度计算插件,设置 idf=1。

@Override
public float idf(long l, long l1) {
    return 1f;
}

queryNorm 的计算

queryNorm 的计算在 DefaultSimilarity 类中实现,如下所示:

@Override
public float queryNorm(float sumOfSquaredWeights) {
  return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
}

这里,sumOfSquaredWeights 的计算是在 org.apache.lucene.search.TermQuery.TermWeight 类中的 sumOfSquaredWeights 方法实现:

/**
 * Creates a normalized weight for a top-level {@link Query}.
 * The query is rewritten by this method and {@link Query#createWeight} called,
 * afterwards the {@link Weight} is normalized. The returned {@code Weight}
 * can then directly be used to get a {@link Scorer}.
 * @lucene.internal
 */
public Weight createNormalizedWeight(Query query, boolean needsScores) throws IOException {
  query = rewrite(query);
  Weight weight = createWeight(query, needsScores);
  float v = weight.getValueForNormalization();
  float norm = getSimilarity(needsScores).queryNorm(v);
  if (Float.isInfinite(norm) || Float.isNaN(norm)) {
    norm = 1.0f;
  }
  weight.normalize(norm, 1.0f);
  return weight;
}

sumOfSquaredWeights=weight.getValueForNormalization()


@Override
public float getValueForNormalization() {
  // TODO: (sorta LUCENE-1907) make non-static class and expose this squaring via a nice method to subclasses?
  return queryWeight * queryWeight;  // sum of squared weights
}

其实默认情况下,sumOfSquaredWeights = idf x idf,因为 Lucune 中默认的 boost = 1.0。

上面测试例子中 sumOfSquaredWeights 的计算如下所示:

sumOfSquaredWeights = 400 x 400;

然后,就可以计算 queryNorm 的值了,计算如下所示:

queryNorm = (float)(1.0 / Math.sqrt(400 x 400) = 0.0025

实际我们也自定义了 queryNorm,直接输出 1。

@Override
public float queryNorm(float valueForNormalization) {
    return 1.0F;
}

fieldWeight 的计算

在 org/apache/lucene/search/similarities/TFIDFSimilarity.java 的 explainScore 方法中有:

private Explanation explainField(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) {
  Explanation tfExplanation = Explanation.match(tf(freq.getValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
  Explanation fieldNormExpl = Explanation.match(
      norms != null ? decodeNormValue(norms.get(doc)) : 1.0f,
      "fieldNorm(doc=" + doc + ")");

  return Explanation.match(
      tfExplanation.getValue() * stats.idf.getValue() * fieldNormExpl.getValue(),
      "fieldWeight in " + doc + ", product of:",
      tfExplanation, stats.idf, fieldNormExpl);
}

使用计算式表示就是

fieldWeight = tf idf fieldNorm=1 1 1

tf 的计算

文档中项的出现概率,计算方式为:

/** Implemented as <code>sqrt(freq)</code>. */
@Override
public float tf(float freq) {
  return (float)Math.sqrt(freq);
}

同样,我们也进行了自定义:

@Override
public float tf(float v) {
    return 1f;
}

fieldNorm 的计算

tf 和 idf 的计算参考前面的,fieldNorm 的计算在索引的时候确定了,此时直接从索引文件中读取,这个方法并没有给出直接的计算。如果使用 ClassicSimilarity 的话,它实际上就是 lengthNorm,域越长的话 Norm 越小。在源码中藏得比较深一些,在 MemoryIndex 的 getNormValues 方法

public NumericDocValues getNormValues(String field) {
    MemoryIndex.Info info = (MemoryIndex.Info)MemoryIndex.this.fields.get(field);
    return info != null && !info.fieldInfo.omitsNorms() ? info.getNormDocValues() : null;
}

``

其内部类 Info 中有 MemoryIndex.this.normSimilarity.computeNorm(invertState),TFIDFSimilarity 就有对应的实现方法

NumericDocValues getNormDocValues() {
    if (this.norms == null) {
        FieldInvertState invertState = new FieldInvertState(this.fieldInfo.name, this.fieldInfo.number, this.numTokens, this.numOverlapTokens, 0, this.boost);
        final long value = MemoryIndex.this.normSimilarity.computeNorm(invertState);
        this.norms = new NumericDocValues() {
            public long get(int docID) {
                if (docID != 0) {
                    throw new IndexOutOfBoundsException();
                } else {
                    return value;
                }
            }
        };
    }

    return this.norms;
}

TFIDFSimilarity 实现方法其计算为:

public abstract float lengthNorm(FieldInvertState state);

@Override
public final long computeNorm(FieldInvertState state) {
  float normValue = lengthNorm(state);
  return encodeNormValue(normValue);
}

ClassicSimilarity继承TFIDFSimilarity,并实现lengthNorm()
@Override
public float lengthNorm(FieldInvertState state) {
  final int numTerms;
  if (discountOverlaps)
    numTerms = state.getLength() - state.getNumOverlap();
  else    numTerms = state.getLength();
  return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)));
}

这个我就不再验算了,每个域的 Terms 数量开方求倒数乘以该域的 boost 得出最终的结果。在实际使用我们也做了自定义:

@Override
public float lengthNorm(FieldInvertState fieldInvertState) {
    return 1f;
}
constant_score

在  constant_score 查询中,它可以包含查询或过滤,为任意一个匹配的文档指定评分  1 ,忽略 TF/IDF 信息,不关心检索词频率 TF(Term Frequency)对搜索结果排序的影响。讓我們來看看 ConstantScoreWeight 源碼中計算公式:

@Override
public void normalize(float norm, float boost) {
  this.boost = boost;
  queryNorm = norm;
  queryWeight = queryNorm * boost;
}

而在初始化时定义了norm和boost均为1
protected ConstantScoreWeight(Query query) {
  super(query);
  normalize(1f, 1f);
}

在例子中已分别设置 title.ik_gram、title_alias.ik_gram、title_sub_title、alias_sub_title 对应的 boost 值,那么只有 title.ik_gram 匹配到了,其 boost=100。

query_string

使用 query_string 查询来创建包含通配符、跨多个字段的搜索等复杂搜索。查询很严格,如果查询字符串包含任何无效语法,则会返回错误。具体的语法在这不一一举例说明,后面会单独发一篇文章。