Branch data Line data Source code
1 : : /** @file bm25weight.cc
2 : : * @brief Xapian::BM25Weight class - the BM25 probabilistic formula
3 : : */
4 : : /* Copyright (C) 2009,2010 Olly Betts
5 : : *
6 : : * This program is free software; you can redistribute it and/or
7 : : * modify it under the terms of the GNU General Public License as
8 : : * published by the Free Software Foundation; either version 2 of the
9 : : * License, or (at your option) any later version.
10 : : *
11 : : * This program is distributed in the hope that it will be useful
12 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : : * GNU General Public License for more details.
15 : : *
16 : : * You should have received a copy of the GNU General Public License
17 : : * along with this program; if not, write to the Free Software
18 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 : : */
20 : :
21 : : #include <config.h>
22 : :
23 : : #include "xapian/weight.h"
24 : :
25 : : #include "debuglog.h"
26 : : #include "omassert.h"
27 : : #include "serialise-double.h"
28 : :
29 : : #include "xapian/error.h"
30 : :
31 : : #include <algorithm>
32 : : #include <cmath>
33 : :
34 : : using namespace std;
35 : :
36 : : namespace Xapian {
37 : :
38 : : BM25Weight *
39 : 718057 : BM25Weight::clone() const
40 : : {
41 : : return new BM25Weight(param_k1, param_k2, param_k3, param_b,
42 : 718057 : param_min_normlen);
43 : : }
44 : :
45 : : void
46 : 472492 : BM25Weight::init(double factor)
47 : : {
48 : 472492 : Xapian::doccount tf = get_termfreq();
49 : :
50 : 472492 : Xapian::weight tw = 0;
51 [ + + ]: 472492 : if (get_rset_size() != 0) {
52 : 250 : Xapian::doccount reltermfreq = get_reltermfreq();
53 : :
54 : : // There can't be more relevant documents indexed by a term than there
55 : : // are documents indexed by that term.
56 : : AssertRel(reltermfreq,<=,tf);
57 : :
58 : : // There can't be more relevant documents indexed by a term than there
59 : : // are relevant documents.
60 : : AssertRel(reltermfreq,<=,get_rset_size());
61 : :
62 : 250 : Xapian::doccount reldocs_not_indexed = get_rset_size() - reltermfreq;
63 : :
64 : : // There can't be more relevant documents not indexed by a term than
65 : : // there are documents not indexed by that term.
66 : : AssertRel(reldocs_not_indexed,<=,get_collection_size() - tf);
67 : :
68 : 250 : Xapian::doccount Q = get_collection_size() - reldocs_not_indexed;
69 : :
70 : 250 : Xapian::doccount nonreldocs_indexed = tf - reltermfreq;
71 : 250 : double numerator = (reltermfreq + 0.5) * (Q - tf + 0.5);
72 : 250 : double denom = (reldocs_not_indexed + 0.5) * (nonreldocs_indexed + 0.5);
73 : 250 : tw = numerator / denom;
74 : : } else {
75 : 472242 : tw = (get_collection_size() - tf + 0.5) / (tf + 0.5);
76 : : }
77 : :
78 : : AssertRel(tw,>,0);
79 : :
80 : : // The "official" formula can give a negative termweight in unusual cases
81 : : // (without an RSet, when a term indexes more than half the documents in
82 : : // the database). These negative weights aren't actually helpful, and it
83 : : // is common for implementations to replace them with a small positive
84 : : // weight or similar.
85 : : //
86 : : // Truncating to zero doesn't seem a great approach in practice as it
87 : : // means that some terms in the query can have no affect at all on the
88 : : // ranking, and that some results can have zero weight, both of which
89 : : // are seem surprising.
90 : : //
91 : : // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
92 : : // more than a third of documents, which seems rather "intrusive". That's
93 : : // what the code currently enabled does, but perhaps it would be better to
94 : : // do something else. (FIXME)
95 : : #if 0
96 : : if (rare(tw <= 1.0)) {
97 : : termweight = 0;
98 : : } else {
99 : : termweight = log(tw) * factor;
100 : : if (param_k3 != 0) {
101 : : double wqf_double = get_wqf();
102 : : termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
103 : : }
104 : : }
105 : : #else
106 [ + + ]: 472492 : if (tw < 2) tw = tw * 0.5 + 1;
107 : 472492 : termweight = log(tw) * factor;
108 [ + + ]: 472492 : if (param_k3 != 0) {
109 : 472476 : double wqf_double = get_wqf();
110 : 472476 : termweight *= (param_k3 + 1) * wqf_double / (param_k3 + wqf_double);
111 : : }
112 : : #endif
113 : :
114 : : LOGVALUE(WTCALC, termweight);
115 : :
116 [ + + ][ - + ]: 472492 : if (param_b == 0 || param_k1 == 0) {
117 : : // If either param_b or param_k1 is 0 then the document length doesn't
118 : : // affect the weight.
119 : 16 : len_factor = 0;
120 : : } else {
121 : 472476 : len_factor = get_average_length();
122 : : // len_factor can be zero if all documents are empty (or the database
123 : : // is empty!)
124 [ + + ]: 472476 : if (len_factor != 0) len_factor = 1 / len_factor;
125 : : }
126 : :
127 : : LOGVALUE(WTCALC, len_factor);
128 : 472492 : }
129 : :
130 : : string
131 : 5672 : BM25Weight::name() const
132 : : {
133 : 5672 : return "Xapian::BM25Weight";
134 : : }
135 : :
136 : : string
137 : 4099 : BM25Weight::serialise() const
138 : : {
139 : 4099 : string result = serialise_double(param_k1);
140 : 4099 : result += serialise_double(param_k2);
141 : 4099 : result += serialise_double(param_k3);
142 : 4099 : result += serialise_double(param_b);
143 : 4099 : result += serialise_double(param_min_normlen);
144 : 0 : return result;
145 : : }
146 : :
147 : : BM25Weight *
148 : 4087 : BM25Weight::unserialise(const string & s) const
149 : : {
150 : 4087 : const char *ptr = s.data();
151 : 4087 : const char *end = ptr + s.size();
152 : 4087 : double k1 = unserialise_double(&ptr, end);
153 : 4087 : double k2 = unserialise_double(&ptr, end);
154 : 4087 : double k3 = unserialise_double(&ptr, end);
155 : 4087 : double b = unserialise_double(&ptr, end);
156 : 4087 : double min_normlen = unserialise_double(&ptr, end);
157 [ - + ]: 4087 : if (rare(ptr != end))
158 : 0 : throw Xapian::NetworkError("Extra data in BM25Weight::unserialise()");
159 : 4087 : return new BM25Weight(k1, k2, k3, b, min_normlen);
160 : : }
161 : :
162 : : Xapian::weight
163 : 66002153 : BM25Weight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len) const
164 : : {
165 : : LOGCALL(WTCALC, Xapian::weight, "BM25Weight::get_sumpart", wdf | len);
166 : 66002153 : Xapian::doclength normlen = max(len * len_factor, param_min_normlen);
167 : :
168 : 66002153 : double wdf_double(wdf);
169 : 66002153 : double denom = param_k1 * (normlen * param_b + (1 - param_b)) + wdf_double;
170 : : AssertRel(denom,>,0);
171 : 66002153 : RETURN(termweight * (param_k1 + 1) * (wdf_double / denom));
172 : : }
173 : :
174 : : Xapian::weight
175 : 4288453 : BM25Weight::get_maxpart() const
176 : : {
177 : : LOGCALL(WTCALC, Xapian::weight, "BM25Weight::get_maxpart", NO_ARGS);
178 : : Xapian::doclength normlen_lb = max(get_doclength_lower_bound() * len_factor,
179 : 4288453 : param_min_normlen);
180 : 4288453 : double wdf_max(get_wdf_upper_bound());
181 : 4288453 : double denom = param_k1 * (normlen_lb * param_b + (1 - param_b)) + wdf_max;
182 : : AssertRel(denom,>,0);
183 : 4288453 : RETURN(termweight * (param_k1 + 1) * (wdf_max / denom));
184 : : }
185 : :
186 : : /* The BM25 formula gives:
187 : : *
188 : : * param_k2 * query_length * (1 - normlen) / (1 + normlen)
189 : : *
190 : : * To avoid negative sumextra we add the constant (param_k2 * query_length)
191 : : * to give:
192 : : *
193 : : * 2 * param_k2 * query_length / (1 + normlen)
194 : : */
195 : : Xapian::weight
196 : 26 : BM25Weight::get_sumextra(Xapian::termcount len) const
197 : : {
198 : : LOGCALL(WTCALC, Xapian::weight, "BM25Weight::get_sumextra", len);
199 : 26 : Xapian::weight num = (2.0 * param_k2 * get_query_length());
200 : 26 : RETURN(num / (1.0 + max(len * len_factor, param_min_normlen)));
201 : : }
202 : :
203 : : Xapian::weight
204 : 245552 : BM25Weight::get_maxextra() const
205 : : {
206 : : LOGCALL(WTCALC, Xapian::weight, "BM25Weight::get_maxextra", NO_ARGS);
207 : 245552 : Xapian::weight num = (2.0 * param_k2 * get_query_length());
208 : 245552 : RETURN(num / (1.0 + max(double(get_doclength_lower_bound()),
209 : : param_min_normlen)));
210 : : }
211 : :
212 : : }
|