Branch data Line data Source code
1 : : /** @file tradweight.cc
2 : : * @brief Xapian::TradWeight class - the "traditional" probabilistic formula
3 : : */
4 : : /* Copyright (C) 2009,2010 Olly Betts
5 : : *
6 : : * This program is free software; you can redistribute it and/or
7 : : * modify it under the terms of the GNU General Public License as
8 : : * published by the Free Software Foundation; either version 2 of the
9 : : * License, or (at your option) any later version.
10 : : *
11 : : * This program is distributed in the hope that it will be useful
12 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : : * GNU General Public License for more details.
15 : : *
16 : : * You should have received a copy of the GNU General Public License
17 : : * along with this program; if not, write to the Free Software
18 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 : : */
20 : :
21 : : #include <config.h>
22 : :
23 : : #include "xapian/weight.h"
24 : :
25 : : #include "debuglog.h"
26 : : #include "omassert.h"
27 : : #include "serialise-double.h"
28 : :
29 : : #include "xapian/error.h"
30 : :
31 : : #include <cmath>
32 : :
33 : : using namespace std;
34 : :
35 : : namespace Xapian {
36 : :
37 : : TradWeight *
38 : 135 : TradWeight::clone() const
39 : : {
40 : 135 : return new TradWeight(param_k);
41 : : }
42 : :
43 : : void
44 : 48 : TradWeight::init(double factor)
45 : : {
46 : 48 : Xapian::doccount tf = get_termfreq();
47 : :
48 : 48 : Xapian::weight tw = 0;
49 [ - + ]: 48 : if (get_rset_size() != 0) {
50 : 0 : Xapian::doccount reltermfreq = get_reltermfreq();
51 : :
52 : : // There can't be more relevant documents indexed by a term than there
53 : : // are documents indexed by that term.
54 : : AssertRel(reltermfreq,<=,tf);
55 : :
56 : : // There can't be more relevant documents indexed by a term than there
57 : : // are relevant documents.
58 : : AssertRel(reltermfreq,<=,get_rset_size());
59 : :
60 : 0 : Xapian::doccount reldocs_not_indexed = get_rset_size() - reltermfreq;
61 : :
62 : : // There can't be more relevant documents not indexed by a term than
63 : : // there are documents not indexed by that term.
64 : : AssertRel(reldocs_not_indexed,<=,get_collection_size() - tf);
65 : :
66 : 0 : Xapian::doccount Q = get_collection_size() - reldocs_not_indexed;
67 : :
68 : 0 : Xapian::doccount nonreldocs_indexed = tf - reltermfreq;
69 : 0 : double numerator = (reltermfreq + 0.5) * (Q - tf + 0.5);
70 : 0 : double denom = (reldocs_not_indexed + 0.5) * (nonreldocs_indexed + 0.5);
71 : 0 : tw = numerator / denom;
72 : : } else {
73 : 48 : tw = (get_collection_size() - tf + 0.5) / (tf + 0.5);
74 : : }
75 : :
76 : : AssertRel(tw,>,0);
77 : :
78 : : // The "official" formula can give a negative termweight in unusual cases
79 : : // (without an RSet, when a term indexes more than half the documents in
80 : : // the database). These negative weights aren't actually helpful, and it
81 : : // is common for implementations to replace them with a small positive
82 : : // weight or similar.
83 : : //
84 : : // Truncating to zero doesn't seem a great approach in practice as it
85 : : // means that some terms in the query can have no affect at all on the
86 : : // ranking, and that some results can have zero weight, both of which
87 : : // are seem surprising.
88 : : //
89 : : // Xapian 1.0.x and earlier adjusted the termweight for any term indexing
90 : : // more than a third of documents, which seems rather "intrusive". That's
91 : : // what the code currently enabled does, but perhaps it would be better to
92 : : // do something else. (FIXME)
93 : : #if 0
94 : : if (rare(tw <= 1.0)) {
95 : : termweight = 0;
96 : : } else {
97 : : termweight = log(tw) * factor;
98 : : }
99 : : #else
100 [ + - ]: 48 : if (tw < 2) tw = tw * 0.5 + 1;
101 : 48 : termweight = log(tw) * factor;
102 : : #endif
103 : :
104 : : LOGVALUE(WTCALC, termweight);
105 : :
106 [ + + ]: 48 : if (param_k == 0) {
107 : : // If param_k is 0 then the document length doesn't affect the weight.
108 : 32 : len_factor = 0;
109 : : } else {
110 : 16 : len_factor = get_average_length();
111 : : // len_factor can be zero if all documents are empty (or the database is
112 : : // empty!)
113 [ + - ]: 16 : if (len_factor != 0) len_factor = param_k / len_factor;
114 : : }
115 : :
116 : : LOGVALUE(WTCALC, len_factor);
117 : 48 : }
118 : :
119 : : string
120 : 1595 : TradWeight::name() const
121 : : {
122 : 1595 : return "Xapian::TradWeight";
123 : : }
124 : :
125 : : string
126 : 25 : TradWeight::serialise() const
127 : : {
128 : 25 : return serialise_double(param_k);
129 : : }
130 : :
131 : : TradWeight *
132 : 19 : TradWeight::unserialise(const string & s) const
133 : : {
134 : 19 : const char *ptr = s.data();
135 : 19 : const char *end = ptr + s.size();
136 : 19 : double k = unserialise_double(&ptr, end);
137 [ - + ]: 19 : if (rare(ptr != end))
138 : 0 : throw Xapian::NetworkError("Extra data in BM25Weight::unserialise()");
139 : 19 : return new TradWeight(k);
140 : : }
141 : :
142 : : Xapian::weight
143 : 5395 : TradWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len) const
144 : : {
145 : 5395 : double wdf_double(wdf);
146 : 5395 : return termweight * (wdf_double / (len * len_factor + wdf_double));
147 : : }
148 : :
149 : : Xapian::weight
150 : 3203 : TradWeight::get_maxpart() const
151 : : {
152 : : // FIXME: need to force non-zero wdf_max to stop percentages breaking...
153 : 3203 : double wdf_max(max(get_wdf_upper_bound(), Xapian::termcount(1)));
154 : 3203 : Xapian::termcount doclen_lb = get_doclength_lower_bound();
155 : 3203 : return termweight * (wdf_max / (doclen_lb * len_factor + wdf_max));
156 : : }
157 : :
158 : : Xapian::weight
159 : 0 : TradWeight::get_sumextra(Xapian::termcount) const
160 : : {
161 : 0 : return 0;
162 : : }
163 : :
164 : : Xapian::weight
165 : 48 : TradWeight::get_maxextra() const
166 : : {
167 : 48 : return 0;
168 : : }
169 : :
170 : : }
|