Branch data Line data Source code
1 : : /* phrasepostlist.cc: Return only items where terms are near or form a phrase
2 : : *
3 : : * Copyright 1999,2000,2001 BrightStation PLC
4 : : * Copyright 2002 Ananova Ltd
5 : : * Copyright 2002,2003,2004,2007,2008,2009 Olly Betts
6 : : * Copyright 2009 Lemur Consulting Ltd
7 : : *
8 : : * This program is free software; you can redistribute it and/or
9 : : * modify it under the terms of the GNU General Public License as
10 : : * published by the Free Software Foundation; either version 2 of the
11 : : * License, or (at your option) any later version.
12 : : *
13 : : * This program is distributed in the hope that it will be useful,
14 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 : : * GNU General Public License for more details.
17 : : *
18 : : * You should have received a copy of the GNU General Public License
19 : : * along with this program; if not, write to the Free Software
20 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 : : * USA
22 : : */
23 : :
24 : : #include <config.h>
25 : : #include "phrasepostlist.h"
26 : :
27 : : #include "debuglog.h"
28 : : #include "positionlist.h"
29 : : #include "omassert.h"
30 : : #include "str.h"
31 : :
32 : : #include <algorithm>
33 : :
34 : : /** Class providing an operator which returns true if a has a (strictly)
35 : : * smaller number of postings than b.
36 : : */
37 : : class PositionListCmpLt {
38 : : public:
39 : : /** Return true if and only if a is strictly shorter than b.
40 : : */
41 : 8307 : bool operator()(const PositionList *a, const PositionList *b) {
42 : 8307 : return a->get_size() < b->get_size();
43 : : }
44 : : };
45 : :
46 : :
47 : : /** Check if terms occur sufficiently close together in the current doc
48 : : */
49 : : bool
50 : 1287 : NearPostList::test_doc()
51 : : {
52 : : LOGCALL(MATCH, bool, "NearPostList::test_doc", NO_ARGS);
53 : 1287 : std::vector<PositionList *> plists;
54 : :
55 : 1287 : std::vector<PostList *>::const_iterator i;
56 [ + + ]: 4862 : for (i = terms.begin(); i != terms.end(); i++) {
57 : 3575 : PositionList * p = (*i)->read_position_list();
58 : : // If p is NULL, the backend doesn't support positionlists
59 [ - + ]: 3575 : if (!p) return false;
60 : 3575 : plists.push_back(p);
61 : : }
62 : :
63 : 1287 : std::sort(plists.begin(), plists.end(), PositionListCmpLt());
64 : :
65 : : Xapian::termpos pos;
66 [ + + ]: 1287 : do {
67 : 1586 : plists[0]->next();
68 [ + + ]: 1586 : if (plists[0]->at_end()) RETURN(false);
69 : 1287 : pos = plists[0]->get_position();
70 : : } while (!do_test(plists, 1, pos, pos));
71 : :
72 : 1287 : RETURN(true);
73 : : }
74 : :
75 : : bool
76 : 2093 : NearPostList::do_test(std::vector<PositionList *> &plists, Xapian::termcount i,
77 : : Xapian::termcount min, Xapian::termcount max)
78 : : {
79 : : LOGCALL(MATCH, bool, "NearPostList::do_test", plists | i | min | max);
80 : : LOGLINE(MATCH, "docid = " << get_docid() << ", window = " << window);
81 : 2093 : Xapian::termcount tmp = max + 1;
82 : : // take care to avoid underflow
83 [ + + ]: 2093 : if (window <= tmp) tmp -= window; else tmp = 0;
84 : 2093 : plists[i]->skip_to(tmp);
85 [ + + ]: 2093 : while (!plists[i]->at_end()) {
86 : 2080 : Xapian::termpos pos = plists[i]->get_position();
87 : : LOGLINE(MATCH, "[" << i << "]: " << max - window + 1 << " " << min <<
88 : : " " << pos << " " << max << " " << min + window - 1);
89 [ + + ]: 2080 : if (pos > min + window - 1) RETURN(false);
90 [ + + ]: 1794 : if (i + 1 == plists.size()) RETURN(true);
91 [ + + ]: 806 : if (pos < min) min = pos;
92 [ + - ]: 533 : else if (pos > max) max = pos;
93 [ + - ]: 806 : if (do_test(plists, i + 1, min, max)) RETURN(true);
94 : 0 : plists[i]->next();
95 : : }
96 : 2093 : RETURN(false);
97 : : }
98 : :
99 : : Xapian::termcount
100 : 0 : NearPostList::get_wdf() const
101 : : {
102 : : // Calculate an estimate for the wdf of a near postlist.
103 : : //
104 : : // The natural definition of the wdf for a near postlist is the number of
105 : : // times the terms occur in a group within the windowsize in the document -
106 : : // in other words, the number of times a routine like do_test() could find
107 : : // a match, if it kept going after finding the first one. However,
108 : : // calculating this would be expensive (in CPU time at least - the position
109 : : // lists are probably already read from disk), and the benefit is dubious
110 : : // (given that the estimate here is probably only going to be used for
111 : : // calculating the weight for synonym postlists, and it's not clear that
112 : : // the above definition is exactly appropriate in this situation, anyway).
113 : : //
114 : : // Instead, we could do an estimate of this value, based on the lengths
115 : : // of the position lists. Rather than having to open the position lists,
116 : : // we could use the wdfs, which will be the same value unless the wdfs have
117 : : // been artificially inflated - in which case we probably want to use the
118 : : // inflated value anyway (it will have been inflated to represent the fact
119 : : // that this term is more important than others, so we should make use of
120 : : // this hint).
121 : : //
122 : : // A reasonable way to calculate an estimate would be to consider the
123 : : // document to be split into W (overlapping) windows, each 1 term apart,
124 : : // and of the same length as the windowsize used for the phrase. Then,
125 : : // calculate the probability that each term is found in this window (as
126 : : // Prob = wdf / doclen), multiply these probabilities to get the
127 : : // probability that they're all found in the window, and then multiply by
128 : : // the windowsize again to get an estimate.
129 : : //
130 : : // However, this requires the doclen, which won't always be available (ie,
131 : : // if the weighting scheme doesn't use it, we won't have read it from
132 : : // disk).
133 : : //
134 : : // Rather than force an access to disk to get the doclen, we use an even
135 : : // rougher estimate: the minimum wdf in the sub-lists. This is actually
136 : : // the upper bound for the wdf (since the phrase can only occur the same
137 : : // number of times as the least frequent of its constituent terms).
138 : : //
139 : : // If this estimate proves to give bad results, we can revisit this and try
140 : : // a better approximation.
141 : :
142 : 0 : std::vector<PostList *>::const_iterator i = terms.begin();
143 : 0 : Xapian::termcount wdf = (*i)->get_wdf();
144 [ # # ]: 0 : for (; i != terms.end(); i++) {
145 : 0 : wdf = std::min(wdf, (*i)->get_wdf());
146 : : }
147 : :
148 : : // Ensure that we always return a wdf of at least 1, since we know there
149 : : // was at least one occurrence of the phrase.
150 : 0 : return std::max(wdf, Xapian::termcount(1));
151 : : }
152 : :
153 : : TermFreqs
154 : 0 : NearPostList::get_termfreq_est_using_stats(
155 : : const Xapian::Weight::Internal & stats) const
156 : : {
157 : : LOGCALL(MATCH, TermFreqs, "NearPostList::get_termfreq_est_using_stats", stats);
158 : : // No idea how to estimate this - FIXME
159 : 0 : TermFreqs result(source->get_termfreq_est_using_stats(stats));
160 : 0 : result.termfreq /= 2;
161 : 0 : result.reltermfreq /= 2;
162 : 0 : RETURN(result);
163 : : }
164 : :
165 : : std::string
166 : 0 : NearPostList::get_description() const
167 : : {
168 : 0 : return "(Near " + str(window) + " " + source->get_description() + ")";
169 : : }
170 : :
171 : :
172 : :
173 : : /** Check if terms form a phrase in the current doc
174 : : */
175 : : bool
176 : 1001 : PhrasePostList::test_doc()
177 : : {
178 : : LOGCALL(MATCH, bool, "PhrasePostList::test_doc", NO_ARGS);
179 : 1001 : std::vector<PositionList *> plists;
180 : :
181 : 1001 : std::vector<PostList *>::const_iterator i;
182 [ + + ]: 3887 : for (i = terms.begin(); i != terms.end(); i++) {
183 : 2886 : PositionList * p = (*i)->read_position_list();
184 : : // If p is NULL, the backend doesn't support positionlists
185 [ - + ]: 2886 : if (!p) return false;
186 : 2886 : p->index = i - terms.begin();
187 : 2886 : plists.push_back(p);
188 : : }
189 : :
190 : 1001 : std::sort(plists.begin(), plists.end(), PositionListCmpLt());
191 : :
192 : : Xapian::termpos pos;
193 : : Xapian::termpos idx, min;
194 [ + + ]: 1001 : do {
195 : 1859 : plists[0]->next();
196 [ + + ]: 1859 : if (plists[0]->at_end()) {
197 : : LOGLINE(MATCH, "--MISS--");
198 : 858 : RETURN(false);
199 : : }
200 : 1001 : pos = plists[0]->get_position();
201 : 1001 : idx = plists[0]->index;
202 : 1001 : min = pos + plists.size() - idx;
203 [ + + ]: 1001 : if (min > window) min -= window; else min = 0;
204 : : } while (!do_test(plists, 1, min, pos + window - idx));
205 : : LOGLINE(MATCH, "**HIT**");
206 : 1001 : RETURN(true);
207 : : }
208 : :
209 : : bool
210 : 1469 : PhrasePostList::do_test(std::vector<PositionList *> &plists, Xapian::termcount i,
211 : : Xapian::termcount min, Xapian::termcount max)
212 : : {
213 : : LOGCALL(MATCH, bool, "PhrasePostList::do_test", plists | i | min | max);
214 : : LOGLINE(MATCH, "docid = " << get_docid() << ", window = " << window);
215 : 1469 : Xapian::termpos idxi = plists[i]->index;
216 : : LOGLINE(MATCH, "my idx in phrase is " << idxi);
217 : :
218 : 1469 : Xapian::termpos mymin = min + idxi;
219 : 1469 : Xapian::termpos mymax = max - plists.size() + idxi;
220 : : LOGLINE(MATCH, "MIN = " << mymin << " MAX = " << mymax);
221 : : // FIXME: this is worst case O(n^2) where n = length of phrase
222 : : // Can we do better?
223 [ + + ]: 3406 : for (Xapian::termcount j = 0; j < i; j++) {
224 : 1937 : Xapian::termpos idxj = plists[j]->index;
225 [ - + ]: 1937 : if (idxj > idxi) {
226 : 0 : Xapian::termpos tmp = plists[j]->get_position() + idxj - idxi;
227 : : LOGLINE(MATCH, "ABOVE " << tmp);
228 [ # # ]: 0 : if (tmp < mymax) mymax = tmp;
229 : : } else {
230 : : AssertRel(idxi, !=, idxj);
231 : 1937 : Xapian::termpos tmp = plists[j]->get_position() + idxi - idxj;
232 : : LOGLINE(MATCH, "BELOW " << tmp);
233 [ + + ]: 1937 : if (tmp > mymin) mymin = tmp;
234 : : }
235 : : LOGLINE(MATCH, "min = " << mymin << " max = " << mymax);
236 : : }
237 : 1469 : plists[i]->skip_to(mymin);
238 : :
239 [ + + ]: 1846 : while (!plists[i]->at_end()) {
240 : 858 : Xapian::termpos pos = plists[i]->get_position();
241 : : LOGLINE(MATCH, " " << mymin << " " << pos << " " << mymax);
242 [ + + ]: 858 : if (pos > mymax) RETURN(false);
243 [ + + ]: 611 : if (i + 1 == plists.size()) RETURN(true);
244 : 468 : Xapian::termpos tmp = pos + window - idxi;
245 [ - + ]: 468 : if (tmp < max) max = tmp;
246 : 468 : tmp = pos + plists.size() - idxi;
247 [ + + ]: 468 : if (tmp > window) {
248 : 247 : tmp -= window;
249 [ + + ]: 247 : if (tmp > min) min = tmp;
250 : : }
251 [ + + ]: 468 : if (do_test(plists, i + 1, min, max)) RETURN(true);
252 : 377 : plists[i]->next();
253 : : }
254 : 1469 : RETURN(false);
255 : : }
256 : :
257 : : Xapian::termcount
258 : 0 : PhrasePostList::get_wdf() const
259 : : {
260 : : // Calculate an estimate for the wdf of a phrase postlist.
261 : : //
262 : : // We use the minimum wdf of a sub-postlist as our estimate. See the
263 : : // comment in NearPostList::get_wdf for justification of this estimate.
264 : : //
265 : : // We divide the value calculated for a NearPostList by 2, as a very rough
266 : : // heuristic to represent the fact that the words must occur in order, and
267 : : // phrases are therefore rarer than near matches.
268 : :
269 : 0 : std::vector<PostList *>::const_iterator i = terms.begin();
270 : 0 : Xapian::termcount wdf = (*i)->get_wdf();
271 [ # # ]: 0 : for (; i != terms.end(); i++) {
272 : 0 : wdf = std::min(wdf, (*i)->get_wdf());
273 : : }
274 : :
275 : : // Ensure that we always return a wdf of at least 1, since we know there
276 : : // was at least one occurrence of the phrase.
277 : 0 : return std::max(wdf / 2, Xapian::termcount(1));
278 : : }
279 : :
280 : : TermFreqs
281 : 0 : PhrasePostList::get_termfreq_est_using_stats(
282 : : const Xapian::Weight::Internal & stats) const
283 : : {
284 : : LOGCALL(MATCH, TermFreqs, "PhrasePostList::get_termfreq_est_using_stats", stats);
285 : : // No idea how to estimate this - FIXME
286 : 0 : TermFreqs result(source->get_termfreq_est_using_stats(stats));
287 : 0 : result.termfreq /= 3;
288 : 0 : result.reltermfreq /= 3;
289 : 0 : RETURN(result);
290 : : }
291 : :
292 : : std::string
293 : 0 : PhrasePostList::get_description() const
294 : : {
295 : : return "(Phrase " + str(window) + " "
296 : 0 : + source->get_description() + ")";
297 : : }
|