Branch data Line data Source code
1 : : /** @file queryoptimiser.cc
2 : : * @brief Convert a Xapian::Query::Internal tree into an optimal PostList tree.
3 : : */
4 : : /* Copyright (C) 2007,2008,2009,2010 Olly Betts
5 : : * Copyright (C) 2008,2009 Lemur Consulting Ltd
6 : : *
7 : : * This program is free software; you can redistribute it and/or
8 : : * modify it under the terms of the GNU General Public License as
9 : : * published by the Free Software Foundation; either version 2 of the
10 : : * License, or (at your option) any later version.
11 : : *
12 : : * This program is distributed in the hope that it will be useful,
13 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 : : * GNU General Public License for more details.
16 : : *
17 : : * You should have received a copy of the GNU General Public License
18 : : * along with this program; if not, write to the Free Software
19 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 : : */
21 : :
22 : : #include <config.h>
23 : :
24 : : #include "queryoptimiser.h"
25 : :
26 : : #include "andmaybepostlist.h"
27 : : #include "andnotpostlist.h"
28 : : #include "const_database_wrapper.h"
29 : : #include "debuglog.h"
30 : : #include "emptypostlist.h"
31 : : #include "exactphrasepostlist.h"
32 : : #include "externalpostlist.h"
33 : : #include "multiandpostlist.h"
34 : : #include "multimatch.h"
35 : : #include "multixorpostlist.h"
36 : : #include "omassert.h"
37 : : #include "omqueryinternal.h"
38 : : #include "orpostlist.h"
39 : : #include "phrasepostlist.h"
40 : : #include "postlist.h"
41 : : #include "valuegepostlist.h"
42 : : #include "valuerangepostlist.h"
43 : :
44 : : #include <algorithm>
45 : : #include <list>
46 : : #include <string>
47 : : #include <vector>
48 : :
49 : : using namespace std;
50 : :
51 : : PostList *
52 : 718854 : QueryOptimiser::do_subquery(const Xapian::Query::Internal * query, double factor)
53 : : {
54 : : LOGCALL(MATCH, PostList *, "QueryOptimiser::do_subquery", query | factor);
55 : :
56 : : // Handle QueryMatchNothing.
57 [ - + ]: 718854 : if (!query) RETURN(new EmptyPostList);
58 : :
59 [ + + + + + : 718854 : switch (query->op) {
+ + + + +
+ - ]
60 : : case Xapian::Query::Internal::OP_LEAF:
61 [ + + ]: 476180 : if (factor != 0.0) {
62 : 473908 : ++total_subqs;
63 [ + + ]: 473908 : if (query->tname.empty()) factor = 0.0;
64 : : }
65 : 476180 : RETURN(localsubmatch.postlist_from_op_leaf_query(query, factor));
66 : :
67 : : case Xapian::Query::Internal::OP_EXTERNAL_SOURCE: {
68 [ + + ]: 754 : if (factor != 0.0)
69 : 714 : ++total_subqs;
70 : : Assert(query->external_source);
71 : 754 : Xapian::Database wrappeddb(new ConstDatabaseWrapper(&db));
72 : 754 : RETURN(new ExternalPostList(wrappeddb,
73 : : query->external_source, factor,
74 : 754 : matcher));
75 : : }
76 : :
77 : : case Xapian::Query::OP_AND:
78 : : case Xapian::Query::OP_FILTER:
79 : : case Xapian::Query::OP_NEAR:
80 : : case Xapian::Query::OP_PHRASE:
81 : 1545 : RETURN(do_and_like(query, factor));
82 : :
83 : : case Xapian::Query::OP_OR:
84 : : case Xapian::Query::OP_XOR:
85 : : case Xapian::Query::OP_ELITE_SET:
86 : 232426 : RETURN(do_or_like(query, factor));
87 : :
88 : : case Xapian::Query::OP_SYNONYM: {
89 : : // Save and restore total_subqs so we only add one for the whole
90 : : // OP_SYNONYM subquery (or none if we're not weighted).
91 : 640 : Xapian::termcount save_total_subqs = total_subqs;
92 : 640 : PostList * pl = do_synonym(query, factor);
93 : 640 : total_subqs = save_total_subqs;
94 [ + + ]: 640 : if (factor != 0.0)
95 : 592 : ++total_subqs;
96 : 640 : RETURN(pl);
97 : : }
98 : :
99 : : case Xapian::Query::OP_AND_NOT: {
100 : : AssertEq(query->subqs.size(), 2);
101 : 128 : PostList * l = do_subquery(query->subqs[0], factor);
102 : 128 : PostList * r = do_subquery(query->subqs[1], 0.0);
103 : 128 : RETURN(new AndNotPostList(l, r, matcher, db_size));
104 : : }
105 : :
106 : : case Xapian::Query::OP_AND_MAYBE: {
107 : : AssertEq(query->subqs.size(), 2);
108 : 100 : PostList * l = do_subquery(query->subqs[0], factor);
109 : 100 : PostList * r = do_subquery(query->subqs[1], factor);
110 : 100 : RETURN(new AndMaybePostList(l, r, matcher, db_size));
111 : : }
112 : :
113 : : case Xapian::Query::OP_VALUE_RANGE: {
114 [ + - ]: 6067 : if (factor != 0.0)
115 : 6067 : ++total_subqs;
116 : 6067 : Xapian::valueno valno(query->parameter);
117 : 6067 : const string & range_begin = query->tname;
118 : 6067 : const string & range_end = query->str_parameter;
119 : 6067 : const string & lb = db.get_value_lower_bound(valno);
120 [ + + + + ]: 6067 : if (!lb.empty() &&
[ + + ][ + + ]
[ # # ][ + + ]
121 : : (range_end < lb ||
122 : 5177 : range_begin > db.get_value_upper_bound(valno))) {
123 : 279 : RETURN(new EmptyPostList);
124 : : }
125 : 6067 : RETURN(new ValueRangePostList(&db, valno, range_begin, range_end));
126 : : }
127 : :
128 : : case Xapian::Query::OP_VALUE_GE: {
129 [ + - ]: 208 : if (factor != 0.0)
130 : 208 : ++total_subqs;
131 : 208 : Xapian::valueno valno(query->parameter);
132 : 208 : const string & range_begin = query->tname;
133 : 208 : const string & lb = db.get_value_lower_bound(valno);
134 [ + + + + ]: 208 : if (!lb.empty() &&
[ + + ][ # # ]
[ + + ]
135 : 143 : range_begin > db.get_value_upper_bound(valno)) {
136 : 11 : RETURN(new EmptyPostList);
137 : : }
138 : 208 : RETURN(new ValueGePostList(&db, valno, range_begin));
139 : : }
140 : :
141 : : case Xapian::Query::OP_VALUE_LE: {
142 [ + - ]: 202 : if (factor != 0.0)
143 : 202 : ++total_subqs;
144 : 202 : Xapian::valueno valno(query->parameter);
145 : 202 : const string & range_end = query->tname;
146 [ + + ]: 202 : if (range_end < db.get_value_lower_bound(valno)) {
147 : 13 : RETURN(new EmptyPostList);
148 : : }
149 : 189 : RETURN(new ValueRangePostList(&db, valno, "", range_end));
150 : : }
151 : :
152 : : case Xapian::Query::OP_SCALE_WEIGHT: {
153 : : AssertEq(query->subqs.size(), 1);
154 : 604 : double sub_factor = factor;
155 [ + - ]: 604 : if (sub_factor != 0.0) sub_factor *= query->get_dbl_parameter();
156 : 604 : RETURN(do_subquery(query->subqs[0], sub_factor));
157 : : }
158 : :
159 : : default:
160 : : Assert(false);
161 : 718854 : RETURN(NULL);
162 : : }
163 : : }
164 : :
165 : 1024 : struct PosFilter {
166 : 1024 : PosFilter(Xapian::Query::Internal::op_t op_, size_t begin_, size_t end_,
167 : : Xapian::termcount window_)
168 : 1024 : : op(op_), begin(begin_), end(end_), window(window_) { }
169 : :
170 : : Xapian::Query::Internal::op_t op;
171 : :
172 : : /// Start and end indices for the PostLists this positional filter uses.
173 : : size_t begin, end;
174 : :
175 : : Xapian::termcount window;
176 : : };
177 : :
178 : : PostList *
179 : 1545 : QueryOptimiser::do_and_like(const Xapian::Query::Internal *query, double factor)
180 : : {
181 : : LOGCALL(MATCH, PostList *, "QueryOptimiser::do_and_like", query | factor);
182 : :
183 : 1545 : list<PosFilter> pos_filters;
184 : 1545 : vector<PostList *> plists;
185 : 1545 : do_and_like(query, factor, plists, pos_filters);
186 : : AssertRel(plists.size(), >=, 2);
187 : :
188 : : PostList * pl;
189 : 1545 : pl = new MultiAndPostList(plists.begin(), plists.end(), matcher, db_size);
190 : :
191 : : // Sort the positional filters to try to apply them in an efficient order.
192 : : // FIXME: We need to figure out what that is! Try applying lowest cf/tf
193 : : // first?
194 : :
195 : : // Apply any positional filters.
196 : 1545 : list<PosFilter>::iterator i;
197 [ + + ]: 2569 : for (i = pos_filters.begin(); i != pos_filters.end(); ++i) {
198 : 1024 : const PosFilter & filter = *i;
199 : :
200 : 1024 : vector<PostList *>::const_iterator terms_begin = plists.begin() + filter.begin;
201 : 1024 : vector<PostList *>::const_iterator terms_end = plists.begin() + filter.end;
202 : :
203 : 1024 : Xapian::termcount window = filter.window;
204 [ + + ]: 1024 : if (filter.op == Xapian::Query::OP_NEAR) {
205 : 288 : pl = new NearPostList(pl, window, terms_begin, terms_end);
206 [ + + ]: 736 : } else if (window == filter.end - filter.begin) {
207 : : AssertEq(filter.op, Xapian::Query::OP_PHRASE);
208 : 560 : pl = new ExactPhrasePostList(pl, terms_begin, terms_end);
209 : : } else {
210 : : AssertEq(filter.op, Xapian::Query::OP_PHRASE);
211 : 176 : pl = new PhrasePostList(pl, window, terms_begin, terms_end);
212 : : }
213 : : }
214 : :
215 : 1545 : RETURN(pl);
216 : : }
217 : :
218 : 4038 : inline bool is_and_like(Xapian::Query::Internal::op_t op) {
219 : : return op == Xapian::Query::OP_AND || op == Xapian::Query::OP_FILTER ||
220 [ + - ][ + - ]: 4038 : op == Xapian::Query::OP_NEAR || op == Xapian::Query::OP_PHRASE;
[ + + ][ + + ]
221 : : }
222 : :
223 : : void
224 : 1737 : QueryOptimiser::do_and_like(const Xapian::Query::Internal *query, double factor,
225 : : vector<PostList *> & and_plists,
226 : : list<PosFilter> & pos_filters)
227 : : {
228 : : LOGCALL_VOID(MATCH, "QueryOptimiser::do_and_like", query | factor | and_plists | pos_filters);
229 : :
230 : 1737 : Xapian::Query::Internal::op_t op = query->op;
231 : : Assert(is_and_like(op));
232 : :
233 : 1737 : bool positional = false;
234 [ + + ][ + + ]: 1737 : if (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR) {
235 : : // If this sub-database has no positional information, change
236 : : // OP_PHRASE/OP_NEAR into OP_AND so that we actually return some
237 : : // matches.
238 [ + + ]: 1064 : if (!db.has_positions()) {
239 : 40 : op = Xapian::Query::OP_AND;
240 : : } else {
241 : 1024 : positional = true;
242 : : }
243 : : }
244 : :
245 : 1737 : const Xapian::Query::Internal::subquery_list &queries = query->subqs;
246 : : AssertRel(queries.size(), >=, 2);
247 : :
248 [ + + ]: 5775 : for (size_t i = 0; i != queries.size(); ++i) {
249 : : // The second branch of OP_FILTER is always boolean.
250 [ + + ][ + + ]: 4038 : if (i == 1 && op == Xapian::Query::OP_FILTER) factor = 0.0;
251 : :
252 : 4038 : const Xapian::Query::Internal * subq = queries[i];
253 [ + + ]: 4038 : if (is_and_like(subq->op)) {
254 : 192 : do_and_like(subq, factor, and_plists, pos_filters);
255 : : } else {
256 : 3846 : PostList * pl = do_subquery(subq, factor);
257 : 3846 : and_plists.push_back(pl);
258 : : }
259 : : }
260 : :
261 [ + + ]: 1737 : if (positional) {
262 : : // Record the positional filter to apply higher up the tree.
263 : 1024 : size_t end = and_plists.size();
264 : 1024 : size_t begin = end - queries.size();
265 : 1024 : Xapian::termcount window = query->parameter;
266 : :
267 : 1024 : pos_filters.push_back(PosFilter(op, begin, end, window));
268 : : }
269 : 1737 : }
270 : :
271 : : /** Class providing an operator which sorts postlists to select max or terms.
272 : : * This returns true if a has a (strictly) greater termweight than b,
273 : : * unless a or b contain no documents, in which case the other one is
274 : : * selected.
275 : : */
276 : : struct CmpMaxOrTerms {
277 : : /** Return true if and only if a has a strictly greater termweight than b;
278 : : * with the proviso that if the termfrequency of a or b is 0, then the
279 : : * termweight is considered to be 0.
280 : : *
281 : : * We use termfreq_max() because we really don't want to exclude a
282 : : * postlist which has a low but non-zero termfrequency: the estimate
283 : : * is quite likely to be zero in this case.
284 : : */
285 : 790 : bool operator()(const PostList *a, const PostList *b) {
286 [ + + ]: 790 : if (a->get_termfreq_max() == 0) return false;
287 [ - + ]: 770 : if (b->get_termfreq_max() == 0) return true;
288 : :
289 : : #if (defined(__i386__) && !defined(__SSE2_MATH__)) || defined(__mc68000__) || defined(__mc68010__) || defined(__mc68020__) || defined(__mc68030__)
290 : : // On some architectures, most common of which is x86, floating point
291 : : // values are calculated and stored in registers with excess precision.
292 : : // If the two get_maxweight() calls below return identical values in a
293 : : // register, the excess precision may be dropped for one of them but
294 : : // not the other (e.g. because the compiler saves the first calculated
295 : : // weight to memory while calculating the second, then reloads it to
296 : : // compare). This leads to both a > b and b > a being true, which
297 : : // violates the antisymmetry property of the strict weak ordering
298 : : // required by nth_element(). This can have serious consequences (e.g.
299 : : // segfaults).
300 : : //
301 : : // Note that m68k only has excess precision in earlier models - 68040
302 : : // and later are OK:
303 : : // http://gcc.gnu.org/ml/gcc-patches/2008-11/msg00105.html
304 : : //
305 : : // To avoid this, we store each result in a volatile double prior to
306 : : // comparing them. This means that the result of this test should
307 : : // match that on other architectures with the same double format (which
308 : : // is desirable), and actually has less overhead than rounding both
309 : : // results to float (which is another approach which works).
310 : : volatile double a_max_wt = a->get_maxweight();
311 : : volatile double b_max_wt = b->get_maxweight();
312 : : return a_max_wt > b_max_wt;
313 : : #else
314 : 790 : return a->get_maxweight() > b->get_maxweight();
315 : : #endif
316 : : }
317 : : };
318 : :
319 : : template<class CLASS> struct delete_ptr {
320 [ + - ]: 270 : void operator()(CLASS *p) { delete p; }
321 : : };
322 : :
323 : : /// Comparison functor which orders PostList* by descending get_termfreq_est().
324 : : struct ComparePostListTermFreqAscending {
325 : : /// Order by descending get_termfreq_est().
326 : 237579 : bool operator()(const PostList *a, const PostList *b) {
327 : 237579 : return a->get_termfreq_est() > b->get_termfreq_est();
328 : : }
329 : : };
330 : :
331 : : PostList *
332 : 233066 : QueryOptimiser::do_or_like(const Xapian::Query::Internal *query, double factor)
333 : : {
334 : : LOGCALL(MATCH, PostList *, "QueryOptimiser::do_or_like", query | factor);
335 : :
336 : : // FIXME: we could optimise by merging OP_ELITE_SET and OP_OR like we do
337 : : // for AND-like operations.
338 : 233066 : Xapian::Query::Internal::op_t op = query->op;
339 : : Assert(op == Xapian::Query::OP_ELITE_SET || op == Xapian::Query::OP_OR ||
340 : : op == Xapian::Query::OP_XOR || op == Xapian::Query::OP_SYNONYM);
341 : :
342 : 233066 : const Xapian::Query::Internal::subquery_list &queries = query->subqs;
343 : : AssertRel(queries.size(), >=, 2);
344 : :
345 : 233066 : vector<PostList *> postlists;
346 : 233066 : postlists.reserve(queries.size());
347 : :
348 : 233066 : Xapian::Query::Internal::subquery_list::const_iterator q;
349 [ + + ]: 700662 : for (q = queries.begin(); q != queries.end(); ++q) {
350 : 467596 : postlists.push_back(do_subquery(*q, factor));
351 : : }
352 : :
353 [ + + ]: 233066 : if (op == Xapian::Query::OP_XOR) {
354 : 134 : RETURN(new MultiXorPostList(postlists.begin(), postlists.end(),
355 : : matcher, db_size));
356 : : }
357 : :
358 [ + + ]: 232932 : if (op == Xapian::Query::OP_ELITE_SET) {
359 : : // Select the best elite_set_size terms.
360 : 56 : Xapian::termcount elite_set_size = query->parameter;
361 : : Assert(elite_set_size > 0);
362 : :
363 [ + + ]: 56 : if (postlists.size() > elite_set_size) {
364 : : // Call recalc_maxweight() as otherwise get_maxweight()
365 : : // may not be valid before next() or skip_to()
366 : : for_each(postlists.begin(), postlists.end(),
367 : 40 : mem_fun(&PostList::recalc_maxweight));
368 : :
369 : : nth_element(postlists.begin(),
370 : : postlists.begin() + elite_set_size - 1,
371 : 40 : postlists.end(), CmpMaxOrTerms());
372 : :
373 : : for_each(postlists.begin() + elite_set_size, postlists.end(),
374 : 40 : delete_ptr<PostList>());
375 : :
376 [ + - ]: 40 : if (elite_set_size == 1) RETURN(postlists[0]);
377 : :
378 : 0 : postlists.resize(elite_set_size);
379 : : }
380 : : }
381 : :
382 : : // Make postlists into a heap so that the postlist with the greatest term
383 : : // frequency is at the top of the heap.
384 : : make_heap(postlists.begin(), postlists.end(),
385 : 232892 : ComparePostListTermFreqAscending());
386 : :
387 : : // Now build a tree of binary OrPostList objects.
388 : : //
389 : : // The algorithm used to build the tree is like that used to build an
390 : : // optimal Huffman coding tree. If we called next() repeatedly, this
391 : : // arrangement would minimise the number of method calls. Generally we
392 : : // don't actually do that, but this arrangement is still likely to be a
393 : : // good one, and it does minimise the work in the worst case.
394 : : AssertRel(postlists.size(), >=, 2);
395 : 1202 : while (true) {
396 : : // We build the tree such that at each branch:
397 : : //
398 : : // l.get_termfreq_est() >= r.get_termfreq_est()
399 : : //
400 : : // We do this so that the OrPostList class can be optimised assuming
401 : : // that this is the case.
402 : 234094 : PostList * r = postlists.front();
403 : : pop_heap(postlists.begin(), postlists.end(),
404 : 234094 : ComparePostListTermFreqAscending());
405 : 234094 : postlists.pop_back();
406 : 234094 : PostList * pl = new OrPostList(postlists.front(), r, matcher, db_size);
407 : :
408 [ + + ]: 234094 : if (postlists.size() == 1) RETURN(pl);
409 : :
410 : : pop_heap(postlists.begin(), postlists.end(),
411 : 1202 : ComparePostListTermFreqAscending());
412 : 1202 : postlists.back() = pl;
413 : : push_heap(postlists.begin(), postlists.end(),
414 : 1202 : ComparePostListTermFreqAscending());
415 : 233066 : }
416 : : }
417 : :
418 : : PostList *
419 : 640 : QueryOptimiser::do_synonym(const Xapian::Query::Internal *query, double factor)
420 : : {
421 : : LOGCALL(MATCH, PostList *, "QueryOptimiser::do_synonym", query | factor);
422 [ + + ]: 640 : if (factor == 0.0) {
423 : : // If we have a factor of 0, we don't care about the weights, so
424 : : // we're just like a normal OR query.
425 : 48 : RETURN(do_or_like(query, 0.0));
426 : : }
427 : :
428 : : // We currently assume wqf is 1 for calculating the synonym's weight
429 : : // since conceptually the synonym is one "virtual" term. If we were
430 : : // to combine multiple occurrences of the same synonym expansion into
431 : : // a single instance with wqf set, we would want to use the wqf.
432 : : AssertEq(query->get_wqf(), 0);
433 : :
434 : : // We build an OP_OR tree for OP_SYNONYM and then wrap it in a
435 : : // SynonymPostList, which supplies the weights.
436 : 640 : RETURN(localsubmatch.make_synonym_postlist(do_or_like(query, 0.0),
437 : : matcher, factor));
438 : : }
|