Branch data Line data Source code
1 : : /* multimatch.cc
2 : : *
3 : : * Copyright 1999,2000,2001 BrightStation PLC
4 : : * Copyright 2001,2002 Ananova Ltd
5 : : * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010 Olly Betts
6 : : * Copyright 2003 Orange PCS Ltd
7 : : * Copyright 2003 Sam Liddicott
8 : : * Copyright 2007,2008,2009 Lemur Consulting Ltd
9 : : *
10 : : * This program is free software; you can redistribute it and/or
11 : : * modify it under the terms of the GNU General Public License as
12 : : * published by the Free Software Foundation; either version 2 of the
13 : : * License, or (at your option) any later version.
14 : : *
15 : : * This program is distributed in the hope that it will be useful,
16 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 : : * GNU General Public License for more details.
19 : : *
20 : : * You should have received a copy of the GNU General Public License
21 : : * along with this program; if not, write to the Free Software
22 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 : : * USA
24 : : */
25 : :
26 : : #include <config.h>
27 : :
28 : : #include "multimatch.h"
29 : :
30 : : #include "autoptr.h"
31 : : #include "collapser.h"
32 : : #include "debuglog.h"
33 : : #include "submatch.h"
34 : : #include "localsubmatch.h"
35 : : #include "omassert.h"
36 : : #include "omenquireinternal.h"
37 : :
38 : : #include "emptypostlist.h"
39 : : #include "branchpostlist.h"
40 : : #include "mergepostlist.h"
41 : :
42 : : #include "document.h"
43 : : #include "omqueryinternal.h"
44 : :
45 : : #include "submatch.h"
46 : :
47 : : #include "msetcmp.h"
48 : :
49 : : #include "valuestreamdocument.h"
50 : : #include "weightinternal.h"
51 : :
52 : : #include <xapian/errorhandler.h>
53 : : #include <xapian/matchspy.h>
54 : : #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
55 : :
56 : : #ifdef XAPIAN_HAS_REMOTE_BACKEND
57 : : #include "remotesubmatch.h"
58 : : #include "remote-database.h"
59 : : #endif /* XAPIAN_HAS_REMOTE_BACKEND */
60 : :
61 : : #include <algorithm>
62 : : #include <cfloat> // For DBL_EPSILON.
63 : : #include <climits> // For UINT_MAX.
64 : : #include <vector>
65 : : #include <map>
66 : : #include <set>
67 : :
68 : : using namespace std;
69 : :
70 : : const Xapian::Enquire::Internal::sort_setting REL =
71 : : Xapian::Enquire::Internal::REL;
72 : : const Xapian::Enquire::Internal::sort_setting REL_VAL =
73 : : Xapian::Enquire::Internal::REL_VAL;
74 : : const Xapian::Enquire::Internal::sort_setting VAL =
75 : : Xapian::Enquire::Internal::VAL;
76 : : #if 0 // VAL_REL isn't currently used which causes a warning with SGI CC.
77 : : const Xapian::Enquire::Internal::sort_setting VAL_REL =
78 : : Xapian::Enquire::Internal::VAL_REL;
79 : : #endif
80 : :
81 : : /** Split an RSet into several sub rsets, one for each database.
82 : : *
83 : : * @param rset The RSet to split.
84 : : * @param number_of_subdbs The number of sub databases which exist.
85 : : * @param subrsets Vector of RSets which will the sub rsets will be placed in.
86 : : * This should be empty when the function is called.
87 : : */
88 : : static void
89 : 179504 : split_rset_by_db(const Xapian::RSet * rset,
90 : : Xapian::doccount number_of_subdbs,
91 : : vector<Xapian::RSet> & subrsets)
92 : : {
93 : : LOGCALL_STATIC_VOID(MATCH, "split_rset_by_db", rset | number_of_subdbs | subrsets);
94 [ + + ]: 179504 : if (rset) {
95 [ + + ]: 4494 : if (number_of_subdbs == 1) {
96 : : // The common case of a single database is easy to handle.
97 : 4438 : subrsets.push_back(*rset);
98 : : } else {
99 : : // Can't just use vector::resize() here, since that creates N
100 : : // copies of the same RSet!
101 : 56 : subrsets.reserve(number_of_subdbs);
102 [ + + ]: 168 : for (size_t i = 0; i < number_of_subdbs; ++i) {
103 : 112 : subrsets.push_back(Xapian::RSet());
104 : : }
105 : :
106 : 56 : const set<Xapian::docid> & rsetitems = rset->internal->get_items();
107 : 56 : set<Xapian::docid>::const_iterator j;
108 [ + + ]: 130 : for (j = rsetitems.begin(); j != rsetitems.end(); ++j) {
109 : 74 : Xapian::doccount local_docid = (*j - 1) / number_of_subdbs + 1;
110 : 74 : Xapian::doccount subdatabase = (*j - 1) % number_of_subdbs;
111 : 74 : subrsets[subdatabase].add_document(local_docid);
112 : : }
113 : : }
114 : : } else {
115 : : // NB vector::resize() creates N copies of the same empty RSet.
116 : 175010 : subrsets.resize(number_of_subdbs);
117 : : }
118 : : Assert(subrsets.size() == number_of_subdbs);
119 : 179504 : }
120 : :
121 : : /** Prepare some SubMatches.
122 : : *
123 : : * This calls the prepare_match() method on each SubMatch object, causing them
124 : : * to lookup various statistics.
125 : : *
126 : : * This method is rather complicated in order to handle remote matches
127 : : * efficiently. Instead of simply calling "prepare_match()" on each submatch
128 : : * and waiting for it to return, it first calls "prepare_match(true)" on each
129 : : * submatch. If any of these calls return false, indicating that the required
130 : : * information has not yet been received from the server, the method will try
131 : : * those which returned false again, passing "false" as a parameter to
132 : : * indicate that they should block until the information is ready.
133 : : *
134 : : * This should improve performance in the case of mixed local-and-remote
135 : : * searches - the local searchers will all fetch their statistics from disk
136 : : * without waiting for the remote searchers, so as soon as the remote searcher
137 : : * statistics arrive, we can move on to the next step.
138 : : */
139 : : static void
140 : 179462 : prepare_sub_matches(vector<Xapian::Internal::RefCntPtr<SubMatch> > & leaves,
141 : : Xapian::ErrorHandler * errorhandler,
142 : : Xapian::Weight::Internal & stats)
143 : : {
144 : : LOGCALL_STATIC_VOID(MATCH, "prepare_sub_matches", leaves | errorhandler | stats);
145 : : // We use a vector<bool> to track which SubMatches we're already prepared.
146 : 179462 : vector<bool> prepared;
147 : 179462 : prepared.resize(leaves.size(), false);
148 : 179462 : size_t unprepared = leaves.size();
149 : 179462 : bool nowait = true;
150 [ + + ]: 358924 : while (unprepared) {
151 [ + + ]: 430225 : for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
152 [ + + ]: 250763 : if (prepared[leaf]) continue;
153 : : try {
154 : 250762 : SubMatch * submatch = leaves[leaf].get();
155 [ + - + + ]: 250762 : if (!submatch || submatch->prepare_match(nowait, stats)) {
[ + + ]
156 : 250756 : prepared[leaf] = true;
157 : 250756 : --unprepared;
158 : : }
159 : 6 : } catch (Xapian::Error & e) {
160 [ + - ]: 3 : if (!errorhandler) throw;
161 : :
162 : : LOGLINE(EXCEPTION, "Calling error handler for prepare_match() on a SubMatch.");
163 : 0 : (*errorhandler)(e);
164 : : // Continue match without this sub-match.
165 : 0 : leaves[leaf] = NULL;
166 : 0 : prepared[leaf] = true;
167 : 0 : --unprepared;
168 : : }
169 : : }
170 : : // Use blocking IO on subsequent passes, so that we don't go into
171 : : // a tight loop.
172 : 179462 : nowait = false;
173 : 179462 : }
174 : 179459 : }
175 : :
176 : : /// Class which applies several match spies in turn.
177 [ # # ][ - + ]: 175211 : class MultipleMatchSpy : public Xapian::MatchSpy {
178 : : private:
179 : : /// List of match spies to call, in order.
180 : : const std::vector<Xapian::MatchSpy *> & spies;
181 : :
182 : : public:
183 : 175211 : MultipleMatchSpy(const std::vector<Xapian::MatchSpy *> & spies_)
184 : 175211 : : spies(spies_) {}
185 : :
186 : : /** Implementation of virtual operator().
187 : : *
188 : : * This implementation calls all the spies in turn.
189 : : */
190 : : void operator()(const Xapian::Document &doc, Xapian::weight wt);
191 : : };
192 : :
193 : : void
194 : 828 : MultipleMatchSpy::operator()(const Xapian::Document &doc, Xapian::weight wt) {
195 : : LOGCALL_VOID(MATCH, "MultipleMatchSpy::operator()", doc | wt);
196 : 828 : vector<Xapian::MatchSpy *>::const_iterator i;
197 [ + + ]: 3234 : for (i = spies.begin(); i != spies.end(); ++i) {
198 : 2406 : (**i)(doc, wt);
199 : : }
200 : 828 : }
201 : :
202 : : ////////////////////////////////////
203 : : // Initialisation and cleaning up //
204 : : ////////////////////////////////////
205 : 179569 : MultiMatch::MultiMatch(const Xapian::Database &db_,
206 : : const Xapian::Query::Internal * query_,
207 : : Xapian::termcount qlen,
208 : : const Xapian::RSet * omrset,
209 : : Xapian::doccount collapse_max_,
210 : : Xapian::valueno collapse_key_,
211 : : int percent_cutoff_, Xapian::weight weight_cutoff_,
212 : : Xapian::Enquire::docid_order order_,
213 : : Xapian::valueno sort_key_,
214 : : Xapian::Enquire::Internal::sort_setting sort_by_,
215 : : bool sort_value_forward_,
216 : : Xapian::ErrorHandler * errorhandler_,
217 : : Xapian::Weight::Internal & stats,
218 : : const Xapian::Weight * weight_,
219 : : const vector<Xapian::MatchSpy *> & matchspies_,
220 : : bool have_sorter, bool have_mdecider)
221 : : : db(db_), query(query_),
222 : : collapse_max(collapse_max_), collapse_key(collapse_key_),
223 : : percent_cutoff(percent_cutoff_), weight_cutoff(weight_cutoff_),
224 : : order(order_),
225 : : sort_key(sort_key_), sort_by(sort_by_),
226 : : sort_value_forward(sort_value_forward_),
227 : : errorhandler(errorhandler_), weight(weight_),
228 : : is_remote(db.internal.size()),
229 : 179569 : matchspies(matchspies_)
230 : : {
231 : : LOGCALL_CTOR(MATCH, "MultiMatch", db_ | query_ | qlen | omrset | collapse_max_ | collapse_key_ | percent_cutoff_ | weight_cutoff_ | int(order_) | sort_key_ | int(sort_by_) | sort_value_forward_ | errorhandler_ | stats | weight_ | matchspies_ | have_sorter | have_mdecider);
232 : :
233 [ + + # # ]: 179569 : if (!query) return;
234 : 179504 : query->validate_query();
235 : :
236 : 179504 : Xapian::doccount number_of_subdbs = db.internal.size();
237 : 179504 : vector<Xapian::RSet> subrsets;
238 : 179504 : split_rset_by_db(omrset, number_of_subdbs, subrsets);
239 : :
240 [ + + ][ # # ]: 430305 : for (size_t i = 0; i != number_of_subdbs; ++i) {
241 : 250801 : Xapian::Database::Internal *subdb = db.internal[i].get();
242 : : Assert(subdb);
243 : 250801 : Xapian::Internal::RefCntPtr<SubMatch> smatch;
244 : : try {
245 : : // There is currently only one special case, for network databases.
246 : : #ifdef XAPIAN_HAS_REMOTE_BACKEND
247 : 250801 : RemoteDatabase *rem_db = subdb->as_remotedatabase();
248 [ + + # # ]: 250801 : if (rem_db) {
249 [ + + ][ # # ]: 4446 : if (have_sorter) {
250 : 6 : throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
251 : : }
252 [ + + ][ # # ]: 4440 : if (have_mdecider) {
253 : 18 : throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
254 : : }
255 : : rem_db->set_query(query, qlen, collapse_max, collapse_key,
256 : : order, sort_key, sort_by, sort_value_forward,
257 : : percent_cutoff, weight_cutoff, weight,
258 : 4422 : subrsets[i], matchspies);
259 : : bool decreasing_relevance =
260 [ + + + + ]: 4404 : (sort_by == REL || sort_by == REL_VAL);
[ # # ][ # # ]
261 : 4404 : smatch = new RemoteSubMatch(rem_db, decreasing_relevance, matchspies);
262 : 4404 : is_remote[i] = true;
263 : : } else {
264 : 246355 : smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
265 : : }
266 : : #else
267 : : // Avoid unused parameter warnings.
268 : : (void)have_sorter;
269 : : (void)have_mdecider;
270 : : smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
271 : : #endif /* XAPIAN_HAS_REMOTE_BACKEND */
272 : 84 : } catch (Xapian::Error & e) {
273 [ + - ][ # # ]: 42 : if (!errorhandler) throw;
274 : : LOGLINE(EXCEPTION, "Calling error handler for creation of a SubMatch from a database and query.");
275 : 0 : (*errorhandler)(e);
276 : : // Continue match without this sub-postlist.
277 : 0 : smatch = NULL;
278 : : }
279 : 250759 : leaves.push_back(smatch);
280 : : }
281 : :
282 : 179462 : stats.mark_wanted_terms(*query);
283 : 179462 : prepare_sub_matches(leaves, errorhandler, stats);
284 : 179504 : stats.set_bounds_from_db(db);
285 : 179659 : }
286 : :
287 : : Xapian::weight
288 : 24559265 : MultiMatch::getorrecalc_maxweight(PostList *pl)
289 : : {
290 : : LOGCALL(MATCH, Xapian::weight, "MultiMatch::getorrecalc_maxweight", pl);
291 : : Xapian::weight wt;
292 [ + + ]: 24559265 : if (recalculate_w_max) {
293 : : LOGLINE(MATCH, "recalculating max weight");
294 : 223770 : wt = pl->recalc_maxweight();
295 : 223770 : recalculate_w_max = false;
296 : : } else {
297 : 24335495 : wt = pl->get_maxweight();
298 : : LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
299 : : AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
300 : : }
301 : : LOGLINE(MATCH, "max possible doc weight = " << wt);
302 : 24559265 : RETURN(wt);
303 : : }
304 : :
305 : : void
306 : 179524 : MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
307 : : Xapian::doccount check_at_least,
308 : : Xapian::MSet & mset,
309 : : const Xapian::Weight::Internal & stats,
310 : : const Xapian::MatchDecider *mdecider,
311 : : const Xapian::MatchDecider *matchspy_legacy,
312 : : const Xapian::KeyMaker *sorter)
313 : : {
314 : : LOGCALL_VOID(MATCH, "MultiMatch::get_mset", first | maxitems | check_at_least | Literal("mset") | stats | Literal("mdecider") | Literal("matchspy_legacy") | Literal("sorter"));
315 : : AssertRel(check_at_least,>=,maxitems);
316 : :
317 [ + + ]: 179524 : if (!query) {
318 : 65 : mset = Xapian::MSet(new Xapian::MSet::Internal());
319 : 65 : mset.internal->firstitem = first;
320 : 65 : return;
321 : : }
322 : :
323 : : Assert(!leaves.empty());
324 : :
325 : : #ifdef XAPIAN_HAS_REMOTE_BACKEND
326 : : // If there's only one database and it's remote, we can just unserialise
327 : : // its MSet and return that.
328 [ + + ][ + + ]: 179459 : if (leaves.size() == 1 && is_remote[0]) {
[ + + ]
329 : : RemoteSubMatch * rem_match;
330 : 4248 : rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
331 : 4248 : rem_match->start_match(first, maxitems, check_at_least, stats);
332 : 4248 : rem_match->get_mset(mset);
333 : 4248 : return;
334 : : }
335 : : #endif
336 : :
337 : : // Start matchers.
338 : : {
339 : 175211 : vector<Xapian::Internal::RefCntPtr<SubMatch> >::iterator leaf;
340 [ + + ]: 421719 : for (leaf = leaves.begin(); leaf != leaves.end(); ++leaf) {
341 [ - + ]: 246508 : if (!(*leaf).get()) continue;
342 : : try {
343 : : (*leaf)->start_match(0, first + maxitems,
344 : 246508 : first + check_at_least, stats);
345 : 0 : } catch (Xapian::Error & e) {
346 [ # # ]: 0 : if (!errorhandler) throw;
347 : : LOGLINE(EXCEPTION, "Calling error handler for "
348 : : "start_match() on a SubMatch.");
349 : 0 : (*errorhandler)(e);
350 : : // Continue match without this sub-match.
351 : 0 : *leaf = NULL;
352 : : }
353 : : }
354 : : }
355 : :
356 : : // Get postlists and term info
357 : 175211 : vector<PostList *> postlists;
358 : 175211 : map<string, Xapian::MSet::Internal::TermFreqAndWeight> termfreqandwts;
359 : : map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr;
360 : 175211 : termfreqandwts_ptr = &termfreqandwts;
361 : :
362 : 175211 : Xapian::termcount total_subqs = 0;
363 : : // Keep a count of matches which we know exist, but we won't see. This
364 : : // occurs when a submatch is remote, and returns a lower bound on the
365 : : // number of matching documents which is higher than the number of
366 : : // documents it returns (because it wasn't asked for more documents).
367 : 175211 : Xapian::doccount definite_matches_not_seen = 0;
368 [ + + ]: 421719 : for (size_t i = 0; i != leaves.size(); ++i) {
369 : : PostList *pl;
370 : : try {
371 : : pl = leaves[i]->get_postlist_and_term_info(this,
372 : : termfreqandwts_ptr,
373 : 246508 : &total_subqs);
374 [ + + + + ]: 246508 : if (termfreqandwts_ptr && !termfreqandwts.empty())
[ + + ]
375 : 168636 : termfreqandwts_ptr = NULL;
376 [ + + ]: 246508 : if (is_remote[i]) {
377 [ + + ]: 156 : if (pl->get_termfreq_min() > first + maxitems) {
378 : : LOGLINE(MATCH, "Found " <<
379 : : pl->get_termfreq_min() - (first + maxitems)
380 : : << " definite matches in remote submatch "
381 : : "which aren't passed to local match");
382 : 12 : definite_matches_not_seen += pl->get_termfreq_min();
383 : 12 : definite_matches_not_seen -= first + maxitems;
384 : : }
385 : : }
386 : 0 : } catch (Xapian::Error & e) {
387 [ # # ]: 0 : if (!errorhandler) throw;
388 : : LOGLINE(EXCEPTION, "Calling error handler for "
389 : : "get_term_info() on a SubMatch.");
390 : 0 : (*errorhandler)(e);
391 : : // FIXME: check if *ALL* the remote servers have failed!
392 : : // Continue match without this sub-match.
393 : 0 : leaves[i] = NULL;
394 : 0 : pl = new EmptyPostList;
395 : : }
396 : 246508 : postlists.push_back(pl);
397 : : }
398 : : Assert(!postlists.empty());
399 : :
400 : 175211 : ValueStreamDocument vsdoc(db);
401 : 175211 : ++vsdoc.ref_count;
402 : 175211 : Xapian::Document doc(&vsdoc);
403 : :
404 : : // Get a single combined postlist
405 : 175211 : AutoPtr<PostList> pl;
406 [ + + ]: 175211 : if (postlists.size() == 1) {
407 : 103936 : pl.reset(postlists.front());
408 : : } else {
409 : 71275 : pl.reset(new MergePostList(postlists, this, vsdoc, errorhandler));
410 : : }
411 : :
412 : : LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
413 : :
414 : : #ifdef XAPIAN_DEBUG_LOG
415 : : {
416 : : map<string, Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator tfwi;
417 : : for (tfwi = termfreqandwts.begin(); tfwi != termfreqandwts.end(); ++tfwi) {
418 : : LOGLINE(MATCH, "termfreqandwts[" << tfwi->first << "] = " << tfwi->second.termfreq << ", " << tfwi->second.termweight);
419 : : }
420 : : }
421 : : #endif
422 : :
423 : : // Empty result set
424 : 175211 : Xapian::doccount docs_matched = 0;
425 : 175211 : Xapian::weight greatest_wt = 0;
426 : 175211 : Xapian::termcount greatest_wt_subqs_matched = 0;
427 : : #ifdef XAPIAN_HAS_REMOTE_BACKEND
428 : 175211 : unsigned greatest_wt_subqs_db_num = UINT_MAX;
429 : : #endif
430 : 175211 : vector<Xapian::Internal::MSetItem> items;
431 : :
432 : : // maximum weight a document could possibly have
433 : 175211 : const Xapian::weight max_possible = pl->recalc_maxweight();
434 : :
435 : : LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
436 : 175211 : recalculate_w_max = false;
437 : :
438 : 175211 : Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
439 : 175211 : Xapian::doccount matches_lower_bound = 0;
440 : 175211 : Xapian::doccount matches_estimated = pl->get_termfreq_est();
441 : :
442 [ + + + - ]: 175211 : if (mdecider == NULL && matchspy_legacy == NULL) {
443 : : // If we have a matcher decider or match spy, the lower bound must be
444 : : // set to 0 as we could discard all hits. Otherwise set it to the
445 : : // minimum number of entries which the postlist could return.
446 : 175134 : matches_lower_bound = pl->get_termfreq_min();
447 : : }
448 : :
449 : : // Prepare the matchspy
450 : 175211 : Xapian::MatchSpy *matchspy = NULL;
451 : 175211 : MultipleMatchSpy multispy(matchspies);
452 [ + + ]: 175211 : if (!matchspies.empty()) {
453 [ + + ]: 50 : if (matchspies.size() == 1) {
454 : 7 : matchspy = matchspies[0];
455 : : } else {
456 : 43 : matchspy = &multispy;
457 : : }
458 : : }
459 : :
460 : : // Check if any results have been asked for (might just be wanting
461 : : // maxweight).
462 [ + + ]: 175211 : if (check_at_least == 0) {
463 : 1731 : pl.reset(NULL);
464 : 1731 : Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
465 [ + + ]: 1731 : if (collapse_max) {
466 : : // Lower bound must be set to no more than collapse_max, since it's
467 : : // possible that all matching documents have the same collapse_key
468 : : // value and so are collapsed together.
469 [ + - ]: 78 : if (matches_lower_bound > collapse_max)
470 : 78 : matches_lower_bound = collapse_max;
471 : : }
472 : :
473 : : mset = Xapian::MSet(new Xapian::MSet::Internal(
474 : : first,
475 : : matches_upper_bound,
476 : : matches_lower_bound,
477 : : matches_estimated,
478 : : matches_upper_bound,
479 : : uncollapsed_lower_bound,
480 : : matches_estimated,
481 : : max_possible, greatest_wt, items,
482 : : termfreqandwts,
483 : 1731 : 0));
484 : : return;
485 : : }
486 : :
487 : : // Number of documents considered by a decider or matchspy_legacy.
488 : 173480 : Xapian::doccount decider_considered = 0;
489 : : // Number of documents denied by the decider or matchspy_legacy.
490 : 173480 : Xapian::doccount decider_denied = 0;
491 : :
492 : : // Set max number of results that we want - this is used to decide
493 : : // when to throw away unwanted items.
494 : 173480 : Xapian::doccount max_msize = first + maxitems;
495 : 173480 : items.reserve(max_msize + 1);
496 : :
497 : : // Tracks the minimum item currently eligible for the MSet - we compare
498 : : // candidate items against this.
499 : 173480 : Xapian::Internal::MSetItem min_item(0.0, 0);
500 : :
501 : : // Minimum weight an item must have to be worth considering.
502 : 173480 : Xapian::weight min_weight = weight_cutoff;
503 : :
504 : : // Factor to multiply maximum weight seen by to get the cutoff weight.
505 : 173480 : Xapian::weight percent_cutoff_factor = percent_cutoff / 100.0;
506 : : // Corresponding correction to that in omenquire.cc to account for excess
507 : : // precision on x86.
508 : 173480 : percent_cutoff_factor -= DBL_EPSILON;
509 : :
510 : : // Object to handle collapsing.
511 : 173480 : Collapser collapser(collapse_key, collapse_max);
512 : :
513 : : /// Comparison functor for sorting MSet
514 : 173480 : bool sort_forward = (order != Xapian::Enquire::DESCENDING);
515 : 173480 : MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
516 : :
517 : : // Perform query
518 : :
519 : : // We form the mset in two stages. In the first we fill up our working
520 : : // mset. Adding a new document does not remove another.
521 : : //
522 : : // In the second, we consider documents which rank higher than the current
523 : : // lowest ranking document in the mset. Each document added expels the
524 : : // current lowest ranking document.
525 : : //
526 : : // If a percentage cutoff is in effect, it can cause the matcher to return
527 : : // from the second stage from the first.
528 : :
529 : : // Is the mset a valid heap?
530 : 173480 : bool is_heap = false;
531 : :
532 [ + + ]: 65743772 : while (true) {
533 : : bool pushback;
534 : :
535 [ + + ]: 65917113 : if (rare(recalculate_w_max)) {
536 [ + + ]: 593397 : if (min_weight > 0.0) {
537 [ - + ]: 57588 : if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
538 : : LOGLINE(MATCH, "*** TERMINATING EARLY (1)");
539 : 0 : break;
540 : : }
541 : : }
542 : : }
543 : :
544 : 65917113 : PostList * pl_copy = pl.get();
545 [ + + ]: 65917113 : if (rare(next_handling_prune(pl_copy, min_weight, this))) {
546 : 92596 : (void)pl.release();
547 : 92596 : pl.reset(pl_copy);
548 : : LOGLINE(MATCH, "*** REPLACING ROOT");
549 : :
550 [ + + ]: 92596 : if (min_weight > 0.0) {
551 : : // No need for a full recalc (unless we've got to do one
552 : : // because of a prune elsewhere) - we're just switching to a
553 : : // subtree.
554 [ - + ]: 91337 : if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
555 : : LOGLINE(MATCH, "*** TERMINATING EARLY (2)");
556 : 0 : break;
557 : : }
558 : : }
559 : : }
560 : :
561 [ + + ]: 65917113 : if (rare(pl->at_end())) {
562 : : LOGLINE(MATCH, "Reached end of potential matches");
563 : 173341 : break;
564 : : }
565 : :
566 : : // Only calculate the weight if we need it for mcmp, or there's a
567 : : // percentage or weight cutoff in effect. Otherwise we calculate it
568 : : // below if we haven't already rejected this candidate.
569 : 65743772 : Xapian::weight wt = 0.0;
570 : 65743772 : bool calculated_weight = false;
571 [ + + ][ - + ]: 65743772 : if (sort_by != VAL || min_weight > 0.0) {
572 : 65716211 : wt = pl->get_weight();
573 [ + + ]: 65716211 : if (wt < min_weight) {
574 : : LOGLINE(MATCH, "Rejecting potential match due to insufficient weight");
575 : 18477742 : continue;
576 : : }
577 : 47238469 : calculated_weight = true;
578 : : }
579 : :
580 : 47266030 : Xapian::docid did = pl->get_docid();
581 : 47266030 : vsdoc.set_document(did);
582 : : LOGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
583 : 47266030 : Xapian::Internal::MSetItem new_item(wt, did);
584 [ + + ]: 47266030 : if (sort_by != REL) {
585 [ + + ]: 56481 : if (sorter) {
586 : 870 : new_item.sort_key = (*sorter)(doc);
587 : : } else {
588 : 55611 : new_item.sort_key = vsdoc.get_value(sort_key);
589 : : }
590 : :
591 : : // We're sorting by value (in part at least), so compare the item
592 : : // against the lowest currently in the proto-mset. If sort_by is
593 : : // VAL, then new_item.wt won't yet be set, but that doesn't
594 : : // matter since it's not used by the sort function.
595 [ + + ]: 56481 : if (!mcmp(new_item, min_item)) {
596 [ + + ][ + + ]: 10929 : if (mdecider == NULL && !collapser && matchspy_legacy == NULL) {
[ + - ][ + + ]
597 : : // Document was definitely suitable for mset - no more
598 : : // processing needed.
599 : : LOGLINE(MATCH, "Making note of match item which sorts lower than min_item");
600 : 7019 : ++docs_matched;
601 [ + + ]: 7019 : if (!calculated_weight) wt = pl->get_weight();
602 [ + + ]: 7019 : if (matchspy) {
603 : 100 : matchspy->operator()(doc, wt);
604 : : }
605 [ + + ]: 7019 : if (wt > greatest_wt) goto new_greatest_weight;
606 : 6829 : continue;
607 : : }
608 [ + + ]: 3910 : if (docs_matched >= check_at_least) {
609 : : // We've seen enough items - we can drop this one.
610 : : LOGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
611 : : // FIXME: hmm, match decider might have rejected this...
612 [ + - ]: 3808 : if (!calculated_weight) wt = pl->get_weight();
613 [ - + ]: 3808 : if (wt > greatest_wt) goto new_greatest_weight;
614 : 3808 : continue;
615 : : }
616 : : // We can't drop the item, because we need to show it
617 : : // to the matchspy_legacy, test whether the mdecider would
618 : : // accept it, and/or test whether it would be collapsed.
619 : : LOGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
620 : : }
621 : : }
622 : :
623 : : // Use the match spy and/or decision functors (if specified).
624 [ + + ][ + + ]: 47255203 : if (matchspy != NULL || mdecider != NULL || matchspy_legacy != NULL) {
[ - + ]
625 : 5236 : const unsigned int multiplier = db.internal.size();
626 : : Assert(multiplier != 0);
627 : 5236 : Xapian::doccount n = (did - 1) % multiplier; // which actual database
628 : : // If the results are from a remote database, then the functor will
629 : : // already have been applied there so we can skip this step.
630 [ + - ]: 5236 : if (!is_remote[n]) {
631 : 5236 : ++decider_considered;
632 [ - + ][ # # ]: 5236 : if (matchspy_legacy && !matchspy_legacy->operator()(doc)) {
[ - + ]
633 : 0 : ++decider_denied;
634 : 0 : continue;
635 : : }
636 [ + + ][ + + ]: 5236 : if (mdecider && !mdecider->operator()(doc)) {
[ + + ]
637 : 186 : ++decider_denied;
638 : 186 : continue;
639 : : }
640 [ + + ]: 5050 : if (matchspy) {
641 [ + + ]: 770 : if (!calculated_weight) {
642 : 150 : wt = pl->get_weight();
643 : 150 : new_item.wt = wt;
644 : 150 : calculated_weight = true;
645 : : }
646 : 770 : matchspy->operator()(doc, wt);
647 : : }
648 : : }
649 : : }
650 : :
651 [ + + ]: 47255017 : if (!calculated_weight) {
652 : : // we didn't calculate the weight above, but now we will need it
653 : 20762 : wt = pl->get_weight();
654 : 20762 : new_item.wt = wt;
655 : : }
656 : :
657 : 47255017 : pushback = true;
658 : :
659 : : // Perform collapsing on key if requested.
660 [ + + ]: 47255017 : if (collapser) {
661 : : collapse_result res;
662 : 11422 : res = collapser.process(new_item, pl.get(), vsdoc, mcmp);
663 [ + + ]: 11422 : if (res == REJECTED) {
664 : : // If we're sorting by relevance primarily, then we throw away
665 : : // the lower weighted document anyway.
666 [ + + ][ + - ]: 4777 : if (sort_by != REL && sort_by != REL_VAL) {
667 [ - + ]: 3753 : if (wt > greatest_wt) goto new_greatest_weight;
668 : : }
669 : 4777 : continue;
670 : : }
671 : :
672 [ + + ]: 6645 : if (res == REPLACED) {
673 : : // There was a previous item in the collapse tab so
674 : : // the MSet can't be empty.
675 : : Assert(!items.empty());
676 : :
677 : : const Xapian::Internal::MSetItem & old_item =
678 : 756 : collapser.old_item;
679 : : // This is one of the best collapse_max potential MSet entries
680 : : // with this key which we've seen so far. Check if the
681 : : // entry with this key which it displaced might still be in the
682 : : // proto-MSet. If it might be, we need to check through for
683 : : // it.
684 : 756 : Xapian::weight old_wt = old_item.wt;
685 [ + - ][ + + ]: 756 : if (old_wt >= min_weight && mcmp(old_item, min_item)) {
[ + + ]
686 : : // Scan through (unsorted) MSet looking for entry.
687 : : // FIXME: more efficient way than just scanning?
688 : 635 : Xapian::docid olddid = old_item.did;
689 : 635 : vector<Xapian::Internal::MSetItem>::iterator i;
690 [ + - ]: 3348 : for (i = items.begin(); i != items.end(); ++i) {
691 [ + + ]: 3348 : if (i->did == olddid) {
692 : : LOGLINE(MATCH, "collapse: removing " <<
693 : : olddid << ": " <<
694 : : new_item.collapse_key);
695 : : // We can replace an arbitrary element in O(log N)
696 : : // but have to do it by hand (in this case the new
697 : : // elt is bigger, so we just swap down the tree).
698 : : // FIXME: implement this, and clean up is_heap
699 : : // handling
700 : 635 : *i = new_item;
701 : 635 : pushback = false;
702 : 635 : is_heap = false;
703 : 635 : break;
704 : : }
705 : : }
706 : : }
707 : : }
708 : : }
709 : :
710 : : // OK, actually add the item to the mset.
711 [ + + ]: 47250240 : if (pushback) {
712 : 47249605 : ++docs_matched;
713 [ + + ]: 47249605 : if (items.size() >= max_msize) {
714 : 24410340 : items.push_back(new_item);
715 [ + + ]: 24410340 : if (!is_heap) {
716 : 161132 : is_heap = true;
717 : 161132 : make_heap(items.begin(), items.end(), mcmp);
718 : : } else {
719 : : push_heap<vector<Xapian::Internal::MSetItem>::iterator,
720 : 24249208 : MSetCmp>(items.begin(), items.end(), mcmp);
721 : : }
722 : : pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
723 : 24410340 : MSetCmp>(items.begin(), items.end(), mcmp);
724 : 24410340 : items.pop_back();
725 : :
726 : 24410340 : min_item = items.front();
727 [ + + + + ]: 24410340 : if (sort_by == REL || sort_by == REL_VAL) {
728 [ + + ]: 24405337 : if (docs_matched >= check_at_least) {
729 [ + + ]: 24401023 : if (sort_by == REL) {
730 : : // We're done if this is a forward boolean match
731 : : // with only one database (bodgetastic, FIXME
732 : : // better if we can!)
733 [ + + ][ + + ]: 24400801 : if (rare(max_possible == 0 && sort_forward)) {
[ + + ]
734 : : // In the multi database case, MergePostList
735 : : // currently processes each database
736 : : // sequentially (which actually may well be
737 : : // more efficient) so the docids in general
738 : : // won't arrive in order.
739 [ - + ]: 45 : if (leaves.size() == 1) break;
740 : : }
741 : : }
742 [ + + ]: 24401023 : if (min_item.wt > min_weight) {
743 : : LOGLINE(MATCH, "Setting min_weight to " <<
744 : : min_item.wt << " from " << min_weight);
745 : 18399322 : min_weight = min_item.wt;
746 : : }
747 : : }
748 : : }
749 [ + + ]: 24410340 : if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
750 : : LOGLINE(MATCH, "*** TERMINATING EARLY (3)");
751 : : break;
752 : : }
753 : : } else {
754 : 22839265 : items.push_back(new_item);
755 : 22839265 : is_heap = false;
756 [ + + + + ]: 22839265 : if (sort_by == REL && items.size() == max_msize) {
[ + + ]
757 [ + + ]: 161813 : if (docs_matched >= check_at_least) {
758 : : // We're done if this is a forward boolean match
759 : : // with only one database (bodgetastic, FIXME
760 : : // better if we can!)
761 [ + + ][ + + ]: 161709 : if (rare(max_possible == 0 && sort_forward)) {
[ + + ]
762 : : // In the multi database case, MergePostList
763 : : // currently processes each database
764 : : // sequentially (which actually may well be
765 : : // more efficient) so the docids in general
766 : : // won't arrive in order.
767 [ + + ]: 174 : if (leaves.size() == 1) break;
768 : : }
769 : : }
770 : : }
771 : : }
772 : : }
773 : :
774 : : // Keep a track of the greatest weight we've seen.
775 [ + + ]: 47250101 : if (wt > greatest_wt) {
776 : : new_greatest_weight:
777 : 1088143 : greatest_wt = wt;
778 : : #ifdef XAPIAN_HAS_REMOTE_BACKEND
779 : 1088143 : const unsigned int multiplier = db.internal.size();
780 : 1088143 : unsigned int db_num = (did - 1) % multiplier;
781 [ + + ]: 1088143 : if (is_remote[db_num]) {
782 : : // Note that the greatest weighted document came from a remote
783 : : // database, and which one.
784 : 84 : greatest_wt_subqs_db_num = db_num;
785 : : } else
786 : : #endif
787 : : {
788 : 1088059 : greatest_wt_subqs_matched = pl->count_matching_subqs();
789 : : #ifdef XAPIAN_HAS_REMOTE_BACKEND
790 : 1088059 : greatest_wt_subqs_db_num = UINT_MAX;
791 : : #endif
792 : : }
793 [ + + ]: 1088143 : if (percent_cutoff) {
794 : 156 : Xapian::weight w = wt * percent_cutoff_factor;
795 [ + + ]: 156 : if (w > min_weight) {
796 : 143 : min_weight = w;
797 [ + - ]: 143 : if (!is_heap) {
798 : 143 : is_heap = true;
799 : : make_heap<vector<Xapian::Internal::MSetItem>::iterator,
800 : 143 : MSetCmp>(items.begin(), items.end(), mcmp);
801 : : }
802 [ + - ][ + + ]: 47250304 : while (!items.empty() && items.front().wt < min_weight) {
[ + + ]
803 : : pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
804 : 13 : MSetCmp>(items.begin(), items.end(), mcmp);
805 : : Assert(items.back().wt < min_weight);
806 : 13 : items.pop_back();
807 : : }
808 : : #ifdef XAPIAN_ASSERTIONS_PARANOID
809 : : vector<Xapian::Internal::MSetItem>::const_iterator i;
810 : : for (i = items.begin(); i != items.end(); ++i) {
811 : : Assert(i->wt >= min_weight);
812 : : }
813 : : #endif
814 : : }
815 : : }
816 : : }
817 : : }
818 : :
819 : : // done with posting list tree
820 : 173480 : pl.reset(NULL);
821 : :
822 : 173480 : double percent_scale = 0;
823 [ + + + + ]: 173480 : if (!items.empty() && greatest_wt > 0) {
[ + + ]
824 : : // Find the document with the highest weight, then total up the
825 : : // weights for the terms it contains
826 : 166289 : vector<Xapian::Internal::MSetItem>::const_iterator best;
827 : 166289 : best = min_element(items.begin(), items.end(), mcmp);
828 : :
829 : : #ifdef XAPIAN_HAS_REMOTE_BACKEND
830 [ + + ]: 166289 : if (greatest_wt_subqs_db_num != UINT_MAX) {
831 : 54 : const unsigned int n = greatest_wt_subqs_db_num;
832 : : RemoteSubMatch * rem_match;
833 : 54 : rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
834 : 54 : percent_scale = rem_match->get_percent_factor() / 100.0;
835 : : } else
836 : : #endif
837 : : {
838 : 166235 : percent_scale = greatest_wt_subqs_matched / double(total_subqs);
839 : 166235 : percent_scale /= greatest_wt;
840 : : }
841 : : Assert(percent_scale > 0);
842 [ + + ]: 166289 : if (percent_cutoff) {
843 : : // FIXME: better to sort and binary chop maybe?
844 : : // Or we could just use a linear scan here instead.
845 : :
846 : : // trim the mset to the correct answer...
847 : 91 : Xapian::weight min_wt = percent_cutoff_factor / percent_scale;
848 [ + + ]: 91 : if (!is_heap) {
849 : 34 : is_heap = true;
850 : : make_heap<vector<Xapian::Internal::MSetItem>::iterator,
851 : 34 : MSetCmp>(items.begin(), items.end(), mcmp);
852 : : }
853 [ + - ][ - + ]: 91 : while (!items.empty() && items.front().wt < min_wt) {
[ - + ]
854 : : pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
855 : 0 : MSetCmp>(items.begin(), items.end(), mcmp);
856 : : Assert(items.back().wt < min_wt);
857 : 0 : items.pop_back();
858 : : }
859 : : #ifdef XAPIAN_ASSERTIONS_PARANOID
860 : : vector<Xapian::Internal::MSetItem>::const_iterator j;
861 : : for (j = items.begin(); j != items.end(); ++j) {
862 : : Assert(j->wt >= min_wt);
863 : : }
864 : : #endif
865 : : }
866 : 166289 : percent_scale *= 100.0;
867 : : }
868 : :
869 : : LOGLINE(MATCH,
870 : : "docs_matched = " << docs_matched <<
871 : : ", definite_matches_not_seen = " << definite_matches_not_seen <<
872 : : ", matches_lower_bound = " << matches_lower_bound <<
873 : : ", matches_estimated = " << matches_estimated <<
874 : : ", matches_upper_bound = " << matches_upper_bound);
875 : :
876 : : // Adjust docs_matched to take account of documents which matched remotely
877 : : // but weren't sent across.
878 : 173480 : docs_matched += definite_matches_not_seen;
879 : :
880 : 173480 : Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
881 : 173480 : Xapian::doccount uncollapsed_upper_bound = matches_upper_bound;
882 : 173480 : Xapian::doccount uncollapsed_estimated = matches_estimated;
883 [ + + ]: 173480 : if (items.size() < max_msize) {
884 : : // We have fewer items in the mset than we tried to get for it, so we
885 : : // must have all the matches in it.
886 : : LOGLINE(MATCH, "items.size() = " << items.size() <<
887 : : ", max_msize = " << max_msize << ", setting bounds equal");
888 : : Assert(definite_matches_not_seen == 0);
889 : : Assert(percent_cutoff || docs_matched == items.size());
890 : : matches_lower_bound = matches_upper_bound = matches_estimated
891 : 10479 : = items.size();
892 [ + + + + ]: 10479 : if (collapser && matches_lower_bound > uncollapsed_lower_bound)
[ + + ]
893 : 7 : uncollapsed_lower_bound = matches_lower_bound;
894 [ + + ][ + + ]: 163001 : } else if (!collapser && docs_matched < check_at_least) {
[ + + ]
895 : : // We have seen fewer matches than we checked for, so we must have seen
896 : : // all the matches.
897 : : LOGLINE(MATCH, "Setting bounds equal");
898 : : matches_lower_bound = matches_upper_bound = matches_estimated
899 : 286 : = docs_matched;
900 [ - + ][ # # ]: 286 : if (collapser && matches_lower_bound > uncollapsed_lower_bound)
[ - + ]
901 : 0 : uncollapsed_lower_bound = matches_lower_bound;
902 : : } else {
903 : : AssertRel(matches_estimated,>=,matches_lower_bound);
904 : : AssertRel(matches_estimated,<=,matches_upper_bound);
905 : :
906 : : // We can end up scaling the estimate more than once, so collect
907 : : // the scale factors and apply them in one go to avoid rounding
908 : : // more than once.
909 : 162715 : double estimate_scale = 1.0;
910 : 162715 : double unique_rate = 1.0;
911 : :
912 [ + + ]: 162715 : if (collapser) {
913 : : LOGLINE(MATCH, "Adjusting bounds due to collapse_key");
914 : 638 : matches_lower_bound = collapser.get_matches_lower_bound();
915 [ + + ]: 638 : if (matches_lower_bound > uncollapsed_lower_bound)
916 : 21 : uncollapsed_lower_bound = matches_lower_bound;
917 : :
918 : : // The estimate for the number of hits can be modified by
919 : : // multiplying it by the rate at which we've been finding
920 : : // unique documents.
921 : 638 : Xapian::doccount docs_considered = collapser.get_docs_considered();
922 : 638 : Xapian::doccount dups_ignored = collapser.get_dups_ignored();
923 [ + - ]: 638 : if (docs_considered > 0) {
924 : 638 : double unique = double(docs_considered - dups_ignored);
925 : 638 : unique_rate = unique / double(docs_considered);
926 : : }
927 : :
928 : : // We can safely reduce the upper bound by the number of duplicates
929 : : // we've ignored.
930 : 638 : matches_upper_bound -= dups_ignored;
931 : :
932 : : LOGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
933 : : ", matches_estimated=" << matches_estimated <<
934 : : "*" << estimate_scale << "*" << unique_rate <<
935 : : ", matches_upper_bound=" << matches_upper_bound);
936 : : }
937 : :
938 [ + + ][ - + ]: 162715 : if (mdecider || matchspy_legacy) {
939 [ + + ]: 35 : if (!percent_cutoff) {
940 [ + + ]: 21 : if (!collapser) {
941 : : // We're not collapsing or doing a percentage cutoff, so
942 : : // docs_matched is a lower bound on the total number of
943 : : // matches.
944 : 7 : matches_lower_bound = max(docs_matched, matches_lower_bound);
945 : : }
946 : : }
947 : :
948 : : // The estimate for the number of hits can be modified by
949 : : // multiplying it by the rate at which the match decider has
950 : : // been accepting documents.
951 [ + - ]: 35 : if (decider_considered > 0) {
952 : 35 : double accept = double(decider_considered - decider_denied);
953 : 35 : double accept_rate = accept / double(decider_considered);
954 : 35 : estimate_scale *= accept_rate;
955 : : }
956 : :
957 : : // If a document is denied by a match decider, it is not possible
958 : : // for it to found to be a duplicate, so it is safe to also reduce
959 : : // the upper bound by the number of documents denied by a match
960 : : // decider.
961 : 35 : matches_upper_bound -= decider_denied;
962 [ + + ]: 35 : if (collapser) uncollapsed_upper_bound -= decider_denied;
963 : : }
964 : :
965 [ + + ]: 162715 : if (percent_cutoff) {
966 : 27 : estimate_scale *= (1.0 - percent_cutoff_factor);
967 : : // another approach:
968 : : // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
969 : : // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
970 : :
971 : : // Very likely an underestimate, but we can't really do better
972 : : // without checking further matches... Only possibility would be
973 : : // to track how many docs made the min weight test but didn't make
974 : : // the candidate set since the last greatest_wt change, which we
975 : : // could use if the top documents matched all the prob terms.
976 : 27 : matches_lower_bound = items.size();
977 [ + + ]: 27 : if (collapser) uncollapsed_lower_bound = matches_lower_bound;
978 : :
979 : : // matches_upper_bound could be reduced by the number of documents
980 : : // which fail the min weight test (FIXME)
981 : :
982 : : LOGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
983 : : percent_cutoff << "): now have matches_estimated=" <<
984 : : matches_estimated << ", matches_lower_bound=" <<
985 : : matches_lower_bound);
986 : : }
987 : :
988 [ + + ][ + + ]: 162715 : if (collapser && estimate_scale != 1.0) {
[ + + ]
989 : : uncollapsed_estimated =
990 : 31 : Xapian::doccount(uncollapsed_estimated * estimate_scale + 0.5);
991 : : }
992 : :
993 : 162715 : estimate_scale *= unique_rate;
994 : :
995 [ + + ]: 162715 : if (estimate_scale != 1.0) {
996 : : matches_estimated =
997 : 80 : Xapian::doccount(matches_estimated * estimate_scale + 0.5);
998 [ + + ]: 80 : if (matches_estimated < matches_lower_bound)
999 : 13 : matches_estimated = matches_lower_bound;
1000 : : }
1001 : :
1002 [ + + ][ + + ]: 162715 : if (collapser || mdecider || matchspy_legacy) {
[ - + ][ + + ]
1003 : : LOGLINE(MATCH, "Clamping estimate between bounds: "
1004 : : "matches_lower_bound = " << matches_lower_bound <<
1005 : : ", matches_estimated = " << matches_estimated <<
1006 : : ", matches_upper_bound = " << matches_upper_bound);
1007 : : AssertRel(matches_lower_bound,<=,matches_upper_bound);
1008 : 652 : matches_estimated = max(matches_estimated, matches_lower_bound);
1009 : 652 : matches_estimated = min(matches_estimated, matches_upper_bound);
1010 [ + - ]: 162063 : } else if (!percent_cutoff) {
1011 : : AssertRel(docs_matched,<=,matches_upper_bound);
1012 [ + + ]: 162063 : if (docs_matched > matches_lower_bound)
1013 : 170 : matches_lower_bound = docs_matched;
1014 [ + + ]: 162063 : if (docs_matched > matches_estimated)
1015 : 141 : matches_estimated = docs_matched;
1016 : : }
1017 : :
1018 [ + + ][ + + ]: 162715 : if (collapser && !mdecider && !percent_cutoff && !matchspy_legacy) {
[ + + ][ + - ]
[ + + ]
1019 : : AssertRel(docs_matched,<=,uncollapsed_upper_bound);
1020 [ - + ]: 604 : if (docs_matched > uncollapsed_lower_bound)
1021 : 0 : uncollapsed_lower_bound = docs_matched;
1022 : : }
1023 : : }
1024 : :
1025 [ + + ]: 173480 : if (collapser) {
1026 : : AssertRel(uncollapsed_lower_bound,<=,uncollapsed_upper_bound);
1027 [ + + ]: 1138 : if (uncollapsed_estimated < uncollapsed_lower_bound) {
1028 : 13 : uncollapsed_estimated = uncollapsed_lower_bound;
1029 [ - + ]: 1125 : } else if (uncollapsed_estimated > uncollapsed_upper_bound) {
1030 : 0 : uncollapsed_estimated = uncollapsed_upper_bound;
1031 : : }
1032 : : } else {
1033 : : // We're not collapsing, so the bounds must be the same.
1034 : 172342 : uncollapsed_lower_bound = matches_lower_bound;
1035 : 172342 : uncollapsed_upper_bound = matches_upper_bound;
1036 : 172342 : uncollapsed_estimated = matches_estimated;
1037 : : }
1038 : :
1039 : : LOGLINE(MATCH, items.size() << " items in potential mset");
1040 : :
1041 [ + + ]: 173480 : if (first > 0) {
1042 : : // Remove unwanted leading entries
1043 [ + + ]: 158306 : if (items.size() <= first) {
1044 : 32 : items.clear();
1045 : : } else {
1046 : : LOGLINE(MATCH, "finding " << first << "th");
1047 : : // We perform nth_element() on reverse iterators so that the
1048 : : // unwanted elements end up at the end of items, which means
1049 : : // that the call to erase() to remove them doesn't have to copy
1050 : : // any elements.
1051 : 158274 : vector<Xapian::Internal::MSetItem>::reverse_iterator nth;
1052 : 158274 : nth = items.rbegin() + first;
1053 : 158274 : nth_element(items.rbegin(), nth, items.rend(), mcmp);
1054 : : // Erase the trailing ``first'' elements
1055 : 158274 : items.erase(items.begin() + items.size() - first, items.end());
1056 : : }
1057 : : }
1058 : :
1059 : : LOGLINE(MATCH, "sorting " << items.size() << " entries");
1060 : :
1061 : : // Need a stable sort, but this is provided by comparison operator
1062 : 173480 : sort(items.begin(), items.end(), mcmp);
1063 : :
1064 : 173480 : if (!items.empty()) {
1065 : : LOGLINE(MATCH, "min weight in mset = " << items.back().wt);
1066 : : LOGLINE(MATCH, "max weight in mset = " << items[0].wt);
1067 : : }
1068 : :
1069 : : AssertRel(matches_estimated,>=,matches_lower_bound);
1070 : : AssertRel(matches_estimated,<=,matches_upper_bound);
1071 : :
1072 : : AssertRel(uncollapsed_estimated,>=,uncollapsed_lower_bound);
1073 : : AssertRel(uncollapsed_estimated,<=,uncollapsed_upper_bound);
1074 : :
1075 : : // We may need to qualify any collapse_count to see if the highest weight
1076 : : // collapsed item would have qualified percent_cutoff
1077 : : // We WILL need to restore collapse_count to the mset by taking from
1078 : : // collapse_tab; this is what comes of copying around whole objects
1079 : : // instead of taking references, we find it hard to update collapse_count
1080 : : // of an item that has already been pushed-back as we don't know where it
1081 : : // is any more. If we keep or find references we won't need to mess with
1082 : : // is_heap so much maybe?
1083 [ + + + + ]: 173480 : if (!items.empty() && collapser && !collapser.empty()) {
[ + + ][ + + ]
1084 : : // Nicked this formula from above, but for some reason percent_scale
1085 : : // has since been multiplied by 100 so we take that into account
1086 : 1092 : Xapian::weight min_wt = percent_cutoff_factor / (percent_scale / 100);
1087 : 1092 : Xapian::doccount entries = collapser.entries();
1088 : 1092 : vector<Xapian::Internal::MSetItem>::iterator i;
1089 [ + + ]: 5416 : for (i = items.begin(); i != items.end(); ++i) {
1090 : : // Skip entries without a collapse key.
1091 [ - + ]: 5377 : if (i->collapse_key.empty()) continue;
1092 : :
1093 : : // Set collapse_count appropriately.
1094 : 5377 : i->collapse_count = collapser.get_collapse_count(i->collapse_key, percent_cutoff, min_wt);
1095 [ + + ]: 5377 : if (--entries == 0) {
1096 : : // Stop once we've processed all items with collapse keys.
1097 : 1053 : break;
1098 : : }
1099 : : }
1100 : : }
1101 : :
1102 : : mset = Xapian::MSet(new Xapian::MSet::Internal(
1103 : : first,
1104 : : matches_upper_bound,
1105 : : matches_lower_bound,
1106 : : matches_estimated,
1107 : : uncollapsed_upper_bound,
1108 : : uncollapsed_lower_bound,
1109 : : uncollapsed_estimated,
1110 : : max_possible, greatest_wt, items,
1111 : : termfreqandwts,
1112 [ + + ][ + + ]: 179524 : percent_scale));
[ + + ][ + + ]
[ + + ][ + + ]
1113 : : }
|