LCOV - code coverage report
Current view: top level - matcher - multimatch.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core r Lines: 362 386 93.8 %
Date: 2011-08-21 Functions: 8 10 80.0 %
Branches: 328 392 83.7 %

           Branch data     Line data    Source code
       1                 :            : /* multimatch.cc
       2                 :            :  *
       3                 :            :  * Copyright 1999,2000,2001 BrightStation PLC
       4                 :            :  * Copyright 2001,2002 Ananova Ltd
       5                 :            :  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010 Olly Betts
       6                 :            :  * Copyright 2003 Orange PCS Ltd
       7                 :            :  * Copyright 2003 Sam Liddicott
       8                 :            :  * Copyright 2007,2008,2009 Lemur Consulting Ltd
       9                 :            :  *
      10                 :            :  * This program is free software; you can redistribute it and/or
      11                 :            :  * modify it under the terms of the GNU General Public License as
      12                 :            :  * published by the Free Software Foundation; either version 2 of the
      13                 :            :  * License, or (at your option) any later version.
      14                 :            :  *
      15                 :            :  * This program is distributed in the hope that it will be useful,
      16                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      17                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      18                 :            :  * GNU General Public License for more details.
      19                 :            :  *
      20                 :            :  * You should have received a copy of the GNU General Public License
      21                 :            :  * along with this program; if not, write to the Free Software
      22                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
      23                 :            :  * USA
      24                 :            :  */
      25                 :            : 
      26                 :            : #include <config.h>
      27                 :            : 
      28                 :            : #include "multimatch.h"
      29                 :            : 
      30                 :            : #include "autoptr.h"
      31                 :            : #include "collapser.h"
      32                 :            : #include "debuglog.h"
      33                 :            : #include "submatch.h"
      34                 :            : #include "localsubmatch.h"
      35                 :            : #include "omassert.h"
      36                 :            : #include "omenquireinternal.h"
      37                 :            : 
      38                 :            : #include "emptypostlist.h"
      39                 :            : #include "branchpostlist.h"
      40                 :            : #include "mergepostlist.h"
      41                 :            : 
      42                 :            : #include "document.h"
      43                 :            : #include "omqueryinternal.h"
      44                 :            : 
      45                 :            : #include "submatch.h"
      46                 :            : 
      47                 :            : #include "msetcmp.h"
      48                 :            : 
      49                 :            : #include "valuestreamdocument.h"
      50                 :            : #include "weightinternal.h"
      51                 :            : 
      52                 :            : #include <xapian/errorhandler.h>
      53                 :            : #include <xapian/matchspy.h>
      54                 :            : #include <xapian/version.h> // For XAPIAN_HAS_REMOTE_BACKEND
      55                 :            : 
      56                 :            : #ifdef XAPIAN_HAS_REMOTE_BACKEND
      57                 :            : #include "remotesubmatch.h"
      58                 :            : #include "remote-database.h"
      59                 :            : #endif /* XAPIAN_HAS_REMOTE_BACKEND */
      60                 :            : 
      61                 :            : #include <algorithm>
      62                 :            : #include <cfloat> // For DBL_EPSILON.
      63                 :            : #include <climits> // For UINT_MAX.
      64                 :            : #include <vector>
      65                 :            : #include <map>
      66                 :            : #include <set>
      67                 :            : 
      68                 :            : using namespace std;
      69                 :            : 
      70                 :            : const Xapian::Enquire::Internal::sort_setting REL =
      71                 :            :         Xapian::Enquire::Internal::REL;
      72                 :            : const Xapian::Enquire::Internal::sort_setting REL_VAL =
      73                 :            :         Xapian::Enquire::Internal::REL_VAL;
      74                 :            : const Xapian::Enquire::Internal::sort_setting VAL =
      75                 :            :         Xapian::Enquire::Internal::VAL;
      76                 :            : #if 0 // VAL_REL isn't currently used which causes a warning with SGI CC.
      77                 :            : const Xapian::Enquire::Internal::sort_setting VAL_REL =
      78                 :            :         Xapian::Enquire::Internal::VAL_REL;
      79                 :            : #endif
      80                 :            : 
      81                 :            : /** Split an RSet into several sub rsets, one for each database.
      82                 :            :  *
      83                 :            :  *  @param rset The RSet to split.
      84                 :            :  *  @param number_of_subdbs The number of sub databases which exist.
      85                 :            :  *  @param subrsets Vector of RSets which will the sub rsets will be placed in.
      86                 :            :  *                  This should be empty when the function is called.
      87                 :            :  */
      88                 :            : static void
      89                 :     179504 : split_rset_by_db(const Xapian::RSet * rset,
      90                 :            :                  Xapian::doccount number_of_subdbs,
      91                 :            :                  vector<Xapian::RSet> & subrsets)
      92                 :            : {
      93                 :            :     LOGCALL_STATIC_VOID(MATCH, "split_rset_by_db", rset | number_of_subdbs | subrsets);
      94         [ +  + ]:     179504 :     if (rset) {
      95         [ +  + ]:       4494 :         if (number_of_subdbs == 1) {
      96                 :            :             // The common case of a single database is easy to handle.
      97                 :       4438 :             subrsets.push_back(*rset);
      98                 :            :         } else {
      99                 :            :             // Can't just use vector::resize() here, since that creates N
     100                 :            :             // copies of the same RSet!
     101                 :         56 :             subrsets.reserve(number_of_subdbs);
     102         [ +  + ]:        168 :             for (size_t i = 0; i < number_of_subdbs; ++i) {
     103                 :        112 :                 subrsets.push_back(Xapian::RSet());
     104                 :            :             }
     105                 :            : 
     106                 :         56 :             const set<Xapian::docid> & rsetitems = rset->internal->get_items();
     107                 :         56 :             set<Xapian::docid>::const_iterator j;
     108         [ +  + ]:        130 :             for (j = rsetitems.begin(); j != rsetitems.end(); ++j) {
     109                 :         74 :                 Xapian::doccount local_docid = (*j - 1) / number_of_subdbs + 1;
     110                 :         74 :                 Xapian::doccount subdatabase = (*j - 1) % number_of_subdbs;
     111                 :         74 :                 subrsets[subdatabase].add_document(local_docid);
     112                 :            :             }
     113                 :            :         }
     114                 :            :     } else {
     115                 :            :         // NB vector::resize() creates N copies of the same empty RSet.
     116                 :     175010 :         subrsets.resize(number_of_subdbs);
     117                 :            :     }
     118                 :            :     Assert(subrsets.size() == number_of_subdbs);
     119                 :     179504 : }
     120                 :            : 
     121                 :            : /** Prepare some SubMatches.
     122                 :            :  *
     123                 :            :  *  This calls the prepare_match() method on each SubMatch object, causing them
     124                 :            :  *  to lookup various statistics.
     125                 :            :  *
     126                 :            :  *  This method is rather complicated in order to handle remote matches
     127                 :            :  *  efficiently.  Instead of simply calling "prepare_match()" on each submatch
     128                 :            :  *  and waiting for it to return, it first calls "prepare_match(true)" on each
     129                 :            :  *  submatch.  If any of these calls return false, indicating that the required
     130                 :            :  *  information has not yet been received from the server, the method will try
     131                 :            :  *  those which returned false again, passing "false" as a parameter to
     132                 :            :  *  indicate that they should block until the information is ready.
     133                 :            :  *
     134                 :            :  *  This should improve performance in the case of mixed local-and-remote
     135                 :            :  *  searches - the local searchers will all fetch their statistics from disk
     136                 :            :  *  without waiting for the remote searchers, so as soon as the remote searcher
     137                 :            :  *  statistics arrive, we can move on to the next step.
     138                 :            :  */
     139                 :            : static void
     140                 :     179462 : prepare_sub_matches(vector<Xapian::Internal::RefCntPtr<SubMatch> > & leaves,
     141                 :            :                     Xapian::ErrorHandler * errorhandler,
     142                 :            :                     Xapian::Weight::Internal & stats)
     143                 :            : {
     144                 :            :     LOGCALL_STATIC_VOID(MATCH, "prepare_sub_matches", leaves | errorhandler | stats);
     145                 :            :     // We use a vector<bool> to track which SubMatches we're already prepared.
     146                 :     179462 :     vector<bool> prepared;
     147                 :     179462 :     prepared.resize(leaves.size(), false);
     148                 :     179462 :     size_t unprepared = leaves.size();
     149                 :     179462 :     bool nowait = true;
     150         [ +  + ]:     358924 :     while (unprepared) {
     151         [ +  + ]:     430225 :         for (size_t leaf = 0; leaf < leaves.size(); ++leaf) {
     152         [ +  + ]:     250763 :             if (prepared[leaf]) continue;
     153                 :            :             try {
     154                 :     250762 :                 SubMatch * submatch = leaves[leaf].get();
     155   [ +  -  +  + ]:     250762 :                 if (!submatch || submatch->prepare_match(nowait, stats)) {
                 [ +  + ]
     156                 :     250756 :                     prepared[leaf] = true;
     157                 :     250756 :                     --unprepared;
     158                 :            :                 }
     159                 :          6 :             } catch (Xapian::Error & e) {
     160         [ +  - ]:          3 :                 if (!errorhandler) throw;
     161                 :            : 
     162                 :            :                 LOGLINE(EXCEPTION, "Calling error handler for prepare_match() on a SubMatch.");
     163                 :          0 :                 (*errorhandler)(e);
     164                 :            :                 // Continue match without this sub-match.
     165                 :          0 :                 leaves[leaf] = NULL;
     166                 :          0 :                 prepared[leaf] = true;
     167                 :          0 :                 --unprepared;
     168                 :            :             }
     169                 :            :         }
     170                 :            :         // Use blocking IO on subsequent passes, so that we don't go into
     171                 :            :         // a tight loop.
     172                 :     179462 :         nowait = false;
     173                 :     179462 :     }
     174                 :     179459 : }
     175                 :            : 
     176                 :            : /// Class which applies several match spies in turn.
     177 [ #  # ][ -  + ]:     175211 : class MultipleMatchSpy : public Xapian::MatchSpy {
     178                 :            :   private:
     179                 :            :     /// List of match spies to call, in order.
     180                 :            :     const std::vector<Xapian::MatchSpy *> & spies;
     181                 :            : 
     182                 :            :   public:
     183                 :     175211 :     MultipleMatchSpy(const std::vector<Xapian::MatchSpy *> & spies_)
     184                 :     175211 :             : spies(spies_) {}
     185                 :            : 
     186                 :            :     /** Implementation of virtual operator().
     187                 :            :      *
     188                 :            :      *  This implementation calls all the spies in turn.
     189                 :            :      */
     190                 :            :     void operator()(const Xapian::Document &doc, Xapian::weight wt);
     191                 :            : };
     192                 :            : 
     193                 :            : void 
     194                 :        828 : MultipleMatchSpy::operator()(const Xapian::Document &doc, Xapian::weight wt) {
     195                 :            :     LOGCALL_VOID(MATCH, "MultipleMatchSpy::operator()", doc | wt);
     196                 :        828 :     vector<Xapian::MatchSpy *>::const_iterator i;
     197         [ +  + ]:       3234 :     for (i = spies.begin(); i != spies.end(); ++i) {
     198                 :       2406 :         (**i)(doc, wt);
     199                 :            :     }
     200                 :        828 : }
     201                 :            : 
     202                 :            : ////////////////////////////////////
     203                 :            : // Initialisation and cleaning up //
     204                 :            : ////////////////////////////////////
     205                 :     179569 : MultiMatch::MultiMatch(const Xapian::Database &db_,
     206                 :            :                        const Xapian::Query::Internal * query_,
     207                 :            :                        Xapian::termcount qlen,
     208                 :            :                        const Xapian::RSet * omrset,
     209                 :            :                        Xapian::doccount collapse_max_,
     210                 :            :                        Xapian::valueno collapse_key_,
     211                 :            :                        int percent_cutoff_, Xapian::weight weight_cutoff_,
     212                 :            :                        Xapian::Enquire::docid_order order_,
     213                 :            :                        Xapian::valueno sort_key_,
     214                 :            :                        Xapian::Enquire::Internal::sort_setting sort_by_,
     215                 :            :                        bool sort_value_forward_,
     216                 :            :                        Xapian::ErrorHandler * errorhandler_,
     217                 :            :                        Xapian::Weight::Internal & stats,
     218                 :            :                        const Xapian::Weight * weight_,
     219                 :            :                        const vector<Xapian::MatchSpy *> & matchspies_,
     220                 :            :                        bool have_sorter, bool have_mdecider)
     221                 :            :         : db(db_), query(query_),
     222                 :            :           collapse_max(collapse_max_), collapse_key(collapse_key_),
     223                 :            :           percent_cutoff(percent_cutoff_), weight_cutoff(weight_cutoff_),
     224                 :            :           order(order_),
     225                 :            :           sort_key(sort_key_), sort_by(sort_by_),
     226                 :            :           sort_value_forward(sort_value_forward_),
     227                 :            :           errorhandler(errorhandler_), weight(weight_),
     228                 :            :           is_remote(db.internal.size()),
     229                 :     179569 :           matchspies(matchspies_)
     230                 :            : {
     231                 :            :     LOGCALL_CTOR(MATCH, "MultiMatch", db_ | query_ | qlen | omrset | collapse_max_ | collapse_key_ | percent_cutoff_ | weight_cutoff_ | int(order_) | sort_key_ | int(sort_by_) | sort_value_forward_ | errorhandler_ | stats | weight_ | matchspies_ | have_sorter | have_mdecider);
     232                 :            : 
     233   [ +  +  #  # ]:     179569 :     if (!query) return;
     234                 :     179504 :     query->validate_query();
     235                 :            : 
     236                 :     179504 :     Xapian::doccount number_of_subdbs = db.internal.size();
     237                 :     179504 :     vector<Xapian::RSet> subrsets;
     238                 :     179504 :     split_rset_by_db(omrset, number_of_subdbs, subrsets);
     239                 :            : 
     240 [ +  + ][ #  # ]:     430305 :     for (size_t i = 0; i != number_of_subdbs; ++i) {
     241                 :     250801 :         Xapian::Database::Internal *subdb = db.internal[i].get();
     242                 :            :         Assert(subdb);
     243                 :     250801 :         Xapian::Internal::RefCntPtr<SubMatch> smatch;
     244                 :            :         try {
     245                 :            :             // There is currently only one special case, for network databases.
     246                 :            : #ifdef XAPIAN_HAS_REMOTE_BACKEND
     247                 :     250801 :             RemoteDatabase *rem_db = subdb->as_remotedatabase();
     248   [ +  +  #  # ]:     250801 :             if (rem_db) {
     249 [ +  + ][ #  # ]:       4446 :                 if (have_sorter) {
     250                 :          6 :                     throw Xapian::UnimplementedError("Xapian::KeyMaker not supported for the remote backend");
     251                 :            :                 }
     252 [ +  + ][ #  # ]:       4440 :                 if (have_mdecider) {
     253                 :         18 :                     throw Xapian::UnimplementedError("Xapian::MatchDecider not supported for the remote backend");
     254                 :            :                 }
     255                 :            :                 rem_db->set_query(query, qlen, collapse_max, collapse_key,
     256                 :            :                                   order, sort_key, sort_by, sort_value_forward,
     257                 :            :                                   percent_cutoff, weight_cutoff, weight,
     258                 :       4422 :                                   subrsets[i], matchspies);
     259                 :            :                 bool decreasing_relevance =
     260   [ +  +  +  + ]:       4404 :                     (sort_by == REL || sort_by == REL_VAL);
         [ #  # ][ #  # ]
     261                 :       4404 :                 smatch = new RemoteSubMatch(rem_db, decreasing_relevance, matchspies);
     262                 :       4404 :                 is_remote[i] = true;
     263                 :            :             } else {
     264                 :     246355 :                 smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
     265                 :            :             }
     266                 :            : #else
     267                 :            :             // Avoid unused parameter warnings.
     268                 :            :             (void)have_sorter;
     269                 :            :             (void)have_mdecider;
     270                 :            :             smatch = new LocalSubMatch(subdb, query, qlen, subrsets[i], weight);
     271                 :            : #endif /* XAPIAN_HAS_REMOTE_BACKEND */
     272                 :         84 :         } catch (Xapian::Error & e) {
     273 [ +  - ][ #  # ]:         42 :             if (!errorhandler) throw;
     274                 :            :             LOGLINE(EXCEPTION, "Calling error handler for creation of a SubMatch from a database and query.");
     275                 :          0 :             (*errorhandler)(e);
     276                 :            :             // Continue match without this sub-postlist.
     277                 :          0 :             smatch = NULL;
     278                 :            :         }
     279                 :     250759 :         leaves.push_back(smatch);
     280                 :            :     }
     281                 :            : 
     282                 :     179462 :     stats.mark_wanted_terms(*query);
     283                 :     179462 :     prepare_sub_matches(leaves, errorhandler, stats);
     284                 :     179504 :     stats.set_bounds_from_db(db);
     285                 :     179659 : }
     286                 :            : 
     287                 :            : Xapian::weight
     288                 :   24559265 : MultiMatch::getorrecalc_maxweight(PostList *pl)
     289                 :            : {
     290                 :            :     LOGCALL(MATCH, Xapian::weight, "MultiMatch::getorrecalc_maxweight", pl);
     291                 :            :     Xapian::weight wt;
     292         [ +  + ]:   24559265 :     if (recalculate_w_max) {
     293                 :            :         LOGLINE(MATCH, "recalculating max weight");
     294                 :     223770 :         wt = pl->recalc_maxweight();
     295                 :     223770 :         recalculate_w_max = false;
     296                 :            :     } else {
     297                 :   24335495 :         wt = pl->get_maxweight();
     298                 :            :         LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
     299                 :            :         AssertEqDoubleParanoid(wt, pl->recalc_maxweight());
     300                 :            :     }
     301                 :            :     LOGLINE(MATCH, "max possible doc weight = " << wt);
     302                 :   24559265 :     RETURN(wt);
     303                 :            : }
     304                 :            : 
     305                 :            : void
     306                 :     179524 : MultiMatch::get_mset(Xapian::doccount first, Xapian::doccount maxitems,
     307                 :            :                      Xapian::doccount check_at_least,
     308                 :            :                      Xapian::MSet & mset,
     309                 :            :                      const Xapian::Weight::Internal & stats,
     310                 :            :                      const Xapian::MatchDecider *mdecider,
     311                 :            :                      const Xapian::MatchDecider *matchspy_legacy,
     312                 :            :                      const Xapian::KeyMaker *sorter)
     313                 :            : {
     314                 :            :     LOGCALL_VOID(MATCH, "MultiMatch::get_mset", first | maxitems | check_at_least | Literal("mset") | stats | Literal("mdecider") | Literal("matchspy_legacy") | Literal("sorter"));
     315                 :            :     AssertRel(check_at_least,>=,maxitems);
     316                 :            : 
     317         [ +  + ]:     179524 :     if (!query) {
     318                 :         65 :         mset = Xapian::MSet(new Xapian::MSet::Internal());
     319                 :         65 :         mset.internal->firstitem = first;
     320                 :         65 :         return;
     321                 :            :     }
     322                 :            : 
     323                 :            :     Assert(!leaves.empty());
     324                 :            : 
     325                 :            : #ifdef XAPIAN_HAS_REMOTE_BACKEND
     326                 :            :     // If there's only one database and it's remote, we can just unserialise
     327                 :            :     // its MSet and return that.
     328 [ +  + ][ +  + ]:     179459 :     if (leaves.size() == 1 && is_remote[0]) {
                 [ +  + ]
     329                 :            :         RemoteSubMatch * rem_match;
     330                 :       4248 :         rem_match = static_cast<RemoteSubMatch*>(leaves[0].get());
     331                 :       4248 :         rem_match->start_match(first, maxitems, check_at_least, stats);
     332                 :       4248 :         rem_match->get_mset(mset);
     333                 :       4248 :         return;
     334                 :            :     }
     335                 :            : #endif
     336                 :            : 
     337                 :            :     // Start matchers.
     338                 :            :     {
     339                 :     175211 :         vector<Xapian::Internal::RefCntPtr<SubMatch> >::iterator leaf;
     340         [ +  + ]:     421719 :         for (leaf = leaves.begin(); leaf != leaves.end(); ++leaf) {
     341         [ -  + ]:     246508 :             if (!(*leaf).get()) continue;
     342                 :            :             try {
     343                 :            :                 (*leaf)->start_match(0, first + maxitems,
     344                 :     246508 :                                      first + check_at_least, stats);
     345                 :          0 :             } catch (Xapian::Error & e) {
     346         [ #  # ]:          0 :                 if (!errorhandler) throw;
     347                 :            :                 LOGLINE(EXCEPTION, "Calling error handler for "
     348                 :            :                                    "start_match() on a SubMatch.");
     349                 :          0 :                 (*errorhandler)(e);
     350                 :            :                 // Continue match without this sub-match.
     351                 :          0 :                 *leaf = NULL;
     352                 :            :             }
     353                 :            :         }
     354                 :            :     }
     355                 :            : 
     356                 :            :     // Get postlists and term info
     357                 :     175211 :     vector<PostList *> postlists;
     358                 :     175211 :     map<string, Xapian::MSet::Internal::TermFreqAndWeight> termfreqandwts;
     359                 :            :     map<string, Xapian::MSet::Internal::TermFreqAndWeight> * termfreqandwts_ptr;
     360                 :     175211 :     termfreqandwts_ptr = &termfreqandwts;
     361                 :            : 
     362                 :     175211 :     Xapian::termcount total_subqs = 0;
     363                 :            :     // Keep a count of matches which we know exist, but we won't see.  This
     364                 :            :     // occurs when a submatch is remote, and returns a lower bound on the
     365                 :            :     // number of matching documents which is higher than the number of
     366                 :            :     // documents it returns (because it wasn't asked for more documents).
     367                 :     175211 :     Xapian::doccount definite_matches_not_seen = 0;
     368         [ +  + ]:     421719 :     for (size_t i = 0; i != leaves.size(); ++i) {
     369                 :            :         PostList *pl;
     370                 :            :         try {
     371                 :            :             pl = leaves[i]->get_postlist_and_term_info(this,
     372                 :            :                                                        termfreqandwts_ptr,
     373                 :     246508 :                                                        &total_subqs);
     374   [ +  +  +  + ]:     246508 :             if (termfreqandwts_ptr && !termfreqandwts.empty())
                 [ +  + ]
     375                 :     168636 :                 termfreqandwts_ptr = NULL;
     376         [ +  + ]:     246508 :             if (is_remote[i]) {
     377         [ +  + ]:        156 :                 if (pl->get_termfreq_min() > first + maxitems) {
     378                 :            :                     LOGLINE(MATCH, "Found " <<
     379                 :            :                                    pl->get_termfreq_min() - (first + maxitems)
     380                 :            :                                    << " definite matches in remote submatch "
     381                 :            :                                    "which aren't passed to local match");
     382                 :         12 :                     definite_matches_not_seen += pl->get_termfreq_min();
     383                 :         12 :                     definite_matches_not_seen -= first + maxitems;
     384                 :            :                 }
     385                 :            :             }
     386                 :          0 :         } catch (Xapian::Error & e) {
     387         [ #  # ]:          0 :             if (!errorhandler) throw;
     388                 :            :             LOGLINE(EXCEPTION, "Calling error handler for "
     389                 :            :                                "get_term_info() on a SubMatch.");
     390                 :          0 :             (*errorhandler)(e);
     391                 :            :             // FIXME: check if *ALL* the remote servers have failed!
     392                 :            :             // Continue match without this sub-match.
     393                 :          0 :             leaves[i] = NULL;
     394                 :          0 :             pl = new EmptyPostList;
     395                 :            :         }
     396                 :     246508 :         postlists.push_back(pl);
     397                 :            :     }
     398                 :            :     Assert(!postlists.empty());
     399                 :            : 
     400                 :     175211 :     ValueStreamDocument vsdoc(db);
     401                 :     175211 :     ++vsdoc.ref_count;
     402                 :     175211 :     Xapian::Document doc(&vsdoc);
     403                 :            : 
     404                 :            :     // Get a single combined postlist
     405                 :     175211 :     AutoPtr<PostList> pl;
     406         [ +  + ]:     175211 :     if (postlists.size() == 1) {
     407                 :     103936 :         pl.reset(postlists.front());
     408                 :            :     } else {
     409                 :      71275 :         pl.reset(new MergePostList(postlists, this, vsdoc, errorhandler));
     410                 :            :     }
     411                 :            : 
     412                 :            :     LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
     413                 :            : 
     414                 :            : #ifdef XAPIAN_DEBUG_LOG
     415                 :            :     {
     416                 :            :         map<string, Xapian::MSet::Internal::TermFreqAndWeight>::const_iterator tfwi;
     417                 :            :         for (tfwi = termfreqandwts.begin(); tfwi != termfreqandwts.end(); ++tfwi) {
     418                 :            :             LOGLINE(MATCH, "termfreqandwts[" << tfwi->first << "] = " << tfwi->second.termfreq << ", " << tfwi->second.termweight);
     419                 :            :         }
     420                 :            :     }
     421                 :            : #endif
     422                 :            : 
     423                 :            :     // Empty result set
     424                 :     175211 :     Xapian::doccount docs_matched = 0;
     425                 :     175211 :     Xapian::weight greatest_wt = 0;
     426                 :     175211 :     Xapian::termcount greatest_wt_subqs_matched = 0;
     427                 :            : #ifdef XAPIAN_HAS_REMOTE_BACKEND
     428                 :     175211 :     unsigned greatest_wt_subqs_db_num = UINT_MAX;
     429                 :            : #endif
     430                 :     175211 :     vector<Xapian::Internal::MSetItem> items;
     431                 :            : 
     432                 :            :     // maximum weight a document could possibly have
     433                 :     175211 :     const Xapian::weight max_possible = pl->recalc_maxweight();
     434                 :            : 
     435                 :            :     LOGLINE(MATCH, "pl = (" << pl->get_description() << ")");
     436                 :     175211 :     recalculate_w_max = false;
     437                 :            : 
     438                 :     175211 :     Xapian::doccount matches_upper_bound = pl->get_termfreq_max();
     439                 :     175211 :     Xapian::doccount matches_lower_bound = 0;
     440                 :     175211 :     Xapian::doccount matches_estimated   = pl->get_termfreq_est();
     441                 :            : 
     442   [ +  +  +  - ]:     175211 :     if (mdecider == NULL && matchspy_legacy == NULL) {
     443                 :            :         // If we have a matcher decider or match spy, the lower bound must be
     444                 :            :         // set to 0 as we could discard all hits.  Otherwise set it to the
     445                 :            :         // minimum number of entries which the postlist could return.
     446                 :     175134 :         matches_lower_bound = pl->get_termfreq_min();
     447                 :            :     }
     448                 :            : 
     449                 :            :     // Prepare the matchspy
     450                 :     175211 :     Xapian::MatchSpy *matchspy = NULL;
     451                 :     175211 :     MultipleMatchSpy multispy(matchspies);
     452         [ +  + ]:     175211 :     if (!matchspies.empty()) {
     453         [ +  + ]:         50 :         if (matchspies.size() == 1) {
     454                 :          7 :             matchspy = matchspies[0];
     455                 :            :         } else {
     456                 :         43 :             matchspy = &multispy;
     457                 :            :         }
     458                 :            :     }
     459                 :            : 
     460                 :            :     // Check if any results have been asked for (might just be wanting
     461                 :            :     // maxweight).
     462         [ +  + ]:     175211 :     if (check_at_least == 0) {
     463                 :       1731 :         pl.reset(NULL);
     464                 :       1731 :         Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
     465         [ +  + ]:       1731 :         if (collapse_max) {
     466                 :            :             // Lower bound must be set to no more than collapse_max, since it's
     467                 :            :             // possible that all matching documents have the same collapse_key
     468                 :            :             // value and so are collapsed together.
     469         [ +  - ]:         78 :             if (matches_lower_bound > collapse_max)
     470                 :         78 :                 matches_lower_bound = collapse_max;
     471                 :            :         }
     472                 :            : 
     473                 :            :         mset = Xapian::MSet(new Xapian::MSet::Internal(
     474                 :            :                                            first,
     475                 :            :                                            matches_upper_bound,
     476                 :            :                                            matches_lower_bound,
     477                 :            :                                            matches_estimated,
     478                 :            :                                            matches_upper_bound,
     479                 :            :                                            uncollapsed_lower_bound,
     480                 :            :                                            matches_estimated,
     481                 :            :                                            max_possible, greatest_wt, items,
     482                 :            :                                            termfreqandwts,
     483                 :       1731 :                                            0));
     484                 :            :         return;
     485                 :            :     }
     486                 :            : 
     487                 :            :     // Number of documents considered by a decider or matchspy_legacy.
     488                 :     173480 :     Xapian::doccount decider_considered = 0;
     489                 :            :     // Number of documents denied by the decider or matchspy_legacy.
     490                 :     173480 :     Xapian::doccount decider_denied = 0;
     491                 :            : 
     492                 :            :     // Set max number of results that we want - this is used to decide
     493                 :            :     // when to throw away unwanted items.
     494                 :     173480 :     Xapian::doccount max_msize = first + maxitems;
     495                 :     173480 :     items.reserve(max_msize + 1);
     496                 :            : 
     497                 :            :     // Tracks the minimum item currently eligible for the MSet - we compare
     498                 :            :     // candidate items against this.
     499                 :     173480 :     Xapian::Internal::MSetItem min_item(0.0, 0);
     500                 :            : 
     501                 :            :     // Minimum weight an item must have to be worth considering.
     502                 :     173480 :     Xapian::weight min_weight = weight_cutoff;
     503                 :            : 
     504                 :            :     // Factor to multiply maximum weight seen by to get the cutoff weight.
     505                 :     173480 :     Xapian::weight percent_cutoff_factor = percent_cutoff / 100.0;
     506                 :            :     // Corresponding correction to that in omenquire.cc to account for excess
     507                 :            :     // precision on x86.
     508                 :     173480 :     percent_cutoff_factor -= DBL_EPSILON;
     509                 :            : 
     510                 :            :     // Object to handle collapsing.
     511                 :     173480 :     Collapser collapser(collapse_key, collapse_max);
     512                 :            : 
     513                 :            :     /// Comparison functor for sorting MSet
     514                 :     173480 :     bool sort_forward = (order != Xapian::Enquire::DESCENDING);
     515                 :     173480 :     MSetCmp mcmp(get_msetcmp_function(sort_by, sort_forward, sort_value_forward));
     516                 :            : 
     517                 :            :     // Perform query
     518                 :            : 
     519                 :            :     // We form the mset in two stages.  In the first we fill up our working
     520                 :            :     // mset.  Adding a new document does not remove another.
     521                 :            :     //
     522                 :            :     // In the second, we consider documents which rank higher than the current
     523                 :            :     // lowest ranking document in the mset.  Each document added expels the
     524                 :            :     // current lowest ranking document.
     525                 :            :     //
     526                 :            :     // If a percentage cutoff is in effect, it can cause the matcher to return
     527                 :            :     // from the second stage from the first.
     528                 :            : 
     529                 :            :     // Is the mset a valid heap?
     530                 :     173480 :     bool is_heap = false;
     531                 :            : 
     532         [ +  + ]:   65743772 :     while (true) {
     533                 :            :         bool pushback;
     534                 :            : 
     535         [ +  + ]:   65917113 :         if (rare(recalculate_w_max)) {
     536         [ +  + ]:     593397 :             if (min_weight > 0.0) {
     537         [ -  + ]:      57588 :                 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
     538                 :            :                     LOGLINE(MATCH, "*** TERMINATING EARLY (1)");
     539                 :          0 :                     break;
     540                 :            :                 }
     541                 :            :             }
     542                 :            :         }
     543                 :            : 
     544                 :   65917113 :         PostList * pl_copy = pl.get();
     545         [ +  + ]:   65917113 :         if (rare(next_handling_prune(pl_copy, min_weight, this))) {
     546                 :      92596 :             (void)pl.release();
     547                 :      92596 :             pl.reset(pl_copy);
     548                 :            :             LOGLINE(MATCH, "*** REPLACING ROOT");
     549                 :            : 
     550         [ +  + ]:      92596 :             if (min_weight > 0.0) {
     551                 :            :                 // No need for a full recalc (unless we've got to do one
     552                 :            :                 // because of a prune elsewhere) - we're just switching to a
     553                 :            :                 // subtree.
     554         [ -  + ]:      91337 :                 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
     555                 :            :                     LOGLINE(MATCH, "*** TERMINATING EARLY (2)");
     556                 :          0 :                     break;
     557                 :            :                 }
     558                 :            :             }
     559                 :            :         }
     560                 :            : 
     561         [ +  + ]:   65917113 :         if (rare(pl->at_end())) {
     562                 :            :             LOGLINE(MATCH, "Reached end of potential matches");
     563                 :     173341 :             break;
     564                 :            :         }
     565                 :            : 
     566                 :            :         // Only calculate the weight if we need it for mcmp, or there's a
     567                 :            :         // percentage or weight cutoff in effect.  Otherwise we calculate it
     568                 :            :         // below if we haven't already rejected this candidate.
     569                 :   65743772 :         Xapian::weight wt = 0.0;
     570                 :   65743772 :         bool calculated_weight = false;
     571 [ +  + ][ -  + ]:   65743772 :         if (sort_by != VAL || min_weight > 0.0) {
     572                 :   65716211 :             wt = pl->get_weight();
     573         [ +  + ]:   65716211 :             if (wt < min_weight) {
     574                 :            :                 LOGLINE(MATCH, "Rejecting potential match due to insufficient weight");
     575                 :   18477742 :                 continue;
     576                 :            :             }
     577                 :   47238469 :             calculated_weight = true;
     578                 :            :         }
     579                 :            : 
     580                 :   47266030 :         Xapian::docid did = pl->get_docid();
     581                 :   47266030 :         vsdoc.set_document(did);
     582                 :            :         LOGLINE(MATCH, "Candidate document id " << did << " wt " << wt);
     583                 :   47266030 :         Xapian::Internal::MSetItem new_item(wt, did);
     584         [ +  + ]:   47266030 :         if (sort_by != REL) {
     585         [ +  + ]:      56481 :             if (sorter) {
     586                 :        870 :                 new_item.sort_key = (*sorter)(doc);
     587                 :            :             } else {
     588                 :      55611 :                 new_item.sort_key = vsdoc.get_value(sort_key);
     589                 :            :             }
     590                 :            : 
     591                 :            :             // We're sorting by value (in part at least), so compare the item
     592                 :            :             // against the lowest currently in the proto-mset.  If sort_by is
     593                 :            :             // VAL, then new_item.wt won't yet be set, but that doesn't
     594                 :            :             // matter since it's not used by the sort function.
     595         [ +  + ]:      56481 :             if (!mcmp(new_item, min_item)) {
     596 [ +  + ][ +  + ]:      10929 :                 if (mdecider == NULL && !collapser && matchspy_legacy == NULL) {
         [ +  - ][ +  + ]
     597                 :            :                     // Document was definitely suitable for mset - no more
     598                 :            :                     // processing needed.
     599                 :            :                     LOGLINE(MATCH, "Making note of match item which sorts lower than min_item");
     600                 :       7019 :                     ++docs_matched;
     601         [ +  + ]:       7019 :                     if (!calculated_weight) wt = pl->get_weight();
     602         [ +  + ]:       7019 :                     if (matchspy) {
     603                 :        100 :                         matchspy->operator()(doc, wt);
     604                 :            :                     }
     605         [ +  + ]:       7019 :                     if (wt > greatest_wt) goto new_greatest_weight;
     606                 :       6829 :                     continue;
     607                 :            :                 }
     608         [ +  + ]:       3910 :                 if (docs_matched >= check_at_least) {
     609                 :            :                     // We've seen enough items - we can drop this one.
     610                 :            :                     LOGLINE(MATCH, "Dropping candidate which sorts lower than min_item");
     611                 :            :                     // FIXME: hmm, match decider might have rejected this...
     612         [ +  - ]:       3808 :                     if (!calculated_weight) wt = pl->get_weight();
     613         [ -  + ]:       3808 :                     if (wt > greatest_wt) goto new_greatest_weight;
     614                 :       3808 :                     continue;
     615                 :            :                 }
     616                 :            :                 // We can't drop the item, because we need to show it
     617                 :            :                 // to the matchspy_legacy, test whether the mdecider would
     618                 :            :                 // accept it, and/or test whether it would be collapsed.
     619                 :            :                 LOGLINE(MATCH, "Keeping candidate which sorts lower than min_item for further investigation");
     620                 :            :             }
     621                 :            :         }
     622                 :            : 
     623                 :            :         // Use the match spy and/or decision functors (if specified).
     624 [ +  + ][ +  + ]:   47255203 :         if (matchspy != NULL || mdecider != NULL || matchspy_legacy != NULL) {
                 [ -  + ]
     625                 :       5236 :             const unsigned int multiplier = db.internal.size();
     626                 :            :             Assert(multiplier != 0);
     627                 :       5236 :             Xapian::doccount n = (did - 1) % multiplier; // which actual database
     628                 :            :             // If the results are from a remote database, then the functor will
     629                 :            :             // already have been applied there so we can skip this step.
     630         [ +  - ]:       5236 :             if (!is_remote[n]) {
     631                 :       5236 :                 ++decider_considered;
     632 [ -  + ][ #  # ]:       5236 :                 if (matchspy_legacy && !matchspy_legacy->operator()(doc)) {
                 [ -  + ]
     633                 :          0 :                     ++decider_denied;
     634                 :          0 :                     continue;
     635                 :            :                 }
     636 [ +  + ][ +  + ]:       5236 :                 if (mdecider && !mdecider->operator()(doc)) {
                 [ +  + ]
     637                 :        186 :                     ++decider_denied;
     638                 :        186 :                     continue;
     639                 :            :                 }
     640         [ +  + ]:       5050 :                 if (matchspy) {
     641         [ +  + ]:        770 :                     if (!calculated_weight) {
     642                 :        150 :                         wt = pl->get_weight();
     643                 :        150 :                         new_item.wt = wt;
     644                 :        150 :                         calculated_weight = true;
     645                 :            :                     }
     646                 :        770 :                     matchspy->operator()(doc, wt);
     647                 :            :                 }
     648                 :            :             }
     649                 :            :         }
     650                 :            : 
     651         [ +  + ]:   47255017 :         if (!calculated_weight) {
     652                 :            :             // we didn't calculate the weight above, but now we will need it
     653                 :      20762 :             wt = pl->get_weight();
     654                 :      20762 :             new_item.wt = wt;
     655                 :            :         }
     656                 :            : 
     657                 :   47255017 :         pushback = true;
     658                 :            : 
     659                 :            :         // Perform collapsing on key if requested.
     660         [ +  + ]:   47255017 :         if (collapser) {
     661                 :            :             collapse_result res;
     662                 :      11422 :             res = collapser.process(new_item, pl.get(), vsdoc, mcmp);
     663         [ +  + ]:      11422 :             if (res == REJECTED) {
     664                 :            :                 // If we're sorting by relevance primarily, then we throw away
     665                 :            :                 // the lower weighted document anyway.
     666 [ +  + ][ +  - ]:       4777 :                 if (sort_by != REL && sort_by != REL_VAL) {
     667         [ -  + ]:       3753 :                     if (wt > greatest_wt) goto new_greatest_weight;
     668                 :            :                 }
     669                 :       4777 :                 continue;
     670                 :            :             }
     671                 :            : 
     672         [ +  + ]:       6645 :             if (res == REPLACED) {
     673                 :            :                 // There was a previous item in the collapse tab so
     674                 :            :                 // the MSet can't be empty.
     675                 :            :                 Assert(!items.empty());
     676                 :            : 
     677                 :            :                 const Xapian::Internal::MSetItem & old_item =
     678                 :        756 :                     collapser.old_item;
     679                 :            :                 // This is one of the best collapse_max potential MSet entries
     680                 :            :                 // with this key which we've seen so far.  Check if the
     681                 :            :                 // entry with this key which it displaced might still be in the
     682                 :            :                 // proto-MSet.  If it might be, we need to check through for
     683                 :            :                 // it.
     684                 :        756 :                 Xapian::weight old_wt = old_item.wt;
     685 [ +  - ][ +  + ]:        756 :                 if (old_wt >= min_weight && mcmp(old_item, min_item)) {
                 [ +  + ]
     686                 :            :                     // Scan through (unsorted) MSet looking for entry.
     687                 :            :                     // FIXME: more efficient way than just scanning?
     688                 :        635 :                     Xapian::docid olddid = old_item.did;
     689                 :        635 :                     vector<Xapian::Internal::MSetItem>::iterator i;
     690         [ +  - ]:       3348 :                     for (i = items.begin(); i != items.end(); ++i) {
     691         [ +  + ]:       3348 :                         if (i->did == olddid) {
     692                 :            :                             LOGLINE(MATCH, "collapse: removing " <<
     693                 :            :                                            olddid << ": " <<
     694                 :            :                                            new_item.collapse_key);
     695                 :            :                             // We can replace an arbitrary element in O(log N)
     696                 :            :                             // but have to do it by hand (in this case the new
     697                 :            :                             // elt is bigger, so we just swap down the tree).
     698                 :            :                             // FIXME: implement this, and clean up is_heap
     699                 :            :                             // handling
     700                 :        635 :                             *i = new_item;
     701                 :        635 :                             pushback = false;
     702                 :        635 :                             is_heap = false;
     703                 :        635 :                             break;
     704                 :            :                         }
     705                 :            :                     }
     706                 :            :                 }
     707                 :            :             }
     708                 :            :         }
     709                 :            : 
     710                 :            :         // OK, actually add the item to the mset.
     711         [ +  + ]:   47250240 :         if (pushback) {
     712                 :   47249605 :             ++docs_matched;
     713         [ +  + ]:   47249605 :             if (items.size() >= max_msize) {
     714                 :   24410340 :                 items.push_back(new_item);
     715         [ +  + ]:   24410340 :                 if (!is_heap) {
     716                 :     161132 :                     is_heap = true;
     717                 :     161132 :                     make_heap(items.begin(), items.end(), mcmp);
     718                 :            :                 } else {
     719                 :            :                     push_heap<vector<Xapian::Internal::MSetItem>::iterator,
     720                 :   24249208 :                               MSetCmp>(items.begin(), items.end(), mcmp);
     721                 :            :                 }
     722                 :            :                 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
     723                 :   24410340 :                          MSetCmp>(items.begin(), items.end(), mcmp);
     724                 :   24410340 :                 items.pop_back();
     725                 :            : 
     726                 :   24410340 :                 min_item = items.front();
     727   [ +  +  +  + ]:   24410340 :                 if (sort_by == REL || sort_by == REL_VAL) {
     728         [ +  + ]:   24405337 :                     if (docs_matched >= check_at_least) {
     729         [ +  + ]:   24401023 :                         if (sort_by == REL) {
     730                 :            :                             // We're done if this is a forward boolean match
     731                 :            :                             // with only one database (bodgetastic, FIXME
     732                 :            :                             // better if we can!)
     733 [ +  + ][ +  + ]:   24400801 :                             if (rare(max_possible == 0 && sort_forward)) {
                 [ +  + ]
     734                 :            :                                 // In the multi database case, MergePostList
     735                 :            :                                 // currently processes each database
     736                 :            :                                 // sequentially (which actually may well be
     737                 :            :                                 // more efficient) so the docids in general
     738                 :            :                                 // won't arrive in order.
     739         [ -  + ]:         45 :                                 if (leaves.size() == 1) break;
     740                 :            :                             }
     741                 :            :                         }
     742         [ +  + ]:   24401023 :                         if (min_item.wt > min_weight) {
     743                 :            :                             LOGLINE(MATCH, "Setting min_weight to " <<
     744                 :            :                                     min_item.wt << " from " << min_weight);
     745                 :   18399322 :                             min_weight = min_item.wt;
     746                 :            :                         }
     747                 :            :                     }
     748                 :            :                 }
     749         [ +  + ]:   24410340 :                 if (rare(getorrecalc_maxweight(pl.get()) < min_weight)) {
     750                 :            :                     LOGLINE(MATCH, "*** TERMINATING EARLY (3)");
     751                 :            :                     break;
     752                 :            :                 }
     753                 :            :             } else {
     754                 :   22839265 :                 items.push_back(new_item);
     755                 :   22839265 :                 is_heap = false;
     756   [ +  +  +  + ]:   22839265 :                 if (sort_by == REL && items.size() == max_msize) {
                 [ +  + ]
     757         [ +  + ]:     161813 :                     if (docs_matched >= check_at_least) {
     758                 :            :                         // We're done if this is a forward boolean match
     759                 :            :                         // with only one database (bodgetastic, FIXME
     760                 :            :                         // better if we can!)
     761 [ +  + ][ +  + ]:     161709 :                         if (rare(max_possible == 0 && sort_forward)) {
                 [ +  + ]
     762                 :            :                             // In the multi database case, MergePostList
     763                 :            :                             // currently processes each database
     764                 :            :                             // sequentially (which actually may well be
     765                 :            :                             // more efficient) so the docids in general
     766                 :            :                             // won't arrive in order.
     767         [ +  + ]:        174 :                             if (leaves.size() == 1) break;
     768                 :            :                         }
     769                 :            :                     }
     770                 :            :                 }
     771                 :            :             }
     772                 :            :         }
     773                 :            : 
     774                 :            :         // Keep a track of the greatest weight we've seen.
     775         [ +  + ]:   47250101 :         if (wt > greatest_wt) {
     776                 :            : new_greatest_weight:
     777                 :    1088143 :             greatest_wt = wt;
     778                 :            : #ifdef XAPIAN_HAS_REMOTE_BACKEND
     779                 :    1088143 :             const unsigned int multiplier = db.internal.size();
     780                 :    1088143 :             unsigned int db_num = (did - 1) % multiplier;
     781         [ +  + ]:    1088143 :             if (is_remote[db_num]) {
     782                 :            :                 // Note that the greatest weighted document came from a remote
     783                 :            :                 // database, and which one.
     784                 :         84 :                 greatest_wt_subqs_db_num = db_num;
     785                 :            :             } else
     786                 :            : #endif
     787                 :            :             {
     788                 :    1088059 :                 greatest_wt_subqs_matched = pl->count_matching_subqs();
     789                 :            : #ifdef XAPIAN_HAS_REMOTE_BACKEND
     790                 :    1088059 :                 greatest_wt_subqs_db_num = UINT_MAX;
     791                 :            : #endif
     792                 :            :             }
     793         [ +  + ]:    1088143 :             if (percent_cutoff) {
     794                 :        156 :                 Xapian::weight w = wt * percent_cutoff_factor;
     795         [ +  + ]:        156 :                 if (w > min_weight) {
     796                 :        143 :                     min_weight = w;
     797         [ +  - ]:        143 :                     if (!is_heap) {
     798                 :        143 :                         is_heap = true;
     799                 :            :                         make_heap<vector<Xapian::Internal::MSetItem>::iterator,
     800                 :        143 :                                   MSetCmp>(items.begin(), items.end(), mcmp);
     801                 :            :                     }
     802 [ +  - ][ +  + ]:   47250304 :                     while (!items.empty() && items.front().wt < min_weight) {
                 [ +  + ]
     803                 :            :                         pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
     804                 :         13 :                                  MSetCmp>(items.begin(), items.end(), mcmp);
     805                 :            :                         Assert(items.back().wt < min_weight);
     806                 :         13 :                         items.pop_back();
     807                 :            :                     }
     808                 :            : #ifdef XAPIAN_ASSERTIONS_PARANOID
     809                 :            :                     vector<Xapian::Internal::MSetItem>::const_iterator i;
     810                 :            :                     for (i = items.begin(); i != items.end(); ++i) {
     811                 :            :                         Assert(i->wt >= min_weight);
     812                 :            :                     }
     813                 :            : #endif
     814                 :            :                 }
     815                 :            :             }
     816                 :            :         }
     817                 :            :     }
     818                 :            : 
     819                 :            :     // done with posting list tree
     820                 :     173480 :     pl.reset(NULL);
     821                 :            : 
     822                 :     173480 :     double percent_scale = 0;
     823   [ +  +  +  + ]:     173480 :     if (!items.empty() && greatest_wt > 0) {
                 [ +  + ]
     824                 :            :         // Find the document with the highest weight, then total up the
     825                 :            :         // weights for the terms it contains
     826                 :     166289 :         vector<Xapian::Internal::MSetItem>::const_iterator best;
     827                 :     166289 :         best = min_element(items.begin(), items.end(), mcmp);
     828                 :            : 
     829                 :            : #ifdef XAPIAN_HAS_REMOTE_BACKEND
     830         [ +  + ]:     166289 :         if (greatest_wt_subqs_db_num != UINT_MAX) {
     831                 :         54 :             const unsigned int n = greatest_wt_subqs_db_num;
     832                 :            :             RemoteSubMatch * rem_match;
     833                 :         54 :             rem_match = static_cast<RemoteSubMatch*>(leaves[n].get());
     834                 :         54 :             percent_scale = rem_match->get_percent_factor() / 100.0;
     835                 :            :         } else
     836                 :            : #endif
     837                 :            :         {
     838                 :     166235 :             percent_scale = greatest_wt_subqs_matched / double(total_subqs);
     839                 :     166235 :             percent_scale /= greatest_wt;
     840                 :            :         }
     841                 :            :         Assert(percent_scale > 0);
     842         [ +  + ]:     166289 :         if (percent_cutoff) {
     843                 :            :             // FIXME: better to sort and binary chop maybe?
     844                 :            :             // Or we could just use a linear scan here instead.
     845                 :            : 
     846                 :            :             // trim the mset to the correct answer...
     847                 :         91 :             Xapian::weight min_wt = percent_cutoff_factor / percent_scale;
     848         [ +  + ]:         91 :             if (!is_heap) {
     849                 :         34 :                 is_heap = true;
     850                 :            :                 make_heap<vector<Xapian::Internal::MSetItem>::iterator,
     851                 :         34 :                           MSetCmp>(items.begin(), items.end(), mcmp);
     852                 :            :             }
     853 [ +  - ][ -  + ]:         91 :             while (!items.empty() && items.front().wt < min_wt) {
                 [ -  + ]
     854                 :            :                 pop_heap<vector<Xapian::Internal::MSetItem>::iterator,
     855                 :          0 :                          MSetCmp>(items.begin(), items.end(), mcmp);
     856                 :            :                 Assert(items.back().wt < min_wt);
     857                 :          0 :                 items.pop_back();
     858                 :            :             }
     859                 :            : #ifdef XAPIAN_ASSERTIONS_PARANOID
     860                 :            :             vector<Xapian::Internal::MSetItem>::const_iterator j;
     861                 :            :             for (j = items.begin(); j != items.end(); ++j) {
     862                 :            :                 Assert(j->wt >= min_wt);
     863                 :            :             }
     864                 :            : #endif
     865                 :            :         }
     866                 :     166289 :         percent_scale *= 100.0;
     867                 :            :     }
     868                 :            : 
     869                 :            :     LOGLINE(MATCH,
     870                 :            :             "docs_matched = " << docs_matched <<
     871                 :            :             ", definite_matches_not_seen = " << definite_matches_not_seen <<
     872                 :            :             ", matches_lower_bound = " << matches_lower_bound <<
     873                 :            :             ", matches_estimated = " << matches_estimated <<
     874                 :            :             ", matches_upper_bound = " << matches_upper_bound);
     875                 :            : 
     876                 :            :     // Adjust docs_matched to take account of documents which matched remotely
     877                 :            :     // but weren't sent across.
     878                 :     173480 :     docs_matched += definite_matches_not_seen;
     879                 :            : 
     880                 :     173480 :     Xapian::doccount uncollapsed_lower_bound = matches_lower_bound;
     881                 :     173480 :     Xapian::doccount uncollapsed_upper_bound = matches_upper_bound;
     882                 :     173480 :     Xapian::doccount uncollapsed_estimated = matches_estimated;
     883         [ +  + ]:     173480 :     if (items.size() < max_msize) {
     884                 :            :         // We have fewer items in the mset than we tried to get for it, so we
     885                 :            :         // must have all the matches in it.
     886                 :            :         LOGLINE(MATCH, "items.size() = " << items.size() <<
     887                 :            :                 ", max_msize = " << max_msize << ", setting bounds equal");
     888                 :            :         Assert(definite_matches_not_seen == 0);
     889                 :            :         Assert(percent_cutoff || docs_matched == items.size());
     890                 :            :         matches_lower_bound = matches_upper_bound = matches_estimated
     891                 :      10479 :             = items.size();
     892   [ +  +  +  + ]:      10479 :         if (collapser && matches_lower_bound > uncollapsed_lower_bound)
                 [ +  + ]
     893                 :          7 :             uncollapsed_lower_bound = matches_lower_bound;
     894 [ +  + ][ +  + ]:     163001 :     } else if (!collapser && docs_matched < check_at_least) {
                 [ +  + ]
     895                 :            :         // We have seen fewer matches than we checked for, so we must have seen
     896                 :            :         // all the matches.
     897                 :            :         LOGLINE(MATCH, "Setting bounds equal");
     898                 :            :         matches_lower_bound = matches_upper_bound = matches_estimated
     899                 :        286 :             = docs_matched;
     900 [ -  + ][ #  # ]:        286 :         if (collapser && matches_lower_bound > uncollapsed_lower_bound)
                 [ -  + ]
     901                 :          0 :            uncollapsed_lower_bound = matches_lower_bound;
     902                 :            :     } else {
     903                 :            :         AssertRel(matches_estimated,>=,matches_lower_bound);
     904                 :            :         AssertRel(matches_estimated,<=,matches_upper_bound);
     905                 :            : 
     906                 :            :         // We can end up scaling the estimate more than once, so collect
     907                 :            :         // the scale factors and apply them in one go to avoid rounding
     908                 :            :         // more than once.
     909                 :     162715 :         double estimate_scale = 1.0;
     910                 :     162715 :         double unique_rate = 1.0;
     911                 :            : 
     912         [ +  + ]:     162715 :         if (collapser) {
     913                 :            :             LOGLINE(MATCH, "Adjusting bounds due to collapse_key");
     914                 :        638 :             matches_lower_bound = collapser.get_matches_lower_bound();
     915         [ +  + ]:        638 :             if (matches_lower_bound > uncollapsed_lower_bound)
     916                 :         21 :                uncollapsed_lower_bound = matches_lower_bound;
     917                 :            : 
     918                 :            :             // The estimate for the number of hits can be modified by
     919                 :            :             // multiplying it by the rate at which we've been finding
     920                 :            :             // unique documents.
     921                 :        638 :             Xapian::doccount docs_considered = collapser.get_docs_considered();
     922                 :        638 :             Xapian::doccount dups_ignored = collapser.get_dups_ignored();
     923         [ +  - ]:        638 :             if (docs_considered > 0) {
     924                 :        638 :                 double unique = double(docs_considered - dups_ignored);
     925                 :        638 :                 unique_rate = unique / double(docs_considered);
     926                 :            :             }
     927                 :            : 
     928                 :            :             // We can safely reduce the upper bound by the number of duplicates
     929                 :            :             // we've ignored.
     930                 :        638 :             matches_upper_bound -= dups_ignored;
     931                 :            : 
     932                 :            :             LOGLINE(MATCH, "matches_lower_bound=" << matches_lower_bound <<
     933                 :            :                     ", matches_estimated=" << matches_estimated <<
     934                 :            :                     "*" << estimate_scale << "*" << unique_rate <<
     935                 :            :                     ", matches_upper_bound=" << matches_upper_bound);
     936                 :            :         }
     937                 :            : 
     938 [ +  + ][ -  + ]:     162715 :         if (mdecider || matchspy_legacy) {
     939         [ +  + ]:         35 :             if (!percent_cutoff) {
     940         [ +  + ]:         21 :                 if (!collapser) {
     941                 :            :                     // We're not collapsing or doing a percentage cutoff, so
     942                 :            :                     // docs_matched is a lower bound on the total number of
     943                 :            :                     // matches.
     944                 :          7 :                     matches_lower_bound = max(docs_matched, matches_lower_bound);
     945                 :            :                 }
     946                 :            :             }
     947                 :            : 
     948                 :            :             // The estimate for the number of hits can be modified by
     949                 :            :             // multiplying it by the rate at which the match decider has
     950                 :            :             // been accepting documents.
     951         [ +  - ]:         35 :             if (decider_considered > 0) {
     952                 :         35 :                 double accept = double(decider_considered - decider_denied);
     953                 :         35 :                 double accept_rate = accept / double(decider_considered);
     954                 :         35 :                 estimate_scale *= accept_rate;
     955                 :            :             }
     956                 :            : 
     957                 :            :             // If a document is denied by a match decider, it is not possible
     958                 :            :             // for it to found to be a duplicate, so it is safe to also reduce
     959                 :            :             // the upper bound by the number of documents denied by a match
     960                 :            :             // decider.
     961                 :         35 :             matches_upper_bound -= decider_denied;
     962         [ +  + ]:         35 :             if (collapser) uncollapsed_upper_bound -= decider_denied;
     963                 :            :         }
     964                 :            : 
     965         [ +  + ]:     162715 :         if (percent_cutoff) {
     966                 :         27 :             estimate_scale *= (1.0 - percent_cutoff_factor);
     967                 :            :             // another approach:
     968                 :            :             // Xapian::doccount new_est = items.size() * (1 - percent_cutoff_factor) / (1 - min_weight / greatest_wt);
     969                 :            :             // and another: items.size() + (1 - greatest_wt * percent_cutoff_factor / min_weight) * (matches_estimated - items.size());
     970                 :            : 
     971                 :            :             // Very likely an underestimate, but we can't really do better
     972                 :            :             // without checking further matches...  Only possibility would be
     973                 :            :             // to track how many docs made the min weight test but didn't make
     974                 :            :             // the candidate set since the last greatest_wt change, which we
     975                 :            :             // could use if the top documents matched all the prob terms.
     976                 :         27 :             matches_lower_bound = items.size();
     977         [ +  + ]:         27 :             if (collapser) uncollapsed_lower_bound = matches_lower_bound;
     978                 :            : 
     979                 :            :             // matches_upper_bound could be reduced by the number of documents
     980                 :            :             // which fail the min weight test (FIXME)
     981                 :            : 
     982                 :            :             LOGLINE(MATCH, "Adjusted bounds due to percent_cutoff (" <<
     983                 :            :                     percent_cutoff << "): now have matches_estimated=" <<
     984                 :            :                     matches_estimated << ", matches_lower_bound=" <<
     985                 :            :                     matches_lower_bound);
     986                 :            :         }
     987                 :            : 
     988 [ +  + ][ +  + ]:     162715 :         if (collapser && estimate_scale != 1.0) {
                 [ +  + ]
     989                 :            :             uncollapsed_estimated =
     990                 :         31 :                 Xapian::doccount(uncollapsed_estimated * estimate_scale + 0.5);
     991                 :            :         }
     992                 :            : 
     993                 :     162715 :         estimate_scale *= unique_rate;
     994                 :            : 
     995         [ +  + ]:     162715 :         if (estimate_scale != 1.0) {
     996                 :            :             matches_estimated =
     997                 :         80 :                 Xapian::doccount(matches_estimated * estimate_scale + 0.5);
     998         [ +  + ]:         80 :             if (matches_estimated < matches_lower_bound)
     999                 :         13 :                 matches_estimated = matches_lower_bound;
    1000                 :            :         }
    1001                 :            : 
    1002 [ +  + ][ +  + ]:     162715 :         if (collapser || mdecider || matchspy_legacy) {
         [ -  + ][ +  + ]
    1003                 :            :             LOGLINE(MATCH, "Clamping estimate between bounds: "
    1004                 :            :                     "matches_lower_bound = " << matches_lower_bound <<
    1005                 :            :                     ", matches_estimated = " << matches_estimated <<
    1006                 :            :                     ", matches_upper_bound = " << matches_upper_bound);
    1007                 :            :             AssertRel(matches_lower_bound,<=,matches_upper_bound);
    1008                 :        652 :             matches_estimated = max(matches_estimated, matches_lower_bound);
    1009                 :        652 :             matches_estimated = min(matches_estimated, matches_upper_bound);
    1010         [ +  - ]:     162063 :         } else if (!percent_cutoff) {
    1011                 :            :             AssertRel(docs_matched,<=,matches_upper_bound);
    1012         [ +  + ]:     162063 :             if (docs_matched > matches_lower_bound)
    1013                 :        170 :                 matches_lower_bound = docs_matched;
    1014         [ +  + ]:     162063 :             if (docs_matched > matches_estimated)
    1015                 :        141 :                 matches_estimated = docs_matched;
    1016                 :            :         }
    1017                 :            : 
    1018 [ +  + ][ +  + ]:     162715 :         if (collapser && !mdecider && !percent_cutoff && !matchspy_legacy) {
         [ +  + ][ +  - ]
                 [ +  + ]
    1019                 :            :             AssertRel(docs_matched,<=,uncollapsed_upper_bound);
    1020         [ -  + ]:        604 :             if (docs_matched > uncollapsed_lower_bound)
    1021                 :          0 :                 uncollapsed_lower_bound = docs_matched;
    1022                 :            :         }
    1023                 :            :     }
    1024                 :            : 
    1025         [ +  + ]:     173480 :     if (collapser) {
    1026                 :            :         AssertRel(uncollapsed_lower_bound,<=,uncollapsed_upper_bound);
    1027         [ +  + ]:       1138 :         if (uncollapsed_estimated < uncollapsed_lower_bound) {
    1028                 :         13 :             uncollapsed_estimated = uncollapsed_lower_bound;
    1029         [ -  + ]:       1125 :         } else if (uncollapsed_estimated > uncollapsed_upper_bound) {
    1030                 :          0 :             uncollapsed_estimated = uncollapsed_upper_bound;
    1031                 :            :         }
    1032                 :            :     } else {
    1033                 :            :         // We're not collapsing, so the bounds must be the same.
    1034                 :     172342 :         uncollapsed_lower_bound = matches_lower_bound;
    1035                 :     172342 :         uncollapsed_upper_bound = matches_upper_bound;
    1036                 :     172342 :         uncollapsed_estimated = matches_estimated;
    1037                 :            :     }
    1038                 :            : 
    1039                 :            :     LOGLINE(MATCH, items.size() << " items in potential mset");
    1040                 :            : 
    1041         [ +  + ]:     173480 :     if (first > 0) {
    1042                 :            :         // Remove unwanted leading entries
    1043         [ +  + ]:     158306 :         if (items.size() <= first) {
    1044                 :         32 :             items.clear();
    1045                 :            :         } else {
    1046                 :            :             LOGLINE(MATCH, "finding " << first << "th");
    1047                 :            :             // We perform nth_element() on reverse iterators so that the
    1048                 :            :             // unwanted elements end up at the end of items, which means
    1049                 :            :             // that the call to erase() to remove them doesn't have to copy
    1050                 :            :             // any elements.
    1051                 :     158274 :             vector<Xapian::Internal::MSetItem>::reverse_iterator nth;
    1052                 :     158274 :             nth = items.rbegin() + first;
    1053                 :     158274 :             nth_element(items.rbegin(), nth, items.rend(), mcmp);
    1054                 :            :             // Erase the trailing ``first'' elements
    1055                 :     158274 :             items.erase(items.begin() + items.size() - first, items.end());
    1056                 :            :         }
    1057                 :            :     }
    1058                 :            : 
    1059                 :            :     LOGLINE(MATCH, "sorting " << items.size() << " entries");
    1060                 :            : 
    1061                 :            :     // Need a stable sort, but this is provided by comparison operator
    1062                 :     173480 :     sort(items.begin(), items.end(), mcmp);
    1063                 :            : 
    1064                 :     173480 :     if (!items.empty()) {
    1065                 :            :         LOGLINE(MATCH, "min weight in mset = " << items.back().wt);
    1066                 :            :         LOGLINE(MATCH, "max weight in mset = " << items[0].wt);
    1067                 :            :     }
    1068                 :            : 
    1069                 :            :     AssertRel(matches_estimated,>=,matches_lower_bound);
    1070                 :            :     AssertRel(matches_estimated,<=,matches_upper_bound);
    1071                 :            : 
    1072                 :            :     AssertRel(uncollapsed_estimated,>=,uncollapsed_lower_bound);
    1073                 :            :     AssertRel(uncollapsed_estimated,<=,uncollapsed_upper_bound);
    1074                 :            : 
    1075                 :            :     // We may need to qualify any collapse_count to see if the highest weight
    1076                 :            :     // collapsed item would have qualified percent_cutoff
    1077                 :            :     // We WILL need to restore collapse_count to the mset by taking from
    1078                 :            :     // collapse_tab; this is what comes of copying around whole objects
    1079                 :            :     // instead of taking references, we find it hard to update collapse_count
    1080                 :            :     // of an item that has already been pushed-back as we don't know where it
    1081                 :            :     // is any more.  If we keep or find references we won't need to mess with
    1082                 :            :     // is_heap so much maybe?
    1083   [ +  +  +  + ]:     173480 :     if (!items.empty() && collapser && !collapser.empty()) {
         [ +  + ][ +  + ]
    1084                 :            :         // Nicked this formula from above, but for some reason percent_scale
    1085                 :            :         // has since been multiplied by 100 so we take that into account
    1086                 :       1092 :         Xapian::weight min_wt = percent_cutoff_factor / (percent_scale / 100);
    1087                 :       1092 :         Xapian::doccount entries = collapser.entries();
    1088                 :       1092 :         vector<Xapian::Internal::MSetItem>::iterator i;
    1089         [ +  + ]:       5416 :         for (i = items.begin(); i != items.end(); ++i) {
    1090                 :            :             // Skip entries without a collapse key.
    1091         [ -  + ]:       5377 :             if (i->collapse_key.empty()) continue;
    1092                 :            : 
    1093                 :            :             // Set collapse_count appropriately.
    1094                 :       5377 :             i->collapse_count = collapser.get_collapse_count(i->collapse_key, percent_cutoff, min_wt);
    1095         [ +  + ]:       5377 :             if (--entries == 0) {
    1096                 :            :                 // Stop once we've processed all items with collapse keys.
    1097                 :       1053 :                 break;
    1098                 :            :             }
    1099                 :            :         }
    1100                 :            :     }
    1101                 :            : 
    1102                 :            :     mset = Xapian::MSet(new Xapian::MSet::Internal(
    1103                 :            :                                        first,
    1104                 :            :                                        matches_upper_bound,
    1105                 :            :                                        matches_lower_bound,
    1106                 :            :                                        matches_estimated,
    1107                 :            :                                        uncollapsed_upper_bound,
    1108                 :            :                                        uncollapsed_lower_bound,
    1109                 :            :                                        uncollapsed_estimated,
    1110                 :            :                                        max_possible, greatest_wt, items,
    1111                 :            :                                        termfreqandwts,
    1112 [ +  + ][ +  + ]:     179524 :                                        percent_scale));
         [ +  + ][ +  + ]
         [ +  + ][ +  + ]
    1113                 :            : }

Generated by: LCOV version 1.8