LCOV - code coverage report
Current view: top level - matcher - phrasepostlist.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core r Lines: 68 97 70.1 %
Date: 2011-08-21 Functions: 5 11 45.5 %
Branches: 46 58 79.3 %

           Branch data     Line data    Source code
       1                 :            : /* phrasepostlist.cc: Return only items where terms are near or form a phrase
       2                 :            :  *
       3                 :            :  * Copyright 1999,2000,2001 BrightStation PLC
       4                 :            :  * Copyright 2002 Ananova Ltd
       5                 :            :  * Copyright 2002,2003,2004,2007,2008,2009 Olly Betts
       6                 :            :  * Copyright 2009 Lemur Consulting Ltd
       7                 :            :  *
       8                 :            :  * This program is free software; you can redistribute it and/or
       9                 :            :  * modify it under the terms of the GNU General Public License as
      10                 :            :  * published by the Free Software Foundation; either version 2 of the
      11                 :            :  * License, or (at your option) any later version.
      12                 :            :  *
      13                 :            :  * This program is distributed in the hope that it will be useful,
      14                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      15                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      16                 :            :  * GNU General Public License for more details.
      17                 :            :  *
      18                 :            :  * You should have received a copy of the GNU General Public License
      19                 :            :  * along with this program; if not, write to the Free Software
      20                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
      21                 :            :  * USA
      22                 :            :  */
      23                 :            : 
      24                 :            : #include <config.h>
      25                 :            : #include "phrasepostlist.h"
      26                 :            : 
      27                 :            : #include "debuglog.h"
      28                 :            : #include "positionlist.h"
      29                 :            : #include "omassert.h"
      30                 :            : #include "str.h"
      31                 :            : 
      32                 :            : #include <algorithm>
      33                 :            : 
      34                 :            : /** Class providing an operator which returns true if a has a (strictly)
      35                 :            :  *  smaller number of postings than b.
      36                 :            :  */
      37                 :            : class PositionListCmpLt {
      38                 :            :     public:
      39                 :            :         /** Return true if and only if a is strictly shorter than b.
      40                 :            :          */
      41                 :       8307 :         bool operator()(const PositionList *a, const PositionList *b) {
      42                 :       8307 :             return a->get_size() < b->get_size();
      43                 :            :         }
      44                 :            : };
      45                 :            : 
      46                 :            : 
      47                 :            : /** Check if terms occur sufficiently close together in the current doc
      48                 :            :  */
      49                 :            : bool
      50                 :       1287 : NearPostList::test_doc()
      51                 :            : {
      52                 :            :     LOGCALL(MATCH, bool, "NearPostList::test_doc", NO_ARGS);
      53                 :       1287 :     std::vector<PositionList *> plists;
      54                 :            : 
      55                 :       1287 :     std::vector<PostList *>::const_iterator i;
      56         [ +  + ]:       4862 :     for (i = terms.begin(); i != terms.end(); i++) {
      57                 :       3575 :         PositionList * p = (*i)->read_position_list();
      58                 :            :         // If p is NULL, the backend doesn't support positionlists
      59         [ -  + ]:       3575 :         if (!p) return false;
      60                 :       3575 :         plists.push_back(p);
      61                 :            :     }
      62                 :            : 
      63                 :       1287 :     std::sort(plists.begin(), plists.end(), PositionListCmpLt());
      64                 :            : 
      65                 :            :     Xapian::termpos pos;
      66         [ +  + ]:       1287 :     do {
      67                 :       1586 :         plists[0]->next();
      68         [ +  + ]:       1586 :         if (plists[0]->at_end()) RETURN(false);
      69                 :       1287 :         pos = plists[0]->get_position();
      70                 :            :     } while (!do_test(plists, 1, pos, pos));
      71                 :            : 
      72                 :       1287 :     RETURN(true);
      73                 :            : }
      74                 :            : 
      75                 :            : bool
      76                 :       2093 : NearPostList::do_test(std::vector<PositionList *> &plists, Xapian::termcount i,
      77                 :            :                       Xapian::termcount min, Xapian::termcount max)
      78                 :            : {
      79                 :            :     LOGCALL(MATCH, bool, "NearPostList::do_test", plists | i | min | max);
      80                 :            :     LOGLINE(MATCH, "docid = " << get_docid() << ", window = " << window);
      81                 :       2093 :     Xapian::termcount tmp = max + 1;
      82                 :            :     // take care to avoid underflow
      83         [ +  + ]:       2093 :     if (window <= tmp) tmp -= window; else tmp = 0;
      84                 :       2093 :     plists[i]->skip_to(tmp);
      85         [ +  + ]:       2093 :     while (!plists[i]->at_end()) {
      86                 :       2080 :         Xapian::termpos pos = plists[i]->get_position();
      87                 :            :         LOGLINE(MATCH, "[" << i << "]: " << max - window + 1 << " " << min <<
      88                 :            :                        " " << pos << " " << max << " " << min + window - 1);
      89         [ +  + ]:       2080 :         if (pos > min + window - 1) RETURN(false);
      90         [ +  + ]:       1794 :         if (i + 1 == plists.size()) RETURN(true);
      91         [ +  + ]:        806 :         if (pos < min) min = pos;
      92         [ +  - ]:        533 :         else if (pos > max) max = pos;
      93         [ +  - ]:        806 :         if (do_test(plists, i + 1, min, max)) RETURN(true);
      94                 :          0 :         plists[i]->next();
      95                 :            :     }
      96                 :       2093 :     RETURN(false);
      97                 :            : }
      98                 :            : 
      99                 :            : Xapian::termcount
     100                 :          0 : NearPostList::get_wdf() const
     101                 :            : {
     102                 :            :     // Calculate an estimate for the wdf of a near postlist.
     103                 :            :     //
     104                 :            :     // The natural definition of the wdf for a near postlist is the number of
     105                 :            :     // times the terms occur in a group within the windowsize in the document -
     106                 :            :     // in other words, the number of times a routine like do_test() could find
     107                 :            :     // a match, if it kept going after finding the first one.  However,
     108                 :            :     // calculating this would be expensive (in CPU time at least - the position
     109                 :            :     // lists are probably already read from disk), and the benefit is dubious
     110                 :            :     // (given that the estimate here is probably only going to be used for
     111                 :            :     // calculating the weight for synonym postlists, and it's not clear that
     112                 :            :     // the above definition is exactly appropriate in this situation, anyway).
     113                 :            :     //
     114                 :            :     // Instead, we could do an estimate of this value, based on the lengths
     115                 :            :     // of the position lists.  Rather than having to open the position lists,
     116                 :            :     // we could use the wdfs, which will be the same value unless the wdfs have
     117                 :            :     // been artificially inflated - in which case we probably want to use the
     118                 :            :     // inflated value anyway (it will have been inflated to represent the fact
     119                 :            :     // that this term is more important than others, so we should make use of
     120                 :            :     // this hint).
     121                 :            :     //
     122                 :            :     // A reasonable way to calculate an estimate would be to consider the
     123                 :            :     // document to be split into W (overlapping) windows, each 1 term apart,
     124                 :            :     // and of the same length as the windowsize used for the phrase.  Then,
     125                 :            :     // calculate the probability that each term is found in this window (as
     126                 :            :     // Prob = wdf / doclen), multiply these probabilities to get the
     127                 :            :     // probability that they're all found in the window, and then multiply by
     128                 :            :     // the windowsize again to get an estimate.
     129                 :            :     //
     130                 :            :     // However, this requires the doclen, which won't always be available (ie,
     131                 :            :     // if the weighting scheme doesn't use it, we won't have read it from
     132                 :            :     // disk).
     133                 :            :     //
     134                 :            :     // Rather than force an access to disk to get the doclen, we use an even
     135                 :            :     // rougher estimate: the minimum wdf in the sub-lists.  This is actually
     136                 :            :     // the upper bound for the wdf (since the phrase can only occur the same
     137                 :            :     // number of times as the least frequent of its constituent terms).
     138                 :            :     //
     139                 :            :     // If this estimate proves to give bad results, we can revisit this and try
     140                 :            :     // a better approximation.
     141                 :            : 
     142                 :          0 :     std::vector<PostList *>::const_iterator i = terms.begin();
     143                 :          0 :     Xapian::termcount wdf = (*i)->get_wdf();
     144         [ #  # ]:          0 :     for (; i != terms.end(); i++) {
     145                 :          0 :         wdf = std::min(wdf, (*i)->get_wdf());
     146                 :            :     }
     147                 :            : 
     148                 :            :     // Ensure that we always return a wdf of at least 1, since we know there
     149                 :            :     // was at least one occurrence of the phrase.
     150                 :          0 :     return std::max(wdf, Xapian::termcount(1));
     151                 :            : }
     152                 :            : 
     153                 :            : TermFreqs
     154                 :          0 : NearPostList::get_termfreq_est_using_stats(
     155                 :            :         const Xapian::Weight::Internal & stats) const
     156                 :            : {
     157                 :            :     LOGCALL(MATCH, TermFreqs, "NearPostList::get_termfreq_est_using_stats", stats);
     158                 :            :     // No idea how to estimate this - FIXME
     159                 :          0 :     TermFreqs result(source->get_termfreq_est_using_stats(stats));
     160                 :          0 :     result.termfreq /= 2;
     161                 :          0 :     result.reltermfreq /= 2;
     162                 :          0 :     RETURN(result);
     163                 :            : }
     164                 :            : 
     165                 :            : std::string
     166                 :          0 : NearPostList::get_description() const
     167                 :            : {
     168                 :          0 :     return "(Near " + str(window) + " " + source->get_description() + ")";
     169                 :            : }
     170                 :            : 
     171                 :            : 
     172                 :            : 
     173                 :            : /** Check if terms form a phrase in the current doc
     174                 :            :  */
     175                 :            : bool
     176                 :       1001 : PhrasePostList::test_doc()
     177                 :            : {
     178                 :            :     LOGCALL(MATCH, bool, "PhrasePostList::test_doc", NO_ARGS);
     179                 :       1001 :     std::vector<PositionList *> plists;
     180                 :            : 
     181                 :       1001 :     std::vector<PostList *>::const_iterator i;
     182         [ +  + ]:       3887 :     for (i = terms.begin(); i != terms.end(); i++) {
     183                 :       2886 :         PositionList * p = (*i)->read_position_list();
     184                 :            :         // If p is NULL, the backend doesn't support positionlists
     185         [ -  + ]:       2886 :         if (!p) return false;
     186                 :       2886 :         p->index = i - terms.begin();
     187                 :       2886 :         plists.push_back(p);
     188                 :            :     }
     189                 :            : 
     190                 :       1001 :     std::sort(plists.begin(), plists.end(), PositionListCmpLt());
     191                 :            : 
     192                 :            :     Xapian::termpos pos;
     193                 :            :     Xapian::termpos idx, min;
     194         [ +  + ]:       1001 :     do {
     195                 :       1859 :         plists[0]->next();
     196         [ +  + ]:       1859 :         if (plists[0]->at_end()) {
     197                 :            :             LOGLINE(MATCH, "--MISS--");
     198                 :        858 :             RETURN(false);
     199                 :            :         }
     200                 :       1001 :         pos = plists[0]->get_position();
     201                 :       1001 :         idx = plists[0]->index;
     202                 :       1001 :         min = pos + plists.size() - idx;
     203         [ +  + ]:       1001 :         if (min > window) min -= window; else min = 0;
     204                 :            :     } while (!do_test(plists, 1, min, pos + window - idx));
     205                 :            :     LOGLINE(MATCH, "**HIT**");
     206                 :       1001 :     RETURN(true);
     207                 :            : }
     208                 :            : 
     209                 :            : bool
     210                 :       1469 : PhrasePostList::do_test(std::vector<PositionList *> &plists, Xapian::termcount i,
     211                 :            :                         Xapian::termcount min, Xapian::termcount max)
     212                 :            : {
     213                 :            :     LOGCALL(MATCH, bool, "PhrasePostList::do_test", plists | i | min | max);
     214                 :            :     LOGLINE(MATCH, "docid = " << get_docid() << ", window = " << window);
     215                 :       1469 :     Xapian::termpos idxi = plists[i]->index;
     216                 :            :     LOGLINE(MATCH, "my idx in phrase is " << idxi);
     217                 :            : 
     218                 :       1469 :     Xapian::termpos mymin = min + idxi;
     219                 :       1469 :     Xapian::termpos mymax = max - plists.size() + idxi;
     220                 :            :     LOGLINE(MATCH, "MIN = " << mymin << " MAX = " << mymax);
     221                 :            :     // FIXME: this is worst case O(n^2) where n = length of phrase
     222                 :            :     // Can we do better?
     223         [ +  + ]:       3406 :     for (Xapian::termcount j = 0; j < i; j++) {
     224                 :       1937 :         Xapian::termpos idxj = plists[j]->index;
     225         [ -  + ]:       1937 :         if (idxj > idxi) {
     226                 :          0 :             Xapian::termpos tmp = plists[j]->get_position() + idxj - idxi;
     227                 :            :             LOGLINE(MATCH, "ABOVE " << tmp);
     228         [ #  # ]:          0 :             if (tmp < mymax) mymax = tmp;
     229                 :            :         } else {
     230                 :            :             AssertRel(idxi, !=, idxj);
     231                 :       1937 :             Xapian::termpos tmp = plists[j]->get_position() + idxi - idxj;
     232                 :            :             LOGLINE(MATCH, "BELOW " << tmp);
     233         [ +  + ]:       1937 :             if (tmp > mymin) mymin = tmp;
     234                 :            :         }
     235                 :            :         LOGLINE(MATCH, "min = " << mymin << " max = " << mymax);
     236                 :            :     }
     237                 :       1469 :     plists[i]->skip_to(mymin);
     238                 :            : 
     239         [ +  + ]:       1846 :     while (!plists[i]->at_end()) {
     240                 :        858 :         Xapian::termpos pos = plists[i]->get_position();
     241                 :            :         LOGLINE(MATCH, " " << mymin << " " << pos << " " << mymax);
     242         [ +  + ]:        858 :         if (pos > mymax) RETURN(false);
     243         [ +  + ]:        611 :         if (i + 1 == plists.size()) RETURN(true);
     244                 :        468 :         Xapian::termpos tmp = pos + window - idxi;
     245         [ -  + ]:        468 :         if (tmp < max) max = tmp;
     246                 :        468 :         tmp = pos + plists.size() - idxi;
     247         [ +  + ]:        468 :         if (tmp > window) {
     248                 :        247 :             tmp -= window;
     249         [ +  + ]:        247 :             if (tmp > min) min = tmp;
     250                 :            :         }
     251         [ +  + ]:        468 :         if (do_test(plists, i + 1, min, max)) RETURN(true);
     252                 :        377 :         plists[i]->next();
     253                 :            :     }
     254                 :       1469 :     RETURN(false);
     255                 :            : }
     256                 :            : 
     257                 :            : Xapian::termcount
     258                 :          0 : PhrasePostList::get_wdf() const
     259                 :            : {
     260                 :            :     // Calculate an estimate for the wdf of a phrase postlist.
     261                 :            :     //
     262                 :            :     // We use the minimum wdf of a sub-postlist as our estimate.  See the
     263                 :            :     // comment in NearPostList::get_wdf for justification of this estimate.
     264                 :            :     //
     265                 :            :     // We divide the value calculated for a NearPostList by 2, as a very rough
     266                 :            :     // heuristic to represent the fact that the words must occur in order, and
     267                 :            :     // phrases are therefore rarer than near matches.
     268                 :            : 
     269                 :          0 :     std::vector<PostList *>::const_iterator i = terms.begin();
     270                 :          0 :     Xapian::termcount wdf = (*i)->get_wdf();
     271         [ #  # ]:          0 :     for (; i != terms.end(); i++) {
     272                 :          0 :         wdf = std::min(wdf, (*i)->get_wdf());
     273                 :            :     }
     274                 :            : 
     275                 :            :     // Ensure that we always return a wdf of at least 1, since we know there
     276                 :            :     // was at least one occurrence of the phrase.
     277                 :          0 :     return std::max(wdf / 2, Xapian::termcount(1));
     278                 :            : }
     279                 :            : 
     280                 :            : TermFreqs
     281                 :          0 : PhrasePostList::get_termfreq_est_using_stats(
     282                 :            :         const Xapian::Weight::Internal & stats) const
     283                 :            : {
     284                 :            :     LOGCALL(MATCH, TermFreqs, "PhrasePostList::get_termfreq_est_using_stats", stats);
     285                 :            :     // No idea how to estimate this - FIXME
     286                 :          0 :     TermFreqs result(source->get_termfreq_est_using_stats(stats));
     287                 :          0 :     result.termfreq /= 3;
     288                 :          0 :     result.reltermfreq /= 3;
     289                 :          0 :     RETURN(result);
     290                 :            : }
     291                 :            : 
     292                 :            : std::string
     293                 :          0 : PhrasePostList::get_description() const
     294                 :            : {
     295                 :            :     return "(Phrase " + str(window) + " "
     296                 :          0 :            + source->get_description() + ")";
     297                 :            : }

Generated by: LCOV version 1.8