LCOV - code coverage report
Current view: top level - expand - expandweight.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core r Lines: 18 19 94.7 %
Date: 2011-08-21 Functions: 1 1 100.0 %
Branches: 7 8 87.5 %

           Branch data     Line data    Source code
       1                 :            : /** @file expandweight.cc
       2                 :            :  * @brief Calculate term weights for the ESet.
       3                 :            :  */
       4                 :            : /* Copyright (C) 2007,2008 Olly Betts
       5                 :            :  * Copyright (C) 2011 Action Without Borders
       6                 :            :  *
       7                 :            :  * This program is free software; you can redistribute it and/or
       8                 :            :  * modify it under the terms of the GNU General Public License as
       9                 :            :  * published by the Free Software Foundation; either version 2 of the
      10                 :            :  * License, or (at your option) any later version.
      11                 :            :  *
      12                 :            :  * This program is distributed in the hope that it will be useful,
      13                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      15                 :            :  * GNU General Public License for more details.
      16                 :            :  *
      17                 :            :  * You should have received a copy of the GNU General Public License
      18                 :            :  * along with this program; if not, write to the Free Software
      19                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      20                 :            :  */
      21                 :            : 
      22                 :            : #include <config.h>
      23                 :            : 
      24                 :            : #include "expandweight.h"
      25                 :            : 
      26                 :            : #include "debuglog.h"
      27                 :            : #include "omassert.h"
      28                 :            : #include "termlist.h"
      29                 :            : 
      30                 :            : #include <cmath>
      31                 :            : 
      32                 :            : using namespace std;
      33                 :            : 
      34                 :            : namespace Xapian {
      35                 :            : namespace Internal {
      36                 :            : 
      37                 :            : Xapian::weight
      38                 :       8679 : ExpandWeight::get_weight(TermList * merger, const string & term) const
      39                 :            : {
      40                 :            :     LOGCALL(MATCH, Xapian::weight, "ExpandWeight::get_weight", merger | term);
      41                 :            : 
      42                 :            :     // Accumulate the stats for this term across all relevant documents.
      43                 :       8679 :     ExpandStats stats(avlen, expand_k);
      44                 :       8679 :     merger->accumulate_stats(stats);
      45                 :            : 
      46                 :       8679 :     double termfreq = stats.termfreq;
      47                 :       8679 :     double rtermfreq = stats.rtermfreq;
      48                 :            : 
      49                 :            :     LOGVALUE(EXPAND, rsize);
      50                 :            :     LOGVALUE(EXPAND, rtermfreq);
      51                 :            : 
      52                 :            :     LOGVALUE(EXPAND, dbsize);
      53                 :            :     LOGVALUE(EXPAND, stats.dbsize);
      54         [ +  + ]:       8679 :     if (stats.dbsize == dbsize) {
      55                 :            :         // Either we're expanding from just one database, or we got stats from
      56                 :            :         // all the sub-databases (because at least one relevant document from
      57                 :            :         // each sub-database contained this term), so termfreq should already
      58                 :            :         // be exact.
      59                 :            :         AssertEqParanoid(termfreq, db.get_termfreq(term));
      60                 :            :     } else {
      61                 :            :         AssertRel(stats.dbsize,<,dbsize);
      62                 :            :         // We're expanding from more than one database and the stats we've got
      63                 :            :         // only cover some of the sub-databases, so termfreq only includes
      64                 :            :         // those sub-databases.
      65         [ +  + ]:       2539 :         if (use_exact_termfreq) {
      66                 :            :             LOGLINE(EXPAND, "Had to request exact termfreq");
      67                 :        611 :             termfreq = db.get_termfreq(term);
      68                 :            :         } else {
      69                 :            :             // Approximate the termfreq by scaling it up from the databases we
      70                 :            :             // do have information from.
      71                 :       1928 :             termfreq *= double(dbsize) / double(stats.dbsize);
      72                 :            :             LOGLINE(EXPAND, "termfreq is approx " << stats.termfreq << " * " <<
      73                 :            :                             dbsize << " / " << stats.dbsize << " = " <<
      74                 :            :                             termfreq);
      75                 :            :             LOGVALUE(EXPAND, db.get_termfreq(term));
      76         [ -  + ]:       1928 :             if (termfreq < rtermfreq) {
      77                 :            :                 // termfreq must be at least rtermfreq, since there are at
      78                 :            :                 // least rtermfreq documents indexed by this term.
      79                 :            :                 LOGLINE(EXPAND, "termfreq must be at least rtermfreq");
      80                 :          0 :                 termfreq = rtermfreq;
      81                 :            :             } else {
      82                 :            :                 // termfreq can't be more than (dbsize - rsize + rtermfreq)
      83                 :            :                 // since the number of relevant documents not indexed by this
      84                 :            :                 // term can't be more than the number of documents not indexed
      85                 :            :                 // by this term, so:
      86                 :            :                 //
      87                 :            :                 //     rsize - rtermfreq <= dbsize - termfreq
      88                 :            :                 // <=> termfreq <= dbsize - (rsize - rtermfreq)
      89                 :       1928 :                 double termfreq_upper_bound = dbsize - (rsize - rtermfreq);
      90         [ +  + ]:       1928 :                 if (termfreq > termfreq_upper_bound) {
      91                 :            :                     LOGLINE(EXPAND, "termfreq can't be more than "
      92                 :            :                                     "dbsize - (rsize + rtermfreq)");
      93                 :         20 :                     termfreq = termfreq_upper_bound;
      94                 :            :                 }
      95                 :            :             }
      96                 :            :         }
      97                 :            :     }
      98                 :            :     LOGVALUE(EXPAND, termfreq);
      99                 :            : 
     100                 :       8679 :     double reldocs_without_term = rsize - rtermfreq;
     101                 :            :     double num, denom;
     102                 :       8679 :     num = (rtermfreq + 0.5) * (dbsize - termfreq - reldocs_without_term + 0.5);
     103                 :            :     AssertRel(num,>,0);
     104                 :       8679 :     denom = (termfreq - rtermfreq + 0.5) * (reldocs_without_term + 0.5);
     105                 :            :     AssertRel(denom,>,0);
     106                 :            : 
     107                 :       8679 :     Xapian::weight tw = log(num / denom);
     108                 :            :     LOGVALUE(EXPAND, tw);
     109                 :            :     LOGVALUE(EXPAND, stats.multiplier);
     110                 :       8679 :     RETURN(stats.multiplier * tw);
     111                 :            : }
     112                 :            : 
     113                 :            : }
     114                 :            : }

Generated by: LCOV version 1.8