Branch data Line data Source code
1 : : /** @file expandweight.cc
2 : : * @brief Calculate term weights for the ESet.
3 : : */
4 : : /* Copyright (C) 2007,2008 Olly Betts
5 : : * Copyright (C) 2011 Action Without Borders
6 : : *
7 : : * This program is free software; you can redistribute it and/or
8 : : * modify it under the terms of the GNU General Public License as
9 : : * published by the Free Software Foundation; either version 2 of the
10 : : * License, or (at your option) any later version.
11 : : *
12 : : * This program is distributed in the hope that it will be useful,
13 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 : : * GNU General Public License for more details.
16 : : *
17 : : * You should have received a copy of the GNU General Public License
18 : : * along with this program; if not, write to the Free Software
19 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 : : */
21 : :
22 : : #include <config.h>
23 : :
24 : : #include "expandweight.h"
25 : :
26 : : #include "debuglog.h"
27 : : #include "omassert.h"
28 : : #include "termlist.h"
29 : :
30 : : #include <cmath>
31 : :
32 : : using namespace std;
33 : :
34 : : namespace Xapian {
35 : : namespace Internal {
36 : :
37 : : Xapian::weight
38 : 8679 : ExpandWeight::get_weight(TermList * merger, const string & term) const
39 : : {
40 : : LOGCALL(MATCH, Xapian::weight, "ExpandWeight::get_weight", merger | term);
41 : :
42 : : // Accumulate the stats for this term across all relevant documents.
43 : 8679 : ExpandStats stats(avlen, expand_k);
44 : 8679 : merger->accumulate_stats(stats);
45 : :
46 : 8679 : double termfreq = stats.termfreq;
47 : 8679 : double rtermfreq = stats.rtermfreq;
48 : :
49 : : LOGVALUE(EXPAND, rsize);
50 : : LOGVALUE(EXPAND, rtermfreq);
51 : :
52 : : LOGVALUE(EXPAND, dbsize);
53 : : LOGVALUE(EXPAND, stats.dbsize);
54 [ + + ]: 8679 : if (stats.dbsize == dbsize) {
55 : : // Either we're expanding from just one database, or we got stats from
56 : : // all the sub-databases (because at least one relevant document from
57 : : // each sub-database contained this term), so termfreq should already
58 : : // be exact.
59 : : AssertEqParanoid(termfreq, db.get_termfreq(term));
60 : : } else {
61 : : AssertRel(stats.dbsize,<,dbsize);
62 : : // We're expanding from more than one database and the stats we've got
63 : : // only cover some of the sub-databases, so termfreq only includes
64 : : // those sub-databases.
65 [ + + ]: 2539 : if (use_exact_termfreq) {
66 : : LOGLINE(EXPAND, "Had to request exact termfreq");
67 : 611 : termfreq = db.get_termfreq(term);
68 : : } else {
69 : : // Approximate the termfreq by scaling it up from the databases we
70 : : // do have information from.
71 : 1928 : termfreq *= double(dbsize) / double(stats.dbsize);
72 : : LOGLINE(EXPAND, "termfreq is approx " << stats.termfreq << " * " <<
73 : : dbsize << " / " << stats.dbsize << " = " <<
74 : : termfreq);
75 : : LOGVALUE(EXPAND, db.get_termfreq(term));
76 [ - + ]: 1928 : if (termfreq < rtermfreq) {
77 : : // termfreq must be at least rtermfreq, since there are at
78 : : // least rtermfreq documents indexed by this term.
79 : : LOGLINE(EXPAND, "termfreq must be at least rtermfreq");
80 : 0 : termfreq = rtermfreq;
81 : : } else {
82 : : // termfreq can't be more than (dbsize - rsize + rtermfreq)
83 : : // since the number of relevant documents not indexed by this
84 : : // term can't be more than the number of documents not indexed
85 : : // by this term, so:
86 : : //
87 : : // rsize - rtermfreq <= dbsize - termfreq
88 : : // <=> termfreq <= dbsize - (rsize - rtermfreq)
89 : 1928 : double termfreq_upper_bound = dbsize - (rsize - rtermfreq);
90 [ + + ]: 1928 : if (termfreq > termfreq_upper_bound) {
91 : : LOGLINE(EXPAND, "termfreq can't be more than "
92 : : "dbsize - (rsize + rtermfreq)");
93 : 20 : termfreq = termfreq_upper_bound;
94 : : }
95 : : }
96 : : }
97 : : }
98 : : LOGVALUE(EXPAND, termfreq);
99 : :
100 : 8679 : double reldocs_without_term = rsize - rtermfreq;
101 : : double num, denom;
102 : 8679 : num = (rtermfreq + 0.5) * (dbsize - termfreq - reldocs_without_term + 0.5);
103 : : AssertRel(num,>,0);
104 : 8679 : denom = (termfreq - rtermfreq + 0.5) * (reldocs_without_term + 0.5);
105 : : AssertRel(denom,>,0);
106 : :
107 : 8679 : Xapian::weight tw = log(num / denom);
108 : : LOGVALUE(EXPAND, tw);
109 : : LOGVALUE(EXPAND, stats.multiplier);
110 : 8679 : RETURN(stats.multiplier * tw);
111 : : }
112 : :
113 : : }
114 : : }
|