Branch data Line data Source code
1 : : /** @file exactphrasepostlist.cc
2 : : * @brief Return docs containing terms forming a particular exact phrase.
3 : : */
4 : : /* Copyright (C) 2006,2007,2009,2010 Olly Betts
5 : : *
6 : : * This program is free software; you can redistribute it and/or modify
7 : : * it under the terms of the GNU General Public License as published by
8 : : * the Free Software Foundation; either version 2 of the License, or
9 : : * (at your option) any later version.
10 : : *
11 : : * This program is distributed in the hope that it will be useful,
12 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : : * GNU General Public License for more details.
15 : : *
16 : : * You should have received a copy of the GNU General Public License
17 : : * along with this program; if not, write to the Free Software
18 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 : : */
20 : :
21 : : #include <config.h>
22 : :
23 : : #include "exactphrasepostlist.h"
24 : :
25 : : #include "debuglog.h"
26 : : #include "positionlist.h"
27 : :
28 : : #include <algorithm>
29 : : #include <vector>
30 : :
31 : : using namespace std;
32 : :
33 : 560 : ExactPhrasePostList::ExactPhrasePostList(PostList *source_,
34 : : const vector<PostList*>::const_iterator &terms_begin,
35 : : const vector<PostList*>::const_iterator &terms_end)
36 : 560 : : SelectPostList(source_), terms(terms_begin, terms_end)
37 : : {
38 : 560 : size_t n = terms.size();
39 : 560 : poslists = new PositionList*[n];
40 : : try {
41 : 560 : order = new unsigned[n];
42 : 0 : } catch (...) {
43 [ # # ][ # # ]: 0 : delete [] poslists;
44 : 0 : throw;
45 : : }
46 [ + + ][ # # ]: 1856 : for (size_t i = 0; i < n; ++i) order[i] = unsigned(i);
47 : 560 : }
48 : :
49 : 560 : ExactPhrasePostList::~ExactPhrasePostList()
50 : : {
51 [ + - ][ # # ]: 560 : delete [] poslists;
[ # # ]
52 [ + - ][ # # ]: 560 : delete [] order;
[ # # ]
53 [ + - ][ # # ]: 560 : }
[ # # ]
54 : :
55 : : void
56 : 952 : ExactPhrasePostList::start_position_list(unsigned i)
57 : : {
58 : 952 : unsigned index = order[i];
59 : 952 : poslists[i] = terms[index]->read_position_list();
60 : 952 : poslists[i]->index = index;
61 : 952 : }
62 : :
63 : : class TermCompare {
64 : : vector<PostList *> & terms;
65 : :
66 : : public:
67 : 468 : TermCompare(vector<PostList *> & terms_) : terms(terms_) { }
68 : :
69 : 1206 : bool operator()(unsigned a, unsigned b) const {
70 : 1206 : return terms[a]->get_wdf() < terms[b]->get_wdf();
71 : : }
72 : : };
73 : :
74 : : bool
75 : 468 : ExactPhrasePostList::test_doc()
76 : : {
77 : : LOGCALL(MATCH, bool, "ExactPhrasePostList::test_doc", NO_ARGS);
78 : :
79 [ - + ]: 468 : if (terms.size() <= 1) RETURN(true);
80 : :
81 : : // We often don't need to read all the position lists, so rather than using
82 : : // the shortest position lists first, we approximate by using the terms
83 : : // with the lowest wdf first. This will typically give the same or a very
84 : : // similar order.
85 : 468 : sort(order, order + terms.size(), TermCompare(terms));
86 : :
87 : : // If the first term we check only occurs too close to the start of the
88 : : // document, we only need to read one term's positions. E.g. search for
89 : : // "ripe mango" when the only occurrence of 'mango' in the current document
90 : : // is at position 0.
91 : 468 : start_position_list(0);
92 : 468 : poslists[0]->skip_to(poslists[0]->index);
93 [ - + ]: 468 : if (poslists[0]->at_end()) RETURN(false);
94 : :
95 : : // If we get here, we'll need to read the positionlists for at least two
96 : : // terms, so check the true positionlist length for the two terms with the
97 : : // lowest wdf and if necessary swap them so the true shorter one is first.
98 : 468 : start_position_list(1);
99 [ + + ]: 468 : if (poslists[0]->get_size() < poslists[1]->get_size()) {
100 : 169 : poslists[1]->skip_to(poslists[1]->index);
101 [ - + ]: 169 : if (poslists[1]->at_end()) RETURN(false);
102 : 169 : swap(poslists[0], poslists[1]);
103 : : }
104 : :
105 : 468 : unsigned read_hwm = 1;
106 : 468 : Xapian::termpos idx0 = poslists[0]->index;
107 [ + + ]: 149 : do {
108 : 572 : Xapian::termpos base = poslists[0]->get_position() - idx0;
109 : 572 : unsigned i = 1;
110 : 16 : while (true) {
111 [ + + ]: 588 : if (i > read_hwm) {
112 : 16 : read_hwm = i;
113 : 16 : start_position_list(i);
114 : : // FIXME: consider comparing with poslist[0] and swapping
115 : : // if less common. Should we allow for the number of positions
116 : : // we've read from poslist[0] already?
117 : : }
118 : 588 : Xapian::termpos required = base + poslists[i]->index;
119 : 588 : poslists[i]->skip_to(required);
120 [ + + ]: 588 : if (poslists[i]->at_end()) RETURN(false);
121 [ + + ]: 360 : if (poslists[i]->get_position() != required) break;
122 [ + + ]: 211 : if (++i == terms.size()) RETURN(true);
123 : : }
124 : 149 : poslists[0]->next();
125 : 298 : } while (!poslists[0]->at_end());
126 : 468 : RETURN(false);
127 : : }
128 : :
129 : : Xapian::termcount
130 : 78 : ExactPhrasePostList::get_wdf() const
131 : : {
132 : : // Calculate an estimate for the wdf of an exact phrase postlist.
133 : : //
134 : : // We use the minimum wdf of a sub-postlist as our estimate. See the
135 : : // comment in NearPostList::get_wdf for justification of this estimate.
136 : : //
137 : : // We divide the value calculated for a NearPostList by 3, as a very rough
138 : : // heuristic to represent the fact that the words must occur exactly in
139 : : // order, and phrases are therefore rarer than near matches and (non-exact)
140 : : // phrase matches.
141 : :
142 : 78 : vector<PostList *>::const_iterator i = terms.begin();
143 : 78 : Xapian::termcount wdf = (*i)->get_wdf();
144 [ + + ]: 234 : for (; i != terms.end(); i++) {
145 : 156 : wdf = min(wdf, (*i)->get_wdf());
146 : : }
147 : 78 : return wdf;
148 : : }
149 : :
150 : : Xapian::doccount
151 : 928 : ExactPhrasePostList::get_termfreq_est() const
152 : : {
153 : : // It's hard to estimate how many times the exact phrase will occur as
154 : : // it depends a lot on the phrase, but usually the exact phrase will
155 : : // occur significantly less often than the individual terms.
156 : 928 : return source->get_termfreq_est() / 4;
157 : : }
158 : :
159 : : TermFreqs
160 : 32 : ExactPhrasePostList::get_termfreq_est_using_stats(
161 : : const Xapian::Weight::Internal & stats) const
162 : : {
163 : : LOGCALL(MATCH, TermFreqs, "ExactPhrasePostList::get_termfreq_est_using_stats", stats);
164 : : // No idea how to estimate this - do the same as get_termfreq_est() for
165 : : // now.
166 : 32 : TermFreqs result(source->get_termfreq_est_using_stats(stats));
167 : 32 : result.termfreq /= 4;
168 : 32 : result.reltermfreq /= 4;
169 : 32 : RETURN(result);
170 : : }
171 : :
172 : : string
173 : 0 : ExactPhrasePostList::get_description() const
174 : : {
175 : 0 : return "(ExactPhrase " + source->get_description() + ")";
176 : : }
|