Branch data Line data Source code
1 : : /** @file collapser.h
2 : : * @brief Collapse documents with the same collapse key during the match.
3 : : */
4 : : /* Copyright (C) 2009 Olly Betts
5 : : *
6 : : * This program is free software; you can redistribute it and/or
7 : : * modify it under the terms of the GNU General Public License as
8 : : * published by the Free Software Foundation; either version 2 of the
9 : : * License, or (at your option) any later version.
10 : : *
11 : : * This program is distributed in the hope that it will be useful,
12 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : : * GNU General Public License for more details.
15 : : *
16 : : * You should have received a copy of the GNU General Public License
17 : : * along with this program; if not, write to the Free Software
18 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 : : */
20 : :
21 : : #ifndef XAPIAN_INCLUDED_COLLAPSER_H
22 : : #define XAPIAN_INCLUDED_COLLAPSER_H
23 : :
24 : : #include "document.h"
25 : : #include "msetcmp.h"
26 : : #include "omenquireinternal.h"
27 : : #include "postlist.h"
28 : :
29 : : #include <map>
30 : :
31 : : /// Enumeration reporting how a document was handled by the Collapser.
32 : : typedef enum {
33 : : EMPTY,
34 : : ADDED,
35 : : REJECTED,
36 : : REPLACED
37 : : } collapse_result;
38 : :
39 : : /// Class tracking information for a given value of the collapse key.
40 : 27944 : class CollapseData {
41 : : /** Currently kept MSet entries for this value of the collapse key.
42 : : *
43 : : * If collapse_max > 1, then this is a min-heap once items.size()
44 : : * reaches collapse_max.
45 : : *
46 : : * FIXME: We expect collapse_max to be small, so perhaps we should
47 : : * preallocate space for that many entries and/or allocate space in
48 : : * larger blocks to divvy up?
49 : : */
50 : : vector<Xapian::Internal::MSetItem> items;
51 : :
52 : : /// The highest weight of a document we've rejected.
53 : : Xapian::weight next_best_weight;
54 : :
55 : : /// The number of documents we've rejected.
56 : : Xapian::doccount collapse_count;
57 : :
58 : : public:
59 : : /// Construct with the given MSetItem @a item.
60 : 3992 : CollapseData(const Xapian::Internal::MSetItem & item)
61 : 3992 : : items(1, item), next_best_weight(0), collapse_count(0) {
62 : 3992 : items[0].collapse_key = string();
63 : 3992 : }
64 : :
65 : : /** Handle a new MSetItem with this collapse key value.
66 : : *
67 : : * @param item The new item.
68 : : * @param collapse_max Max no. of items for each collapse key value.
69 : : * @param mcmp MSetItem comparison functor.
70 : : * @param[out] old_item Replaced item (when REPLACED is returned).
71 : : *
72 : : * @return How @a item was handled: ADDED, REJECTED or REPLACED.
73 : : */
74 : : collapse_result add_item(const Xapian::Internal::MSetItem & item,
75 : : Xapian::doccount collapse_max,
76 : : const MSetCmp & mcmp,
77 : : Xapian::Internal::MSetItem & old_item);
78 : :
79 : : /// The highest weight of a document we've rejected.
80 : 0 : Xapian::weight get_next_best_weight() const { return next_best_weight; }
81 : :
82 : : /// The number of documents we've rejected.
83 : 5377 : Xapian::doccount get_collapse_count() const { return collapse_count; }
84 : : };
85 : :
86 : : /// The Collapser class tracks collapse keys and the documents they match.
87 : 173480 : class Collapser {
88 : : /// Map from collapse key values to the items we're keeping for them.
89 : : std::map<std::string, CollapseData> table;
90 : :
91 : : /// How many items we're currently keeping in @a table.
92 : : Xapian::doccount entry_count;
93 : :
94 : : /** How many documents have we seen without a collapse key?
95 : : *
96 : : * We use this statistic to improve matches_lower_bound.
97 : : */
98 : : Xapian::doccount no_collapse_key;
99 : :
100 : : /** How many documents with duplicate collapse keys we have ignored.
101 : : *
102 : : * We use this statistic to improve matches_estimated (by considering
103 : : * the rate of collapsing) and matches_upper_bound.
104 : : */
105 : : Xapian::doccount dups_ignored;
106 : :
107 : : /** How many documents we've considered for collapsing.
108 : : *
109 : : * We use this statistic to improve matches_estimated (by considering
110 : : * the rate of collapsing).
111 : : */
112 : : Xapian::doccount docs_considered;
113 : :
114 : : /** The value slot we're getting collapse keys from. */
115 : : Xapian::valueno slot;
116 : :
117 : : /** The maximum number of items to keep for each collapse key value. */
118 : : Xapian::doccount collapse_max;
119 : :
120 : : public:
121 : : /// Replaced item when REPLACED is returned by @a collapse().
122 : : Xapian::Internal::MSetItem old_item;
123 : :
124 : 173480 : Collapser(Xapian::valueno slot_, Xapian::doccount collapse_max_)
125 : : : entry_count(0), no_collapse_key(0), dups_ignored(0),
126 : : docs_considered(0), slot(slot_), collapse_max(collapse_max_),
127 : 173480 : old_item(0, 0) { }
128 : :
129 : : /// Return true if collapsing is active for this match.
130 : 48430344 : operator bool() const { return collapse_max != 0; }
131 : :
132 : : /** Handle a new MSetItem.
133 : : *
134 : : * @param item The new item.
135 : : * @param postlist PostList to try to get collapse key from
136 : : * (this happens for a remote match).
137 : : * @param doc Document for getting values.
138 : : * @param mcmp MSetItem comparison functor.
139 : : *
140 : : * @return How @a item was handled: EMPTY, ADDED, REJECTED or REPLACED.
141 : : */
142 : : collapse_result process(Xapian::Internal::MSetItem & item,
143 : : PostList * postlist,
144 : : Xapian::Document::Internal & vsdoc,
145 : : const MSetCmp & mcmp);
146 : :
147 : : Xapian::doccount get_collapse_count(const std::string & collapse_key,
148 : : int percent_cutoff,
149 : : Xapian::weight min_weight) const;
150 : :
151 : 638 : Xapian::doccount get_docs_considered() const { return docs_considered; }
152 : :
153 : 638 : Xapian::doccount get_dups_ignored() const { return dups_ignored; }
154 : :
155 : 1092 : Xapian::doccount entries() const { return entry_count; }
156 : :
157 : : Xapian::doccount get_matches_lower_bound() const;
158 : :
159 : 1132 : bool empty() const { return table.empty(); }
160 : : };
161 : :
162 : : #endif // XAPIAN_INCLUDED_COLLAPSER_H
|