Branch data Line data Source code
1 : : /* flint_postlist.h: Postlists in flint databases
2 : : *
3 : : * Copyright 1999,2000,2001 BrightStation PLC
4 : : * Copyright 2002 Ananova Ltd
5 : : * Copyright 2002,2003,2004,2005,2007,2008,2009 Olly Betts
6 : : * Copyright 2007,2009 Lemur Consulting Ltd
7 : : *
8 : : * This program is free software; you can redistribute it and/or
9 : : * modify it under the terms of the GNU General Public License as
10 : : * published by the Free Software Foundation; either version 2 of the
11 : : * License, or (at your option) any later version.
12 : : *
13 : : * This program is distributed in the hope that it will be useful,
14 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 : : * GNU General Public License for more details.
17 : : *
18 : : * You should have received a copy of the GNU General Public License
19 : : * along with this program; if not, write to the Free Software
20 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 : : * USA
22 : : */
23 : :
24 : : #ifndef OM_HGUARD_FLINT_POSTLIST_H
25 : : #define OM_HGUARD_FLINT_POSTLIST_H
26 : :
27 : : #include <xapian/database.h>
28 : :
29 : : #include "flint_types.h"
30 : : #include "flint_positionlist.h"
31 : : #include "leafpostlist.h"
32 : : #include "omassert.h"
33 : :
34 : : #include "autoptr.h"
35 : : #include <map>
36 : : #include <string>
37 : :
38 : : using namespace std;
39 : :
40 : : class FlintCursor;
41 : : class FlintDatabase;
42 : :
43 : : class FlintPostlistChunkReader;
44 : : class FlintPostlistChunkWriter;
45 : :
46 : 2193 : class FlintPostListTable : public FlintTable {
47 : : public:
48 : : /** Create a new table object.
49 : : *
50 : : * This does not create the table on disk - the create() method must
51 : : * be called before the table is created on disk
52 : : *
53 : : * This also does not open the table - the open() method must be
54 : : * called before use is made of the table.
55 : : *
56 : : * @param path_ - Path at which the table is stored.
57 : : * @param readonly_ - whether to open the table for read only
58 : : * access.
59 : : */
60 : 2193 : FlintPostListTable(const string & path_, bool readonly_)
61 : 2193 : : FlintTable("postlist", path_ + "/postlist.", readonly_) { }
62 : :
63 : : /// Merge added, removed, and changed entries.
64 : : void merge_changes(
65 : : const map<string, map<Xapian::docid, pair<char, Xapian::termcount> > > & mod_plists,
66 : : const map<Xapian::docid, Xapian::termcount> & doclens,
67 : : const map<string, pair<Xapian::termcount_diff, Xapian::termcount_diff> > & freq_deltas);
68 : :
69 : : Xapian::docid get_chunk(const string &tname,
70 : : Xapian::docid did, bool adding,
71 : : FlintPostlistChunkReader ** from, FlintPostlistChunkWriter **to);
72 : :
73 : : /// Compose a key from a termname and docid.
74 : 22447 : static string make_key(const string & term, Xapian::docid did) {
75 : 22447 : string key = F_pack_string_preserving_sort(term);
76 : 22447 : key += F_pack_uint_preserving_sort(did);
77 : 0 : return key;
78 : : }
79 : :
80 : : /// Compose a key from a termname.
81 : 619127 : static string make_key(const string & term) {
82 : 619127 : return F_pack_string_preserving_sort(term);
83 : : }
84 : :
85 : 657 : bool term_exists(const string & term) const {
86 : 657 : return key_exists(make_key(term));
87 : : }
88 : :
89 : : /** Returns number of docs indexed by @a term.
90 : : *
91 : : * This is the length of the postlist.
92 : : */
93 : : Xapian::doccount get_termfreq(const std::string & term) const;
94 : :
95 : : /** Returns the number of occurrences of @a term in the database.
96 : : *
97 : : * This is the sum of the wdfs in the postlist.
98 : : */
99 : : Xapian::termcount get_collection_freq(const std::string & term) const;
100 : : };
101 : :
102 : : /** A postlist in a flint database.
103 : : */
104 : : class FlintPostList : public LeafPostList {
105 : : protected: // FlintModifiedPostList needs to access these.
106 : : /** The database we are searching. This pointer is held so that the
107 : : * database doesn't get deleted before us, and also to give us access
108 : : * to the position_table.
109 : : */
110 : : Xapian::Internal::RefCntPtr<const FlintDatabase> this_db;
111 : :
112 : : /// Whether we've started reading the list yet.
113 : : bool have_started;
114 : :
115 : : /// The position list object for this posting list.
116 : : FlintPositionList positionlist;
117 : :
118 : : private:
119 : : /// Cursor pointing to current chunk of postlist.
120 : : AutoPtr<FlintCursor> cursor;
121 : :
122 : : /// True if this is the last chunk.
123 : : bool is_last_chunk;
124 : :
125 : : /// The first document id in this chunk.
126 : : Xapian::docid first_did_in_chunk;
127 : :
128 : : /// The last document id in this chunk.
129 : : Xapian::docid last_did_in_chunk;
130 : :
131 : : /// Position of iteration through current chunk.
132 : : const char * pos;
133 : :
134 : : /// Pointer to byte after end of current chunk.
135 : : const char * end;
136 : :
137 : : /// Document id we're currently at.
138 : : Xapian::docid did;
139 : :
140 : : /// The (absolute) length of the current document.
141 : : flint_doclen_t doclength;
142 : :
143 : : /// The wdf of the current document.
144 : : Xapian::termcount wdf;
145 : :
146 : : /// Whether we've run off the end of the list yet.
147 : : bool is_at_end;
148 : :
149 : : /// The number of entries in the posting list.
150 : : Xapian::doccount number_of_entries;
151 : :
152 : : /// Copying is not allowed.
153 : : FlintPostList(const FlintPostList &);
154 : :
155 : : /// Assignment is not allowed.
156 : : void operator=(const FlintPostList &);
157 : :
158 : : /** Move to the next item in the chunk, if possible.
159 : : * If already at the end of the chunk, returns false.
160 : : */
161 : : bool next_in_chunk();
162 : :
163 : : /** Move to the next chunk.
164 : : *
165 : : * If there are no more chunks in this postlist, this will set
166 : : * is_at_end to true.
167 : : */
168 : : void next_chunk();
169 : :
170 : : /** Return true if the given document ID lies in the range covered
171 : : * by the current chunk. This does not say whether the document ID
172 : : * is actually present. It will return false if the document ID
173 : : * is greater than the last document ID in the chunk, even if it is
174 : : * less than the first document ID in the next chunk: it is possible
175 : : * for no chunk to contain a particular document ID.
176 : : */
177 : : bool current_chunk_contains(Xapian::docid desired_did);
178 : :
179 : : /** Move to chunk containing the specified document ID.
180 : : *
181 : : * This moves to the chunk whose starting document ID is
182 : : * <= desired_did, but such that the next chunk's starting
183 : : * document ID is > desired_did.
184 : : *
185 : : * It is thus possible that current_chunk_contains(desired_did)
186 : : * will return false after this call, since the document ID
187 : : * might lie after the end of this chunk, but before the start
188 : : * of the next chunk.
189 : : */
190 : : void move_to_chunk_containing(Xapian::docid desired_did);
191 : :
192 : : /** Scan forward in the current chunk for the specified document ID.
193 : : *
194 : : * This is particularly efficient if the desired document ID is
195 : : * greater than the last in the chunk - it then skips straight
196 : : * to the end.
197 : : *
198 : : * @return true if we moved to a valid document,
199 : : * false if we reached the end of the chunk.
200 : : */
201 : : bool move_forward_in_chunk_to_at_least(Xapian::docid desired_did);
202 : :
203 : : public:
204 : : /// Default constructor.
205 : : FlintPostList(Xapian::Internal::RefCntPtr<const FlintDatabase> this_db_,
206 : : const string & term_);
207 : :
208 : : /// Destructor.
209 : : ~FlintPostList();
210 : :
211 : : /** Returns number of docs indexed by this term.
212 : : *
213 : : * This is the length of the postlist.
214 : : */
215 : 573673 : Xapian::doccount get_termfreq() const { return number_of_entries; }
216 : :
217 : : /// Returns the current docid.
218 : 15601146 : Xapian::docid get_docid() const { Assert(have_started); return did; }
219 : :
220 : : /// Returns the length of current document.
221 : : Xapian::termcount get_doclength() const;
222 : :
223 : : /** Returns the Within Document Frequency of the term in the current
224 : : * document.
225 : : */
226 : 18900328 : Xapian::termcount get_wdf() const { Assert(have_started); return wdf; }
227 : :
228 : : /** Get the list of positions of the term in the current document.
229 : : */
230 : : PositionList *read_position_list();
231 : :
232 : : /** Get the list of positions of the term in the current document.
233 : : */
234 : : PositionList * open_position_list() const;
235 : :
236 : : /// Move to the next document.
237 : : PostList * next(Xapian::weight w_min);
238 : :
239 : : /// Skip to next document with docid >= docid.
240 : : PostList * skip_to(Xapian::docid desired_did, Xapian::weight w_min);
241 : :
242 : : /// Return true if and only if we're off the end of the list.
243 : 19095099 : bool at_end() const { return is_at_end; }
244 : :
245 : : /// Get a description of the document.
246 : : std::string get_description() const;
247 : :
248 : : /// Read the number of entries and the collection frequency.
249 : : static void read_number_of_entries(const char ** posptr,
250 : : const char * end,
251 : : Xapian::doccount * number_of_entries_ptr,
252 : : Xapian::termcount * collection_freq_ptr);
253 : : };
254 : :
255 : : #endif /* OM_HGUARD_FLINT_POSTLIST_H */
|