Branch data Line data Source code
1 : : /* brass_postlist.h: Postlists in brass databases
2 : : *
3 : : * Copyright 1999,2000,2001 BrightStation PLC
4 : : * Copyright 2002 Ananova Ltd
5 : : * Copyright 2002,2003,2004,2005,2007,2008,2009,2011 Olly Betts
6 : : * Copyright 2007,2009 Lemur Consulting Ltd
7 : : *
8 : : * This program is free software; you can redistribute it and/or
9 : : * modify it under the terms of the GNU General Public License as
10 : : * published by the Free Software Foundation; either version 2 of the
11 : : * License, or (at your option) any later version.
12 : : *
13 : : * This program is distributed in the hope that it will be useful,
14 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 : : * GNU General Public License for more details.
17 : : *
18 : : * You should have received a copy of the GNU General Public License
19 : : * along with this program; if not, write to the Free Software
20 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 : : * USA
22 : : */
23 : :
24 : : #ifndef OM_HGUARD_BRASS_POSTLIST_H
25 : : #define OM_HGUARD_BRASS_POSTLIST_H
26 : :
27 : : #include <xapian/database.h>
28 : :
29 : : #include "brass_inverter.h"
30 : : #include "brass_types.h"
31 : : #include "brass_positionlist.h"
32 : : #include "leafpostlist.h"
33 : : #include "omassert.h"
34 : :
35 : : #include "autoptr.h"
36 : : #include <map>
37 : : #include <string>
38 : :
39 : : using namespace std;
40 : :
41 : : class BrassCursor;
42 : : class BrassDatabase;
43 : :
44 : : namespace Brass {
45 : : class PostlistChunkReader;
46 : : class PostlistChunkWriter;
47 : : }
48 : :
49 : : class BrassPostList;
50 : :
51 : 2208 : class BrassPostListTable : public BrassTable {
52 : : /// PostList for looking up document lengths.
53 : : mutable AutoPtr<BrassPostList> doclen_pl;
54 : :
55 : : public:
56 : : /** Create a new table object.
57 : : *
58 : : * This does not create the table on disk - the create() method must
59 : : * be called before the table is created on disk
60 : : *
61 : : * This also does not open the table - the open() method must be
62 : : * called before use is made of the table.
63 : : *
64 : : * @param path_ - Path at which the table is stored.
65 : : * @param readonly_ - whether to open the table for read only
66 : : * access.
67 : : */
68 : 2208 : BrassPostListTable(const string & path_, bool readonly_)
69 : : : BrassTable("postlist", path_ + "/postlist.", readonly_),
70 : 2208 : doclen_pl()
71 : 2208 : { }
72 : :
73 : 1924 : bool open(brass_revision_number_t revno) {
74 : 1924 : doclen_pl.reset(0);
75 : 1924 : return BrassTable::open(revno);
76 : : }
77 : :
78 : : /// Merge changes for a term.
79 : : void merge_changes(const string &term, const Inverter::PostingChanges & changes);
80 : :
81 : : /// Merge document length changes.
82 : : void merge_doclen_changes(const map<Xapian::docid, Xapian::termcount> & doclens);
83 : :
84 : : Xapian::docid get_chunk(const string &tname,
85 : : Xapian::docid did, bool adding,
86 : : Brass::PostlistChunkReader ** from,
87 : : Brass::PostlistChunkWriter **to);
88 : :
89 : : /// Compose a key from a termname and docid.
90 : 105422 : static string make_key(const string & term, Xapian::docid did) {
91 : 105422 : return pack_brass_postlist_key(term, did);
92 : : }
93 : :
94 : : /// Compose a key from a termname.
95 : 620430 : static string make_key(const string & term) {
96 : 620430 : return pack_brass_postlist_key(term);
97 : : }
98 : :
99 : 655 : bool term_exists(const string & term) const {
100 : 655 : return key_exists(make_key(term));
101 : : }
102 : :
103 : : /** Returns number of docs indexed by @a term.
104 : : *
105 : : * This is the length of the postlist.
106 : : */
107 : : Xapian::doccount get_termfreq(const std::string & term) const;
108 : :
109 : : /** Returns the number of occurrences of @a term in the database.
110 : : *
111 : : * This is the sum of the wdfs in the postlist.
112 : : */
113 : : Xapian::termcount get_collection_freq(const std::string & term) const;
114 : :
115 : : /** Returns the length of document @a did. */
116 : : Xapian::termcount get_doclength(Xapian::docid did,
117 : : Xapian::Internal::RefCntPtr<const BrassDatabase> db) const;
118 : :
119 : : /** Check if document @a did exists. */
120 : : bool document_exists(Xapian::docid did,
121 : : Xapian::Internal::RefCntPtr<const BrassDatabase> db) const;
122 : : };
123 : :
124 : : /** A postlist in a brass database.
125 : : */
126 : : class BrassPostList : public LeafPostList {
127 : : protected: // BrassModifiedPostList needs to access these.
128 : : /** The database we are searching. This pointer is held so that the
129 : : * database doesn't get deleted before us, and also to give us access
130 : : * to the position_table.
131 : : */
132 : : Xapian::Internal::RefCntPtr<const BrassDatabase> this_db;
133 : :
134 : : /// Whether we've started reading the list yet.
135 : : bool have_started;
136 : :
137 : : /// The position list object for this posting list.
138 : : BrassPositionList positionlist;
139 : :
140 : : private:
141 : : /// Cursor pointing to current chunk of postlist.
142 : : AutoPtr<BrassCursor> cursor;
143 : :
144 : : /// True if this is the last chunk.
145 : : bool is_last_chunk;
146 : :
147 : : /// The first document id in this chunk.
148 : : Xapian::docid first_did_in_chunk;
149 : :
150 : : /// The last document id in this chunk.
151 : : Xapian::docid last_did_in_chunk;
152 : :
153 : : /// Position of iteration through current chunk.
154 : : const char * pos;
155 : :
156 : : /// Pointer to byte after end of current chunk.
157 : : const char * end;
158 : :
159 : : /// Document id we're currently at.
160 : : Xapian::docid did;
161 : :
162 : : /// The wdf of the current document.
163 : : Xapian::termcount wdf;
164 : :
165 : : /// Whether we've run off the end of the list yet.
166 : : bool is_at_end;
167 : :
168 : : /// The number of entries in the posting list.
169 : : Xapian::doccount number_of_entries;
170 : :
171 : : /// Copying is not allowed.
172 : : BrassPostList(const BrassPostList &);
173 : :
174 : : /// Assignment is not allowed.
175 : : void operator=(const BrassPostList &);
176 : :
177 : : /** Move to the next item in the chunk, if possible.
178 : : * If already at the end of the chunk, returns false.
179 : : */
180 : : bool next_in_chunk();
181 : :
182 : : /** Move to the next chunk.
183 : : *
184 : : * If there are no more chunks in this postlist, this will set
185 : : * is_at_end to true.
186 : : */
187 : : void next_chunk();
188 : :
189 : : /** Return true if the given document ID lies in the range covered
190 : : * by the current chunk. This does not say whether the document ID
191 : : * is actually present. It will return false if the document ID
192 : : * is greater than the last document ID in the chunk, even if it is
193 : : * less than the first document ID in the next chunk: it is possible
194 : : * for no chunk to contain a particular document ID.
195 : : */
196 : : bool current_chunk_contains(Xapian::docid desired_did);
197 : :
198 : : /** Move to chunk containing the specified document ID.
199 : : *
200 : : * This moves to the chunk whose starting document ID is
201 : : * <= desired_did, but such that the next chunk's starting
202 : : * document ID is > desired_did.
203 : : *
204 : : * It is thus possible that current_chunk_contains(desired_did)
205 : : * will return false after this call, since the document ID
206 : : * might lie after the end of this chunk, but before the start
207 : : * of the next chunk.
208 : : */
209 : : void move_to_chunk_containing(Xapian::docid desired_did);
210 : :
211 : : /** Scan forward in the current chunk for the specified document ID.
212 : : *
213 : : * This is particularly efficient if the desired document ID is
214 : : * greater than the last in the chunk - it then skips straight
215 : : * to the end.
216 : : *
217 : : * @return true if we moved to a valid document,
218 : : * false if we reached the end of the chunk.
219 : : */
220 : : bool move_forward_in_chunk_to_at_least(Xapian::docid desired_did);
221 : :
222 : : public:
223 : : /// Default constructor.
224 : : BrassPostList(Xapian::Internal::RefCntPtr<const BrassDatabase> this_db_,
225 : : const string & term,
226 : : bool keep_reference);
227 : :
228 : : /// Destructor.
229 : : ~BrassPostList();
230 : :
231 : : /** Used for looking up doclens.
232 : : *
233 : : * @return true if docid @a desired_did has a document length.
234 : : */
235 : : bool jump_to(Xapian::docid desired_did);
236 : :
237 : : /** Returns number of docs indexed by this term.
238 : : *
239 : : * This is the length of the postlist.
240 : : */
241 : 573762 : Xapian::doccount get_termfreq() const { return number_of_entries; }
242 : :
243 : : /// Returns the current docid.
244 : 15605542 : Xapian::docid get_docid() const { Assert(have_started); return did; }
245 : :
246 : : /// Returns the length of current document.
247 : : Xapian::termcount get_doclength() const;
248 : :
249 : : /** Returns the Within Document Frequency of the term in the current
250 : : * document.
251 : : */
252 : 37996319 : Xapian::termcount get_wdf() const { Assert(have_started); return wdf; }
253 : :
254 : : /** Get the list of positions of the term in the current document.
255 : : */
256 : : PositionList *read_position_list();
257 : :
258 : : /** Get the list of positions of the term in the current document.
259 : : */
260 : : PositionList * open_position_list() const;
261 : :
262 : : /// Move to the next document.
263 : : PostList * next(Xapian::weight w_min);
264 : :
265 : : /// Skip to next document with docid >= docid.
266 : : PostList * skip_to(Xapian::docid desired_did, Xapian::weight w_min);
267 : :
268 : : /// Return true if and only if we're off the end of the list.
269 : 19097028 : bool at_end() const { return is_at_end; }
270 : :
271 : : /// Get a description of the document.
272 : : std::string get_description() const;
273 : :
274 : : /// Read the number of entries and the collection frequency.
275 : : static void read_number_of_entries(const char ** posptr,
276 : : const char * end,
277 : : Xapian::doccount * number_of_entries_ptr,
278 : : Xapian::termcount * collection_freq_ptr);
279 : : };
280 : :
281 : : #endif /* OM_HGUARD_BRASS_POSTLIST_H */
|