Branch data Line data Source code
1 : : /* chert_postlist.h: Postlists in chert databases
2 : : *
3 : : * Copyright 1999,2000,2001 BrightStation PLC
4 : : * Copyright 2002 Ananova Ltd
5 : : * Copyright 2002,2003,2004,2005,2007,2008,2009,2011 Olly Betts
6 : : * Copyright 2007,2009 Lemur Consulting Ltd
7 : : *
8 : : * This program is free software; you can redistribute it and/or
9 : : * modify it under the terms of the GNU General Public License as
10 : : * published by the Free Software Foundation; either version 2 of the
11 : : * License, or (at your option) any later version.
12 : : *
13 : : * This program is distributed in the hope that it will be useful,
14 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 : : * GNU General Public License for more details.
17 : : *
18 : : * You should have received a copy of the GNU General Public License
19 : : * along with this program; if not, write to the Free Software
20 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 : : * USA
22 : : */
23 : :
24 : : #ifndef OM_HGUARD_CHERT_POSTLIST_H
25 : : #define OM_HGUARD_CHERT_POSTLIST_H
26 : :
27 : : #include <xapian/database.h>
28 : :
29 : : #include "chert_types.h"
30 : : #include "chert_positionlist.h"
31 : : #include "leafpostlist.h"
32 : : #include "omassert.h"
33 : :
34 : : #include "autoptr.h"
35 : : #include <map>
36 : : #include <string>
37 : :
38 : : using namespace std;
39 : :
40 : : class ChertCursor;
41 : : class ChertDatabase;
42 : :
43 : : namespace Chert {
44 : : class PostlistChunkReader;
45 : : class PostlistChunkWriter;
46 : : }
47 : :
48 : : class ChertPostList;
49 : :
50 : 2289 : class ChertPostListTable : public ChertTable {
51 : : /// PostList for looking up document lengths.
52 : : mutable AutoPtr<ChertPostList> doclen_pl;
53 : :
54 : : public:
55 : : /** Create a new table object.
56 : : *
57 : : * This does not create the table on disk - the create() method must
58 : : * be called before the table is created on disk
59 : : *
60 : : * This also does not open the table - the open() method must be
61 : : * called before use is made of the table.
62 : : *
63 : : * @param path_ - Path at which the table is stored.
64 : : * @param readonly_ - whether to open the table for read only
65 : : * access.
66 : : */
67 : 2289 : ChertPostListTable(const string & path_, bool readonly_)
68 : : : ChertTable("postlist", path_ + "/postlist.", readonly_),
69 : 2289 : doclen_pl()
70 : 2289 : { }
71 : :
72 : 1979 : bool open(chert_revision_number_t revno) {
73 : 1979 : doclen_pl.reset(0);
74 : 1979 : return ChertTable::open(revno);
75 : : }
76 : :
77 : : /// Merge added, removed, and changed entries.
78 : : void merge_changes(
79 : : const map<string, map<Xapian::docid, pair<char, Xapian::termcount> > > & mod_plists,
80 : : const map<Xapian::docid, Xapian::termcount> & doclens,
81 : : const map<string, pair<Xapian::termcount_diff, Xapian::termcount_diff> > & freq_deltas);
82 : :
83 : : Xapian::docid get_chunk(const string &tname,
84 : : Xapian::docid did, bool adding,
85 : : Chert::PostlistChunkReader ** from,
86 : : Chert::PostlistChunkWriter **to);
87 : :
88 : : /// Compose a key from a termname and docid.
89 : 105466 : static string make_key(const string & term, Xapian::docid did) {
90 : 105466 : return pack_chert_postlist_key(term, did);
91 : : }
92 : :
93 : : /// Compose a key from a termname.
94 : 621215 : static string make_key(const string & term) {
95 : 621215 : return pack_chert_postlist_key(term);
96 : : }
97 : :
98 : 657 : bool term_exists(const string & term) const {
99 : 657 : return key_exists(make_key(term));
100 : : }
101 : :
102 : : /** Returns number of docs indexed by @a term.
103 : : *
104 : : * This is the length of the postlist.
105 : : */
106 : : Xapian::doccount get_termfreq(const std::string & term) const;
107 : :
108 : : /** Returns the number of occurrences of @a term in the database.
109 : : *
110 : : * This is the sum of the wdfs in the postlist.
111 : : */
112 : : Xapian::termcount get_collection_freq(const std::string & term) const;
113 : :
114 : : /** Returns the length of document @a did. */
115 : : Xapian::termcount get_doclength(Xapian::docid did,
116 : : Xapian::Internal::RefCntPtr<const ChertDatabase> db) const;
117 : :
118 : : /** Check if document @a did exists. */
119 : : bool document_exists(Xapian::docid did,
120 : : Xapian::Internal::RefCntPtr<const ChertDatabase> db) const;
121 : : };
122 : :
123 : : /** A postlist in a chert database.
124 : : */
125 : : class ChertPostList : public LeafPostList {
126 : : protected: // ChertModifiedPostList needs to access these.
127 : : /** The database we are searching. This pointer is held so that the
128 : : * database doesn't get deleted before us, and also to give us access
129 : : * to the position_table.
130 : : */
131 : : Xapian::Internal::RefCntPtr<const ChertDatabase> this_db;
132 : :
133 : : /// Whether we've started reading the list yet.
134 : : bool have_started;
135 : :
136 : : /// The position list object for this posting list.
137 : : ChertPositionList positionlist;
138 : :
139 : : private:
140 : : /// Cursor pointing to current chunk of postlist.
141 : : AutoPtr<ChertCursor> cursor;
142 : :
143 : : /// True if this is the last chunk.
144 : : bool is_last_chunk;
145 : :
146 : : /// The first document id in this chunk.
147 : : Xapian::docid first_did_in_chunk;
148 : :
149 : : /// The last document id in this chunk.
150 : : Xapian::docid last_did_in_chunk;
151 : :
152 : : /// Position of iteration through current chunk.
153 : : const char * pos;
154 : :
155 : : /// Pointer to byte after end of current chunk.
156 : : const char * end;
157 : :
158 : : /// Document id we're currently at.
159 : : Xapian::docid did;
160 : :
161 : : /// The wdf of the current document.
162 : : Xapian::termcount wdf;
163 : :
164 : : /// Whether we've run off the end of the list yet.
165 : : bool is_at_end;
166 : :
167 : : /// The number of entries in the posting list.
168 : : Xapian::doccount number_of_entries;
169 : :
170 : : /// Copying is not allowed.
171 : : ChertPostList(const ChertPostList &);
172 : :
173 : : /// Assignment is not allowed.
174 : : void operator=(const ChertPostList &);
175 : :
176 : : /** Move to the next item in the chunk, if possible.
177 : : * If already at the end of the chunk, returns false.
178 : : */
179 : : bool next_in_chunk();
180 : :
181 : : /** Move to the next chunk.
182 : : *
183 : : * If there are no more chunks in this postlist, this will set
184 : : * is_at_end to true.
185 : : */
186 : : void next_chunk();
187 : :
188 : : /** Return true if the given document ID lies in the range covered
189 : : * by the current chunk. This does not say whether the document ID
190 : : * is actually present. It will return false if the document ID
191 : : * is greater than the last document ID in the chunk, even if it is
192 : : * less than the first document ID in the next chunk: it is possible
193 : : * for no chunk to contain a particular document ID.
194 : : */
195 : : bool current_chunk_contains(Xapian::docid desired_did);
196 : :
197 : : /** Move to chunk containing the specified document ID.
198 : : *
199 : : * This moves to the chunk whose starting document ID is
200 : : * <= desired_did, but such that the next chunk's starting
201 : : * document ID is > desired_did.
202 : : *
203 : : * It is thus possible that current_chunk_contains(desired_did)
204 : : * will return false after this call, since the document ID
205 : : * might lie after the end of this chunk, but before the start
206 : : * of the next chunk.
207 : : */
208 : : void move_to_chunk_containing(Xapian::docid desired_did);
209 : :
210 : : /** Scan forward in the current chunk for the specified document ID.
211 : : *
212 : : * This is particularly efficient if the desired document ID is
213 : : * greater than the last in the chunk - it then skips straight
214 : : * to the end.
215 : : *
216 : : * @return true if we moved to a valid document,
217 : : * false if we reached the end of the chunk.
218 : : */
219 : : bool move_forward_in_chunk_to_at_least(Xapian::docid desired_did);
220 : :
221 : : public:
222 : : /// Default constructor.
223 : : ChertPostList(Xapian::Internal::RefCntPtr<const ChertDatabase> this_db_,
224 : : const string & term,
225 : : bool keep_reference);
226 : :
227 : : /// Destructor.
228 : : ~ChertPostList();
229 : :
230 : : /** Used for looking up doclens.
231 : : *
232 : : * @return true if docid @a desired_did has a document length.
233 : : */
234 : : bool jump_to(Xapian::docid desired_did);
235 : :
236 : : /** Returns number of docs indexed by this term.
237 : : *
238 : : * This is the length of the postlist.
239 : : */
240 : 574425 : Xapian::doccount get_termfreq() const { return number_of_entries; }
241 : :
242 : : /// Returns the current docid.
243 : 15607802 : Xapian::docid get_docid() const { Assert(have_started); return did; }
244 : :
245 : : /// Returns the length of current document.
246 : : Xapian::termcount get_doclength() const;
247 : :
248 : : /** Returns the Within Document Frequency of the term in the current
249 : : * document.
250 : : */
251 : 37997317 : Xapian::termcount get_wdf() const { Assert(have_started); return wdf; }
252 : :
253 : : /** Get the list of positions of the term in the current document.
254 : : */
255 : : PositionList *read_position_list();
256 : :
257 : : /** Get the list of positions of the term in the current document.
258 : : */
259 : : PositionList * open_position_list() const;
260 : :
261 : : /// Move to the next document.
262 : : PostList * next(Xapian::weight w_min);
263 : :
264 : : /// Skip to next document with docid >= docid.
265 : : PostList * skip_to(Xapian::docid desired_did, Xapian::weight w_min);
266 : :
267 : : /// Return true if and only if we're off the end of the list.
268 : 19106374 : bool at_end() const { return is_at_end; }
269 : :
270 : : /// Get a description of the document.
271 : : std::string get_description() const;
272 : :
273 : : /// Read the number of entries and the collection frequency.
274 : : static void read_number_of_entries(const char ** posptr,
275 : : const char * end,
276 : : Xapian::doccount * number_of_entries_ptr,
277 : : Xapian::termcount * collection_freq_ptr);
278 : : };
279 : :
280 : : #endif /* OM_HGUARD_CHERT_POSTLIST_H */
|