Branch data Line data Source code
1 : : /* chert_database.h: C++ class definition for chert database
2 : : *
3 : : * Copyright 1999,2000,2001 BrightStation PLC
4 : : * Copyright 2002 Ananova Ltd
5 : : * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010 Olly Betts
6 : : * Copyright 2008 Lemur Consulting Ltd
7 : : *
8 : : * This program is free software; you can redistribute it and/or
9 : : * modify it under the terms of the GNU General Public License as
10 : : * published by the Free Software Foundation; either version 2 of the
11 : : * License, or (at your option) any later version.
12 : : *
13 : : * This program is distributed in the hope that it will be useful,
14 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 : : * GNU General Public License for more details.
17 : : *
18 : : * You should have received a copy of the GNU General Public License
19 : : * along with this program; if not, write to the Free Software
20 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 : : * USA
22 : : */
23 : :
24 : : #ifndef OM_HGUARD_CHERT_DATABASE_H
25 : : #define OM_HGUARD_CHERT_DATABASE_H
26 : :
27 : : #include "database.h"
28 : : #include "chert_dbstats.h"
29 : : #include "chert_positionlist.h"
30 : : #include "chert_postlist.h"
31 : : #include "chert_record.h"
32 : : #include "chert_spelling.h"
33 : : #include "chert_synonym.h"
34 : : #include "chert_termlisttable.h"
35 : : #include "chert_values.h"
36 : : #include "chert_version.h"
37 : : #include "../flint_lock.h"
38 : : #include "chert_types.h"
39 : : #include "valuestats.h"
40 : :
41 : : #include <map>
42 : :
43 : : class ChertTermList;
44 : : class ChertAllDocsPostList;
45 : : class RemoteConnection;
46 : :
47 : : /** A backend designed for efficient indexing and retrieval, using
48 : : * compressed posting lists and a btree storage scheme.
49 : : */
50 : : class ChertDatabase : public Xapian::Database::Internal {
51 : : friend class ChertWritableDatabase;
52 : : friend class ChertTermList;
53 : : friend class ChertPostList;
54 : : friend class ChertAllTermsList;
55 : : friend class ChertAllDocsPostList;
56 : : private:
57 : : /** Directory to store databases in.
58 : : */
59 : : std::string db_dir;
60 : :
61 : : /** Whether the database is readonly.
62 : : */
63 : : bool readonly;
64 : :
65 : : /** The file describing the Chert database.
66 : : * This file has information about the format of the database
67 : : * which can't easily be stored in any of the individual tables.
68 : : */
69 : : ChertVersion version_file;
70 : :
71 : : /** Table storing posting lists.
72 : : *
73 : : * Whenever an update is performed, this table is the first to be
74 : : * updated: therefore, its most recent revision number is the most
75 : : * recent anywhere in the database.
76 : : */
77 : : mutable ChertPostListTable postlist_table;
78 : :
79 : : /** Table storing position lists.
80 : : */
81 : : ChertPositionListTable position_table;
82 : :
83 : : /** Table storing term lists.
84 : : */
85 : : ChertTermListTable termlist_table;
86 : :
87 : : /** Value manager. */
88 : : mutable ChertValueManager value_manager;
89 : :
90 : : /** Table storing synonym data.
91 : : */
92 : : mutable ChertSynonymTable synonym_table;
93 : :
94 : : /** Table storing spelling correction data.
95 : : */
96 : : mutable ChertSpellingTable spelling_table;
97 : :
98 : : /** Table storing records.
99 : : *
100 : : * Whenever an update is performed, this table is the last to be
101 : : * updated: therefore, its most recent revision number is the most
102 : : * recent consistent revision available. If this table's most
103 : : * recent revision number is not available for all tables, there
104 : : * is no consistent revision available, and the database is corrupt.
105 : : */
106 : : ChertRecordTable record_table;
107 : :
108 : : /// Lock object.
109 : : FlintLock lock;
110 : :
111 : : /** The maximum number of changesets which should be kept in the
112 : : * database. */
113 : : unsigned int max_changesets;
114 : :
115 : : /// Database statistics.
116 : : ChertDatabaseStats stats;
117 : :
118 : : /** Return true if a database exists at the path specified for this
119 : : * database.
120 : : */
121 : : bool database_exists();
122 : :
123 : : /** Create new tables, and open them.
124 : : * Any existing tables will be removed first.
125 : : */
126 : : void create_and_open_tables(unsigned int blocksize);
127 : :
128 : : /** Open all tables at most recent consistent revision.
129 : : *
130 : : * @exception Xapian::DatabaseCorruptError is thrown if there is no
131 : : * consistent revision available.
132 : : */
133 : : void open_tables_consistent();
134 : :
135 : : /** Get a write lock on the database, or throw an
136 : : * Xapian::DatabaseLockError if failure.
137 : : *
138 : : * @param creating true if the database is in the process of being
139 : : * created - if false, will throw a DatabaseOpening error if the lock
140 : : * can't be acquired and the database doesn't exist.
141 : : */
142 : : void get_database_write_lock(bool creating);
143 : :
144 : : /** Open tables at specified revision number.
145 : : *
146 : : * @exception Xapian::InvalidArgumentError is thrown if the specified
147 : : * revision is not available.
148 : : */
149 : : void open_tables(chert_revision_number_t revision);
150 : :
151 : : /** Get an object holding the revision number which the tables are
152 : : * opened at.
153 : : *
154 : : * @return the current revision number.
155 : : */
156 : : chert_revision_number_t get_revision_number() const;
157 : :
158 : : /** Get an object holding the next revision number which should be
159 : : * used in the tables.
160 : : *
161 : : * @return the next revision number.
162 : : */
163 : : chert_revision_number_t get_next_revision_number() const;
164 : :
165 : : /** Set the revision number in the tables.
166 : : *
167 : : * This updates the disk tables so that the currently open revision
168 : : * becomes the specified revision number.
169 : : *
170 : : * @param new_revision The new revision number to store. This must
171 : : * be greater than the latest revision number (see
172 : : * get_latest_revision_number()), or undefined behaviour will
173 : : * result.
174 : : */
175 : : void set_revision_number(chert_revision_number_t new_revision);
176 : :
177 : : /** Re-open tables to recover from an overwritten condition,
178 : : * or just get most up-to-date version.
179 : : */
180 : : void reopen();
181 : :
182 : : /** Close all the tables permanently.
183 : : */
184 : : void close();
185 : :
186 : : /** Called if a modifications fail.
187 : : *
188 : : * @param msg is a string description of the exception that was
189 : : * raised when the modifications failed.
190 : : */
191 : : void modifications_failed(chert_revision_number_t old_revision,
192 : : chert_revision_number_t new_revision,
193 : : const std::string & msg);
194 : :
195 : : /** Apply any outstanding changes to the tables.
196 : : *
197 : : * If an error occurs during this operation, this will be signalled
198 : : * by an exception being thrown. In this case the contents of the
199 : : * tables on disk will be left in an unmodified state (though possibly
200 : : * with increased revision numbers), and the outstanding changes will
201 : : * be lost.
202 : : */
203 : : void apply();
204 : :
205 : : /** Cancel any outstanding changes to the tables.
206 : : */
207 : : void cancel();
208 : :
209 : : /** Send a set of messages which transfer the whole database.
210 : : */
211 : : void send_whole_database(RemoteConnection & conn, double end_time);
212 : :
213 : : /** Get the revision stored in a changeset.
214 : : */
215 : : void get_changeset_revisions(const string & path,
216 : : chert_revision_number_t * startrev,
217 : : chert_revision_number_t * endrev) const;
218 : : public:
219 : : /** Create and open a chert database.
220 : : *
221 : : * @exception Xapian::DatabaseCorruptError is thrown if there is no
222 : : * consistent revision available.
223 : : *
224 : : * @exception Xapian::DatabaseOpeningError thrown if database can't
225 : : * be opened.
226 : : *
227 : : * @exception Xapian::DatabaseVersionError thrown if database is in an
228 : : * unsupported format. This implies that the database was
229 : : * created by an older or newer version of Xapian.
230 : : *
231 : : * @param dbdir directory holding chert tables
232 : : *
233 : : * @param block_size Block size, in bytes, to use when creating
234 : : * tables. This is only important, and has the
235 : : * correct value, when the database is being
236 : : * created.
237 : : */
238 : : ChertDatabase(const string &db_dir_, int action = XAPIAN_DB_READONLY,
239 : : unsigned int block_size = 0u);
240 : :
241 : : ~ChertDatabase();
242 : :
243 : : /// Get a postlist table cursor (used by ChertValueList).
244 : 2688 : ChertCursor * get_postlist_cursor() const {
245 : 2688 : return postlist_table.cursor_get();
246 : : }
247 : :
248 : : /** Virtual methods of Database::Internal. */
249 : : //@{
250 : : Xapian::doccount get_doccount() const;
251 : : Xapian::docid get_lastdocid() const;
252 : : totlen_t get_total_length() const;
253 : : Xapian::doclength get_avlength() const;
254 : : Xapian::termcount get_doclength(Xapian::docid did) const;
255 : : Xapian::doccount get_termfreq(const string & tname) const;
256 : : Xapian::termcount get_collection_freq(const string & tname) const;
257 : : Xapian::doccount get_value_freq(Xapian::valueno valno) const;
258 : : std::string get_value_lower_bound(Xapian::valueno valno) const;
259 : : std::string get_value_upper_bound(Xapian::valueno valno) const;
260 : : Xapian::termcount get_doclength_lower_bound() const;
261 : : Xapian::termcount get_doclength_upper_bound() const;
262 : : Xapian::termcount get_wdf_upper_bound(const string & term) const;
263 : : bool term_exists(const string & tname) const;
264 : : bool has_positions() const;
265 : :
266 : : LeafPostList * open_post_list(const string & tname) const;
267 : : ValueList * open_value_list(Xapian::valueno slot) const;
268 : : Xapian::Document::Internal * open_document(Xapian::docid did, bool lazy) const;
269 : :
270 : : PositionList * open_position_list(Xapian::docid did, const string & term) const;
271 : : TermList * open_term_list(Xapian::docid did) const;
272 : : TermList * open_allterms(const string & prefix) const;
273 : :
274 : : TermList * open_spelling_termlist(const string & word) const;
275 : : TermList * open_spelling_wordlist() const;
276 : : Xapian::doccount get_spelling_frequency(const string & word) const;
277 : :
278 : : TermList * open_synonym_termlist(const string & term) const;
279 : : TermList * open_synonym_keylist(const string & prefix) const;
280 : :
281 : : string get_metadata(const string & key) const;
282 : : TermList * open_metadata_keylist(const std::string &prefix) const;
283 : : void write_changesets_to_fd(int fd,
284 : : const string & start_revision,
285 : : bool need_whole_db,
286 : : Xapian::ReplicationInfo * info);
287 : : string get_revision_info() const;
288 : : string get_uuid() const;
289 : : //@}
290 : :
291 : : };
292 : :
293 : : /** A writable chert database.
294 : : */
295 : : class ChertWritableDatabase : public ChertDatabase {
296 : : /** Unflushed changes to term frequencies and collection frequencies. */
297 : : mutable map<string, pair<Xapian::termcount_diff, Xapian::termcount_diff> >
298 : : freq_deltas;
299 : :
300 : : /** Document lengths of new and modified documents which haven't been flushed yet. */
301 : : mutable map<Xapian::docid, Xapian::termcount> doclens;
302 : :
303 : : /// Modifications to posting lists.
304 : : mutable map<string, map<Xapian::docid,
305 : : pair<char, Xapian::termcount> > > mod_plists;
306 : :
307 : : mutable map<Xapian::valueno, ValueStats> value_stats;
308 : :
309 : : /** The number of documents added, deleted, or replaced since the last
310 : : * flush.
311 : : */
312 : : mutable Xapian::doccount change_count;
313 : :
314 : : /// If change_count reaches this threshold we automatically flush.
315 : : Xapian::doccount flush_threshold;
316 : :
317 : : /** A pointer to the last document which was returned by
318 : : * open_document(), or NULL if there is no such valid document. This
319 : : * is used purely for comparing with a supplied document to help with
320 : : * optimising replace_document. When the document internals are
321 : : * deleted, this pointer gets set to NULL.
322 : : */
323 : : mutable Xapian::Document::Internal * modify_shortcut_document;
324 : :
325 : : /** The document ID for the last document returned by open_document().
326 : : */
327 : : mutable Xapian::docid modify_shortcut_docid;
328 : :
329 : : /// Flush any unflushed postlist changes, but don't commit them.
330 : : void flush_postlist_changes() const;
331 : :
332 : : /// Close all the tables permanently.
333 : : void close();
334 : :
335 : : /// Apply changes.
336 : : void apply();
337 : :
338 : : /** Add or modify an entry in freq_deltas.
339 : : *
340 : : * @param tname The term to modify the entry for.
341 : : * @param tf_delta The change in the term frequency delta.
342 : : * @param cf_delta The change in the collection frequency delta.
343 : : */
344 : : void add_freq_delta(const string & tname,
345 : : Xapian::termcount_diff tf_delta,
346 : : Xapian::termcount_diff cf_delta);
347 : :
348 : : /** Insert modifications for a new document to the postlists.
349 : : *
350 : : * @param did The document ID to insert the entry for.
351 : : * @param tname The term to insert the entry for.
352 : : * @param wdf The new wdf value to store.
353 : : */
354 : : void insert_mod_plist(Xapian::docid did,
355 : : const string & tname,
356 : : Xapian::termcount wdf);
357 : :
358 : : /** Update the stored modifications to the postlists.
359 : : *
360 : : * @param did The document ID to modify the entry for.
361 : : * @param tname The term to modify the entry for.
362 : : * @param type The type of change to the postlist.
363 : : * @param wdf The new wdf value to store.
364 : : *
365 : : * If type is 'A', and an existing entry is in the stored
366 : : * modifications, the stored type will be set to 'M'. In all other
367 : : * cases, the stored type is simply the value supplied.
368 : : */
369 : : void update_mod_plist(Xapian::docid did,
370 : : const string & tname,
371 : : char type,
372 : : Xapian::termcount wdf);
373 : :
374 : : //@{
375 : : /** Implementation of virtual methods: see Database::Internal for
376 : : * details.
377 : : */
378 : : void commit();
379 : :
380 : : /** Cancel pending modifications to the database. */
381 : : void cancel();
382 : :
383 : : Xapian::docid add_document(const Xapian::Document & document);
384 : : Xapian::docid add_document_(Xapian::docid did, const Xapian::Document & document);
385 : : // Stop the default implementation of delete_document(term) and
386 : : // replace_document(term) from being hidden. This isn't really
387 : : // a problem as we only try to call them through the base class
388 : : // (where they aren't hidden) but some compilers generate a warning
389 : : // about the hiding.
390 : : #ifndef _MSC_VER
391 : : using Xapian::Database::Internal::delete_document;
392 : : using Xapian::Database::Internal::replace_document;
393 : : #endif
394 : : void delete_document(Xapian::docid did);
395 : : void replace_document(Xapian::docid did, const Xapian::Document & document);
396 : :
397 : : Xapian::Document::Internal * open_document(Xapian::docid did,
398 : : bool lazy) const;
399 : :
400 : : //@}
401 : :
402 : : public:
403 : : /** Create and open a writable chert database.
404 : : *
405 : : * @exception Xapian::DatabaseOpeningError thrown if database can't
406 : : * be opened.
407 : : *
408 : : * @exception Xapian::DatabaseVersionError thrown if database is in an
409 : : * unsupported format. This implies that the database was
410 : : * created by an older or newer version of Xapian.
411 : : *
412 : : * @param dir directory holding chert tables
413 : : */
414 : : ChertWritableDatabase(const string &dir, int action, int block_size);
415 : :
416 : : ~ChertWritableDatabase();
417 : :
418 : : /** Virtual methods of Database::Internal. */
419 : : //@{
420 : : Xapian::termcount get_doclength(Xapian::docid did) const;
421 : : Xapian::doccount get_termfreq(const string & tname) const;
422 : : Xapian::termcount get_collection_freq(const string & tname) const;
423 : : Xapian::doccount get_value_freq(Xapian::valueno valno) const;
424 : : std::string get_value_lower_bound(Xapian::valueno valno) const;
425 : : std::string get_value_upper_bound(Xapian::valueno valno) const;
426 : : bool term_exists(const string & tname) const;
427 : :
428 : : LeafPostList * open_post_list(const string & tname) const;
429 : : ValueList * open_value_list(Xapian::valueno slot) const;
430 : : TermList * open_allterms(const string & prefix) const;
431 : :
432 : : void add_spelling(const string & word, Xapian::termcount freqinc) const;
433 : : void remove_spelling(const string & word, Xapian::termcount freqdec) const;
434 : : TermList * open_spelling_wordlist() const;
435 : :
436 : : TermList * open_synonym_keylist(const string & prefix) const;
437 : : void add_synonym(const string & word, const string & synonym) const;
438 : : void remove_synonym(const string & word, const string & synonym) const;
439 : : void clear_synonyms(const string & word) const;
440 : :
441 : : void set_metadata(const string & key, const string & value);
442 : : void invalidate_doc_object(Xapian::Document::Internal * obj) const;
443 : : //@}
444 : : };
445 : :
446 : : #endif /* OM_HGUARD_CHERT_DATABASE_H */
|