Branch data Line data Source code
1 : : /** @file flint_termlisttable.cc
2 : : * @brief Subclass of FlintTable which holds termlists.
3 : : */
4 : : /* Copyright (C) 2007,2008 Olly Betts
5 : : *
6 : : * This program is free software; you can redistribute it and/or modify
7 : : * it under the terms of the GNU General Public License as published by
8 : : * the Free Software Foundation; either version 2 of the License, or
9 : : * (at your option) any later version.
10 : : *
11 : : * This program is distributed in the hope that it will be useful,
12 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : : * GNU General Public License for more details.
15 : : *
16 : : * You should have received a copy of the GNU General Public License
17 : : * along with this program; if not, write to the Free Software
18 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 : : */
20 : :
21 : : #include <config.h>
22 : :
23 : : #include <xapian/document.h>
24 : : #include <xapian/error.h>
25 : : #include <xapian/termiterator.h>
26 : :
27 : : #include "flint_termlisttable.h"
28 : : #include "flint_utils.h"
29 : : #include "debuglog.h"
30 : : #include "omassert.h"
31 : : #include "str.h"
32 : : #include "stringutils.h"
33 : :
34 : : #include <string>
35 : :
36 : : using namespace std;
37 : :
38 : : void
39 : 47367 : FlintTermListTable::set_termlist(Xapian::docid did,
40 : : const Xapian::Document & doc,
41 : : flint_doclen_t doclen)
42 : : {
43 : : LOGCALL_VOID(DB, "FlintTermListTable::set_termlist", did | doc | doclen);
44 : :
45 : 47367 : Xapian::doccount termlist_size = doc.termlist_count();
46 [ + + ]: 47367 : if (termlist_size == 0) {
47 : : // doclen is sum(wdf) so should be zero if there are no terms.
48 : : Assert(doclen == 0);
49 : : Assert(doc.termlist_begin() == doc.termlist_end());
50 : 12275 : add(flint_docid_to_key(did), string());
51 : 12275 : return;
52 : : }
53 : :
54 : 35092 : string tag = F_pack_uint(doclen);
55 : :
56 : 35092 : Xapian::TermIterator t = doc.termlist_begin();
57 [ + - ]: 35092 : if (t != doc.termlist_end()) {
58 : 35092 : tag += F_pack_uint(termlist_size);
59 : 35092 : string prev_term = *t;
60 : :
61 : : // Previous database versions encoded a boolean here, which was
62 : : // always false (and F_pack_bool() encodes false as a '0'). We can
63 : : // just omit this and successfully read old and new termlists
64 : : // except in the case where the next byte is a '0' - in this case
65 : : // we need keep the '0' so that the decoder can just skip any '0'
66 : : // it sees in this position (this shouldn't be a common case - 48
67 : : // character terms aren't very common, and the first term
68 : : // alphabetically is likely to be shorter than average).
69 [ + + ]: 35092 : if (prev_term.size() == '0') tag += '0';
70 : :
71 : 35092 : tag += char(prev_term.size());
72 : 35092 : tag += prev_term;
73 : 35092 : tag += F_pack_uint(t.get_wdf());
74 : 35092 : --termlist_size;
75 : :
76 [ + + ]: 777086 : while (++t != doc.termlist_end()) {
77 : 741994 : const string & term = *t;
78 : : // If there's a shared prefix with the previous term, we don't
79 : : // store it explicitly, but just store the length of the shared
80 : : // prefix. In general, this is a big win.
81 : 741994 : size_t reuse = common_prefix_length(prev_term, term);
82 : :
83 : : // reuse must be <= prev_term.size(), and we know that value while
84 : : // decoding. So if the wdf is small enough that we can multiply it
85 : : // by (prev_term.size() + 1), add reuse and fit the result in a
86 : : // byte, then we can pack reuse and the wdf into a single byte and
87 : : // save ourselves a byte. We actually need to add one to the wdf
88 : : // before multiplying so that a wdf of 0 can be detected by the
89 : : // decoder.
90 : 741994 : size_t packed = 0;
91 : 741994 : Xapian::termcount wdf = t.get_wdf();
92 : : // If wdf >= 128, then we aren't going to be able to pack it in so
93 : : // don't even try to avoid the calculation overflowing and making
94 : : // us think we can.
95 [ + + ]: 741994 : if (wdf < 127)
96 : 741988 : packed = (wdf + 1) * (prev_term.size() + 1) + reuse;
97 : :
98 [ + + ][ + + ]: 1483979 : if (packed && packed < 256) {
99 : : // We can pack the wdf into the same byte.
100 : 741985 : tag += char(packed);
101 : 741985 : tag += char(term.size() - reuse);
102 : 741985 : tag.append(term.data() + reuse, term.size() - reuse);
103 : : } else {
104 : 9 : tag += char(reuse);
105 : 9 : tag += char(term.size() - reuse);
106 : 9 : tag.append(term.data() + reuse, term.size() - reuse);
107 : : // FIXME: pack wdf after reuse next time we rejig the format
108 : : // incompatibly.
109 : 9 : tag += F_pack_uint(wdf);
110 : : }
111 : :
112 : 741994 : prev_term = *t;
113 : 741994 : --termlist_size;
114 : 35092 : }
115 : : }
116 : : Assert(termlist_size == 0);
117 : 47367 : add(flint_docid_to_key(did), tag);
118 : : }
119 : :
120 : : flint_doclen_t
121 : 26004 : FlintTermListTable::get_doclength(Xapian::docid did) const
122 : : {
123 : : LOGCALL(DB, flint_doclen_t, "FlintTermListTable::get_doclength", did);
124 : :
125 : 26004 : string tag;
126 [ + + ]: 26004 : if (!get_exact_entry(flint_docid_to_key(did), tag))
127 : : throw Xapian::DocNotFoundError("No termlist found for document " +
128 : 15039 : str(did));
129 : :
130 [ + + ]: 10965 : if (tag.empty()) RETURN(0);
131 : :
132 : 10946 : const char * pos = tag.data();
133 : 10946 : const char * end = pos + tag.size();
134 : :
135 : : flint_doclen_t doclen;
136 [ - + ]: 10946 : if (!F_unpack_uint(&pos, end, &doclen)) {
137 : : const char *msg;
138 [ # # ]: 0 : if (pos == 0) {
139 : 0 : msg = "Too little data for doclen in termlist";
140 : : } else {
141 : 0 : msg = "Overflowed value for doclen in termlist";
142 : : }
143 : 0 : throw Xapian::DatabaseCorruptError(msg);
144 : : }
145 : :
146 : 26004 : RETURN(doclen);
147 : : }
|