Branch data Line data Source code
1 : : /** @file chert_termlisttable.cc
2 : : * @brief Subclass of ChertTable which holds termlists.
3 : : */
4 : : /* Copyright (C) 2007,2008 Olly Betts
5 : : *
6 : : * This program is free software; you can redistribute it and/or modify
7 : : * it under the terms of the GNU General Public License as published by
8 : : * the Free Software Foundation; either version 2 of the License, or
9 : : * (at your option) any later version.
10 : : *
11 : : * This program is distributed in the hope that it will be useful,
12 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : : * GNU General Public License for more details.
15 : : *
16 : : * You should have received a copy of the GNU General Public License
17 : : * along with this program; if not, write to the Free Software
18 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 : : */
20 : :
21 : : #include <config.h>
22 : :
23 : : #include "chert_termlisttable.h"
24 : :
25 : : #include <xapian/document.h>
26 : : #include <xapian/error.h>
27 : : #include <xapian/termiterator.h>
28 : :
29 : : #include "debuglog.h"
30 : : #include "omassert.h"
31 : : #include "pack.h"
32 : : #include "stringutils.h"
33 : : #include "utils.h"
34 : :
35 : : #include <string>
36 : :
37 : : using namespace std;
38 : :
39 : : void
40 : 77389 : ChertTermListTable::set_termlist(Xapian::docid did,
41 : : const Xapian::Document & doc,
42 : : chert_doclen_t doclen)
43 : : {
44 : : LOGCALL_VOID(DB, "ChertTermListTable::set_termlist", did | doc | doclen);
45 : :
46 : 77389 : string tag;
47 : 77389 : pack_uint(tag, doclen);
48 : :
49 : 77389 : Xapian::doccount termlist_size = doc.termlist_count();
50 [ + + ]: 77389 : if (termlist_size == 0) {
51 : : // doclen is sum(wdf) so should be zero if there are no terms.
52 : : Assert(doclen == 0);
53 : : Assert(doc.termlist_begin() == doc.termlist_end());
54 : 42295 : add(make_key(did), string());
55 : : return;
56 : : }
57 : :
58 : 35094 : Xapian::TermIterator t = doc.termlist_begin();
59 [ + - ]: 35094 : if (t != doc.termlist_end()) {
60 : 35094 : pack_uint(tag, termlist_size);
61 : 35094 : string prev_term = *t;
62 : :
63 : 35094 : tag += prev_term.size();
64 : 35094 : tag += prev_term;
65 : 35094 : pack_uint(tag, t.get_wdf());
66 : 35094 : --termlist_size;
67 : :
68 [ + + ]: 777096 : while (++t != doc.termlist_end()) {
69 : 742002 : const string & term = *t;
70 : : // If there's a shared prefix with the previous term, we don't
71 : : // store it explicitly, but just store the length of the shared
72 : : // prefix. In general, this is a big win.
73 : 742002 : size_t reuse = common_prefix_length(prev_term, term);
74 : :
75 : : // reuse must be <= prev_term.size(), and we know that value while
76 : : // decoding. So if the wdf is small enough that we can multiply it
77 : : // by (prev_term.size() + 1), add reuse and fit the result in a
78 : : // byte, then we can pack reuse and the wdf into a single byte and
79 : : // save ourselves a byte. We actually need to add one to the wdf
80 : : // before multiplying so that a wdf of 0 can be detected by the
81 : : // decoder.
82 : 742002 : size_t packed = 0;
83 : 742002 : Xapian::termcount wdf = t.get_wdf();
84 : : // If wdf >= 128, then we aren't going to be able to pack it in so
85 : : // don't even try to avoid the calculation overflowing and making
86 : : // us think we can.
87 [ + + ]: 742002 : if (wdf < 127)
88 : 741996 : packed = (wdf + 1) * (prev_term.size() + 1) + reuse;
89 : :
90 [ + + ][ + + ]: 1483995 : if (packed && packed < 256) {
91 : : // We can pack the wdf into the same byte.
92 : 741993 : tag += char(packed);
93 : 741993 : tag += char(term.size() - reuse);
94 : 741993 : tag.append(term.data() + reuse, term.size() - reuse);
95 : : } else {
96 : 9 : tag += char(reuse);
97 : 9 : tag += char(term.size() - reuse);
98 : 9 : tag.append(term.data() + reuse, term.size() - reuse);
99 : : // FIXME: pack wdf after reuse next time we rejig the format
100 : : // incompatibly.
101 : 9 : pack_uint(tag, wdf);
102 : : }
103 : :
104 : 742002 : prev_term = *t;
105 : 742002 : --termlist_size;
106 : 35094 : }
107 : : }
108 : : Assert(termlist_size == 0);
109 : 77389 : add(make_key(did), tag);
110 : : }
|