Branch data Line data Source code
1 : : /** @file api_opsynonym.cc
2 : : * @brief tests of OP_SYNONYM.
3 : : */
4 : : /* Copyright 2009 Olly Betts
5 : : * Copyright 2007,2008,2009 Lemur Consulting Ltd
6 : : *
7 : : * This program is free software; you can redistribute it and/or
8 : : * modify it under the terms of the GNU General Public License as
9 : : * published by the Free Software Foundation; either version 2 of the
10 : : * License, or (at your option) any later version.
11 : : *
12 : : * This program is distributed in the hope that it will be useful,
13 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 : : * GNU General Public License for more details.
16 : : *
17 : : * You should have received a copy of the GNU General Public License
18 : : * along with this program; if not, write to the Free Software
19 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 : : * USA
21 : : */
22 : :
23 : : #include <config.h>
24 : :
25 : : #include "api_opsynonym.h"
26 : :
27 : : #include <map>
28 : : #include <set>
29 : : #include <vector>
30 : :
31 : : #include <xapian.h>
32 : :
33 : : #include "backendmanager.h"
34 : : #include "testsuite.h"
35 : : #include "testutils.h"
36 : :
37 : : #include "apitest.h"
38 : :
39 : : using namespace std;
40 : :
41 : : // #######################################################################
42 : : // # Tests start here
43 : :
44 : : // Check a synonym search
45 : 13 : DEFINE_TESTCASE(synonym1, backend) {
46 : 13 : Xapian::Database db(get_database("etext"));
47 : :
48 [ - + # # ]: 13 : TEST_REL(db.get_doclength_upper_bound(), >, 0);
49 : :
50 : 13 : Xapian::doccount lots = 214;
51 : :
52 : : // Make a list of lists of subqueries, which are going to be joined
53 : : // together as a synonym.
54 : 13 : vector<vector<Xapian::Query> > subqueries_list;
55 : :
56 : : // For each set of subqueries, keep a list of the number of results for
57 : : // which the weight should be the same when combined with OP_SYNONYM as
58 : : // when combined with OP_OR.
59 : 13 : vector<int> subqueries_sameweight_count;
60 : 13 : vector<int> subqueries_diffweight_count;
61 : :
62 : 13 : vector<Xapian::Query> subqueries;
63 : 13 : subqueries.push_back(Xapian::Query("date"));
64 : 13 : subqueries_list.push_back(subqueries);
65 : : // Single term - all 33 results should be same weight.
66 : 13 : subqueries_sameweight_count.push_back(33);
67 : 13 : subqueries_diffweight_count.push_back(0);
68 : :
69 : : // Two terms, which co-occur in some documents.
70 : 13 : subqueries.clear();
71 : 13 : subqueries.push_back(Xapian::Query("sky"));
72 : 13 : subqueries.push_back(Xapian::Query("date"));
73 : 13 : subqueries_list.push_back(subqueries);
74 : : // All 34 results should be different.
75 : 13 : subqueries_sameweight_count.push_back(0);
76 : 13 : subqueries_diffweight_count.push_back(34);
77 : :
78 : : // Two terms which are entirely disjoint, and where the maximum weight
79 : : // doesn't occur in the first or second match.
80 : 13 : subqueries.clear();
81 : 13 : subqueries.push_back(Xapian::Query("gutenberg"));
82 : 13 : subqueries.push_back(Xapian::Query("blockhead"));
83 : 13 : subqueries_list.push_back(subqueries);
84 : : // All 18 results should be different.
85 : 13 : subqueries_sameweight_count.push_back(0);
86 : 13 : subqueries_diffweight_count.push_back(18);
87 : :
88 : 13 : subqueries.clear();
89 : 13 : subqueries.push_back(Xapian::Query("date"));
90 : : subqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
91 : : Xapian::Query("sky"),
92 : 13 : Xapian::Query("glove")));
93 : 13 : subqueries_list.push_back(subqueries);
94 : : // All 34 results should be different.
95 : 13 : subqueries_sameweight_count.push_back(0);
96 : 13 : subqueries_diffweight_count.push_back(34);
97 : :
98 : 13 : subqueries.clear();
99 : 13 : subqueries.push_back(Xapian::Query("date"));
100 : : subqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
101 : : Xapian::Query("sky"),
102 : 13 : Xapian::Query("date")));
103 : 13 : subqueries_list.push_back(subqueries);
104 : : // All 34 results should be different.
105 : 13 : subqueries_sameweight_count.push_back(0);
106 : 13 : subqueries_diffweight_count.push_back(34);
107 : :
108 : 13 : subqueries.clear();
109 : 13 : subqueries.push_back(Xapian::Query("date"));
110 : : subqueries.push_back(Xapian::Query(Xapian::Query::OP_AND_MAYBE,
111 : : Xapian::Query("sky"),
112 : 13 : Xapian::Query("date")));
113 : 13 : subqueries_list.push_back(subqueries);
114 : : // All 34 results should be different.
115 : 13 : subqueries_sameweight_count.push_back(0);
116 : 13 : subqueries_diffweight_count.push_back(34);
117 : :
118 : 13 : subqueries.clear();
119 : 13 : subqueries.push_back(Xapian::Query("date"));
120 : : subqueries.push_back(Xapian::Query(Xapian::Query::OP_AND_NOT,
121 : : Xapian::Query("sky"),
122 : 13 : Xapian::Query("date")));
123 : 13 : subqueries_list.push_back(subqueries);
124 : : // All 34 results should be different.
125 : 13 : subqueries_sameweight_count.push_back(0);
126 : 13 : subqueries_diffweight_count.push_back(34);
127 : :
128 : 13 : subqueries.clear();
129 : 13 : subqueries.push_back(Xapian::Query("date"));
130 : : subqueries.push_back(Xapian::Query(Xapian::Query::OP_AND,
131 : : Xapian::Query("sky"),
132 : 13 : Xapian::Query("date")));
133 : 13 : subqueries_list.push_back(subqueries);
134 : : // The AND only matches 1 document, so the estimated termfreq for the whole
135 : : // synonym works out as 33 (due to rounding), which is the same as the
136 : : // termfreq for "date". Therefore most of the weights are the same as just
137 : : // for the pure "date" search, and the only document which gets a different
138 : : // weight is the one also matched by "sky" (because it has a wdf boost).
139 : 13 : subqueries_sameweight_count.push_back(32);
140 : 13 : subqueries_diffweight_count.push_back(1);
141 : :
142 : 13 : subqueries.clear();
143 : 13 : subqueries.push_back(Xapian::Query("date"));
144 : : subqueries.push_back(Xapian::Query(Xapian::Query::OP_XOR,
145 : : Xapian::Query("sky"),
146 : 13 : Xapian::Query("date")));
147 : 13 : subqueries_list.push_back(subqueries);
148 : : // All 34 results should be different.
149 : 13 : subqueries_sameweight_count.push_back(0);
150 : 13 : subqueries_diffweight_count.push_back(34);
151 : :
152 : 13 : subqueries.clear();
153 : 13 : subqueries.push_back(Xapian::Query("date"));
154 : : subqueries.push_back(Xapian::Query(Xapian::Query::OP_SYNONYM,
155 : : Xapian::Query("sky"),
156 : 13 : Xapian::Query("date")));
157 : 13 : subqueries_list.push_back(subqueries);
158 : : // When the top-level operator is OR, the synonym part has an estimated
159 : : // termfreq of 35. When the top-level operator is SYNONYM, the whole query
160 : : // has an estimated termfreq of 35, and is in fact the same as the synonym
161 : : // part in the OR query, except that the wqf of "date" is 2. We're
162 : : // currently not using the wqfs of components of synonyms, so this
163 : : // difference has no effect on the weightings. Therefore, for the 1
164 : : // document which does not contain "data", we get the same result with
165 : : // SYNONYM as with OR.
166 : 13 : subqueries_sameweight_count.push_back(1);
167 : 13 : subqueries_diffweight_count.push_back(33);
168 : :
169 : 13 : subqueries.clear();
170 : 13 : subqueries.push_back(Xapian::Query("sky"));
171 : 13 : subqueries.push_back(Xapian::Query("date"));
172 : 13 : subqueries.push_back(Xapian::Query("stein"));
173 : 13 : subqueries.push_back(Xapian::Query("ally"));
174 : 13 : subqueries_list.push_back(subqueries);
175 : : // All 35 results should be different.
176 : 13 : subqueries_sameweight_count.push_back(0);
177 : 13 : subqueries_diffweight_count.push_back(35);
178 : :
179 : 13 : subqueries.clear();
180 : 13 : subqueries.push_back(Xapian::Query("attitud"));
181 : : subqueries.push_back(Xapian::Query(Xapian::Query::OP_PHRASE,
182 : : Xapian::Query("german"),
183 : 13 : Xapian::Query("adventur")));
184 : 13 : subqueries_list.push_back(subqueries);
185 : : // The estimated term frequency for the synoynm is 2 (because the estimate
186 : : // for the phrase is 0), which is the same as the term frequency of
187 : : // "attitud". Thus, the synonym gets the same weight as "attitud", so
188 : : // documents with only "attitud" (but not the phrase) in them get the same
189 : : // wdf, and have the same total weight. There turns out to be exactly one
190 : : // such document.
191 : 13 : subqueries_sameweight_count.push_back(1);
192 : 13 : subqueries_diffweight_count.push_back(3);
193 : :
194 : 13 : subqueries.clear();
195 : 13 : subqueries.push_back(Xapian::Query("attitud"));
196 : : subqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
197 : : Xapian::Query("german"),
198 : : Xapian::Query(Xapian::Query::OP_SYNONYM,
199 : : Xapian::Query("sky"),
200 : 13 : Xapian::Query("date"))));
201 : 13 : subqueries_list.push_back(subqueries);
202 : : // All 54 results are different.
203 : 13 : subqueries_sameweight_count.push_back(0);
204 : 13 : subqueries_diffweight_count.push_back(54);
205 : :
206 [ + + ]: 182 : for (vector<vector<Xapian::Query> >::size_type subqgroup = 0;
207 : : subqgroup != subqueries_list.size(); ++subqgroup)
208 : : {
209 : 169 : vector<Xapian::Query> * qlist = &(subqueries_list[subqgroup]);
210 : : // Run two queries, one joining the subqueries with OR and one joining
211 : : // them with SYNONYM.
212 : 169 : Xapian::Enquire enquire(db);
213 : :
214 : : // Do the search with OR
215 : 169 : Xapian::Query orquery(Xapian::Query::OP_OR, qlist->begin(), qlist->end());
216 : 169 : enquire.set_query(orquery);
217 : 169 : Xapian::MSet ormset = enquire.get_mset(0, lots);
218 : :
219 : : // Do the search with synonym, getting all the results.
220 : 169 : Xapian::Query synquery(Xapian::Query::OP_SYNONYM, qlist->begin(), qlist->end());
221 : 169 : enquire.set_query(synquery);
222 : 169 : Xapian::MSet synmset = enquire.get_mset(0, lots);
223 : :
224 : 169 : tout << "Comparing " << orquery << " with " << synquery << '\n';
225 : :
226 : : // Check that the queries return some results.
227 [ - + # # ]: 169 : TEST_NOT_EQUAL(synmset.size(), 0);
228 : : // Check that the queries return the same number of results.
229 [ - + ][ # # ]: 169 : TEST_EQUAL(synmset.size(), ormset.size());
230 : 169 : map<Xapian::docid, Xapian::weight> values_or;
231 : 169 : map<Xapian::docid, Xapian::weight> values_synonym;
232 [ + + ]: 5564 : for (Xapian::doccount i = 0; i < synmset.size(); ++i) {
233 : 5395 : values_or[*ormset[i]] = ormset[i].get_weight();
234 : 5395 : values_synonym[*synmset[i]] = synmset[i].get_weight();
235 : : }
236 [ - + ][ # # ]: 169 : TEST_EQUAL(values_or.size(), values_synonym.size());
237 : :
238 : : /* Check that the most of the weights for items in the "or" mset are
239 : : * different from those in the "synonym" mset. */
240 : 169 : int same_weight = 0;
241 : 169 : int different_weight = 0;
242 [ + + ]: 5564 : for (map<Xapian::docid, Xapian::weight>::const_iterator
243 : 169 : j = values_or.begin(); j != values_or.end(); ++j) {
244 : 5395 : Xapian::docid did = j->first;
245 : : // Check that all the results in the or tree make it to the synonym
246 : : // tree.
247 [ - + # # ]: 5395 : TEST(values_synonym.find(did) != values_synonym.end());
248 [ + + ]: 5395 : if (values_or[did] == values_synonym[did]) {
249 : 871 : ++same_weight;
250 : : } else {
251 : 4524 : ++different_weight;
252 : : }
253 : : }
254 : :
255 : 169 : int expected_same = subqueries_sameweight_count[subqgroup];
256 : 169 : int expected_diff = subqueries_diffweight_count[subqgroup];
257 : :
258 [ - + # # ]: 169 : TEST_EQUAL(different_weight, expected_diff);
259 [ - + ][ # # ]: 169 : TEST_EQUAL(same_weight, expected_same);
260 : :
261 : : // Do the search with synonym, but just get the top result.
262 : : // (Regression test - the OR subquery in the synonym postlist tree used
263 : : // to shortcut incorrectly, and return the wrong result here).
264 : 169 : Xapian::MSet mset_top = enquire.get_mset(0, 1);
265 [ - + # # ]: 169 : TEST_EQUAL(mset_top.size(), 1);
266 [ - + ][ # # ]: 169 : TEST(mset_range_is_same(mset_top, 0, synmset, 0, 1));
267 : : }
268 : 13 : return true;
269 : : }
270 : :
271 : : // Regression test - test a synonym search with a MultiAndPostlist.
272 : 13 : DEFINE_TESTCASE(synonym2, backend) {
273 : 13 : Xapian::Query query;
274 : 13 : vector<Xapian::Query> subqueries;
275 : 13 : subqueries.push_back(Xapian::Query("file"));
276 : 13 : subqueries.push_back(Xapian::Query("the"));
277 : 13 : subqueries.push_back(Xapian::Query("next"));
278 : 13 : subqueries.push_back(Xapian::Query("reader"));
279 : 13 : query = Xapian::Query(Xapian::Query::OP_AND, subqueries.begin(), subqueries.end());
280 : 13 : subqueries.clear();
281 : 13 : subqueries.push_back(query);
282 : 13 : subqueries.push_back(Xapian::Query("gutenberg"));
283 : 13 : query = Xapian::Query(Xapian::Query::OP_SYNONYM, subqueries.begin(), subqueries.end());
284 : :
285 : 13 : tout << query << '\n';
286 : :
287 : 13 : Xapian::Database db(get_database("etext"));
288 : 13 : Xapian::Enquire enquire(db);
289 : 13 : enquire.set_query(query);
290 : 13 : Xapian::MSet mset = enquire.get_mset(0, 10);
291 : 13 : tout << mset << '\n';
292 : :
293 : : // Regression test that OP_SCALE_WEIGHT works with OP_SYNONYM
294 : 13 : double maxposs = mset.get_max_possible();
295 : 13 : query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 10.0);
296 : 13 : enquire.set_query(query);
297 : 13 : mset = enquire.get_mset(0, 10);
298 : 13 : double maxposs2 = mset.get_max_possible();
299 : :
300 [ - + # # ]: 13 : TEST_EQUAL_DOUBLE(maxposs * 10.0, maxposs2);
301 : :
302 : 13 : return true;
303 : : }
304 : :
305 : : static void
306 : 91 : check_msets_contain_same_docs(const Xapian::MSet & mset1,
307 : : const Xapian::MSet & mset2)
308 : : {
309 [ - + ][ # # ]: 91 : TEST_EQUAL(mset1.size(), mset2.size());
310 : :
311 : 91 : set<Xapian::docid> docids;
312 [ + + ]: 2886 : for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
313 : 2795 : docids.insert(*mset1[i]);
314 : : }
315 : :
316 : : // Check that all the results in mset1 are in mset2.
317 [ + + ]: 2886 : for (Xapian::doccount j = 0; j < mset2.size(); ++j) {
318 : : // Check that we can erase each entry from mset2 element. Since mset1
319 : : // and mset2 are the same size this means we can be sure that there
320 : : // were no repeated docids in either (it would be a bug if there were).
321 [ - + ][ # # ]: 2795 : TEST(docids.erase(*mset2[j]));
322 : 91 : }
323 : 91 : }
324 : :
325 : : // Test a synonym search which has had its weight scaled to 0.
326 : 13 : DEFINE_TESTCASE(synonym3, backend) {
327 : : Xapian::Query query = Xapian::Query(Xapian::Query::OP_SYNONYM,
328 : : Xapian::Query("sky"),
329 : 13 : Xapian::Query("date"));
330 : :
331 : 13 : Xapian::Database db(get_database("etext"));
332 : 13 : Xapian::Enquire enquire(db);
333 : 13 : enquire.set_query(query);
334 : 13 : Xapian::MSet mset_orig = enquire.get_mset(0, db.get_doccount());
335 : :
336 : 13 : tout << query << '\n';
337 : 13 : tout << mset_orig << '\n';
338 : :
339 : : // Test that OP_SCALE_WEIGHT with a factor of 0.0 works with OP_SYNONYM
340 : : // (this has a special codepath to avoid doing the synonym calculation).
341 : 13 : query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 0.0);
342 : 13 : enquire.set_query(query);
343 : 13 : Xapian::MSet mset_zero = enquire.get_mset(0, db.get_doccount());
344 : :
345 : 13 : tout << query << '\n';
346 : 13 : tout << mset_zero << '\n';
347 : :
348 : : // Check that the queries return some results.
349 [ - + # # ]: 13 : TEST_NOT_EQUAL(mset_zero.size(), 0);
350 : : // Check that the queries return the same document IDs, and the zero
351 : : // one has zero weight.
352 : 13 : check_msets_contain_same_docs(mset_orig, mset_zero);
353 [ + + ]: 455 : for (Xapian::doccount i = 0; i < mset_orig.size(); ++i) {
354 [ - + ][ # # ]: 442 : TEST_NOT_EQUAL(mset_orig[i].get_weight(), 0.0);
355 [ - + ][ # # ]: 442 : TEST_EQUAL(mset_zero[i].get_weight(), 0.0);
356 : : }
357 : :
358 : 13 : return true;
359 : : }
360 : :
361 : : // Test synonym searches combined with various operators.
362 : 13 : DEFINE_TESTCASE(synonym4, backend) {
363 : 13 : Xapian::Database db(get_database("etext"));
364 : 13 : Xapian::Enquire enquire(db);
365 : : Xapian::Query syn_query = Xapian::Query(Xapian::Query::OP_SYNONYM,
366 : : Xapian::Query("gutenberg"),
367 : 13 : Xapian::Query("blockhead"));
368 : : Xapian::Query or_query = Xapian::Query(Xapian::Query::OP_OR,
369 : : Xapian::Query("gutenberg"),
370 : 13 : Xapian::Query("blockhead"));
371 : 13 : Xapian::Query date_query = Xapian::Query("date");
372 : :
373 : : // Check some queries.
374 : : static const Xapian::Query::op operators[] = {
375 : : Xapian::Query::OP_AND_MAYBE,
376 : : Xapian::Query::OP_AND_NOT,
377 : : Xapian::Query::OP_AND,
378 : : Xapian::Query::OP_XOR,
379 : : Xapian::Query::OP_OR,
380 : : Xapian::Query::OP_SYNONYM
381 : : };
382 : : const Xapian::Query::op * end;
383 : 13 : end = operators + sizeof(operators) / sizeof(operators[0]);
384 [ + + ]: 91 : for (const Xapian::Query::op * i = operators; i != end; ++i) {
385 : 78 : tout.str(string());
386 : 78 : Xapian::Query query1(*i, syn_query, date_query);
387 : 78 : Xapian::Query query2(*i, or_query, date_query);
388 : :
389 : 78 : enquire.set_query(query1);
390 : 78 : tout << "query1:" << query1 << '\n';
391 : 78 : Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
392 : 78 : tout << "mset1:" << mset1 << '\n';
393 : 78 : enquire.set_query(query2);
394 : 78 : tout << "query2:" << query2 << '\n';
395 : 78 : Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
396 : 78 : tout << "mset2:" << mset2 << '\n';
397 : :
398 [ - + # # ]: 78 : TEST_NOT_EQUAL(mset1.size(), 0);
399 [ + + ]: 78 : if (*i != Xapian::Query::OP_XOR) {
400 [ - + ][ # # ]: 65 : TEST_EQUAL(mset1[0].get_percent(), 100);
401 : : } else {
402 [ - + ][ # # ]: 13 : TEST(mset1[0].get_percent() != 100);
403 : : }
404 : 78 : check_msets_contain_same_docs(mset1, mset2);
405 : : }
406 : :
407 : 13 : return true;
408 : : }
|