Branch data Line data Source code
1 : : /** @file api_unicode.cc
2 : : * @brief Test the Unicode and UTF-8 classes and functions.
3 : : */
4 : : /* Copyright (C) 2006,2007,2008,2009,2010 Olly Betts
5 : : *
6 : : * This program is free software; you can redistribute it and/or modify
7 : : * it under the terms of the GNU General Public License as published by
8 : : * the Free Software Foundation; either version 2 of the License, or
9 : : * (at your option) any later version.
10 : : *
11 : : * This program is distributed in the hope that it will be useful,
12 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : : * GNU General Public License for more details.
15 : : *
16 : : * You should have received a copy of the GNU General Public License
17 : : * along with this program; if not, write to the Free Software
18 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 : : */
20 : :
21 : : #include <config.h>
22 : :
23 : : #include "api_unicode.h"
24 : :
25 : : #include <xapian.h>
26 : :
27 : : #include "apitest.h"
28 : : #include "testutils.h"
29 : :
30 : : #include <cctype>
31 : :
32 : : using namespace std;
33 : :
34 : : struct testcase {
35 : : const char * a, * b;
36 : : };
37 : :
38 : : static const testcase testcases[] = {
39 : : { "abcd", "abcd" }, // Sanity check!
40 : : { "a\x80""bcd", "a\xc2\x80""bcd" },
41 : : { "a\xa0", "a\xc2\xa0" },
42 : : { "a\xa0z", "a\xc2\xa0z" },
43 : : { "x\xc1yz", "x\xc3\x81yz" },
44 : : { "\xc2z", "\xc3\x82z" },
45 : : { "\xc2", "\xc3\x82" },
46 : : { "xy\xc3z", "xy\xc3\x83z" },
47 : : { "xy\xc3\xc3z", "xy\xc3\x83\xc3\x83z" },
48 : : { "xy\xc3\xc3", "xy\xc3\x83\xc3\x83" },
49 : : { "\xe0", "\xc3\xa0" },
50 : : { "\xe0\x80", "\xc3\xa0\xc2\x80" },
51 : : { "\xe0\xc0", "\xc3\xa0\xc3\x80" },
52 : : { "\xe0\xc0z", "\xc3\xa0\xc3\x80z" },
53 : : { "\xe0\xc0zz", "\xc3\xa0\xc3\x80zz" },
54 : : { "\xe0\xc0\x81", "\xc3\xa0\xc3\x80\xc2\x81" },
55 : : { "\xe0\x82\xc1", "\xc3\xa0\xc2\x82\xc3\x81" },
56 : : { "\xe0\xc5\xc7", "\xc3\xa0\xc3\x85\xc3\x87" },
57 : : { "\xf0", "\xc3\xb0" },
58 : : { "\xf0\x80", "\xc3\xb0\xc2\x80" },
59 : : { "\xf0\xc0", "\xc3\xb0\xc3\x80" },
60 : : { "\xf0\xc0z", "\xc3\xb0\xc3\x80z" },
61 : : { "\xf0\xc0zz", "\xc3\xb0\xc3\x80zz" },
62 : : { "\xf0\xc0\x81", "\xc3\xb0\xc3\x80\xc2\x81" },
63 : : { "\xf0\x82\xc1", "\xc3\xb0\xc2\x82\xc3\x81" },
64 : : { "\xf0\xc5\xc7", "\xc3\xb0\xc3\x85\xc3\x87" },
65 : : { "\xf0\xc0\x81\xc9", "\xc3\xb0\xc3\x80\xc2\x81\xc3\x89" },
66 : : { "\xf0\x82\xc1\xc8", "\xc3\xb0\xc2\x82\xc3\x81\xc3\x88" },
67 : : { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
68 : : { "\xf0\xc0\x81\x89", "\xc3\xb0\xc3\x80\xc2\x81\xc2\x89" },
69 : : { "\xf0\x82\xc1\x88", "\xc3\xb0\xc2\x82\xc3\x81\xc2\x88" },
70 : : { "\xf0\xc5\xc7\xc6", "\xc3\xb0\xc3\x85\xc3\x87\xc3\x86" },
71 : : { "\xf4P\x80\x80", "\xc3\xb4P\xc2\x80\xc2\x80" },
72 : : { "\xf4\x80P\x80", "\xc3\xb4\xc2\x80P\xc2\x80" },
73 : : { "\xf4\x80\x80P", "\xc3\xb4\xc2\x80\xc2\x80P" },
74 : : { "\xfe\xffxyzzy", "\xc3\xbe\xc3\xbfxyzzy" },
75 : : // Overlong encodings:
76 : : { "\xc0\x80", "\xc3\x80\xc2\x80" },
77 : : { "\xc0\xbf", "\xc3\x80\xc2\xbf" },
78 : : { "\xc1\x80", "\xc3\x81\xc2\x80" },
79 : : { "\xc1\xbf", "\xc3\x81\xc2\xbf" },
80 : : { "\xe0\x80\x80", "\xc3\xa0\xc2\x80\xc2\x80" },
81 : : { "\xe0\x9f\xbf", "\xc3\xa0\xc2\x9f\xc2\xbf" },
82 : : { "\xf0\x80\x80\x80", "\xc3\xb0\xc2\x80\xc2\x80\xc2\x80" },
83 : : { "\xf0\x8f\xbf\xbf", "\xc3\xb0\xc2\x8f\xc2\xbf\xc2\xbf" },
84 : : // Above Unicode:
85 : : { "\xf4\x90\x80\x80", "\xc3\xb4\xc2\x90\xc2\x80\xc2\x80" },
86 : : { 0, 0 }
87 : : };
88 : :
89 : : // Test handling of invalid UTF-8 is as desired.
90 : 1 : DEFINE_TESTCASE(utf8iterator1,!backend) {
91 : : const testcase * p;
92 [ + + ]: 46 : for (p = testcases; p->a; ++p) {
93 : 45 : tout.str(string());
94 : 45 : tout << '"' << p->a << "\" and \"" << p->b << '"' << endl;
95 : 45 : size_t a_len = strlen(p->a);
96 : 45 : Xapian::Utf8Iterator a(p->a, a_len);
97 : :
98 : 45 : size_t b_len = strlen(p->b);
99 : 45 : Xapian::Utf8Iterator b(p->b, b_len);
100 : :
101 [ + + ][ + - ]: 190 : while (a != Xapian::Utf8Iterator() && b != Xapian::Utf8Iterator()) {
[ + + ]
102 [ - + ][ # # ]: 145 : TEST_EQUAL(*a, *b);
103 : 145 : ++a;
104 : 145 : ++b;
105 : : }
106 : :
107 : : // Test that we don't reach the end of one before the other.
108 [ - + ][ # # ]: 45 : TEST(a == Xapian::Utf8Iterator());
109 [ - + ][ # # ]: 45 : TEST(b == Xapian::Utf8Iterator());
110 : : }
111 : 1 : return true;
112 : : }
113 : :
114 : : struct testcase2 {
115 : : const char * a;
116 : : unsigned long n;
117 : : };
118 : :
119 : : static const testcase2 testcases2[] = {
120 : : { "a", 97 },
121 : : { "\x80", 128 },
122 : : { "\xa0", 160 },
123 : : { "\xc2\x80", 128 },
124 : : { "\xc2\xa0", 160 },
125 : : { "\xe0\xa0\x80", 0x0800 },
126 : : { "\xe1\x80\x80", 0x1000 },
127 : : { "\xf0\xa8\xa8\x8f", 166415 },
128 : : { "\xf3\x80\x80\x80", 0x0c0000 },
129 : : { "\xf4\x80\x80\x80", 0x100000 },
130 : : { 0, 0 }
131 : : };
132 : :
133 : : // Test decoding of UTF-8.
134 : 1 : DEFINE_TESTCASE(utf8iterator2,!backend) {
135 : : const testcase2 * p;
136 [ + + ]: 11 : for (p = testcases2; p->a; ++p) {
137 : 10 : Xapian::Utf8Iterator a(p->a);
138 : :
139 [ - + # # ]: 10 : TEST(a != Xapian::Utf8Iterator());
140 [ - + ][ # # ]: 10 : TEST_EQUAL(*a, p->n);
141 [ - + ][ # # ]: 10 : TEST(++a == Xapian::Utf8Iterator());
142 : : }
143 : 1 : return true;
144 : : }
145 : :
146 : : // Test Unicode categorisation.
147 : 1 : DEFINE_TESTCASE(unicode1,!backend) {
148 : : using namespace Xapian;
149 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category('a'), Unicode::LOWERCASE_LETTER);
150 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category('0'), Unicode::DECIMAL_DIGIT_NUMBER);
151 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category('$'), Unicode::CURRENCY_SYMBOL);
152 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0xa3), Unicode::CURRENCY_SYMBOL);
153 : : // U+0242 was added in Unicode 5.0.0.
154 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x242), Unicode::LOWERCASE_LETTER);
155 : : // U+11A7 was added in Unicode 5.2.0.
156 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x11A7), Unicode::OTHER_LETTER);
157 : : // U+9FCB was added in Unicode 5.2.0.
158 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x9FCB), Unicode::OTHER_LETTER);
159 : : // U+FA6C was added in Unicode 5.2.0.
160 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0xFA6C), Unicode::OTHER_LETTER);
161 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0xFFFF), Unicode::UNASSIGNED);
162 : : // Test characters outside BMP.
163 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x10345), Unicode::OTHER_LETTER);
164 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x10FFFD), Unicode::PRIVATE_USE);
165 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x10FFFF), Unicode::UNASSIGNED);
166 : : // U+1109A was added in Unicode 5.2.0.
167 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x1109a), Unicode::OTHER_LETTER);
168 : : // Test some invalid Unicode values.
169 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x110000), Unicode::UNASSIGNED);
170 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0xFFFFFFFF), Unicode::UNASSIGNED);
171 : 1 : return true;
172 : : }
173 : :
174 : 1 : DEFINE_TESTCASE(caseconvert1,!backend) {
175 : : using namespace Xapian;
176 [ + + ]: 129 : for (unsigned ch = 0; ch < 128; ++ch) {
177 [ + + ]: 128 : if (isupper((char)ch)) {
178 [ - + ][ # # ]: 26 : TEST_EQUAL(Unicode::tolower(ch), unsigned(tolower((char)ch)));
179 : : } else {
180 [ - + ][ # # ]: 102 : TEST_EQUAL(Unicode::tolower(ch), ch);
181 : : }
182 [ + + ]: 128 : if (islower((char)ch)) {
183 [ - + ][ # # ]: 26 : TEST_EQUAL(Unicode::toupper(ch), unsigned(toupper((char)ch)));
184 : : } else {
185 [ - + ][ # # ]: 102 : TEST_EQUAL(Unicode::toupper(ch), ch);
186 : : }
187 : : }
188 : :
189 : : // U+0242 was added in Unicode 5.0.0 as a lowercase form of U+0241.
190 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0x242), 0x242);
191 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x242), 0x241);
192 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x241), 0x241);
193 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0x241), 0x242);
194 : :
195 : : // Pound currency symbol:
196 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0xa3), 0xa3);
197 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0xa3), 0xa3);
198 : : // Unassigned:
199 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0xFFFF), 0xFFFF);
200 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0xFFFF), 0xFFFF);
201 : : // Test characters outside BMP.
202 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0x10345), 0x10345);
203 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x10345), 0x10345);
204 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0x10FFFD), 0x10FFFD);
205 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x10FFFD), 0x10FFFD);
206 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0x10FFFF), 0x10FFFF);
207 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x10FFFF), 0x10FFFF);
208 : : // Test some invalid Unicode values.
209 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0x110000), 0x110000);
210 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x110000), 0x110000);
211 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0xFFFFFFFF), 0xFFFFFFFF);
212 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0xFFFFFFFF), 0xFFFFFFFF);
213 : :
214 : 1 : return true;
215 : : }
216 : :
217 : : /// Test Unicode 5.1 support.
218 : 1 : DEFINE_TESTCASE(caseconvert2,!backend) {
219 : : using namespace Xapian;
220 : :
221 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x250), 0x2c6f);
222 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x251), 0x2c6d);
223 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x271), 0x2c6e);
224 : :
225 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x2ec), Unicode::MODIFIER_LETTER);
226 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x374), Unicode::MODIFIER_LETTER);
227 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x487), Unicode::NON_SPACING_MARK);
228 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x5be), Unicode::DASH_PUNCTUATION);
229 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::get_category(0x1f093), Unicode::OTHER_SYMBOL);
230 : :
231 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0x370), 0x371);
232 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x371), 0x370);
233 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0x372), 0x373);
234 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x373), 0x372);
235 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0x376), 0x377);
236 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x377), 0x376);
237 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::tolower(0x3cf), 0x3d7);
238 [ - + ][ # # ]: 1 : TEST_EQUAL(Unicode::toupper(0x3d7), 0x3cf);
239 : :
240 : : unsigned u;
241 [ + + ]: 9 : for (u = 0x514; u < 0x524; u += 2) {
242 [ - + ][ # # ]: 8 : TEST_EQUAL(Unicode::get_category(u), Unicode::UPPERCASE_LETTER);
243 [ - + ][ # # ]: 8 : TEST_EQUAL(Unicode::get_category(u + 1), Unicode::LOWERCASE_LETTER);
244 [ - + ][ # # ]: 8 : TEST_EQUAL(Unicode::tolower(u), u + 1);
245 [ - + ][ # # ]: 8 : TEST_EQUAL(Unicode::toupper(u + 1), u);
246 : : }
247 : :
248 : 1 : return true;
249 : : }
250 : :
251 : 1 : DEFINE_TESTCASE(utf8convert1,!backend) {
252 : 1 : string s;
253 : 1 : Xapian::Unicode::append_utf8(s, 'a');
254 : 1 : Xapian::Unicode::append_utf8(s, 128);
255 : 1 : Xapian::Unicode::append_utf8(s, 160);
256 : 1 : Xapian::Unicode::append_utf8(s, 0xFFFF);
257 : 1 : Xapian::Unicode::append_utf8(s, 166415);
258 : 1 : Xapian::Unicode::append_utf8(s, 0x10345);
259 : 1 : Xapian::Unicode::append_utf8(s, 0x10FFFD);
260 : 1 : Xapian::Unicode::append_utf8(s, 0xFFFFFFFF);
261 : 1 : Xapian::Unicode::append_utf8(s, 'z');
262 [ - + # # ]: 1 : TEST_STRINGS_EQUAL(s, "a"
263 : : "\xc2\x80"
264 : : "\xc2\xa0"
265 : : "\xef\xbf\xbf"
266 : : "\xf0\xa8\xa8\x8f"
267 : : "\xf0\x90\x8d\x85"
268 : : "\xf4\x8f\xbf\xbd"
269 : : ""
270 : : "z"
271 : : );
272 : :
273 : 1 : return true;
274 : : }
275 : :
276 : 1 : DEFINE_TESTCASE(unicodepredicates1,!backend) {
277 : : const unsigned wordchars[] = {
278 : : // DECIMAL_DIGIT_NUMER
279 : : '0', '7', '9',
280 : : // LOWERCASE_LETTER
281 : : 'a', 'z', 0x250, 0x251, 0x271, 0x3d7,
282 : : 0x242, // (added in Unicode 5.0.0)
283 : : // LOWERCASE_LETTER (added in Unicode 5.1.0)
284 : : 0x371, 0x373, 0x377, 0x514, 0x516, 0x518, 0x51a, 0x51c, 0x51e,
285 : : 0x520, 0x522,
286 : : // UPPERCASE_LETTER
287 : : 'A', 'Z', 0x241,
288 : : // UPPERCASE_LETTER (added in Unicode 5.1.0)
289 : : 0x370, 0x372, 0x376, 0x3cf, 0x515, 0x517, 0x519, 0x51b, 0x51d, 0x51f,
290 : : 0x521, 0x523, 0x2c6d, 0x2c6e, 0x2c6f,
291 : : // OTHER_LETTER
292 : : 0x10345,
293 : : // MODIFIER_LETTER (added in Unicode 5.1.0)
294 : : 0x2ec, 0x374,
295 : : // NON_SPACING_MARK (added to is_wordchar() in 1.1.0)
296 : : 0x651,
297 : : 0x487, // Added in Unicode 5.1.0
298 : : 0
299 : 1 : };
300 : : const unsigned currency[] = {
301 : : // CURRENCY_SYMBOL
302 : : '$', 0xa3,
303 : : 0
304 : 1 : };
305 : : const unsigned whitespace[] = {
306 : : // CONTROL
307 : : '\t', '\n', '\f', '\r',
308 : : // SPACE_SEPARATOR
309 : : ' ',
310 : : 0
311 : 1 : };
312 : : const unsigned other[] = {
313 : : // DASH_PUNCTUATION (added in Unicode 5.1.0)
314 : : 0x5be,
315 : : // OTHER_SYMBOL (added in Unicode 5.1.0)
316 : : 0x1f093,
317 : : // UNASSIGNED
318 : : 0xffff, 0x10ffff, 0x110000, 0xFFFFFFFF,
319 : : // PRIVATE_USE
320 : : 0x10fffd,
321 : : 0
322 : 1 : };
323 : :
324 [ + + ]: 45 : for (const unsigned * p = wordchars; *p; ++p) {
325 [ - + ][ # # ]: 44 : TEST(Xapian::Unicode::is_wordchar(*p));
326 [ - + ][ # # ]: 44 : TEST(!Xapian::Unicode::is_currency(*p));
327 [ - + ][ # # ]: 44 : TEST(!Xapian::Unicode::is_whitespace(*p));
328 : : }
329 : :
330 [ + + ]: 3 : for (const unsigned * p = currency; *p; ++p) {
331 [ - + ][ # # ]: 2 : TEST(!Xapian::Unicode::is_wordchar(*p));
332 [ - + ][ # # ]: 2 : TEST(Xapian::Unicode::is_currency(*p));
333 [ - + ][ # # ]: 2 : TEST(!Xapian::Unicode::is_whitespace(*p));
334 : : }
335 : :
336 [ + + ]: 6 : for (const unsigned * p = whitespace; *p; ++p) {
337 [ - + ][ # # ]: 5 : TEST(!Xapian::Unicode::is_wordchar(*p));
338 [ - + ][ # # ]: 5 : TEST(!Xapian::Unicode::is_currency(*p));
339 [ - + ][ # # ]: 5 : TEST(Xapian::Unicode::is_whitespace(*p));
340 : : }
341 : :
342 [ + + ]: 8 : for (const unsigned * p = other; *p; ++p) {
343 [ - + ][ # # ]: 7 : TEST(!Xapian::Unicode::is_wordchar(*p));
344 [ - + ][ # # ]: 7 : TEST(!Xapian::Unicode::is_currency(*p));
345 [ - + ][ # # ]: 7 : TEST(!Xapian::Unicode::is_whitespace(*p));
346 : : }
347 : :
348 : 1 : return true;
349 : : }
|