Branch data Line data Source code
1 : : /* utf8itor.cc: iterate over a utf8 string.
2 : : *
3 : : * Copyright (C) 2006,2007,2010 Olly Betts
4 : : *
5 : : * This program is free software; you can redistribute it and/or modify
6 : : * it under the terms of the GNU General Public License as published by
7 : : * the Free Software Foundation; either version 2 of the License, or
8 : : * (at your option) any later version.
9 : : *
10 : : * This program is distributed in the hope that it will be useful,
11 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : : * GNU General Public License for more details.
14 : : *
15 : : * You should have received a copy of the GNU General Public License
16 : : * along with this program; if not, write to the Free Software
17 : : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 : : */
19 : :
20 : : #include <config.h>
21 : :
22 : : #include <xapian/unicode.h>
23 : :
24 : : #include <cstring>
25 : :
26 : : using namespace std;
27 : :
28 : 1211 : inline bool bad_cont(unsigned char ch) { return (ch & 0xc0) != 0x80; }
29 : :
30 : : namespace Xapian {
31 : :
32 : : namespace Unicode {
33 : :
34 : : // buf should be at least 4 bytes.
35 : : unsigned
36 : 327 : nonascii_to_utf8(unsigned ch, char * buf)
37 : : {
38 [ + + ]: 327 : if (ch < 0x800) {
39 : 33 : buf[0] = 0xc0 | (ch >> 6);
40 : 33 : buf[1] = 0x80 | (ch & 0x3f);
41 : 33 : return 2;
42 : : }
43 [ + + ]: 294 : if (ch < 0x10000) {
44 : 290 : buf[0] = 0xe0 | (ch >> 12);
45 : 290 : buf[1] = 0x80 | ((ch >> 6) & 0x3f);
46 : 290 : buf[2] = 0x80 | (ch & 0x3f);
47 : 290 : return 3;
48 : : }
49 [ + + ]: 4 : if (ch < 0x200000) {
50 : 3 : buf[0] = 0xf0 | (ch >> 18);
51 : 3 : buf[1] = 0x80 | ((ch >> 12) & 0x3f);
52 : 3 : buf[2] = 0x80 | ((ch >> 6) & 0x3f);
53 : 3 : buf[3] = 0x80 | (ch & 0x3f);
54 : 3 : return 4;
55 : : }
56 : : // Unicode doesn't specify any characters above 0x10ffff.
57 : : // Should we be presented with such a numeric character
58 : : // entity or similar, we just replace it with nothing.
59 : 327 : return 0;
60 : : }
61 : :
62 : : }
63 : :
64 : 10 : Utf8Iterator::Utf8Iterator(const char *p_)
65 : : {
66 : 10 : assign(p_, strlen(p_));
67 : 10 : }
68 : :
69 : : void
70 : 417856 : Utf8Iterator::calculate_sequence_length() const
71 : : {
72 : : // Handle invalid UTF-8, overlong sequences, and truncated sequences as
73 : : // if the text was actually in ISO-8859-1 since we need to do something
74 : : // with it, and this seems the most likely reason why we'd have invalid
75 : : // UTF-8.
76 : :
77 : 417856 : unsigned char ch = *p;
78 : :
79 : 417856 : seqlen = 1;
80 : : // Single byte encoding (0x00-0x7f) or invalid (0x80-0xbf) or overlong
81 : : // sequence (0xc0-0xc1).
82 : : //
83 : : // (0xc0 and 0xc1 would start 2 byte sequences for characters which are
84 : : // representable in a single byte, and we should not decode these.)
85 [ + + ]: 417856 : if (ch < 0xc2) return;
86 : :
87 [ + + ]: 742 : if (ch < 0xe0) {
88 [ + + ][ + + ]: 244 : if (p + 1 == end || // Not enough bytes
[ + + ]
89 : 236 : bad_cont(p[1])) // Invalid
90 : 19 : return;
91 : 225 : seqlen = 2;
92 : 225 : return;
93 : : }
94 [ + + ]: 498 : if (ch < 0xf0) {
95 [ + + ][ + + ]: 467 : if (end - p < 3 || // Not enough bytes
[ + + ][ + + ]
[ + + ][ + + ]
96 : 924 : bad_cont(p[1]) || bad_cont(p[2]) || // Invalid
97 : 462 : (p[0] == 0xe0 && p[1] < 0xa0)) // Overlong encoding
98 : 10 : return;
99 : 457 : seqlen = 3;
100 : 457 : return;
101 : : }
102 [ + + ][ + + ]: 31 : if (ch >= 0xf5 || // Code value above Unicode
[ + + ][ + + ]
[ + + ][ + + ]
[ + + ][ + + ]
[ + + ][ + + ]
103 : : end - p < 4 || // Not enough bytes
104 : 51 : bad_cont(p[1]) || bad_cont(p[2]) || bad_cont(p[3]) || // Invalid
105 : 21 : (p[0] == 0xf0 && p[1] < 0x90) || // Overlong encoding
106 : 12 : (p[0] == 0xf4 && p[1] >= 0x90)) // Code value above Unicode
107 : 22 : return;
108 : 9 : seqlen = 4;
109 : 417856 : return;
110 : : }
111 : :
112 : 1006398 : unsigned Utf8Iterator::operator*() const {
113 [ + + ]: 1006398 : if (p == NULL) return unsigned(-1);
114 [ + + ]: 1006213 : if (seqlen == 0) calculate_sequence_length();
115 : 1006213 : unsigned char ch = *p;
116 [ + + ]: 1006213 : if (seqlen == 1) return ch;
117 [ + + ]: 993 : if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
118 [ + + ]: 713 : if (seqlen == 3)
119 : 704 : return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
120 : 9 : return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
121 : 1006398 : ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
122 : : }
123 : :
124 : : }
|