LCOV - code coverage report
Current view: top level - unicode - utf8itor.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core r Lines: 55 55 100.0 %
Date: 2011-08-21 Functions: 5 6 83.3 %
Branches: 60 60 100.0 %

           Branch data     Line data    Source code
       1                 :            : /* utf8itor.cc: iterate over a utf8 string.
       2                 :            :  *
       3                 :            :  * Copyright (C) 2006,2007,2010 Olly Betts
       4                 :            :  *
       5                 :            :  * This program is free software; you can redistribute it and/or modify
       6                 :            :  * it under the terms of the GNU General Public License as published by
       7                 :            :  * the Free Software Foundation; either version 2 of the License, or
       8                 :            :  * (at your option) any later version.
       9                 :            :  *
      10                 :            :  * This program is distributed in the hope that it will be useful,
      11                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      12                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13                 :            :  * GNU General Public License for more details.
      14                 :            :  *
      15                 :            :  * You should have received a copy of the GNU General Public License
      16                 :            :  * along with this program; if not, write to the Free Software
      17                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      18                 :            :  */
      19                 :            : 
      20                 :            : #include <config.h>
      21                 :            : 
      22                 :            : #include <xapian/unicode.h>
      23                 :            : 
      24                 :            : #include <cstring>
      25                 :            : 
      26                 :            : using namespace std;
      27                 :            : 
      28                 :       1211 : inline bool bad_cont(unsigned char ch) { return (ch & 0xc0) != 0x80; }
      29                 :            : 
      30                 :            : namespace Xapian {
      31                 :            : 
      32                 :            : namespace Unicode {
      33                 :            : 
      34                 :            : // buf should be at least 4 bytes.
      35                 :            : unsigned
      36                 :        327 : nonascii_to_utf8(unsigned ch, char * buf)
      37                 :            : {
      38         [ +  + ]:        327 :     if (ch < 0x800) {
      39                 :         33 :         buf[0] = 0xc0 | (ch >> 6);
      40                 :         33 :         buf[1] = 0x80 | (ch & 0x3f);
      41                 :         33 :         return 2;
      42                 :            :     }
      43         [ +  + ]:        294 :     if (ch < 0x10000) {
      44                 :        290 :         buf[0] = 0xe0 | (ch >> 12);
      45                 :        290 :         buf[1] = 0x80 | ((ch >> 6) & 0x3f);
      46                 :        290 :         buf[2] = 0x80 | (ch & 0x3f);
      47                 :        290 :         return 3;
      48                 :            :     }
      49         [ +  + ]:          4 :     if (ch < 0x200000) {
      50                 :          3 :         buf[0] = 0xf0 | (ch >> 18);
      51                 :          3 :         buf[1] = 0x80 | ((ch >> 12) & 0x3f);
      52                 :          3 :         buf[2] = 0x80 | ((ch >> 6) & 0x3f);
      53                 :          3 :         buf[3] = 0x80 | (ch & 0x3f);
      54                 :          3 :         return 4;
      55                 :            :     }
      56                 :            :     // Unicode doesn't specify any characters above 0x10ffff.
      57                 :            :     // Should we be presented with such a numeric character
      58                 :            :     // entity or similar, we just replace it with nothing.
      59                 :        327 :     return 0;
      60                 :            : }
      61                 :            : 
      62                 :            : }
      63                 :            : 
      64                 :         10 : Utf8Iterator::Utf8Iterator(const char *p_)
      65                 :            : {
      66                 :         10 :     assign(p_, strlen(p_));
      67                 :         10 : }
      68                 :            : 
      69                 :            : void
      70                 :     417856 : Utf8Iterator::calculate_sequence_length() const
      71                 :            : {
      72                 :            :     // Handle invalid UTF-8, overlong sequences, and truncated sequences as
      73                 :            :     // if the text was actually in ISO-8859-1 since we need to do something
      74                 :            :     // with it, and this seems the most likely reason why we'd have invalid
      75                 :            :     // UTF-8.
      76                 :            : 
      77                 :     417856 :     unsigned char ch = *p;
      78                 :            : 
      79                 :     417856 :     seqlen = 1;
      80                 :            :     // Single byte encoding (0x00-0x7f) or invalid (0x80-0xbf) or overlong
      81                 :            :     // sequence (0xc0-0xc1).
      82                 :            :     //
      83                 :            :     // (0xc0 and 0xc1 would start 2 byte sequences for characters which are
      84                 :            :     // representable in a single byte, and we should not decode these.)
      85         [ +  + ]:     417856 :     if (ch < 0xc2) return;
      86                 :            : 
      87         [ +  + ]:        742 :     if (ch < 0xe0) {
      88 [ +  + ][ +  + ]:        244 :         if (p + 1 == end || // Not enough bytes
                 [ +  + ]
      89                 :        236 :             bad_cont(p[1])) // Invalid
      90                 :         19 :             return;
      91                 :        225 :         seqlen = 2;
      92                 :        225 :         return;
      93                 :            :     }
      94         [ +  + ]:        498 :     if (ch < 0xf0) {
      95 [ +  + ][ +  + ]:        467 :         if (end - p < 3 || // Not enough bytes
         [ +  + ][ +  + ]
         [ +  + ][ +  + ]
      96                 :        924 :             bad_cont(p[1]) || bad_cont(p[2]) || // Invalid
      97                 :        462 :             (p[0] == 0xe0 && p[1] < 0xa0)) // Overlong encoding
      98                 :         10 :             return;
      99                 :        457 :         seqlen = 3;
     100                 :        457 :         return;
     101                 :            :     }
     102 [ +  + ][ +  + ]:         31 :     if (ch >= 0xf5 || // Code value above Unicode
         [ +  + ][ +  + ]
         [ +  + ][ +  + ]
         [ +  + ][ +  + ]
         [ +  + ][ +  + ]
     103                 :            :         end - p < 4 || // Not enough bytes
     104                 :         51 :         bad_cont(p[1]) || bad_cont(p[2]) || bad_cont(p[3]) || // Invalid
     105                 :         21 :         (p[0] == 0xf0 && p[1] < 0x90) || // Overlong encoding
     106                 :         12 :         (p[0] == 0xf4 && p[1] >= 0x90)) // Code value above Unicode
     107                 :         22 :         return;
     108                 :          9 :     seqlen = 4;
     109                 :     417856 :     return;
     110                 :            : }
     111                 :            : 
     112                 :    1006398 : unsigned Utf8Iterator::operator*() const {
     113         [ +  + ]:    1006398 :     if (p == NULL) return unsigned(-1);
     114         [ +  + ]:    1006213 :     if (seqlen == 0) calculate_sequence_length();
     115                 :    1006213 :     unsigned char ch = *p;
     116         [ +  + ]:    1006213 :     if (seqlen == 1) return ch;
     117         [ +  + ]:        993 :     if (seqlen == 2) return ((ch & 0x1f) << 6) | (p[1] & 0x3f);
     118         [ +  + ]:        713 :     if (seqlen == 3)
     119                 :        704 :         return ((ch & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
     120                 :          9 :     return ((ch & 0x07) << 18) | ((p[1] & 0x3f) << 12) |
     121                 :    1006398 :             ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
     122                 :            : }
     123                 :            : 
     124                 :            : }

Generated by: LCOV version 1.8