LCOV - code coverage report
Current view: top level - queryparser/cjk - cjk-tokenizer.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core r Lines: 23 24 95.8 %
Date: 2011-08-21 Functions: 4 4 100.0 %
Branches: 26 54 48.1 %

           Branch data     Line data    Source code
       1                 :            : /** @file cjk-tokenizer.cc
       2                 :            :  * @brief Tokenise CJK text as n-grams
       3                 :            :  */
       4                 :            : /* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com)
       5                 :            :  * Copyright (c) 2011 Richard Boulton (richard@tartarus.org)
       6                 :            :  * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com)
       7                 :            :  * Copyright (c) 2011 Olly Betts
       8                 :            :  *
       9                 :            :  * Permission is hereby granted, free of charge, to any person obtaining a copy
      10                 :            :  * of this software and associated documentation files (the "Software"), to deal
      11                 :            :  * deal in the Software without restriction, including without limitation the
      12                 :            :  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
      13                 :            :  * sell copies of the Software, and to permit persons to whom the Software is
      14                 :            :  * furnished to do so, subject to the following conditions:
      15                 :            :  *
      16                 :            :  * The above copyright notice and this permission notice shall be included in
      17                 :            :  * all copies or substantial portions of the Software.
      18                 :            :  *
      19                 :            :  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      20                 :            :  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      21                 :            :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
      22                 :            :  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      23                 :            :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      24                 :            :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
      25                 :            :  * IN THE SOFTWARE.
      26                 :            :  */
      27                 :            : 
      28                 :            : #include <config.h>
      29                 :            : 
      30                 :            : #include "cjk-tokenizer.h"
      31                 :            : 
      32                 :            : #include "omassert.h"
      33                 :            : #include "xapian/unicode.h"
      34                 :            : 
      35                 :            : #include <string>
      36                 :            : 
      37                 :            : using namespace std;
      38                 :            : 
      39                 :            : static unsigned NGRAM_SIZE = 2;
      40                 :            : 
      41                 :            : // 2E80..2EFF; CJK Radicals Supplement
      42                 :            : // 3000..303F; CJK Symbols and Punctuation
      43                 :            : // 3040..309F; Hiragana
      44                 :            : // 30A0..30FF; Katakana
      45                 :            : // 3100..312F; Bopomofo
      46                 :            : // 3130..318F; Hangul Compatibility Jamo
      47                 :            : // 3190..319F; Kanbun
      48                 :            : // 31A0..31BF; Bopomofo Extended
      49                 :            : // 31C0..31EF; CJK Strokes
      50                 :            : // 31F0..31FF; Katakana Phonetic Extensions
      51                 :            : // 3200..32FF; Enclosed CJK Letters and Months
      52                 :            : // 3300..33FF; CJK Compatibility
      53                 :            : // 3400..4DBF; CJK Unified Ideographs Extension A
      54                 :            : // 4DC0..4DFF; Yijing Hexagram Symbols
      55                 :            : // 4E00..9FFF; CJK Unified Ideographs
      56                 :            : // A700..A71F; Modifier Tone Letters
      57                 :            : // AC00..D7AF; Hangul Syllables
      58                 :            : // F900..FAFF; CJK Compatibility Ideographs
      59                 :            : // FE30..FE4F; CJK Compatibility Forms
      60                 :            : // FF00..FFEF; Halfwidth and Fullwidth Forms
      61                 :            : // 20000..2A6DF; CJK Unified Ideographs Extension B
      62                 :            : // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
      63                 :            : bool
      64                 :     186461 : CJK::codepoint_is_cjk(unsigned p)
      65                 :            : {
      66         [ +  + ]:     186461 :     if (p < 0x2E80) return false;
      67                 :            :     return ((p >= 0x2E80 && p <= 0x2EFF) ||
      68                 :            :             (p >= 0x3000 && p <= 0x9FFF) ||
      69                 :            :             (p >= 0xA700 && p <= 0xA71F) ||
      70                 :            :             (p >= 0xAC00 && p <= 0xD7AF) ||
      71                 :            :             (p >= 0xF900 && p <= 0xFAFF) ||
      72                 :            :             (p >= 0xFE30 && p <= 0xFE4F) ||
      73                 :            :             (p >= 0xFF00 && p <= 0xFFEF) ||
      74                 :            :             (p >= 0x20000 && p <= 0x2A6DF) ||
      75 [ +  - ][ +  - ]:     186461 :             (p >= 0x2F800 && p <= 0x2FA1F));
         [ +  - ][ +  + ]
         [ +  - ][ +  - ]
         [ +  - ][ -  + ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
         [ #  # ][ #  # ]
      76                 :            : }
      77                 :            : 
      78                 :            : string
      79                 :         34 : CJK::get_cjk(Xapian::Utf8Iterator &it)
      80                 :            : {
      81                 :         34 :     string str;
      82 [ +  + ][ +  + ]:        107 :     while (it != Xapian::Utf8Iterator() && codepoint_is_cjk(*it)) {
                 [ +  + ]
      83                 :         73 :         Xapian::Unicode::append_utf8(str, *it);
      84                 :         73 :         ++it;
      85                 :            :     }
      86                 :          0 :     return str;
      87                 :            : }
      88                 :            : 
      89                 :            : const string &
      90                 :        110 : CJKTokenIterator::operator*() const
      91                 :            : {
      92         [ +  + ]:        110 :     if (current_token.empty()) {
      93                 :            :         Assert(it != Xapian::Utf8Iterator());
      94         [ -  + ]:         70 :         p = it;
      95                 :         70 :         Xapian::Unicode::append_utf8(current_token, *p);
      96                 :         70 :         ++p;
      97                 :         70 :         len = 1;
      98                 :            :     }
      99                 :        110 :     return current_token;
     100                 :            : }
     101                 :            : 
     102                 :            : CJKTokenIterator &
     103                 :        107 : CJKTokenIterator::operator++()
     104                 :            : {
     105 [ +  + ][ +  + ]:        107 :     if (len < NGRAM_SIZE && p != Xapian::Utf8Iterator()) {
                 [ +  + ]
     106                 :         37 :         Xapian::Unicode::append_utf8(current_token, *p);
     107                 :         37 :         ++p;
     108                 :         37 :         ++len;
     109                 :            :     } else {
     110                 :            :         Assert(it != Xapian::Utf8Iterator());
     111                 :         70 :         ++it;
     112                 :         70 :         current_token.resize(0);
     113                 :            :     }
     114                 :        107 :     return *this;
     115                 :            : }

Generated by: LCOV version 1.8