LCOV - code coverage report
Current view: top level - queryparser - termgenerator_internal.cc (source / functions) Hit Total Coverage
Test: Test Coverage for xapian-core r Lines: 106 119 89.1 %
Date: 2011-08-21 Functions: 8 8 100.0 %
Branches: 113 168 67.3 %

           Branch data     Line data    Source code
       1                 :            : /** @file termgenerator_internal.cc
       2                 :            :  * @brief TermGenerator class internals
       3                 :            :  */
       4                 :            : /* Copyright (C) 2007,2010,2011 Olly Betts
       5                 :            :  *
       6                 :            :  * This program is free software; you can redistribute it and/or modify
       7                 :            :  * it under the terms of the GNU General Public License as published by
       8                 :            :  * the Free Software Foundation; either version 2 of the License, or
       9                 :            :  * (at your option) any later version.
      10                 :            :  *
      11                 :            :  * This program is distributed in the hope that it will be useful,
      12                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :            :  * GNU General Public License for more details.
      15                 :            :  *
      16                 :            :  * You should have received a copy of the GNU General Public License
      17                 :            :  * along with this program; if not, write to the Free Software
      18                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      19                 :            :  */
      20                 :            : 
      21                 :            : #include <config.h>
      22                 :            : 
      23                 :            : #include "termgenerator_internal.h"
      24                 :            : 
      25                 :            : #include <xapian/document.h>
      26                 :            : #include <xapian/queryparser.h>
      27                 :            : #include <xapian/unicode.h>
      28                 :            : 
      29                 :            : #include "stringutils.h"
      30                 :            : 
      31                 :            : #include <string>
      32                 :            : 
      33                 :            : #include "cjk/cjk-tokenizer.h"
      34                 :            : 
      35                 :            : using namespace std;
      36                 :            : 
      37                 :            : namespace Xapian {
      38                 :            : 
      39                 :            : // Put a limit on the size of terms to help prevent the index being bloated
      40                 :            : // by useless junk terms.
      41                 :            : static const unsigned int MAX_PROB_TERM_LENGTH = 64;
      42                 :            : // FIXME: threshold is currently in bytes of UTF-8 representation, not unicode
      43                 :            : // characters - what actually makes most sense here?
      44                 :            : 
      45                 :            : // FIXME: Add API to allow control of how stemming is used?
      46                 :            : 
      47                 :            : inline bool
      48                 :         97 : U_isupper(unsigned ch) {
      49 [ +  + ][ +  + ]:         97 :     return (ch < 128 && C_isupper((unsigned char)ch));
      50                 :            : }
      51                 :            : 
      52                 :        423 : inline unsigned check_wordchar(unsigned ch) {
      53         [ +  + ]:        423 :     if (Unicode::is_wordchar(ch)) return Unicode::tolower(ch);
      54                 :        423 :     return 0;
      55                 :            : }
      56                 :            : 
      57                 :            : inline bool
      58                 :         42 : should_stem(const std::string & term)
      59                 :            : {
      60                 :            :     const unsigned int SHOULD_STEM_MASK =
      61                 :            :         (1 << Unicode::LOWERCASE_LETTER) |
      62                 :            :         (1 << Unicode::TITLECASE_LETTER) |
      63                 :            :         (1 << Unicode::MODIFIER_LETTER) |
      64                 :         42 :         (1 << Unicode::OTHER_LETTER);
      65                 :         42 :     Utf8Iterator u(term);
      66                 :         42 :     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
      67                 :            : }
      68                 :            : 
      69                 :            : /** Value representing "ignore this" when returned by check_infix() or
      70                 :            :  *  check_infix_digit().
      71                 :            :  */
      72                 :            : const unsigned UNICODE_IGNORE(-1);
      73                 :            : 
      74                 :         28 : inline unsigned check_infix(unsigned ch) {
      75 [ +  + ][ +  - ]:         28 :     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
         [ +  - ][ +  - ]
                 [ -  + ]
      76                 :            :         // Unicode includes all these except '&' in its word boundary rules,
      77                 :            :         // as well as 0x2019 (which we handle below) and ':' (for Swedish
      78                 :            :         // apparently, but we ignore this for now as it's problematic in
      79                 :            :         // real world cases).
      80                 :          1 :         return ch;
      81                 :            :     }
      82                 :            :     // 0x2019 is Unicode apostrophe and single closing quote.
      83                 :            :     // 0x201b is Unicode single opening quote with the tail rising.
      84 [ +  - ][ -  + ]:         27 :     if (ch == 0x2019 || ch == 0x201b) return '\'';
      85 [ +  + ][ -  + ]:         27 :     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
         [ #  # ][ #  # ]
      86                 :          6 :         return UNICODE_IGNORE;
      87                 :         28 :     return 0;
      88                 :            : }
      89                 :            : 
      90                 :         10 : inline unsigned check_infix_digit(unsigned ch) {
      91                 :            :     // This list of characters comes from Unicode's word identifying algorithm.
      92         [ +  + ]:         10 :     switch (ch) {
      93                 :            :         case ',':
      94                 :            :         case '.':
      95                 :            :         case ';':
      96                 :            :         case 0x037e: // GREEK QUESTION MARK
      97                 :            :         case 0x0589: // ARMENIAN FULL STOP
      98                 :            :         case 0x060D: // ARABIC DATE SEPARATOR
      99                 :            :         case 0x07F8: // NKO COMMA
     100                 :            :         case 0x2044: // FRACTION SLASH
     101                 :            :         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
     102                 :            :         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
     103                 :            :         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
     104                 :          6 :             return ch;
     105                 :            :     }
     106 [ -  + ][ #  # ]:          4 :     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
         [ #  # ][ #  # ]
     107                 :          0 :         return UNICODE_IGNORE;
     108                 :         10 :     return 0;
     109                 :            : }
     110                 :            : 
     111                 :            : inline bool
     112                 :         49 : is_digit(unsigned ch) {
     113                 :         49 :     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
     114                 :            : }
     115                 :            : 
     116                 :         38 : inline unsigned check_suffix(unsigned ch) {
     117 [ +  + ][ -  + ]:         38 :     if (ch == '+' || ch == '#') return ch;
     118                 :            :     // FIXME: what about '-'?
     119                 :         38 :     return 0;
     120                 :            : }
     121                 :            : 
     122                 :            : // FIXME: add API for this:
     123                 :            : #define STOPWORDS_NONE 0
     124                 :            : #define STOPWORDS_IGNORE 1
     125                 :            : #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2
     126                 :            : 
     127                 :            : void
     128                 :         43 : TermGenerator::Internal::index_text(Utf8Iterator itor, termcount weight,
     129                 :            :                                     const string & prefix, bool with_positions)
     130                 :            : {
     131                 :         43 :     bool cjk_ngram = true; // FIXME: set from flag or env var or something.
     132                 :            : 
     133                 :         43 :     int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY;
     134                 :            : 
     135         [ +  - ]:         43 :     if (!stopper) stop_mode = STOPWORDS_NONE;
     136                 :            : 
     137         [ +  + ]:        114 :     while (true) {
     138                 :            :         // Advance to the start of the next term.
     139                 :            :         unsigned ch;
     140                 :         40 :         while (true) {
     141         [ +  + ]:        154 :             if (itor == Utf8Iterator()) return;
     142                 :        118 :             ch = check_wordchar(*itor);
     143         [ +  + ]:        118 :             if (ch) break;
     144                 :         40 :             ++itor;
     145                 :            :         }
     146                 :            : 
     147                 :         78 :         string term;
     148                 :            :         // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
     149                 :            :         // Don't worry if there's a trailing '.' or not.
     150         [ +  + ]:         78 :         if (U_isupper(*itor)) {
     151                 :         19 :             const Utf8Iterator end;
     152                 :         19 :             Utf8Iterator p = itor;
     153   [ +  +  +  + ]:         37 :             do {
         [ +  + ][ +  + ]
                 [ +  + ]
     154                 :         37 :                 Unicode::append_utf8(term, Unicode::tolower(*p++));
     155                 :            :             } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
     156                 :            :             // One letter does not make an acronym!  If we handled a single
     157                 :            :             // uppercase letter here, we wouldn't catch M&S below.
     158         [ +  + ]:         19 :             if (term.size() > 1) {
     159                 :            :                 // Check there's not a (lower case) letter or digit
     160                 :            :                 // immediately after it.
     161 [ +  + ][ +  - ]:         12 :                 if (p == end || !Unicode::is_wordchar(*p)) {
                 [ +  - ]
     162                 :         12 :                     itor = p;
     163                 :         12 :                     goto endofterm;
     164                 :            :                 }
     165                 :            :             }
     166                 :          7 :             term.resize(0);
     167                 :            :         }
     168                 :            : 
     169                 :         13 :         while (true) {
     170 [ +  - ][ +  + ]:         79 :             if (cjk_ngram && CJK::codepoint_is_cjk(*itor)) {
                 [ +  + ]
     171                 :         11 :                 const string & cjk = CJK::get_cjk(itor);
     172         [ +  + ]:         50 :                 for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) {
     173                 :         39 :                     const string & cjk_token = *tk;
     174         [ -  + ]:         39 :                     if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue;
     175                 :            : 
     176 [ -  + ][ #  # ]:         39 :                     if (stop_mode == STOPWORDS_IGNORE && (*stopper)(cjk_token)) continue;
                 [ -  + ]
     177                 :            : 
     178         [ +  - ]:         39 :                     if (with_positions) {
     179                 :         39 :                         doc.add_posting(prefix + cjk_token, ++termpos, weight);
     180                 :            :                     } else {
     181                 :          0 :                         doc.add_term(prefix + cjk_token, weight);
     182                 :            :                     }
     183 [ -  + ][ #  # ]:         39 :                     if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(cjk_token);
                 [ -  + ]
     184                 :            : 
     185         [ +  - ]:         39 :                     if (!stemmer.internal.get()) continue;
     186                 :            : 
     187 [ #  # ][ #  # ]:          0 :                     if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(cjk_token))
                 [ #  # ]
     188                 :          0 :                         continue;
     189                 :            : 
     190                 :            :                     // Note, this uses the lowercased term, but that's OK as we only
     191                 :            :                     // want to avoid stemming terms starting with a digit.
     192         [ #  # ]:          0 :                     if (!should_stem(cjk_token)) continue;
     193                 :            : 
     194                 :            :                     // Add stemmed form without positional information.
     195                 :          0 :                     string stem("Z");
     196                 :          0 :                     stem += prefix;
     197                 :          0 :                     stem += stemmer(cjk_token);
     198                 :          0 :                     doc.add_term(stem, weight);
     199                 :         11 :                 }
     200                 :          2 :                 while (true) {
     201         [ +  + ]:         13 :                     if (itor == Utf8Iterator()) return;
     202                 :          7 :                     ch = check_wordchar(*itor);
     203         [ +  + ]:          7 :                     if (ch) break;
     204                 :          2 :                     ++itor;
     205         [ +  + ]:         11 :                 }
     206                 :            :             }
     207                 :            :             unsigned prevch;
     208         [ +  + ]:        255 :             do {
     209                 :        281 :                 Unicode::append_utf8(term, ch);
     210                 :        281 :                 prevch = ch;
     211   [ +  +  +  - ]:        281 :                 if (++itor == Utf8Iterator() ||
         [ +  + ][ +  + ]
     212                 :            :                     (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
     213                 :         26 :                     goto endofterm;
     214                 :        255 :                 ch = check_wordchar(*itor);
     215                 :            :             } while (ch);
     216                 :            : 
     217                 :         47 :             Utf8Iterator next(itor);
     218                 :         47 :             ++next;
     219         [ +  + ]:         47 :             if (next == Utf8Iterator()) break;
     220                 :         43 :             unsigned nextch = check_wordchar(*next);
     221         [ +  + ]:         43 :             if (!nextch) break;
     222                 :         38 :             unsigned infix_ch = *itor;
     223   [ +  +  +  + ]:         38 :             if (is_digit(prevch) && is_digit(*next)) {
                 [ +  + ]
     224                 :         10 :                 infix_ch = check_infix_digit(infix_ch);
     225                 :            :             } else {
     226                 :            :                 // Handle things like '&' in AT&T, apostrophes, etc.
     227                 :         28 :                 infix_ch = check_infix(infix_ch);
     228                 :            :             }
     229         [ +  + ]:         38 :             if (!infix_ch) break;
     230         [ +  + ]:         13 :             if (infix_ch != UNICODE_IGNORE)
     231                 :          7 :                 Unicode::append_utf8(term, infix_ch);
     232                 :         13 :             ch = nextch;
     233                 :         13 :             itor = next;
     234                 :            :         }
     235                 :            : 
     236                 :            :         {
     237                 :         34 :             size_t len = term.size();
     238                 :         34 :             unsigned count = 0;
     239         [ +  + ]:         38 :             while ((ch = check_suffix(*itor))) {
     240         [ -  + ]:          4 :                 if (++count > 3) {
     241                 :          0 :                     term.resize(len);
     242                 :          0 :                     break;
     243                 :            :                 }
     244                 :          4 :                 Unicode::append_utf8(term, ch);
     245         [ -  + ]:          4 :                 if (++itor == Utf8Iterator()) goto endofterm;
     246                 :            :             }
     247                 :            :             // Don't index fish+chips as fish+ chips.
     248         [ +  + ]:         34 :             if (Unicode::is_wordchar(*itor))
     249                 :          1 :                 term.resize(len);
     250                 :            :         }
     251                 :            : 
     252                 :            : endofterm:
     253         [ -  + ]:         72 :         if (term.size() > MAX_PROB_TERM_LENGTH) continue;
     254                 :            : 
     255 [ -  + ][ #  # ]:         72 :         if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue;
                 [ -  + ]
     256                 :            : 
     257         [ +  - ]:         72 :         if (with_positions) {
     258                 :         72 :             doc.add_posting(prefix + term, ++termpos, weight);
     259                 :            :         } else {
     260                 :          0 :             doc.add_term(prefix + term, weight);
     261                 :            :         }
     262 [ +  + ][ +  + ]:         72 :         if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term);
                 [ +  + ]
     263                 :            : 
     264         [ +  + ]:         71 :         if (!stemmer.internal.get()) continue;
     265                 :            : 
     266 [ -  + ][ #  # ]:         42 :         if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(term))
                 [ -  + ]
     267                 :          0 :             continue;
     268                 :            : 
     269                 :            :         // Note, this uses the lowercased term, but that's OK as we only
     270                 :            :         // want to avoid stemming terms starting with a digit.
     271         [ +  + ]:         42 :         if (!should_stem(term)) continue;
     272                 :            : 
     273                 :            :         // Add stemmed form without positional information.
     274                 :         34 :         string stem("Z");
     275                 :         34 :         stem += prefix;
     276                 :         34 :         stem += stemmer(term);
     277                 :         34 :         doc.add_term(stem, weight);
     278                 :         34 :     }
     279                 :            : }
     280                 :            : 
     281                 :            : }

Generated by: LCOV version 1.8