LCOV - Test Coverage for xapian-core r - queryparser/queryparser.lemony

LCOV - code coverage report

Current view:	top level - queryparser - queryparser.lemony (source / functions)		Hit	Total	Coverage
Test:	Test Coverage for xapian-core r	Lines:	882	913	96.6 %
Date:	2011-08-21	Functions:	74	74	100.0 %
		Branches:	824	975	84.5 %

           Branch data     Line data    Source code

       1                 :            : %include {
       2                 :            : /* queryparser.lemony: build a Xapian::Query object from a user query string.
       3                 :            :  *
       4                 :            :  * Copyright (C) 2004,2005,2006,2007,2008,2009,2010,2011 Olly Betts
       5                 :            :  * Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
       6                 :            :  *
       7                 :            :  * This program is free software; you can redistribute it and/or
       8                 :            :  * modify it under the terms of the GNU General Public License as
       9                 :            :  * published by the Free Software Foundation; either version 2 of the
      10                 :            :  * License, or (at your option) any later version.
      11                 :            :  *
      12                 :            :  * This program is distributed in the hope that it will be useful,
      13                 :            :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14                 :            :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      15                 :            :  * GNU General Public License for more details.
      16                 :            :  *
      17                 :            :  * You should have received a copy of the GNU General Public License
      18                 :            :  * along with this program; if not, write to the Free Software
      19                 :            :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
      20                 :            :  * USA
      21                 :            :  */
      22                 :            : 
      23                 :            : #include <config.h>
      24                 :            : 
      25                 :            : #include "omassert.h"
      26                 :            : #include "queryparser_internal.h"
      27                 :            : #include <xapian/error.h>
      28                 :            : #include <xapian/unicode.h>
      29                 :            : #include "stringutils.h"
      30                 :            : 
      31                 :            : // Include the list of token values lemon generates.
      32                 :            : #include "queryparser_token.h"
      33                 :            : 
      34                 :            : #include "cjk/cjk-tokenizer.h"
      35                 :            : 
      36                 :            : #include <algorithm>
      37                 :            : #include <list>
      38                 :            : #include <string>
      39                 :            : 
      40                 :            : #include <string.h>
      41                 :            : 
      42                 :            : using namespace std;
      43                 :            : 
      44                 :            : using namespace Xapian;
      45                 :            : 
      46                 :            : inline bool
      47                 :      44831 : U_isupper(unsigned ch) {
      48 [ +  + ][ +  + ]:      44831 :     return (ch < 128 && C_isupper((unsigned char)ch));
      49                 :            : }
      50                 :            : 
      51                 :            : inline bool
      52                 :         14 : U_isdigit(unsigned ch) {
      53 [ +  + ][ +  + ]:         14 :     return (ch < 128 && C_isdigit((unsigned char)ch));
      54                 :            : }
      55                 :            : 
      56                 :            : inline bool
      57                 :      41257 : U_isalpha(unsigned ch) {
      58 [ +  + ][ +  + ]:      41257 :     return (ch < 128 && C_isalpha((unsigned char)ch));
      59                 :            : }
      60                 :            : 
      61                 :            : using Xapian::Unicode::is_whitespace;
      62                 :            : 
      63                 :            : inline bool
      64                 :       1827 : is_not_whitespace(unsigned ch) {
      65                 :       1827 :     return !is_whitespace(ch);
      66                 :            : }
      67                 :            : 
      68                 :            : using Xapian::Unicode::is_wordchar;
      69                 :            : 
      70                 :            : inline bool
      71                 :      17599 : is_not_wordchar(unsigned ch) {
      72                 :      17599 :     return !is_wordchar(ch);
      73                 :            : }
      74                 :            : 
      75                 :            : inline bool
      76                 :      22787 : is_digit(unsigned ch) {
      77                 :      22787 :     return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
      78                 :            : }
      79                 :            : 
      80                 :            : // FIXME: we used to keep trailing "-" (e.g. Cl-) but it's of dubious utility
      81                 :            : // and there's the risk of hyphens getting stuck onto the end of terms...
      82                 :            : inline bool
      83                 :      44091 : is_suffix(unsigned ch) {
      84 [ +  + ][ +  + ]:      44091 :     return ch == '+' || ch == '#';
      85                 :            : }
      86                 :            : 
      87                 :            : inline bool
      88                 :        185 : prefix_needs_colon(const string & prefix, unsigned ch)
      89                 :            : {
      90         [ +  + ]:        185 :     if (!U_isupper(ch)) return false;
      91                 :          2 :     string::size_type len = prefix.length();
      92   [ +  -  +  - ]:        185 :     return (len > 1 && prefix[len - 1] != ':');
      93                 :            : }
      94                 :            : 
      95                 :            : using Unicode::is_currency;
      96                 :            : 
      97                 :            : inline bool
      98                 :       6196 : is_positional(Xapian::Query::op op)
      99                 :            : {
     100 [ +  + ][ +  + ]:       6196 :     return (op == Xapian::Query::OP_PHRASE || op == Xapian::Query::OP_NEAR);
     101                 :            : }
     102                 :            : 
     103                 :            : /// A structure identifying a group of filter terms or a value range.
     104                 :            : struct filter_group_id {
     105                 :            :     /** The prefix info for boolean filter terms.
     106                 :            :      *
     107                 :            :      *  This is NULL for a value range.
     108                 :            :      */
     109                 :            :     const PrefixInfo *prefix_info;
     110                 :            : 
     111                 :            :     /** The value number for a value range.
     112                 :            :      *
     113                 :            :      *  This is used for value range terms.
     114                 :            :      */
     115                 :            :     Xapian::valueno slot;
     116                 :            : 
     117                 :            :     /// Make a new filter_group_id for boolean filter terms.
     118                 :         69 :     explicit filter_group_id(const PrefixInfo * prefix_info_)
     119                 :         69 :         : prefix_info(prefix_info_), slot(Xapian::BAD_VALUENO) {}
     120                 :            : 
     121                 :            :     /// Make a new filter_group_id for value range terms.
     122                 :       3777 :     explicit filter_group_id(Xapian::valueno slot_)
     123                 :       3777 :         : prefix_info(NULL), slot(slot_) {}
     124                 :            : 
     125                 :            :     /// Ordering needed to allow storage in a map.
     126                 :         54 :     bool operator<(const filter_group_id & other) const {
     127                 :            :         // Check slot first since comparison is cheap.
     128         [ +  + ]:         54 :         if (slot != other.slot)
     129                 :         17 :             return slot < other.slot;
     130 [ +  + ][ +  + ]:         37 :         if (!prefix_info || prefix_info == other.prefix_info)
     131                 :         22 :             return false;
     132         [ -  + ]:         15 :         if (!other.prefix_info)
     133                 :          0 :             return true;
     134                 :         54 :         return prefix_info->prefixes < other.prefix_info->prefixes;
     135                 :            :     }
     136                 :            : };
     137                 :            : 
     138                 :            : /** Class used to pass information about a token from lexer to parser.
     139                 :            :  *
     140                 :            :  *  Generally an instance of this class carries term information, but it can be
     141                 :            :  *  used for the start or end of a value range, with some operators (e.g. the
     142                 :            :  *  distance in NEAR/3 or ADJ/3, etc).
     143                 :            :  */
     144                 :      48307 : class Term {
     145                 :            :     State * state;
     146                 :            : 
     147                 :            :   public:
     148                 :            :     string name;
     149                 :            :     const PrefixInfo * prefix_info;
     150                 :            :     string unstemmed;
     151                 :            :     QueryParser::stem_strategy stem;
     152                 :            :     termpos pos;
     153                 :            : 
     154                 :            :     Term(const string &name_, termpos pos_) : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { }
     155                 :            :     Term(const string &name_) : name(name_), stem(QueryParser::STEM_NONE), pos(0) { }
     156                 :            :     Term(const string &name_, const PrefixInfo * prefix_info_)
     157                 :            :         : name(name_), prefix_info(prefix_info_),
     158                 :            :           stem(QueryParser::STEM_NONE), pos(0) { }
     159                 :          4 :     Term(termpos pos_) : stem(QueryParser::STEM_NONE), pos(pos_) { }
     160                 :      44526 :     Term(State * state_, const string &name_, const PrefixInfo * prefix_info_,
     161                 :            :          const string &unstemmed_,
     162                 :            :          QueryParser::stem_strategy stem_ = QueryParser::STEM_NONE,
     163                 :            :          termpos pos_ = 0)
     164                 :            :         : state(state_), name(name_), prefix_info(prefix_info_),
     165                 :      44526 :           unstemmed(unstemmed_), stem(stem_), pos(pos_) { }
     166                 :            :     // For RANGE tokens.
     167                 :       3777 :     Term(valueno slot, const string &a, const string &b)
     168                 :       3777 :         : name(a), unstemmed(b), pos(slot) { }
     169                 :            : 
     170                 :            :     string make_term(const string & prefix) const;
     171                 :            : 
     172                 :       1210 :     void need_positions() {
     173         [ +  + ]:       1210 :         if (stem == QueryParser::STEM_SOME) stem = QueryParser::STEM_NONE;
     174                 :       1210 :     }
     175                 :            : 
     176                 :          4 :     termpos get_termpos() const { return pos; }
     177                 :            : 
     178                 :         69 :     filter_group_id get_filter_group_id() const {
     179                 :         69 :         return filter_group_id(prefix_info);
     180                 :            :     }
     181                 :            : 
     182                 :            :     Query * as_wildcarded_query(State * state) const;
     183                 :            : 
     184                 :            :     /** Build a query for a term at the very end of the query string when
     185                 :            :      *  FLAG_PARTIAL is in use.
     186                 :            :      *
     187                 :            :      *  This query should match documents containing any terms which start with
     188                 :            :      *  the characters specified, but should give a higher score to exact
     189                 :            :      *  matches (since the user might have finished typing - we simply don't
     190                 :            :      *  know).
     191                 :            :      */
     192                 :            :     Query * as_partial_query(State * state_) const;
     193                 :            : 
     194                 :            :     /** Build a query for a string of CJK characters. */
     195                 :            :     Query * as_cjk_query() const;
     196                 :            : 
     197                 :            :     /// Value range query.
     198                 :            :     Query as_value_range_query() const;
     199                 :            : 
     200                 :            :     Query get_query() const;
     201                 :            : 
     202                 :            :     Query get_query_with_synonyms() const;
     203                 :            : 
     204                 :            :     Query get_query_with_auto_synonyms() const;
     205                 :            : };
     206                 :            : 
     207                 :            : /// Parser State shared between the lexer and the parser.
     208                 :      24980 : class State {
     209                 :            :     QueryParser::Internal * qpi;
     210                 :            : 
     211                 :            :   public:
     212                 :            :     Query query;
     213                 :            :     const char * error;
     214                 :            :     unsigned flags;
     215                 :            : 
     216                 :      24980 :     State(QueryParser::Internal * qpi_, unsigned flags_)
     217                 :      24980 :         : qpi(qpi_), error(NULL), flags(flags_) { }
     218                 :            : 
     219                 :       1660 :     string stem_term(const string &term) {
     220                 :       1660 :         return qpi->stemmer(term);
     221                 :            :     }
     222                 :            : 
     223                 :         38 :     void add_to_stoplist(const Term * term) {
     224                 :         38 :         qpi->stoplist.push_back(term->name);
     225                 :         38 :     }
     226                 :            : 
     227                 :      44370 :     void add_to_unstem(const string & term, const string & unstemmed) {
     228                 :      44370 :         qpi->unstem.insert(make_pair(term, unstemmed));
     229                 :      44370 :     }
     230                 :            : 
     231                 :       3786 :     Term * value_range(const string &a, const string &b) {
     232                 :       3786 :         list<ValueRangeProcessor *>::const_iterator i;
     233 [ +  + ][ +  + ]:       7642 :         for (i = qpi->valrangeprocs.begin(); i != qpi->valrangeprocs.end(); ++i) {
                 [ +  + ]
     234                 :       3856 :             string start = a;
     235                 :       3856 :             string end = b;
     236                 :       3856 :             Xapian::valueno slot = (**i)(start, end);
     237         [ +  + ]:       3856 :             if (slot != Xapian::BAD_VALUENO) {
     238                 :       3856 :                 return new Term(slot, start, end);
     239                 :            :             }
     240                 :            :         }
     241                 :       3786 :         return NULL;
     242                 :            :     }
     243                 :            : 
     244                 :       1551 :     Query::op default_op() const { return qpi->default_op; }
     245                 :            : 
     246                 :        720 :     bool is_stopword(const Term *term) const {
     247 [ +  + ][ +  - ]:        720 :         return qpi->stopper && (*qpi->stopper)(term->name);
     248                 :            :     }
     249                 :            : 
     250                 :      20258 :     Database get_database() const {
     251                 :      20258 :         return qpi->db;
     252                 :            :     }
     253                 :            : 
     254                 :        556 :     const Stopper * get_stopper() const {
     255                 :        556 :         return qpi->stopper;
     256                 :            :     }
     257                 :            : 
     258                 :        558 :     size_t stoplist_size() const {
     259                 :        558 :         return qpi->stoplist.size();
     260                 :            :     }
     261                 :            : 
     262                 :          2 :     void stoplist_resize(size_t s) {
     263                 :          2 :         qpi->stoplist.resize(s);
     264                 :          2 :     }
     265                 :            : };
     266                 :            : 
     267                 :            : string
     268                 :      44370 : Term::make_term(const string & prefix) const
     269                 :            : {
     270                 :      44370 :     string term;
     271         [ +  + ]:      44370 :     if (stem == QueryParser::STEM_SOME) term += 'Z';
     272         [ +  + ]:      44370 :     if (!prefix.empty()) {
     273                 :        185 :         term += prefix;
     274         [ +  + ]:        185 :         if (prefix_needs_colon(prefix, name[0])) term += ':';
     275                 :            :     }
     276         [ +  + ]:      44370 :     if (stem != QueryParser::STEM_NONE) {
     277                 :       1644 :         term += state->stem_term(name);
     278                 :            :     } else {
     279                 :      42726 :         term += name;
     280                 :            :     }
     281                 :            : 
     282         [ +  - ]:      44370 :     if (!unstemmed.empty())
     283                 :      44370 :         state->add_to_unstem(term, unstemmed);
     284                 :          0 :     return term;
     285                 :            : }
     286                 :            : 
     287                 :            : Query
     288                 :      20070 : Term::get_query_with_synonyms() const
     289                 :            : {
     290                 :      20070 :     Query q = get_query();
     291                 :            : 
     292                 :            :     // Handle single-word synonyms with each prefix.
     293                 :      20070 :     const list<string> & prefixes = prefix_info->prefixes;
     294                 :      20070 :     list<string>::const_iterator piter;
     295         [ +  + ]:      40140 :     for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
     296                 :            :         // First try the unstemmed term:
     297                 :      20070 :         string term;
     298         [ -  + ]:      20070 :         if (!piter->empty()) {
     299                 :          0 :             term += *piter;
     300         [ #  # ]:          0 :             if (prefix_needs_colon(*piter, name[0])) term += ':';
     301                 :            :         }
     302                 :      20070 :         term += name;
     303                 :            : 
     304                 :      20070 :         Xapian::Database db = state->get_database();
     305                 :      20070 :         Xapian::TermIterator syn = db.synonyms_begin(term);
     306                 :      20070 :         Xapian::TermIterator end = db.synonyms_end(term);
     307   [ +  +  +  + ]:      20070 :         if (syn == end && stem != QueryParser::STEM_NONE) {
                 [ +  + ]
     308                 :            :             // If that has no synonyms, try the stemmed form:
     309                 :         16 :             term = 'Z';
     310         [ -  + ]:         16 :             if (!piter->empty()) {
     311                 :          0 :                 term += *piter;
     312         [ #  # ]:          0 :                 if (prefix_needs_colon(*piter, name[0])) term += ':';
     313                 :            :             }
     314                 :         16 :             term += state->stem_term(name);
     315                 :         16 :             syn = db.synonyms_begin(term);
     316                 :         16 :             end = db.synonyms_end(term);
     317                 :            :         }
     318         [ +  + ]:      30142 :         while (syn != end) {
     319                 :      10072 :             q = Query(Query::OP_SYNONYM, q, Query(*syn, 1, pos));
     320                 :      10072 :             ++syn;
     321                 :            :         }
     322                 :            :     }
     323                 :          0 :     return q;
     324                 :            : }
     325                 :            : 
     326                 :            : Query
     327                 :      32962 : Term::get_query_with_auto_synonyms() const
     328                 :            : {
     329         [ +  + ]:      32962 :     if (state->flags & QueryParser::FLAG_AUTO_SYNONYMS)
     330                 :      20050 :         return get_query_with_synonyms();
     331                 :            : 
     332                 :      32962 :     return get_query();
     333                 :            : }
     334                 :            : 
     335                 :            : static void
     336                 :       1007 : add_to_query(Query *& q, Query::op op, Query * term)
     337                 :            : {
     338                 :            :     Assert(term);
     339         [ +  + ]:       1007 :     if (q) {
     340                 :        927 :         *q = Query(op, *q, *term);
     341         [ +  - ]:        927 :         delete term;
     342                 :            :     } else {
     343                 :         80 :         q = term;
     344                 :            :     }
     345                 :       1007 : }
     346                 :            : 
     347                 :            : static void
     348                 :        476 : add_to_query(Query *& q, Query::op op, const Query & term)
     349                 :            : {
     350         [ +  + ]:        476 :     if (q) {
     351                 :         29 :         *q = Query(op, *q, term);
     352                 :            :     } else {
     353                 :        447 :         q = new Query(term);
     354                 :            :     }
     355                 :        476 : }
     356                 :            : 
     357                 :            : Query
     358                 :      43079 : Term::get_query() const
     359                 :            : {
     360                 :      43079 :     const list<string> & prefixes = prefix_info->prefixes;
     361                 :            :     Assert(prefixes.size() >= 1);
     362                 :      43079 :     list<string>::const_iterator piter = prefixes.begin();
     363                 :      43079 :     Query q(make_term(*piter), 1, pos);
     364         [ +  + ]:      43085 :     while (++piter != prefixes.end()) {
     365                 :          6 :         q = Query(Query::OP_OR, q, Query(make_term(*piter), 1, pos));
     366                 :            :     }
     367                 :          0 :     return q;
     368                 :            : }
     369                 :            : 
     370                 :            : Query *
     371                 :         93 : Term::as_wildcarded_query(State * state_) const
     372                 :            : {
     373                 :         93 :     const Database & db = state_->get_database();
     374                 :         93 :     vector<Query> subqs;
     375                 :            : 
     376                 :         93 :     const list<string> & prefixes = prefix_info->prefixes;
     377                 :         93 :     list<string>::const_iterator piter;
     378         [ +  + ]:        186 :     for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
     379                 :         93 :         string root = *piter;
     380                 :         93 :         root += name;
     381                 :         93 :         TermIterator t = db.allterms_begin(root);
     382         [ +  + ]:        238 :         while (t != db.allterms_end(root)) {
     383                 :        145 :             subqs.push_back(Query(*t, 1, pos));
     384                 :        145 :             ++t;
     385                 :            :         }
     386                 :            :     }
     387                 :         93 :     Query * q = new Query(Query::OP_SYNONYM, subqs.begin(), subqs.end());
     388         [ +  - ]:         93 :     delete this;
     389                 :         93 :     return q;
     390                 :            : }
     391                 :            : 
     392                 :            : Query *
     393                 :         81 : Term::as_partial_query(State * state_) const
     394                 :            : {
     395                 :         81 :     const Database & db = state_->get_database();
     396                 :         81 :     vector<Query> subqs_partial; // A synonym of all the partial terms.
     397                 :         81 :     vector<Query> subqs_full; // A synonym of all the full terms.
     398                 :            : 
     399                 :         81 :     const list<string> & prefixes = prefix_info->prefixes;
     400                 :         81 :     list<string>::const_iterator piter;
     401         [ +  + ]:        166 :     for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
     402                 :         85 :         string root = *piter;
     403                 :         85 :         root += name;
     404                 :         85 :         TermIterator t = db.allterms_begin(root);
     405         [ +  + ]:        426 :         while (t != db.allterms_end(root)) {
     406                 :        350 :             subqs_partial.push_back(Query(*t, 1, pos));
     407                 :        350 :             ++t;
     408                 :            :         }
     409                 :            :         // Add the term, as it would normally be handled, as an alternative.
     410                 :         76 :         subqs_full.push_back(Query(make_term(*piter), 1, pos));
     411                 :            :     }
     412                 :            :     Query * q = new Query(Query::OP_OR,
     413                 :            :                           Query(Query::OP_SYNONYM,
     414                 :            :                                 subqs_partial.begin(), subqs_partial.end()),
     415                 :            :                           Query(Query::OP_SYNONYM,
     416                 :         72 :                                 subqs_full.begin(), subqs_full.end()));
     417         [ +  - ]:         72 :     delete this;
     418                 :         99 :     return q;
     419                 :            : }
     420                 :            : 
     421                 :            : Query *
     422                 :         22 : Term::as_cjk_query() const
     423                 :            : {
     424                 :         22 :     vector<Query> prefix_cjk;
     425                 :         22 :     const list<string> & prefixes = prefix_info->prefixes;
     426                 :         22 :     list<string>::const_iterator piter;
     427         [ +  + ]:         90 :     for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) {
     428         [ +  + ]:        139 :         for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) {
     429                 :         71 :             string cjk = *piter;
     430                 :         71 :             cjk += *tk;
     431                 :         71 :             prefix_cjk.push_back(Query(cjk, 1, pos));
     432                 :            :         }
     433                 :         22 :     }
     434                 :         22 :     Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end());
     435         [ +  - ]:         22 :     delete this;
     436                 :         22 :     return q;
     437                 :            : }
     438                 :            : 
     439                 :            : Query
     440                 :       3777 : Term::as_value_range_query() const
     441                 :            : {
     442                 :       3777 :     Query q;
     443         [ +  + ]:       3777 :     if (unstemmed.empty())
     444                 :          3 :         q = Query(Query::OP_VALUE_GE, pos, name);
     445                 :            :     else
     446                 :       3774 :         q = Query(Query::OP_VALUE_RANGE, pos, name, unstemmed);
     447         [ +  - ]:       3777 :     delete this;
     448                 :          0 :     return q;
     449                 :            : }
     450                 :            : 
     451                 :            : inline bool
     452                 :      65264 : is_phrase_generator(unsigned ch)
     453                 :            : {
     454                 :            :     // These characters generate a phrase search.
     455                 :            :     // Ordered mostly by frequency of calls to this function done when
     456                 :            :     // running queryparsertest.
     457 [ +  + ][ +  + ]:      65264 :     return (ch && ch < 128 && strchr(".-/:\\@", ch) != NULL);
                 [ +  + ]
     458                 :            : }
     459                 :            : 
     460                 :            : inline bool
     461                 :       2230 : is_stem_preventer(unsigned ch)
     462                 :            : {
     463 [ +  - ][ +  + ]:       2230 :     return (ch && ch < 128 && strchr("(/\\@<>=*[{\"", ch) != NULL);
                 [ +  + ]
     464                 :            : }
     465                 :            : 
     466                 :            : inline bool
     467                 :       3664 : should_stem(const string & term)
     468                 :            : {
     469                 :            :     const unsigned int SHOULD_STEM_MASK =
     470                 :            :         (1 << Unicode::LOWERCASE_LETTER) |
     471                 :            :         (1 << Unicode::TITLECASE_LETTER) |
     472                 :            :         (1 << Unicode::MODIFIER_LETTER) |
     473                 :       3664 :         (1 << Unicode::OTHER_LETTER);
     474                 :       3664 :     Utf8Iterator u(term);
     475                 :       3664 :     return ((SHOULD_STEM_MASK >> Unicode::get_category(*u)) & 1);
     476                 :            : }
     477                 :            : 
     478                 :            : /** Value representing "ignore this" when returned by check_infix() or
     479                 :            :  *  check_infix_digit().
     480                 :            :  */
     481                 :            : const unsigned UNICODE_IGNORE(-1);
     482                 :            : 
     483                 :      22526 : inline unsigned check_infix(unsigned ch) {
     484 [ +  + ][ +  + ]:      22526 :     if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
         [ +  + ][ +  - ]
                 [ -  + ]
     485                 :            :         // Unicode includes all these except '&' in its word boundary rules,
     486                 :            :         // as well as 0x2019 (which we handle below) and ':' (for Swedish
     487                 :            :         // apparently, but we ignore this for now as it's problematic in
     488                 :            :         // real world cases).
     489                 :         11 :         return ch;
     490                 :            :     }
     491                 :            :     // 0x2019 is Unicode apostrophe and single closing quote.
     492                 :            :     // 0x201b is Unicode single opening quote with the tail rising.
     493 [ +  - ][ -  + ]:      22515 :     if (ch == 0x2019 || ch == 0x201b) return '\'';
     494 [ +  + ][ -  + ]:      22515 :     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
         [ #  # ][ #  # ]
     495                 :          6 :         return UNICODE_IGNORE;
     496                 :      22526 :     return 0;
     497                 :            : }
     498                 :            : 
     499                 :         91 : inline unsigned check_infix_digit(unsigned ch) {
     500                 :            :     // This list of characters comes from Unicode's word identifying algorithm.
     501         [ +  + ]:         91 :     switch (ch) {
     502                 :            :         case ',':
     503                 :            :         case '.':
     504                 :            :         case ';':
     505                 :            :         case 0x037e: // GREEK QUESTION MARK
     506                 :            :         case 0x0589: // ARMENIAN FULL STOP
     507                 :            :         case 0x060D: // ARABIC DATE SEPARATOR
     508                 :            :         case 0x07F8: // NKO COMMA
     509                 :            :         case 0x2044: // FRACTION SLASH
     510                 :            :         case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
     511                 :            :         case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
     512                 :            :         case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
     513                 :         57 :             return ch;
     514                 :            :     }
     515 [ -  + ][ #  # ]:         34 :     if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
         [ #  # ][ #  # ]
     516                 :          0 :         return UNICODE_IGNORE;
     517                 :         91 :     return 0;
     518                 :            : }
     519                 :            : 
     520                 :            : struct yyParser;
     521                 :            : 
     522                 :            : // Prototype the functions lemon generates.
     523                 :            : static yyParser *ParseAlloc();
     524                 :            : static void ParseFree(yyParser *);
     525                 :            : static void Parse(yyParser *, int, Term *, State *);
     526                 :            : static void yy_parse_failed(yyParser *);
     527                 :            : 
     528                 :            : void
     529                 :         55 : QueryParser::Internal::add_prefix(const string &field, const string &prefix,
     530                 :            :                                   filter_type type)
     531                 :            : {
     532                 :         55 :     map<string, PrefixInfo>::iterator p = prefixmap.find(field);
     533         [ +  + ]:         55 :     if (p == prefixmap.end()) {
     534                 :         45 :         prefixmap.insert(make_pair(field, PrefixInfo(type, prefix)));
     535                 :            :     } else {
     536                 :            :         // Check that this is the same type of filter as the existing one(s).
     537         [ +  + ]:         10 :         if (p->second.type != type) {
     538                 :          2 :             throw Xapian::InvalidOperationError("Can't use add_prefix() and add_boolean_prefix() on the same field name, or add_boolean_prefix() with different values of the 'exclusive' parameter");
     539                 :            :         }
     540                 :          8 :         p->second.prefixes.push_back(prefix);
     541                 :            :    }
     542                 :         53 : }
     543                 :            : 
     544                 :            : string
     545                 :      44620 : QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end,
     546                 :            :                                   bool cjk_ngram, bool & is_cjk_term,
     547                 :            :                                   bool &was_acronym)
     548                 :            : {
     549                 :      44620 :     string term;
     550                 :            :     // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
     551                 :            :     // Don't worry if there's a trailing '.' or not.
     552         [ +  + ]:      44620 :     if (U_isupper(*it)) {
     553                 :       1041 :         string t;
     554                 :       1041 :         Utf8Iterator p = it;
     555   [ +  +  +  + ]:       1064 :         do {
         [ +  + ][ +  + ]
                 [ +  + ]
     556                 :       1064 :             Unicode::append_utf8(t, *p++);
     557                 :            :         } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
     558                 :            :         // One letter does not make an acronym!  If we handled a single
     559                 :            :         // uppercase letter here, we wouldn't catch M&S below.
     560         [ +  + ]:       1041 :         if (t.length() > 1) {
     561                 :            :             // Check there's not a (lower case) letter or digit
     562                 :            :             // immediately after it.
     563                 :            :             // FIXME: should I.B.M..P.T.O be a range search?
     564 [ +  + ][ +  - ]:          7 :             if (p == end || !is_wordchar(*p)) {
                 [ +  - ]
     565         [ -  + ]:          7 :                 it = p;
     566                 :          7 :                 swap(term, t);
     567                 :            :             }
     568                 :       1041 :         }
     569                 :            :     }
     570                 :      44620 :     was_acronym = !term.empty();
     571                 :            : 
     572   [ +  -  +  + ]:      44620 :     if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) {
         [ +  + ][ +  + ]
     573                 :         23 :         term = CJK::get_cjk(it);
     574                 :         23 :         is_cjk_term = true;
     575                 :            :     }
     576                 :            : 
     577         [ +  + ]:      44620 :     if (term.empty()) {
     578                 :      44590 :         unsigned prevch = *it;
     579                 :      44590 :         Unicode::append_utf8(term, prevch);
     580         [ +  + ]:     142000 :         while (++it != end) {
     581 [ +  - ][ +  + ]:     141414 :             if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break;
                 [ +  + ]
     582                 :     141410 :             unsigned ch = *it;
     583         [ +  + ]:     141410 :             if (!is_wordchar(ch)) {
     584                 :            :                 // Treat a single embedded '&' or "'" or similar as a word
     585                 :            :                 // character (e.g. AT&T, Fred's).  Also, normalise
     586                 :            :                 // apostrophes to ASCII apostrophe.
     587                 :      44074 :                 Utf8Iterator p = it;
     588                 :      44074 :                 ++p;
     589   [ +  +  +  + ]:      44074 :                 if (p == end || !is_wordchar(*p)) break;
                 [ +  + ]
     590                 :      22617 :                 unsigned nextch = *p;
     591   [ +  +  +  + ]:      22617 :                 if (is_digit(prevch) && is_digit(nextch)) {
                 [ +  + ]
     592                 :         91 :                     ch = check_infix_digit(ch);
     593                 :            :                 } else {
     594                 :      22526 :                     ch = check_infix(ch);
     595                 :            :                 }
     596         [ +  + ]:      22617 :                 if (!ch) break;
     597         [ +  + ]:         74 :                 if (ch == UNICODE_IGNORE)
     598                 :          6 :                     continue;
     599                 :            :             }
     600                 :      97404 :             Unicode::append_utf8(term, ch);
     601                 :      97404 :             prevch = ch;
     602                 :            :         }
     603 [ +  + ][ +  + ]:      44590 :         if (it != end && is_suffix(*it)) {
                 [ +  + ]
     604                 :         70 :             string suff_term = term;
     605                 :         70 :             Utf8Iterator p = it;
     606                 :            :             // Keep trailing + (e.g. C++, Na+) or # (e.g. C#).
     607         [ +  + ]:         87 :             do {
     608         [ -  + ]:         87 :                 if (suff_term.size() - term.size() == 3) {
     609                 :          0 :                     suff_term.resize(0);
     610                 :          0 :                     break;
     611                 :            :                 }
     612                 :         87 :                 suff_term += *p;
     613                 :            :             } while (is_suffix(*++p));
     614 [ +  - ][ +  + ]:         70 :             if (!suff_term.empty() && (p == end || !is_wordchar(*p))) {
         [ +  + ][ +  + ]
     615                 :            :                 // If the suffixed term doesn't exist, check that the
     616                 :            :                 // non-suffixed term does.  This also takes care of
     617                 :            :                 // the case when QueryParser::set_database() hasn't
     618                 :            :                 // been called.
     619                 :         38 :                 bool use_suff_term = false;
     620                 :         38 :                 string lc = Unicode::tolower(suff_term);
     621         [ -  + ]:         38 :                 if (db.term_exists(lc)) {
     622                 :          0 :                     use_suff_term = true;
     623                 :            :                 } else {
     624                 :         38 :                     lc = Unicode::tolower(term);
     625         [ +  + ]:         38 :                     if (!db.term_exists(lc)) use_suff_term = true;
     626                 :            :                 }
     627         [ +  + ]:         38 :                 if (use_suff_term) {
     628                 :         37 :                     term = suff_term;
     629         [ -  + ]:         37 :                     it = p;
     630                 :         38 :                 }
     631                 :      44620 :             }
     632                 :            :         }
     633                 :            :     }
     634                 :          0 :     return term;
     635                 :            : }
     636                 :            : 
     637                 :            : class ParserHandler {
     638                 :            :     yyParser * parser;
     639                 :            : 
     640                 :            :   public:
     641                 :      24980 :     explicit ParserHandler(yyParser * parser_) : parser(parser_) { }
     642                 :      74369 :     operator yyParser*() { return parser; }
     643                 :      24980 :     ~ParserHandler() { ParseFree(parser); }
     644                 :            : };
     645                 :            : 
     646                 :            : Query
     647                 :      24980 : QueryParser::Internal::parse_query(const string &qs, unsigned flags,
     648                 :            :                                    const string &default_prefix)
     649                 :            : {
     650                 :      24980 :     bool cjk_ngram = true; // FIXME: set from flag or env var or something.
     651                 :            : 
     652                 :            :     // Set value_ranges if we may have to handle value ranges in the query.
     653                 :            :     bool value_ranges;
     654 [ +  + ][ +  + ]:      24980 :     value_ranges = !valrangeprocs.empty() && (qs.find("..") != string::npos);
     655                 :            : 
     656                 :      24980 :     termpos term_pos = 1;
     657                 :      24980 :     Utf8Iterator it(qs), end;
     658                 :            : 
     659                 :      24980 :     State state(this, flags);
     660                 :            : 
     661                 :            :     // To successfully apply more than one spelling correction to a query
     662                 :            :     // string, we must keep track of the offset due to previous corrections.
     663                 :      24980 :     int correction_offset = 0;
     664                 :      24980 :     corrected_query.resize(0);
     665                 :            : 
     666                 :            :     // Stack of prefixes, used for phrases and subexpressions.
     667                 :      24980 :     list<const PrefixInfo *> prefix_stack;
     668                 :            : 
     669                 :            :     // If default_prefix is specified, use it.  Otherwise, use any list
     670                 :            :     // that has been set for the empty prefix.
     671                 :      24980 :     const PrefixInfo def_pfx(NON_BOOLEAN, default_prefix);
     672                 :            :     {
     673                 :      24980 :         const PrefixInfo * default_prefix_info = &def_pfx;
     674         [ +  + ]:      24980 :         if (default_prefix.empty()) {
     675                 :      24972 :             map<string, PrefixInfo>::const_iterator f = prefixmap.find("");
     676         [ +  + ]:      24972 :             if (f != prefixmap.end()) default_prefix_info = &(f->second);
     677                 :            :         }
     678                 :            : 
     679                 :            :         // We always have the current prefix on the top of the stack.
     680                 :      24980 :         prefix_stack.push_back(default_prefix_info);
     681                 :            :     }
     682                 :            : 
     683                 :      24980 :     ParserHandler pParser(ParseAlloc());
     684                 :            : 
     685                 :      24980 :     unsigned newprev = ' ';
     686                 :            : main_lex_loop:
     687                 :            :     enum {
     688                 :            :         DEFAULT, IN_QUOTES, IN_PREFIXED_QUOTES, IN_PHRASED_TERM, IN_GROUP,
     689                 :            :         IN_GROUP2, EXPLICIT_SYNONYM
     690                 :      28757 :     } mode = DEFAULT;
     691 [ +  + ][ +  + ]:      74645 :     while (it != end && !state.error) {
                 [ +  + ]
     692                 :      50239 :         bool last_was_operator = false;
     693                 :      50239 :         bool last_was_operator_needing_term = false;
     694         [ +  + ]:      50239 :         if (mode == EXPLICIT_SYNONYM) mode = DEFAULT;
     695                 :      50239 :         if (false) {
     696                 :            : just_had_operator:
     697         [ +  + ]:        168 :             if (it == end) break;
     698                 :        157 :             mode = DEFAULT;
     699                 :        157 :             last_was_operator_needing_term = false;
     700                 :        157 :             last_was_operator = true;
     701                 :            :         }
     702                 :      50396 :         if (false) {
     703                 :            : just_had_operator_needing_term:
     704                 :        132 :             last_was_operator_needing_term = true;
     705                 :        132 :             last_was_operator = true;
     706                 :            :         }
     707         [ +  + ]:      50528 :         if (mode == IN_PHRASED_TERM) mode = DEFAULT;
     708         [ +  + ]:      50528 :         if (is_whitespace(*it)) {
     709                 :       1040 :             newprev = ' ';
     710                 :       1040 :             ++it;
     711         [ -  + ]:       1040 :             it = find_if(it, end, is_not_whitespace);
     712         [ +  + ]:       1040 :             if (it == end) break;
     713                 :            :         }
     714                 :            : 
     715 [ +  + ][ +  + ]:      50527 :         if (value_ranges &&
         [ +  + ][ -  + ]
     716                 :            :             (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2)) {
     717                 :            :             // Scan forward to see if this could be the "start of range"
     718                 :            :             // token.  Sadly this has O(n^2) tendencies, though at least
     719                 :            :             // "n" is the number of words in a query which is likely to
     720                 :            :             // remain fairly small.  FIXME: can we tokenise more elegantly?
     721                 :       3807 :             Utf8Iterator it_initial = it;
     722                 :       3807 :             Utf8Iterator p = it;
     723                 :       3807 :             unsigned ch = 0;
     724         [ +  + ]:      17997 :             while (p != end) {
     725 [ +  + ][ +  + ]:      17990 :                 if (ch == '.' && *p == '.') {
                 [ +  + ]
     726                 :       3786 :                     string a;
     727         [ +  + ]:      17869 :                     while (it != p) {
     728                 :      14083 :                         Unicode::append_utf8(a, *it++);
     729                 :            :                     }
     730                 :            :                     // Trim off the trailing ".".
     731                 :       3786 :                     a.resize(a.size() - 1);
     732                 :       3786 :                     ++p;
     733                 :            :                     // Either end of the range can be empty (for an open-ended
     734                 :            :                     // range) but both can't be empty.
     735   [ +  +  +  - ]:       3786 :                     if (!a.empty() || (p != end && *p > ' ' && *p != ')')) {
         [ +  - ][ +  - ]
                 [ +  - ]
     736                 :       3786 :                         string b;
     737                 :            :                         // Allow any character except whitespace and ')' in the
     738                 :            :                         // upper bound.  Or should we be consistent with the
     739                 :            :                         // lower bound?
     740 [ +  + ][ +  + ]:      14031 :                         while (p != end && *p > ' ' && *p != ')') {
         [ +  - ][ +  + ]
     741                 :      10245 :                             Unicode::append_utf8(b, *p++);
     742                 :            :                         }
     743                 :       3786 :                         Term * range = state.value_range(a, b);
     744         [ +  + ]:       3786 :                         if (!range) {
     745                 :          9 :                             state.error = "Unknown range operation";
     746         [ +  + ]:          9 :                             if (a.find(':', 1) == string::npos) {
     747                 :            :                                 goto done;
     748                 :            :                             }
     749                 :            :                             // Might be a boolean filter with ".." in.  Leave
     750                 :            :                             // state.error in case it isn't.
     751                 :          2 :                             it = it_initial;
     752                 :            :                             break;
     753                 :            :                         }
     754      [ +  +  + ]:       3786 :                         Parse(pParser, RANGE, range, &state);
     755                 :            :                     }
     756                 :       3777 :                     it = p;
     757      [ +  +  + ]:       3786 :                     goto main_lex_loop;
     758                 :            :                 }
     759                 :      14204 :                 ch = *p;
     760   [ +  +  +  + ]:      14204 :                 if (!(is_wordchar(ch) || is_currency(ch) ||
         [ +  - ][ +  + ]
                 [ +  + ]
     761                 :         14 :                       (ch < 128 && strchr("%,-./:@", ch)))) break;
     762                 :      14190 :                 ++p;
     763                 :            :             }
     764                 :            :         }
     765                 :            : 
     766         [ +  + ]:      46743 :         if (!is_wordchar(*it)) {
     767                 :       2371 :             unsigned prev = newprev;
     768                 :       2371 :             unsigned ch = *it++;
     769                 :       2371 :             newprev = ch;
     770                 :            :             // Drop out of IN_GROUP mode.
     771   [ +  -  +  + ]:       2371 :             if (mode == IN_GROUP || mode == IN_GROUP2)
     772                 :         18 :                 mode = DEFAULT;
     773 [ +  +  +  +  + :       2371 :             switch (ch) {
                      + ]
     774                 :            :               case '"': // Quoted phrase.
     775         [ +  + ]:        573 :                 if (mode == DEFAULT) {
     776                 :            :                     // Skip whitespace.
     777         [ -  + ]:        401 :                     it = find_if(it, end, is_not_whitespace);
     778         [ +  + ]:        401 :                     if (it == end) {
     779                 :            :                         // Ignore an unmatched " at the end of the query to
     780                 :            :                         // avoid generating an empty pair of QUOTEs which will
     781                 :            :                         // cause a parse error.
     782                 :         75 :                         goto done;
     783                 :            :                     }
     784         [ +  + ]:        326 :                     if (*it == '"') {
     785                 :            :                         // Ignore empty "" (but only if we're not already
     786                 :            :                         // IN_QUOTES as we don't merge two adjacent quoted
     787                 :            :                         // phrases!)
     788                 :          6 :                         newprev = *it++;
     789                 :          6 :                         break;
     790                 :            :                     }
     791                 :            :                 }
     792         [ +  + ]:        492 :                 if (flags & QueryParser::FLAG_PHRASE) {
     793                 :        369 :                     Parse(pParser, QUOTE, NULL, &state);
     794         [ +  + ]:        369 :                     if (mode == DEFAULT) {
     795                 :        197 :                         mode = IN_QUOTES;
     796                 :            :                     } else {
     797                 :            :                         // Remove the prefix we pushed for this phrase.
     798         [ +  + ]:        172 :                         if (mode == IN_PREFIXED_QUOTES)
     799                 :          8 :                             prefix_stack.pop_back();
     800                 :        172 :                         mode = DEFAULT;
     801                 :            :                     }
     802                 :            :                 }
     803                 :        492 :                 break;
     804                 :            : 
     805                 :            :               case '+': case '-': // Loved or hated term/phrase/subexpression.
     806                 :            :                 // Ignore + or - at the end of the query string.
     807         [ +  + ]:        284 :                 if (it == end) goto done;
     808 [ +  + ][ +  + ]:        278 :                 if (prev > ' ' && prev != '(') {
     809                 :            :                     // Or if not after whitespace or an open bracket.
     810                 :        101 :                     break;
     811                 :            :                 }
     812 [ +  + ][ +  - ]:        177 :                 if (is_whitespace(*it) || *it == '+' || *it == '-') {
         [ +  + ][ +  + ]
     813                 :            :                     // Ignore + or - followed by a space, or further + or -.
     814                 :            :                     // Postfix + (such as in C++ and H+) is handled as part of
     815                 :            :                     // the term lexing code in parse_term().
     816                 :         34 :                     newprev = *it++;
     817                 :         34 :                     break;
     818                 :            :                 }
     819 [ +  + ][ +  + ]:        143 :                 if (mode == DEFAULT && (flags & FLAG_LOVEHATE)) {
     820                 :            :                     int token;
     821         [ +  + ]:        112 :                     if (ch == '+') {
     822                 :         59 :                         token = LOVE;
     823         [ +  + ]:         53 :                     } else if (last_was_operator) {
     824                 :          6 :                         token = HATE_AFTER_AND;
     825                 :            :                     } else {
     826                 :         47 :                         token = HATE;
     827                 :            :                     }
     828                 :        112 :                     Parse(pParser, token, NULL, &state);
     829                 :        112 :                     goto just_had_operator_needing_term;
     830                 :            :                 }
     831                 :            :                 // Need to prevent the term after a LOVE or HATE starting a
     832                 :            :                 // term group...
     833                 :         31 :                 break;
     834                 :            : 
     835                 :            :               case '(': // Bracketed subexpression.
     836                 :            :                 // Skip whitespace.
     837         [ -  + ]:        417 :                 it = find_if(it, end, is_not_whitespace);
     838                 :            :                 // Ignore ( at the end of the query string.
     839         [ -  + ]:        417 :                 if (it == end) goto done;
     840 [ +  + ][ +  + ]:        417 :                 if (prev > ' ' && strchr("()+-", prev) == NULL) {
     841                 :            :                     // Or if not after whitespace or a bracket or '+' or '-'.
     842                 :        180 :                     break;
     843                 :            :                 }
     844         [ -  + ]:        237 :                 if (*it == ')') {
     845                 :            :                     // Ignore empty ().
     846                 :          0 :                     newprev = *it++;
     847                 :          0 :                     break;
     848                 :            :                 }
     849 [ +  + ][ +  + ]:        237 :                 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
     850                 :        205 :                     prefix_stack.push_back(prefix_stack.back());
     851                 :        205 :                     Parse(pParser, BRA, NULL, &state);
     852                 :            :                 }
     853                 :        237 :                 break;
     854                 :            : 
     855                 :            :               case ')': // End of bracketed subexpression.
     856 [ +  + ][ +  + ]:        408 :                 if (mode == DEFAULT && (flags & FLAG_BOOLEAN)) {
     857                 :            :                     // Remove the prefix we pushed for the corresponding BRA.
     858                 :            :                     // If brackets are unmatched, it's a syntax error, but
     859                 :            :                     // that's no excuse to SEGV!
     860         [ +  + ]:        283 :                     if (prefix_stack.size() > 1) prefix_stack.pop_back();
     861                 :        283 :                     Parse(pParser, KET, NULL, &state);
     862                 :            :                 }
     863                 :        408 :                 break;
     864                 :            : 
     865                 :            :               case '~': // Synonym expansion.
     866                 :            :                 // Ignore at the end of the query string.
     867         [ -  + ]:         50 :                 if (it == end) goto done;
     868 [ +  + ][ +  + ]:         50 :                 if (mode == DEFAULT && (flags & FLAG_SYNONYM)) {
     869 [ +  + ][ -  + ]:         21 :                     if (prev > ' ' && strchr("+-(", prev) == NULL) {
     870                 :            :                         // Or if not after whitespace, +, -, or an open bracket.
     871                 :          0 :                         break;
     872                 :            :                     }
     873         [ +  + ]:         21 :                     if (!is_wordchar(*it)) {
     874                 :            :                         // Ignore if not followed by a word character.
     875                 :          1 :                         break;
     876                 :            :                     }
     877                 :         20 :                     Parse(pParser, SYNONYM, NULL, &state);
     878                 :         20 :                     mode = EXPLICIT_SYNONYM;
     879                 :         20 :                     goto just_had_operator_needing_term;
     880                 :            :                 }
     881                 :            :                 break;
     882                 :            :             }
     883                 :            :             // Skip any other characters.
     884                 :       2158 :             continue;
     885                 :            :         }
     886                 :            : 
     887                 :            :         Assert(is_wordchar(*it));
     888                 :            : 
     889                 :      44372 :         size_t term_start_index = it.raw() - qs.data();
     890                 :            : 
     891                 :      44372 :         newprev = 'A'; // Any letter will do...
     892                 :            : 
     893                 :            :         // A term, a prefix, or a boolean operator.
     894                 :      44372 :         const PrefixInfo * prefix_info = NULL;
     895   [ +  +  +  + ]:      44372 :         if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2 || mode == EXPLICIT_SYNONYM) &&
         [ +  + ][ +  + ]
         [ +  + ][ +  + ]
     896                 :            :             !prefixmap.empty()) {
     897                 :            :             // Check for a fieldname prefix (e.g. title:historical).
     898                 :       3082 :             Utf8Iterator p = find_if(it, end, is_not_wordchar);
     899   [ +  +  +  + ]:       3082 :             if (p != end && *p == ':' && ++p != end && *p > ' ' && *p != ')') {
         [ +  - ][ +  + ]
         [ +  - ][ +  + ]
     900                 :        172 :                 string field;
     901                 :        172 :                 p = it;
     902         [ +  + ]:       1124 :                 while (*p != ':')
     903                 :        952 :                     Unicode::append_utf8(field, *p++);
     904                 :        172 :                 map<string, PrefixInfo>::const_iterator f;
     905                 :        172 :                 f = prefixmap.find(field);
     906         [ +  + ]:        172 :                 if (f != prefixmap.end()) {
     907                 :            :                     // Special handling for prefixed fields, depending on the
     908                 :            :                     // type of the prefix.
     909                 :        148 :                     unsigned ch = *++p;
     910                 :        148 :                     prefix_info = &(f->second);
     911                 :            : 
     912         [ +  + ]:        148 :                     if (prefix_info->type != NON_BOOLEAN) {
     913                 :            :                         // Drop out of IN_GROUP if we're in it.
     914 [ +  + ][ +  + ]:         74 :                         if (mode == IN_GROUP || mode == IN_GROUP2)
     915                 :          7 :                             mode = DEFAULT;
     916                 :         74 :                         it = p;
     917                 :         74 :                         string name;
     918   [ +  -  +  + ]:         74 :                         if (it != end && *it == '"') {
                 [ +  + ]
     919                 :            :                             // Quoted boolean term (can contain any character).
     920                 :          3 :                             ++it;
     921         [ +  + ]:         39 :                             while (it != end) {
     922         [ +  + ]:         36 :                                 if (*it == '"') {
     923                 :            :                                     // Interpret "" as an escaped ".
     924 [ +  + ][ -  + ]:          3 :                                     if (++it == end || *it != '"')
                 [ +  + ]
     925                 :          2 :                                         break;
     926                 :            :                                 }
     927                 :         34 :                                 Unicode::append_utf8(name, *it++);
     928                 :            :                             }
     929                 :            :                         } else {
     930                 :            :                             // Can't boolean filter prefix a subexpression, so
     931                 :            :                             // just use anything following the prefix until the
     932                 :            :                             // next space or ')' as part of the boolean filter
     933                 :            :                             // term.
     934 [ +  + ][ +  + ]:        527 :                             while (it != end && *it > ' ' && *it != ')')
         [ +  + ][ +  + ]
     935                 :        456 :                                 Unicode::append_utf8(name, *it++);
     936                 :            :                         }
     937                 :            :                         // Build the unstemmed form in field.
     938                 :         74 :                         field += ':';
     939                 :         74 :                         field += name;
     940                 :            :                         // Clear any pending value range error.
     941                 :         74 :                         state.error = NULL;
     942                 :         74 :                         Term * token = new Term(&state, name, prefix_info, field);
     943                 :         74 :                         Parse(pParser, BOOLEAN_FILTER, token, &state);
     944                 :         74 :                         continue;
     945                 :            :                     }
     946                 :            : 
     947 [ +  + ][ +  - ]:         74 :                     if (ch == '"' && (flags & FLAG_PHRASE)) {
     948                 :            :                         // Prefixed phrase, e.g.: subject:"space flight"
     949                 :          8 :                         mode = IN_PREFIXED_QUOTES;
     950                 :          8 :                         Parse(pParser, QUOTE, NULL, &state);
     951                 :          8 :                         it = p;
     952                 :          8 :                         newprev = ch;
     953                 :          8 :                         ++it;
     954                 :          8 :                         prefix_stack.push_back(prefix_info);
     955                 :          8 :                         continue;
     956                 :            :                     }
     957                 :            : 
     958 [ +  + ][ +  - ]:         66 :                     if (ch == '(' && (flags & FLAG_BOOLEAN)) {
     959                 :            :                         // Prefixed subexpression, e.g.: title:(fast NEAR food)
     960                 :         11 :                         mode = DEFAULT;
     961                 :         11 :                         Parse(pParser, BRA, NULL, &state);
     962                 :         11 :                         it = p;
     963                 :         11 :                         newprev = ch;
     964                 :         11 :                         ++it;
     965                 :         11 :                         prefix_stack.push_back(prefix_info);
     966                 :         11 :                         continue;
     967                 :            :                     }
     968                 :            : 
     969         [ +  + ]:         55 :                     if (ch != ':') {
     970                 :            :                         // Allow 'path:/usr/local' but not 'foo::bar::baz'.
     971         [ +  + ]:         54 :                         while (is_phrase_generator(ch)) {
     972         [ +  + ]:          7 :                             if (++p == end)
     973                 :          2 :                                 goto not_prefix;
     974                 :          5 :                             ch = *p;
     975                 :            :                         }
     976                 :            :                     }
     977                 :            : 
     978         [ +  + ]:         53 :                     if (is_wordchar(ch)) {
     979                 :            :                         // Prefixed term.
     980                 :         45 :                         it = p;
     981                 :            :                     } else {
     982                 :            : not_prefix:
     983                 :            :                         // It looks like a prefix but isn't, so parse it as
     984                 :            :                         // text instead.
     985                 :         79 :                         prefix_info = NULL;
     986                 :            :                     }
     987         [ +  + ]:        172 :                 }
     988                 :            :             }
     989                 :            :         }
     990                 :            : 
     991                 :            : phrased_term:
     992                 :            :         bool was_acronym;
     993                 :      44620 :         bool is_cjk_term = false;
     994                 :      44620 :         string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym);
     995                 :            : 
     996                 :            :         // Boolean operators.
     997   [ +  +  +  + ]:      44620 :         if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) &&
         [ +  + ][ +  + ]
         [ +  + ][ +  + ]
         [ +  + ][ +  + ]
         [ +  + ][ +  + ]
     998                 :            :             (flags & FLAG_BOOLEAN) &&
     999                 :            :             // Don't want to interpret A.N.D. as an AND operator.
    1000                 :            :             !was_acronym &&
    1001                 :            :             !prefix_info &&
    1002                 :            :             term.size() >= 2 && term.size() <= 4 && U_isalpha(term[0])) {
    1003                 :            : 
    1004                 :      41164 :             string op = term;
    1005         [ +  + ]:      41164 :             if (flags & FLAG_BOOLEAN_ANY_CASE) {
    1006         [ +  + ]:         22 :                 for (string::iterator i = op.begin(); i != op.end(); ++i) {
    1007                 :         16 :                     *i = C_toupper(*i);
    1008                 :            :                 }
    1009                 :            :             }
    1010         [ +  + ]:      41164 :             if (op.size() == 3) {
    1011         [ +  + ]:      40522 :                 if (op == "AND") {
    1012                 :         61 :                     Parse(pParser, AND, NULL, &state);
    1013                 :            :                     goto just_had_operator;
    1014                 :            :                 }
    1015         [ +  + ]:      40461 :                 if (op == "NOT") {
    1016                 :         35 :                     Parse(pParser, NOT, NULL, &state);
    1017                 :            :                     goto just_had_operator;
    1018                 :            :                 }
    1019         [ +  + ]:      40426 :                 if (op == "XOR") {
    1020                 :          9 :                     Parse(pParser, XOR, NULL, &state);
    1021                 :            :                     goto just_had_operator;
    1022                 :            :                 }
    1023         [ +  + ]:      40417 :                 if (op == "ADJ") {
    1024 [ +  - ][ +  + ]:          7 :                     if (it != end && *it == '/') {
                 [ +  + ]
    1025                 :          5 :                         size_t width = 0;
    1026                 :          5 :                         Utf8Iterator p = it;
    1027 [ +  - ][ +  + ]:          7 :                         while (++p != end && U_isdigit(*p)) {
                 [ +  + ]
    1028                 :          2 :                             width = (width * 10) + (*p - '0');
    1029                 :            :                         }
    1030 [ +  + ][ +  - ]:          5 :                         if (width && (p == end || is_whitespace(*p))) {
         [ +  - ][ +  + ]
    1031                 :          2 :                             it = p;
    1032                 :          2 :                             Parse(pParser, ADJ, new Term(width), &state);
    1033                 :            :                             goto just_had_operator;
    1034                 :            :                         }
    1035                 :            :                     } else {
    1036                 :          2 :                         Parse(pParser, ADJ, NULL, &state);
    1037                 :            :                         goto just_had_operator;
    1038                 :            :                     }
    1039                 :            :                 }
    1040         [ +  + ]:        642 :             } else if (op.size() == 2) {
    1041         [ +  + ]:        276 :                 if (op == "OR") {
    1042                 :         39 :                     Parse(pParser, OR, NULL, &state);
    1043                 :            :                     goto just_had_operator;
    1044                 :            :                 }
    1045         [ +  - ]:        366 :             } else if (op.size() == 4) {
    1046         [ +  + ]:        366 :                 if (op == "NEAR") {
    1047 [ +  - ][ +  + ]:         23 :                     if (it != end && *it == '/') {
                 [ +  + ]
    1048                 :          5 :                         size_t width = 0;
    1049                 :          5 :                         Utf8Iterator p = it;
    1050 [ +  - ][ +  + ]:          7 :                         while (++p != end && U_isdigit(*p)) {
                 [ +  + ]
    1051                 :          2 :                             width = (width * 10) + (*p - '0');
    1052                 :            :                         }
    1053 [ +  + ][ +  - ]:          5 :                         if (width && (p == end || is_whitespace(*p))) {
         [ +  - ][ +  + ]
    1054                 :          2 :                             it = p;
    1055                 :          2 :                             Parse(pParser, NEAR, new Term(width), &state);
    1056                 :            :                             goto just_had_operator;
    1057                 :            :                         }
    1058                 :            :                     } else {
    1059                 :      41014 :                         Parse(pParser, NEAR, NULL, &state);
    1060                 :            :                         goto just_had_operator;
    1061                 :            :                     }
    1062                 :            :                 }
    1063         [ +  + ]:      41164 :             }
    1064                 :            :         }
    1065                 :            : 
    1066                 :            :         // If no prefix is set, use the default one.
    1067         [ +  + ]:      44452 :         if (!prefix_info) prefix_info = prefix_stack.back();
    1068                 :            : 
    1069                 :            :         Assert(prefix_info->type == NON_BOOLEAN);
    1070                 :            : 
    1071                 :            :         {
    1072                 :      44452 :             string unstemmed_term(term);
    1073                 :      44452 :             term = Unicode::tolower(term);
    1074                 :            : 
    1075                 :            :             // Reuse stem_strategy - STEM_SOME here means "stem terms except
    1076                 :            :             // when used with positional operators".
    1077                 :      44452 :             stem_strategy stem_term = stem_action;
    1078         [ +  + ]:      44452 :             if (stem_term != STEM_NONE) {
    1079         [ +  + ]:       3810 :                 if (!stemmer.internal.get()) {
    1080                 :            :                     // No stemmer is set.
    1081                 :         72 :                     stem_term = STEM_NONE;
    1082         [ +  + ]:       3738 :                 } else if (stem_term == STEM_SOME) {
    1083 [ +  + ][ +  + ]:       3664 :                     if (!should_stem(unstemmed_term) ||
         [ +  + ][ +  + ]
    1084                 :            :                         (it != end && is_stem_preventer(*it))) {
    1085                 :            :                         // Don't stem this particular term.
    1086                 :       1503 :                         stem_term = STEM_NONE;
    1087                 :            :                     }
    1088                 :            :                 }
    1089                 :            :             }
    1090                 :            : 
    1091                 :            :             Term * term_obj = new Term(&state, term, prefix_info,
    1092                 :      44452 :                                        unstemmed_term, stem_term, term_pos++);
    1093                 :            : 
    1094         [ +  + ]:      44452 :             if (is_cjk_term) {
    1095                 :         23 :                 Parse(pParser, CJKTERM, term_obj, &state);
    1096         [ +  + ]:         23 :                 if (it == end) break;
    1097                 :         18 :                 continue;
    1098                 :            :             }
    1099                 :            : 
    1100 [ +  + ][ +  + ]:      44429 :             if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
                 [ +  + ]
    1101         [ +  + ]:      43471 :                 if (it != end) {
    1102 [ +  + ][ +  + ]:      42961 :                     if ((flags & FLAG_WILDCARD) && *it == '*') {
                 [ +  + ]
    1103                 :         93 :                         Utf8Iterator p(it);
    1104                 :         93 :                         ++p;
    1105   [ +  +  +  - ]:         93 :                         if (p == end || !is_wordchar(*p)) {
                 [ +  - ]
    1106                 :         93 :                             it = p;
    1107 [ +  + ][ +  + ]:         93 :                             if (mode == IN_GROUP || mode == IN_GROUP2) {
    1108                 :            :                                 // Drop out of IN_GROUP and flag that the group
    1109                 :            :                                 // can be empty if all members are stopwords.
    1110         [ +  + ]:         14 :                                 if (mode == IN_GROUP2)
    1111                 :          8 :                                     Parse(pParser, EMPTY_GROUP_OK, NULL, &state);
    1112                 :         14 :                                 mode = DEFAULT;
    1113                 :            :                             }
    1114                 :            :                             // Wildcard at end of term (also known as
    1115                 :            :                             // "right truncation").
    1116                 :         93 :                             Parse(pParser, WILD_TERM, term_obj, &state);
    1117                 :         93 :                             continue;
    1118                 :            :                         }
    1119                 :            :                     }
    1120                 :            :                 } else {
    1121         [ +  + ]:        510 :                     if (flags & FLAG_PARTIAL) {
    1122 [ +  + ][ -  + ]:         81 :                         if (mode == IN_GROUP || mode == IN_GROUP2) {
    1123                 :            :                             // Drop out of IN_GROUP and flag that the group
    1124                 :            :                             // can be empty if all members are stopwords.
    1125         [ -  + ]:         12 :                             if (mode == IN_GROUP2)
    1126                 :          0 :                                 Parse(pParser, EMPTY_GROUP_OK, NULL, &state);
    1127                 :         12 :                             mode = DEFAULT;
    1128                 :            :                         }
    1129                 :            :                         // Final term of a partial match query, with no
    1130                 :            :                         // following characters - treat as a wildcard.
    1131                 :         81 :                         Parse(pParser, PARTIAL_TERM, term_obj, &state);
    1132                 :         81 :                         continue;
    1133                 :            :                     }
    1134                 :            :                 }
    1135                 :            :             }
    1136                 :            : 
    1137                 :            :             // Check spelling, if we're a normal term, and any of the prefixes
    1138                 :            :             // are empty.
    1139 [ +  + ][ +  + ]:      44255 :             if ((flags & FLAG_SPELLING_CORRECTION) && !was_acronym) {
    1140                 :         99 :                 const list<string> & pfxes = prefix_info->prefixes;
    1141                 :         99 :                 list<string>::const_iterator pfx_it;
    1142         [ +  - ]:        198 :                 for (pfx_it = pfxes.begin(); pfx_it != pfxes.end(); ++pfx_it) {
    1143         [ -  + ]:         99 :                     if (!pfx_it->empty())
    1144                 :          0 :                         continue;
    1145                 :         99 :                     const string & suggest = db.get_spelling_suggestion(term);
    1146         [ +  + ]:         99 :                     if (!suggest.empty()) {
    1147         [ +  + ]:         61 :                         if (corrected_query.empty()) corrected_query = qs;
    1148                 :         61 :                         size_t term_end_index = it.raw() - qs.data();
    1149                 :         61 :                         size_t n = term_end_index - term_start_index;
    1150                 :         61 :                         size_t pos = term_start_index + correction_offset;
    1151                 :         61 :                         corrected_query.replace(pos, n, suggest);
    1152                 :         61 :                         correction_offset += suggest.size();
    1153                 :         61 :                         correction_offset -= n;
    1154                 :            :                     }
    1155                 :            :                     break;
    1156                 :            :                 }
    1157                 :            :             }
    1158                 :            : 
    1159         [ +  + ]:      44255 :             if (mode == IN_PHRASED_TERM) {
    1160                 :        341 :                 Parse(pParser, PHR_TERM, term_obj, &state);
    1161                 :            :             } else {
    1162                 :            :                 // See if the next token will be PHR_TERM - if so, this one
    1163                 :            :                 // needs to be TERM not GROUP_TERM.
    1164 [ +  + ][ +  + ]:      43914 :                 if ((mode == IN_GROUP || mode == IN_GROUP2) &&
         [ +  + ][ +  + ]
    1165                 :            :                     is_phrase_generator(*it)) {
    1166                 :            :                     // FIXME: can we clean this up?
    1167                 :        104 :                     Utf8Iterator p = it;
    1168   [ +  +  +  + ]:        108 :                     do {
                 [ +  + ]
    1169                 :        108 :                         ++p;
    1170                 :            :                     } while (p != end && is_phrase_generator(*p));
    1171                 :            :                     // Don't generate a phrase unless the phrase generators are
    1172                 :            :                     // immediately followed by another term.
    1173 [ +  + ][ +  + ]:        104 :                     if (p != end && is_wordchar(*p)) {
                 [ +  + ]
    1174                 :         48 :                         mode = DEFAULT;
    1175                 :            :                     }
    1176                 :            :                 }
    1177                 :            : 
    1178                 :      43914 :                 int token = TERM;
    1179 [ +  + ][ +  + ]:      43914 :                 if (mode == IN_GROUP || mode == IN_GROUP2) {
    1180                 :      21415 :                     mode = IN_GROUP2;
    1181                 :      21415 :                     token = GROUP_TERM;
    1182                 :            :                 }
    1183                 :      43914 :                 Parse(pParser, token, term_obj, &state);
    1184   [ +  +  +  + ]:      43914 :                 if (token == TERM && mode != DEFAULT)
    1185                 :      44255 :                     continue;
    1186      [ +  +  + ]:      44452 :             }
    1187                 :            :         }
    1188                 :            : 
    1189         [ +  + ]:      43638 :         if (it == end) break;
    1190                 :            : 
    1191         [ +  + ]:      43169 :         if (is_phrase_generator(*it)) {
    1192                 :            :             // Skip multiple phrase generators.
    1193   [ +  +  +  + ]:        488 :             do {
                 [ +  + ]
    1194                 :        488 :                 ++it;
    1195                 :            :             } while (it != end && is_phrase_generator(*it));
    1196                 :            :             // Don't generate a phrase unless the phrase generators are
    1197                 :            :             // immediately followed by another term.
    1198 [ +  + ][ +  + ]:        452 :             if (it != end && is_wordchar(*it)) {
                 [ +  + ]
    1199                 :        341 :                 mode = IN_PHRASED_TERM;
    1200                 :        341 :                 term_start_index = it.raw() - qs.data();
    1201                 :            :                 goto phrased_term;
    1202                 :            :             }
    1203 [ +  + ][ +  - ]:      42717 :         } else if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) {
                 [ +  + ]
    1204                 :      42522 :             int old_mode = mode;
    1205                 :      42522 :             mode = DEFAULT;
    1206 [ +  + ][ +  + ]:      42522 :             if (!last_was_operator_needing_term && is_whitespace(*it)) {
                 [ +  + ]
    1207                 :      41973 :                 newprev = ' ';
    1208                 :            :                 // Skip multiple whitespace.
    1209   [ +  +  +  + ]:      41988 :                 do {
                 [ +  + ]
    1210                 :      41988 :                     ++it;
    1211                 :            :                 } while (it != end && is_whitespace(*it));
    1212                 :            :                 // Don't generate a group unless the terms are only separated
    1213                 :            :                 // by whitespace.
    1214 [ +  + ][ +  + ]:      41973 :                 if (it != end && is_wordchar(*it)) {
                 [ +  + ]
    1215 [ +  - ][ +  + ]:      21585 :                     if (old_mode == IN_GROUP || old_mode == IN_GROUP2) {
    1216                 :      20850 :                         mode = IN_GROUP2;
    1217                 :            :                     } else {
    1218                 :      42828 :                         mode = IN_GROUP;
    1219                 :            :                     }
    1220                 :            :                 }
    1221                 :            :             }
    1222                 :            :         }
    1223   [ +  +  +  + ]:      44620 :     }
    1224                 :            : done:
    1225         [ +  + ]:      24980 :     if (!state.error) {
    1226                 :            :         // Implicitly close any unclosed quotes...
    1227 [ +  + ][ -  + ]:      24850 :         if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES)
    1228                 :         32 :             Parse(pParser, QUOTE, NULL, &state);
    1229                 :      24850 :         Parse(pParser, 0, NULL, &state);
    1230                 :            :     }
    1231                 :            : 
    1232                 :      24964 :     errmsg = state.error;
    1233                 :      25028 :     return state.query;
    1234                 :            : }
    1235                 :            : 
    1236                 :            : struct ProbQuery {
    1237                 :            :     Query * query;
    1238                 :            :     Query * love;
    1239                 :            :     Query * hate;
    1240                 :            :     // filter is a map from prefix to a query for that prefix.  Queries with
    1241                 :            :     // the same prefix are combined with OR, and the results of this are
    1242                 :            :     // combined with AND to get the full filter.
    1243                 :            :     map<filter_group_id, Query> filter;
    1244                 :            : 
    1245                 :       4458 :     ProbQuery() : query(0), love(0), hate(0) { }
    1246                 :       4458 :     ~ProbQuery() {
    1247         [ +  + ]:       4458 :         delete query;
    1248         [ +  + ]:       4458 :         delete love;
    1249         [ +  + ]:       4458 :         delete hate;
    1250                 :       4458 :     }
    1251                 :            : 
    1252                 :         40 :     void add_filter(const filter_group_id & id, const Query & q) {
    1253                 :         40 :         filter[id] = q;
    1254                 :         40 :     }
    1255                 :            : 
    1256                 :         25 :     void append_filter(const filter_group_id & id, const Query & qnew) {
    1257                 :         25 :         Query & q = filter[id];
    1258                 :            :         // We OR filters with the same prefix if they're exclusive, otherwise
    1259                 :            :         // we AND them.
    1260                 :         25 :         bool exclusive = (id.prefix_info->type == BOOLEAN_EXCLUSIVE);
    1261         [ +  + ]:         25 :         Query::op op = exclusive ? Query::OP_OR : Query::OP_AND;
    1262                 :         25 :         q = Query(op, q, qnew);
    1263                 :         25 :     }
    1264                 :            : 
    1265                 :       3764 :     void add_filter_range(Xapian::valueno slot, const Query & range) {
    1266                 :       3764 :         filter[filter_group_id(slot)] = range;
    1267                 :       3764 :     }
    1268                 :            : 
    1269                 :         13 :     void append_filter_range(Xapian::valueno slot, const Query & range) {
    1270                 :         13 :         Query & q = filter[filter_group_id(slot)];
    1271                 :         13 :         q = Query(Query::OP_OR, q, range);
    1272                 :         13 :     }
    1273                 :            : 
    1274                 :       3825 :     Query merge_filters() const {
    1275                 :       3825 :         map<filter_group_id, Query>::const_iterator i = filter.begin();
    1276                 :            :         Assert(i != filter.end());
    1277                 :       3825 :         Query q = i->second;
    1278         [ +  + ]:       3835 :         while (++i != filter.end()) {
    1279                 :         10 :             q = Query(Query::OP_AND, q, i->second);
    1280                 :            :         }
    1281                 :          0 :         return q;
    1282                 :            :     }
    1283                 :            : };
    1284                 :            : 
    1285                 :            : /// A group of terms separated only by whitespace.
    1286                 :            : class TermGroup {
    1287                 :            :     vector<Term *> terms;
    1288                 :            : 
    1289                 :            :     /** Controls how to handle a group where all terms are stopwords.
    1290                 :            :      *
    1291                 :            :      *  If true, then as_group() returns NULL.  If false, then the
    1292                 :            :      *  stopword status of the terms is ignored.
    1293                 :            :      */
    1294                 :            :     bool empty_ok;
    1295                 :            : 
    1296                 :            :   public:
    1297                 :        556 :     TermGroup() : empty_ok(false) { }
    1298                 :            : 
    1299                 :            :     /// Add a Term object to this TermGroup object.
    1300                 :      21971 :     void add_term(Term * term) {
    1301                 :      21971 :         terms.push_back(term);
    1302                 :      21971 :     }
    1303                 :            : 
    1304                 :            :     /// Set the empty_ok flag.
    1305                 :          8 :     void set_empty_ok() { empty_ok = true; }
    1306                 :            : 
    1307                 :            :     /// Convert to a Xapian::Query * using default_op.
    1308                 :            :     Query * as_group(State *state) const;
    1309                 :            : 
    1310                 :            :     /** Provide a way to explicitly delete an object of this class.  The
    1311                 :            :      *  destructor is protected to prevent auto-variables of this type.
    1312                 :            :      */
    1313         [ +  - ]:          7 :     void destroy() { delete this; }
    1314                 :            : 
    1315                 :            :   protected:
    1316                 :            :     /** Protected destructor, so an auto-variable of this type is a
    1317                 :            :      *  compile-time error - you must allocate this object with new.
    1318                 :            :      */
    1319                 :        556 :     ~TermGroup() {
    1320                 :        556 :         vector<Term*>::const_iterator i;
    1321         [ +  + ]:      22527 :         for (i = terms.begin(); i != terms.end(); ++i) {
    1322         [ +  - ]:      21971 :             delete *i;
    1323                 :            :         }
    1324                 :        556 :     }
    1325                 :            : };
    1326                 :            : 
    1327                 :            : Query *
    1328                 :        556 : TermGroup::as_group(State *state) const
    1329                 :            : {
    1330                 :        556 :     const Xapian::Stopper * stopper = state->get_stopper();
    1331                 :        556 :     size_t stoplist_size = state->stoplist_size();
    1332                 :            : reprocess:
    1333                 :        558 :     Query::op default_op = state->default_op();
    1334                 :        558 :     vector<Query> subqs;
    1335                 :        558 :     subqs.reserve(terms.size());
    1336         [ +  + ]:        558 :     if (state->flags & QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS) {
    1337                 :            :         // Check for multi-word synonyms.
    1338                 :         14 :         Database db = state->get_database();
    1339                 :            : 
    1340                 :         14 :         string key;
    1341                 :         14 :         vector<Term*>::const_iterator begin = terms.begin();
    1342                 :         14 :         vector<Term*>::const_iterator i = begin;
    1343 [ -  + ][ -  + ]:      10138 :         while (i != terms.end()) {
         [ +  + ][ +  + ]
    1344                 :      10124 :             TermIterator synkey(db.synonym_keys_begin((*i)->name));
    1345                 :      10124 :             TermIterator synend(db.synonym_keys_end((*i)->name));
    1346         [ +  + ]:      10124 :             if (synkey == synend) {
    1347                 :            :                 // No multi-synonym matches.
    1348 [ -  + ][ #  # ]:       5009 :                 if (stopper && (*stopper)((*i)->name)) {
                 [ -  + ]
    1349                 :          0 :                     state->add_to_stoplist(*i);
    1350                 :            :                 } else {
    1351                 :       5009 :                     subqs.push_back((*i)->get_query_with_auto_synonyms());
    1352                 :            :                 }
    1353                 :       5009 :                 begin = ++i;
    1354                 :       5009 :                 continue;
    1355                 :            :             }
    1356                 :       5115 :             key.resize(0);
    1357         [ +  + ]:      15324 :             while (i != terms.end()) {
    1358         [ +  + ]:      15300 :                 if (!key.empty()) key += ' ';
    1359                 :      15300 :                 key += (*i)->name;
    1360                 :      15300 :                 ++i;
    1361                 :      15300 :                 synkey.skip_to(key);
    1362   [ +  +  -  + ]:      15300 :                 if (synkey == synend || !startswith(*synkey, key)) break;
         [ +  + ][ #  # ]
                 [ +  + ]
    1363                 :            :             }
    1364                 :            :             // Greedily try to match as many consecutive words as possible.
    1365                 :       5115 :             TermIterator syn, end;
    1366                 :       5281 :             while (true) {
    1367                 :      10396 :                 syn = db.synonyms_begin(key);
    1368                 :      10396 :                 end = db.synonyms_end(key);
    1369         [ +  + ]:      10396 :                 if (syn != end) break;
    1370         [ -  + ]:       5281 :                 if (--i == begin) break;
    1371                 :       5281 :                 key.resize(key.size() - (*i)->name.size() - 1);
    1372                 :            :             }
    1373         [ -  + ]:       5115 :             if (i == begin) {
    1374                 :            :                 // No multi-synonym matches.
    1375 [ #  # ][ #  # ]:          0 :                 if (stopper && (*stopper)((*i)->name)) {
                 [ #  # ]
    1376                 :          0 :                     state->add_to_stoplist(*i);
    1377                 :            :                 } else {
    1378                 :          0 :                     subqs.push_back((*i)->get_query_with_auto_synonyms());
    1379                 :            :                 }
    1380                 :          0 :                 begin = ++i;
    1381                 :          0 :                 continue;
    1382                 :            :             }
    1383                 :            : 
    1384                 :       5115 :             vector<Query> subqs2;
    1385                 :       5115 :             vector<Term*>::const_iterator j;
    1386         [ +  + ]:      15134 :             for (j = begin; j != i; ++j) {
    1387 [ -  + ][ #  # ]:      10019 :                 if (stopper && (*stopper)((*j)->name)) {
                 [ -  + ]
    1388                 :          0 :                     state->add_to_stoplist(*j);
    1389                 :            :                 } else {
    1390                 :      10019 :                     subqs2.push_back((*j)->get_query());
    1391                 :            :                 }
    1392                 :            :             }
    1393                 :       5115 :             Query q_original_terms;
    1394         [ -  + ]:       5115 :             if (is_positional(default_op)) {
    1395                 :            :                 q_original_terms = Query(default_op,
    1396                 :            :                                          subqs2.begin(), subqs2.end(),
    1397                 :          0 :                                          subqs2.size() + 9);
    1398                 :            :             } else {
    1399                 :            :                 q_original_terms = Query(default_op,
    1400                 :       5115 :                                          subqs2.begin(), subqs2.end());
    1401                 :            :             }
    1402                 :       5115 :             subqs2.clear();
    1403                 :            : 
    1404                 :            :             // Use the position of the first term for the synonyms.
    1405                 :       5115 :             Xapian::termpos pos = (*begin)->pos;
    1406                 :       5115 :             begin = i;
    1407         [ +  + ]:      10236 :             while (syn != end) {
    1408                 :       5121 :                 subqs2.push_back(Query(*syn, 1, pos));
    1409                 :       5121 :                 ++syn;
    1410                 :            :             }
    1411                 :       5115 :             Query q_synonym_terms(Query::OP_SYNONYM, subqs2.begin(), subqs2.end());
    1412                 :       5115 :             subqs2.clear();
    1413                 :            :             subqs.push_back(Query(Query::OP_SYNONYM,
    1414                 :       5115 :                                   q_original_terms, q_synonym_terms));
    1415                 :         14 :         }
    1416                 :            :     } else {
    1417                 :        544 :         vector<Term*>::const_iterator i;
    1418         [ +  + ]:       7492 :         for (i = terms.begin(); i != terms.end(); ++i) {
    1419 [ +  + ][ +  + ]:       6948 :             if (stopper && (*stopper)((*i)->name)) {
                 [ +  + ]
    1420                 :         36 :                 state->add_to_stoplist(*i);
    1421                 :            :             } else {
    1422                 :       6912 :                 subqs.push_back((*i)->get_query_with_auto_synonyms());
    1423                 :            :             }
    1424                 :            :         }
    1425                 :            :     }
    1426                 :            : 
    1427 [ +  + ][ +  + ]:        558 :     if (!empty_ok && stopper && subqs.empty() &&
         [ +  + ][ +  - ]
                 [ +  + ]
    1428                 :            :         stoplist_size < state->stoplist_size()) {
    1429                 :            :         // This group is all stopwords, so roll-back, disable stopper
    1430                 :            :         // temporarily, and reprocess this group.
    1431                 :          2 :         state->stoplist_resize(stoplist_size);
    1432                 :          2 :         stopper = NULL;
    1433                 :            :         goto reprocess;
    1434                 :            :     }
    1435                 :            : 
    1436                 :        556 :     Query * q = NULL;
    1437         [ +  + ]:        556 :     if (!subqs.empty()) {
    1438         [ +  + ]:        550 :         if (is_positional(default_op)) {
    1439                 :            :             q = new Query(default_op, subqs.begin(), subqs.end(),
    1440                 :         10 :                              subqs.size() + 9);
    1441                 :            :         } else {
    1442                 :        547 :             q = new Query(default_op, subqs.begin(), subqs.end());
    1443                 :            :         }
    1444                 :            :     }
    1445         [ +  - ]:        549 :     delete this;
    1446         [ +  + ]:        558 :     return q;
    1447                 :            : }
    1448                 :            : 
    1449                 :            : /// Some terms which form a positional sub-query.
    1450                 :            : class Terms {
    1451                 :            :     vector<Term *> terms;
    1452                 :            :     size_t window;
    1453                 :            : 
    1454                 :            :     /** Keep track of whether the terms added all have the same list of
    1455                 :            :      *  prefixes.  If so, we'll build a set of phrases, one using each prefix.
    1456                 :            :      *  This works around the limitation that a phrase cannot have multiple
    1457                 :            :      *  components which are "OR" combinations of terms, but is also probably
    1458                 :            :      *  what users expect: i.e., if a user specifies a phrase in a field, and
    1459                 :            :      *  that field maps to multiple prefixes, the user probably wants a phrase
    1460                 :            :      *  returned with all terms having one of those prefixes, rather than a
    1461                 :            :      *  phrase comprised of terms with differing prefixes.
    1462                 :            :      */
    1463                 :            :     bool uniform_prefixes;
    1464                 :            : 
    1465                 :            :     /** The list of prefixes of the terms added.
    1466                 :            :      *  This will be NULL if the terms have different prefixes.
    1467                 :            :      */
    1468                 :            :     const list<string> * prefixes;
    1469                 :            : 
    1470                 :            :     /// Convert to a query using the given operator and window size.
    1471                 :        445 :     Query * as_opwindow_query(Query::op op, Xapian::termcount w_delta) const {
    1472                 :        445 :         Query * q = NULL;
    1473                 :        445 :         size_t n_terms = terms.size();
    1474                 :        445 :         Xapian::termcount w = w_delta + terms.size();
    1475         [ +  + ]:        445 :         if (uniform_prefixes) {
    1476         [ +  - ]:        443 :             if (prefixes) {
    1477                 :        443 :                 list<string>::const_iterator piter;
    1478         [ +  + ]:        890 :                 for (piter = prefixes->begin(); piter != prefixes->end(); ++piter) {
    1479                 :        447 :                     vector<Query> subqs;
    1480                 :        447 :                     subqs.reserve(n_terms);
    1481                 :        447 :                     vector<Term *>::const_iterator titer;
    1482         [ +  + ]:       1656 :                     for (titer = terms.begin(); titer != terms.end(); ++titer) {
    1483                 :       1209 :                         Term * t = *titer;
    1484                 :       1209 :                         subqs.push_back(Query(t->make_term(*piter), 1, t->pos));
    1485                 :            :                     }
    1486                 :            :                     add_to_query(q, Query::OP_OR,
    1487                 :        447 :                                  Query(op, subqs.begin(), subqs.end(), w));
    1488                 :            :                 }
    1489                 :            :             }
    1490                 :            :         } else {
    1491                 :          2 :             vector<Query> subqs;
    1492                 :          2 :             subqs.reserve(n_terms);
    1493                 :          2 :             vector<Term *>::const_iterator titer;
    1494         [ +  + ]:          6 :             for (titer = terms.begin(); titer != terms.end(); ++titer) {
    1495                 :          4 :                 subqs.push_back((*titer)->get_query());
    1496                 :            :             }
    1497                 :          2 :             q = new Query(op, subqs.begin(), subqs.end(), w);
    1498                 :            :         }
    1499                 :            : 
    1500         [ +  - ]:        445 :         delete this;
    1501                 :        445 :         return q;
    1502                 :            :     }
    1503                 :            : 
    1504                 :            :   public:
    1505                 :        446 :     Terms() : window(0), uniform_prefixes(true), prefixes(NULL) { }
    1506                 :            : 
    1507                 :            :     /// Add an unstemmed Term object to this Terms object.
    1508                 :       1210 :     void add_positional_term(Term * term) {
    1509                 :       1210 :         const list<string> & term_prefixes = term->prefix_info->prefixes;
    1510         [ +  + ]:       1210 :         if (terms.empty()) {
    1511                 :        446 :             prefixes = &term_prefixes;
    1512 [ +  - ][ +  + ]:        764 :         } else if (uniform_prefixes && prefixes != &term_prefixes) {
    1513         [ +  - ]:          2 :             if (*prefixes != term_prefixes)  {
    1514                 :          2 :                 prefixes = NULL;
    1515                 :          2 :                 uniform_prefixes = false;
    1516                 :            :             }
    1517                 :            :         }
    1518                 :       1210 :         term->need_positions();
    1519                 :       1210 :         terms.push_back(term);
    1520                 :       1210 :     }
    1521                 :            : 
    1522                 :          4 :     void adjust_window(size_t alternative_window) {
    1523         [ +  - ]:          4 :         if (alternative_window > window) window = alternative_window;
    1524                 :          4 :     }
    1525                 :            : 
    1526                 :            :     /// Convert to a Xapian::Query * using adjacent OP_PHRASE.
    1527                 :        434 :     Query * as_phrase_query() const {
    1528                 :        434 :         return as_opwindow_query(Query::OP_PHRASE, 0);
    1529                 :            :     }
    1530                 :            : 
    1531                 :            :     /// Convert to a Xapian::Query * using OP_NEAR.
    1532                 :          8 :     Query * as_near_query() const {
    1533                 :            :         // The common meaning of 'a NEAR b' is "a within 10 terms of b", which
    1534                 :            :         // means a window size of 11.  For more than 2 terms, we just add one
    1535                 :            :         // to the window size for each extra term.
    1536                 :          8 :         size_t w = window;
    1537         [ +  + ]:          8 :         if (w == 0) w = 10;
    1538                 :          8 :         return as_opwindow_query(Query::OP_NEAR, w - 1);
    1539                 :            :     }
    1540                 :            : 
    1541                 :            :     /// Convert to a Xapian::Query * using OP_PHRASE to implement ADJ.
    1542                 :          3 :     Query * as_adj_query() const {
    1543                 :            :         // The common meaning of 'a ADJ b' is "a at most 10 terms before b",
    1544                 :            :         // which means a window size of 11.  For more than 2 terms, we just add
    1545                 :            :         // one to the window size for each extra term.
    1546                 :          3 :         size_t w = window;
    1547         [ +  + ]:          3 :         if (w == 0) w = 10;
    1548                 :          3 :         return as_opwindow_query(Query::OP_PHRASE, w - 1);
    1549                 :            :     }
    1550                 :            : 
    1551                 :            :     /** Provide a way to explicitly delete an object of this class.  The
    1552                 :            :      *  destructor is protected to prevent auto-variables of this type.
    1553                 :            :      */
    1554         [ +  - ]:          1 :     void destroy() { delete this; }
    1555                 :            : 
    1556                 :            :   protected:
    1557                 :            :     /** Protected destructor, so an auto-variable of this type is a
    1558                 :            :      *  compile-time error - you must allocate this object with new.
    1559                 :            :      */
    1560                 :        446 :     ~Terms() {
    1561                 :        446 :         vector<Term *>::const_iterator t;
    1562         [ +  + ]:       1656 :         for (t = terms.begin(); t != terms.end(); ++t) {
    1563         [ +  - ]:       1210 :             delete *t;
    1564                 :            :         }
    1565                 :        446 :     }
    1566                 :            : };
    1567                 :            : 
    1568                 :            : // Helper macro for converting a boolean operation into a Xapian::Query.
    1569                 :            : #define BOOL_OP_TO_QUERY(E, A, OP, B, OP_TXT) \
    1570                 :            :     do {\
    1571                 :            :         if (!A || !B) {\
    1572                 :            :             state->error = "Syntax: <expression> "OP_TXT" <expression>";\
    1573                 :            :             yy_parse_failed(yypParser);\
    1574                 :            :             return;\
    1575                 :            :         }\
    1576                 :            :         E = new Query(OP, *A, *B);\
    1577                 :            :         delete A;\
    1578                 :            :         delete B;\
    1579                 :            :     } while (0)
    1580                 :            : 
    1581                 :            : }
    1582                 :            : 
    1583                 :            : %token_type {Term *}
    1584         [ +  + ]:       1214 : %token_destructor {delete $$;}
    1585                 :            : 
    1586                 :            : %extra_argument {State * state}
    1587                 :            : 
    1588                 :            : %parse_failure {
    1589                 :            :     // If we've not already set an error message, set a default one.
    1590         [ +  + ]:        190 :     if (!state->error) state->error = "parse error";
    1591                 :            : }
    1592                 :            : 
    1593                 :            : %syntax_error {
    1594                 :        136 :     yy_parse_failed(yypParser);
    1595                 :            : }
    1596                 :            : 
    1597                 :            : // Operators, grouped in order of increasing precedence:
    1598                 :            : %nonassoc ERROR.
    1599                 :            : %left OR.
    1600                 :            : %left XOR.
    1601                 :            : %left AND NOT.
    1602                 :            : %left NEAR ADJ.
    1603                 :            : %left LOVE HATE HATE_AFTER_AND SYNONYM.
    1604                 :            : 
    1605                 :            : // Destructors for terminal symbols:
    1606                 :            : 
    1607                 :            : // TERM is a query term, including prefix (if any).
    1608                 :            : %destructor TERM {delete $$;}
    1609                 :            : 
    1610                 :            : // GROUP_TERM is a query term which follows a TERM or another GROUP_TERM and
    1611                 :            : // is only separated by whitespace characters.
    1612                 :            : %destructor GROUP_TERM {delete $$;}
    1613                 :            : 
    1614                 :            : // PHR_TERM is a query term which follows a TERM or another PHR_TERM and is
    1615                 :            : // separated only by one or more phrase generator characters (hyphen and
    1616                 :            : // apostrophe are common examples - see is_phrase_generator() for the list
    1617                 :            : // of all punctuation which does this).
    1618                 :            : %destructor PHR_TERM {delete $$;}
    1619                 :            : 
    1620                 :            : // WILD_TERM is like a TERM, but has a trailing wildcard which needs to be
    1621                 :            : // expanded.
    1622                 :            : %destructor WILD_TERM {delete $$;}
    1623                 :            : 
    1624                 :            : // PARTIAL_TERM is like a TERM, but it's at the end of the query string and
    1625                 :            : // we're doing "search as you type".  It expands to something like WILD_TERM
    1626                 :            : // OR stemmed_form.
    1627                 :            : %destructor PARTIAL_TERM {delete $$;}
    1628                 :            : 
    1629                 :            : // BOOLEAN_FILTER is a query term with a prefix registered using
    1630                 :            : // add_bool_prefix().  It's added to the query using an OP_FILTER operator,
    1631                 :            : // (or OP_AND_NOT if it's negated) e.g. site:xapian.org or -site:xapian.org
    1632                 :            : %destructor BOOLEAN_FILTER {delete $$;}
    1633                 :            : 
    1634                 :            : // Grammar rules:
    1635                 :            : 
    1636                 :            : // query - The whole query - just an expr or nothing.
    1637                 :            : 
    1638                 :            : // query non-terminal doesn't need a type, so just give a dummy one.
    1639                 :            : %type query {int}
    1640                 :            : 
    1641                 :            : query ::= expr(E). {
    1642                 :            :     // Save the parsed query in the State structure so we can return it.
    1643         [ +  - ]:      24784 :     if (E) {
    1644                 :      24784 :         state->query = *E;
    1645         [ +  - ]:      24784 :         delete E;
    1646                 :            :     } else {
    1647                 :          0 :         state->query = Query();
    1648                 :            :     }
    1649                 :            : }
    1650                 :            : 
    1651                 :            : query ::= . {
    1652                 :            :     // Handle a query string with no terms in.
    1653                 :          5 :     state->query = Query();
    1654                 :            : }
    1655                 :            : 
    1656                 :            : // expr - A query expression.
    1657                 :            : 
    1658                 :            : %type expr {Query *}
    1659         [ +  + ]:        198 : %destructor expr {delete $$;}
    1660                 :            : 
    1661                 :            : expr(E) ::= prob_expr(P).
    1662                 :      25504 :         { E = P; }
    1663                 :            : 
    1664                 :            : expr(E) ::= bool_arg(A) AND bool_arg(B).
    1665 [ +  - ][ +  + ]:         38 :         { BOOL_OP_TO_QUERY(E, A, Query::OP_AND, B, "AND"); }
         [ +  - ][ +  - ]
    1666                 :            : 
    1667                 :            : expr(E) ::= bool_arg(A) NOT bool_arg(B). {
    1668                 :            :     // 'NOT foo' -> '<alldocuments> NOT foo'
    1669 [ +  + ][ +  + ]:         21 :     if (!A && (state->flags & QueryParser::FLAG_PURE_NOT)) {
    1670                 :          2 :         A = new Query("", 1, 0);
    1671                 :            :     }
    1672 [ +  + ][ +  + ]:         21 :     BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "NOT");
         [ +  - ][ +  - ]
    1673                 :         11 : }
    1674                 :            : 
    1675                 :            : expr(E) ::= bool_arg(A) AND NOT bool_arg(B). [NOT]
    1676 [ +  + ][ +  + ]:         14 :         { BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "AND NOT"); }
         [ +  - ][ +  - ]
    1677                 :          6 : 
    1678                 :            : expr(E) ::= bool_arg(A) AND HATE_AFTER_AND bool_arg(B). [AND]
    1679 [ +  + ][ -  + ]:          6 :         { BOOL_OP_TO_QUERY(E, A, Query::OP_AND_NOT, B, "AND"); }
         [ +  - ][ +  - ]
    1680                 :          5 : 
    1681                 :            : expr(E) ::= bool_arg(A) OR bool_arg(B).
    1682 [ +  + ][ -  + ]:         36 :         { BOOL_OP_TO_QUERY(E, A, Query::OP_OR, B, "OR"); }
         [ +  - ][ +  - ]
    1683                 :            : 
    1684                 :            : expr(E) ::= bool_arg(A) XOR bool_arg(B).
    1685 [ +  + ][ -  + ]:          9 :         { BOOL_OP_TO_QUERY(E, A, Query::OP_XOR, B, "XOR"); }
         [ +  - ][ +  - ]
    1686                 :            : 
    1687                 :            : // bool_arg - an argument to a boolean operator such as AND or OR.
    1688                 :            : 
    1689                 :            : %type bool_arg {Query *}
    1690                 :            : %destructor bool_arg {delete $$;}
    1691                 :            : 
    1692                 :            : bool_arg(A) ::= expr(E). { A = E; }
    1693                 :            : 
    1694                 :            : bool_arg(A) ::= . [ERROR] {
    1695                 :            :     // Set the argument to NULL, which enables the bool_arg-using rules in
    1696                 :            :     // expr above to report uses of AND, OR, etc which don't have two
    1697                 :            :     // arguments.
    1698                 :         42 :     A = NULL;
    1699                 :            : }
    1700                 :            : 
    1701                 :            : // prob_expr - a single compound term, or a prob.
    1702                 :            : 
    1703                 :            : %type prob_expr {Query *}
    1704                 :            : %destructor prob_expr {delete $$;}
    1705                 :            : 
    1706                 :            : prob_expr(E) ::= prob(P). {
    1707                 :       4439 :     E = P->query;
    1708                 :       4439 :     P->query = NULL;
    1709                 :            :     // Handle any "+ terms".
    1710         [ +  + ]:       4439 :     if (P->love) {
    1711         [ +  + ]:         40 :         if (P->love->empty()) {
    1712                 :            :             // +<nothing>.
    1713         [ +  + ]:         21 :             delete E;
    1714                 :         21 :             E = P->love;
    1715         [ +  + ]:         19 :         } else if (E) {
    1716                 :         16 :             swap(E, P->love);
    1717                 :         16 :             add_to_query(E, Query::OP_AND_MAYBE, P->love);
    1718                 :            :         } else {
    1719                 :          3 :             E = P->love;
    1720                 :            :         }
    1721                 :         40 :         P->love = NULL;
    1722                 :            :     }
    1723                 :            :     // Handle any boolean filters.
    1724         [ +  + ]:       4439 :     if (!P->filter.empty()) {
    1725         [ +  + ]:       3825 :         if (E) {
    1726                 :         25 :             add_to_query(E, Query::OP_FILTER, P->merge_filters());
    1727                 :            :         } else {
    1728                 :            :             // Make the query a boolean one.
    1729                 :       3800 :             E = new Query(Query::OP_SCALE_WEIGHT, P->merge_filters(), 0.0);
    1730                 :            :         }
    1731                 :            :     }
    1732                 :            :     // Handle any "- terms".
    1733 [ +  + ][ +  + ]:       4439 :     if (P->hate && !P->hate->empty()) {
                 [ +  + ]
    1734         [ +  + ]:         38 :         if (!E) {
    1735                 :            :             // Can't just hate!
    1736                 :          4 :             yy_parse_failed(yypParser);
    1737                 :          4 :             return;
    1738                 :            :         }
    1739                 :         34 :         *E = Query(Query::OP_AND_NOT, *E, *P->hate);
    1740                 :            :     }
    1741         [ +  - ]:       4435 :     delete P;
    1742                 :            : }
    1743                 :            : 
    1744                 :            : prob_expr(E) ::= term(T). {
    1745                 :      22143 :     E = T;
    1746                 :            : }
    1747                 :            : 
    1748                 :            : // prob - a probabilistic sub-expression consisting of stop_terms, "+" terms,
    1749                 :            : // "-" terms, boolean filters, and/or value ranges.
    1750                 :            : //
    1751                 :            : // Note: stop_term can also be several other things other than a simple term!
    1752                 :            : 
    1753                 :            : %type prob {ProbQuery *}
    1754         [ +  - ]:         23 : %destructor prob {delete $$;}
    1755                 :            : 
    1756                 :            : prob(P) ::= RANGE(R). {
    1757                 :       3764 :     valueno slot = R->pos;
    1758                 :       3764 :     const Query & range = R->as_value_range_query();
    1759                 :       3764 :     P = new ProbQuery;
    1760                 :       3764 :     P->add_filter_range(slot, range);
    1761                 :            : }
    1762                 :            : 
    1763                 :            : prob(P) ::= stop_prob(Q) RANGE(R). {
    1764                 :         13 :     valueno slot = R->pos;
    1765                 :         13 :     const Query & range = R->as_value_range_query();
    1766                 :         13 :     P = Q;
    1767                 :         13 :     P->append_filter_range(slot, range);
    1768                 :            : }
    1769                 :            : 
    1770                 :            : prob(P) ::= stop_term(T) stop_term(U). {
    1771                 :        539 :     P = new ProbQuery;
    1772                 :        539 :     P->query = T;
    1773         [ +  - ]:        539 :     if (U) {
    1774                 :        539 :         Query::op op = state->default_op();
    1775   [ +  +  +  + ]:        539 :         if (P->query && is_positional(op)) {
                 [ +  + ]
    1776                 :            :             // If default_op is OP_NEAR or OP_PHRASE, set the window size to
    1777                 :            :             // 11 for the first pair of terms and it will automatically grow
    1778                 :            :             // by one for each subsequent term.
    1779                 :          1 :             Query * subqs[2] = { P->query, U };
    1780                 :          1 :             *(P->query) = Query(op, subqs, subqs + 2, 11);
    1781         [ +  - ]:          1 :             delete U;
    1782                 :            :         } else {
    1783                 :        538 :             add_to_query(P->query, op, U);
    1784                 :            :         }
    1785                 :            :     }
    1786                 :            : }
    1787                 :            : 
    1788                 :            : prob(P) ::= prob(Q) stop_term(T). {
    1789                 :        404 :     P = Q;
    1790                 :            :     // If T is a stopword, there's nothing to do here.
    1791         [ +  - ]:        404 :     if (T) add_to_query(P->query, state->default_op(), T);
    1792                 :            : }
    1793                 :            : 
    1794                 :            : prob(P) ::= LOVE term(T). {
    1795                 :         31 :     P = new ProbQuery;
    1796         [ +  + ]:         31 :     if (state->default_op() == Query::OP_AND) {
    1797                 :          1 :         P->query = T;
    1798                 :            :     } else {
    1799                 :         30 :         P->love = T;
    1800                 :            :     }
    1801                 :         31 : }
    1802                 :            : 
    1803                 :            : prob(P) ::= stop_prob(Q) LOVE term(T). {
    1804                 :         19 :     P = Q;
    1805         [ +  + ]:         19 :     if (state->default_op() == Query::OP_AND) {
    1806                 :            :         /* The default op is AND, so we just put loved terms into the query
    1807                 :            :          * (in this case the only effect of love is to ignore the stopword
    1808                 :            :          * list). */
    1809                 :          2 :         add_to_query(P->query, Query::OP_AND, T);
    1810                 :            :     } else {
    1811                 :         17 :         add_to_query(P->love, Query::OP_AND, T);
    1812                 :            :     }
    1813                 :         19 : }
    1814                 :            : 
    1815                 :            : prob(P) ::= HATE term(T). {
    1816                 :          9 :     P = new ProbQuery;
    1817                 :          9 :     P->hate = T;
    1818                 :          9 : }
    1819                 :            : 
    1820                 :            : prob(P) ::= stop_prob(Q) HATE term(T). {
    1821                 :         30 :     P = Q;
    1822                 :         30 :     add_to_query(P->hate, Query::OP_OR, T);
    1823                 :         30 : }
    1824                 :            : 
    1825                 :            : prob(P) ::= HATE BOOLEAN_FILTER(T). {
    1826                 :          1 :     P = new ProbQuery;
    1827                 :          1 :     P->hate = new Query(T->get_query());
    1828         [ +  - ]:          1 :     delete T;
    1829                 :          1 : }
    1830                 :            : 
    1831                 :            : prob(P) ::= stop_prob(Q) HATE BOOLEAN_FILTER(T). {
    1832                 :          4 :     P = Q;
    1833                 :          4 :     add_to_query(P->hate, Query::OP_OR, T->get_query());
    1834         [ +  - ]:          4 :     delete T;
    1835                 :          4 : }
    1836                 :            : 
    1837                 :            : prob(P) ::= BOOLEAN_FILTER(T). {
    1838                 :         40 :     P = new ProbQuery;
    1839                 :         40 :     P->add_filter(T->get_filter_group_id(), T->get_query());
    1840         [ +  - ]:         40 :     delete T;
    1841                 :            : }
    1842                 :            : 
    1843                 :            : prob(P) ::= stop_prob(Q) BOOLEAN_FILTER(T). {
    1844                 :         25 :     P = Q;
    1845                 :         25 :     P->append_filter(T->get_filter_group_id(), T->get_query());
    1846         [ +  - ]:         25 :     delete T;
    1847                 :            : }
    1848                 :            : 
    1849                 :            : prob(P) ::= LOVE BOOLEAN_FILTER(T). {
    1850                 :            :     // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
    1851                 :          1 :     P = new ProbQuery;
    1852                 :          1 :     P->filter[T->get_filter_group_id()] = T->get_query();
    1853         [ +  - ]:          1 :     delete T;
    1854                 :          1 : }
    1855                 :            : 
    1856                 :            : prob(P) ::= stop_prob(Q) LOVE BOOLEAN_FILTER(T). {
    1857                 :            :     // LOVE BOOLEAN_FILTER(T) is just the same as BOOLEAN_FILTER
    1858                 :          3 :     P = Q;
    1859                 :            :     // We OR filters with the same prefix...
    1860                 :          3 :     Query & q = P->filter[T->get_filter_group_id()];
    1861                 :          3 :     q = Query(Query::OP_OR, q, T->get_query());
    1862         [ +  - ]:          3 :     delete T;
    1863                 :          3 : }
    1864                 :            : 
    1865                 :            : // stop_prob - A prob or a stop_term.
    1866                 :            : 
    1867                 :            : %type stop_prob {ProbQuery *}
    1868                 :            : %destructor stop_prob {delete $$;}
    1869                 :            : 
    1870                 :            : stop_prob(P) ::= prob(Q).
    1871                 :         34 :     { P = Q; }
    1872                 :            : 
    1873                 :            : stop_prob(P) ::= stop_term(T). {
    1874                 :         73 :     P = new ProbQuery;
    1875                 :         73 :     P->query = T;
    1876                 :            : }
    1877                 :            : 
    1878                 :            : // stop_term - A term which should be checked against the stopword list,
    1879                 :            : // or a compound_term.
    1880                 :            : //
    1881                 :            : // If a term is loved, hated, or in a phrase, we don't want to consult the
    1882                 :            : // stopword list, so stop_term isn't used there (instead term is).
    1883                 :            : 
    1884                 :            : %type stop_term {Query *}
    1885                 :            : %destructor stop_term {delete $$;}
    1886                 :            : 
    1887                 :            : stop_term(T) ::= TERM(U). {
    1888         [ +  + ]:        720 :     if (state->is_stopword(U)) {
    1889                 :          2 :         T = NULL;
    1890                 :          2 :         state->add_to_stoplist(U);
    1891                 :            :     } else {
    1892                 :        718 :         T = new Query(U->get_query_with_auto_synonyms());
    1893                 :            :     }
    1894         [ +  - ]:        720 :     delete U;
    1895                 :            : }
    1896                 :            : 
    1897                 :            : stop_term(T) ::= compound_term(U). {
    1898                 :            :     T = U;
    1899                 :            : }
    1900                 :            : 
    1901                 :            : // term - A term or a compound_term.
    1902                 :            : 
    1903                 :            : %type term {Query *}
    1904                 :            : %destructor term {delete $$;}
    1905                 :            : 
    1906                 :            : term(T) ::= TERM(U). {
    1907                 :      20323 :     T = new Query(U->get_query_with_auto_synonyms());
    1908         [ +  - ]:      20323 :     delete U;
    1909                 :            : }
    1910                 :            : 
    1911                 :            : term(T) ::= compound_term(U). {
    1912                 :            :     T = U;
    1913                 :            : }
    1914                 :            : 
    1915                 :            : // compound_term - A WILD_TERM, a quoted phrase (with or without prefix), a
    1916                 :            : // phrased_term, group, near_expr, adj_expr, or a bracketed subexpression (with
    1917                 :            : // or without prefix).
    1918                 :            : 
    1919                 :            : %type compound_term {Query *}
    1920                 :            : %destructor compound_term {delete $$;}
    1921                 :            : 
    1922                 :            : compound_term(T) ::= WILD_TERM(U).
    1923                 :         93 :         { T = U->as_wildcarded_query(state); }
    1924                 :            : 
    1925                 :            : compound_term(T) ::= PARTIAL_TERM(U).
    1926                 :         81 :         { T = U->as_partial_query(state); }
    1927                 :            : 
    1928                 :            : compound_term(T) ::= QUOTE phrase(P) QUOTE.
    1929                 :        187 :         { T = P->as_phrase_query(); }
    1930                 :        187 : 
    1931                 :            : compound_term(T) ::= phrased_term(P).
    1932                 :        247 :         { T = P->as_phrase_query(); }
    1933                 :            : 
    1934                 :            : compound_term(T) ::= group(P).
    1935                 :        556 :         { T = P->as_group(state); }
    1936                 :            : 
    1937                 :            : compound_term(T) ::= near_expr(P).
    1938                 :          8 :         { T = P->as_near_query(); }
    1939                 :            : 
    1940                 :            : compound_term(T) ::= adj_expr(P).
    1941                 :          3 :         { T = P->as_adj_query(); }
    1942                 :            : 
    1943                 :            : compound_term(T) ::= BRA expr(E) KET.
    1944                 :        187 :         { T = E; }
    1945                 :        187 : 
    1946                 :            : compound_term(T) ::= SYNONYM TERM(U). {
    1947                 :         20 :     T = new Query(U->get_query_with_synonyms());
    1948         [ +  - ]:         20 :     delete U;
    1949                 :         20 : }
    1950                 :            : 
    1951                 :            : compound_term(T) ::= CJKTERM(U). {
    1952                 :         22 :     { T = U->as_cjk_query(); }
    1953                 :            : }
    1954                 :            : 
    1955                 :            : // phrase - The "inside the quotes" part of a double-quoted phrase.
    1956                 :            : 
    1957                 :            : %type phrase {Terms *}
    1958                 :            : 
    1959                 :          1 : %destructor phrase {$$->destroy();}
    1960                 :            : 
    1961                 :            : phrase(P) ::= TERM(T). {
    1962                 :        187 :     P = new Terms;
    1963                 :        187 :     P->add_positional_term(T);
    1964                 :            : }
    1965                 :            : 
    1966                 :            : phrase(P) ::= phrase(Q) TERM(T). {
    1967                 :        502 :     P = Q;
    1968                 :        502 :     P->add_positional_term(T);
    1969                 :            : }
    1970                 :            : 
    1971                 :            : // phrased_term - A phrased term works like a single term, but is actually
    1972                 :            : // 2 or more terms linked together into a phrase by punctuation.  There must be
    1973                 :            : // at least 2 terms in order to be able to have punctuation between the terms!
    1974                 :            : 
    1975                 :            : %type phrased_term {Terms *}
    1976                 :            : %destructor phrased_term {$$->destroy();}
    1977                 :            : 
    1978                 :            : phrased_term(P) ::= TERM(T) PHR_TERM(U). {
    1979                 :        248 :     P = new Terms;
    1980                 :        248 :     P->add_positional_term(T);
    1981                 :        248 :     P->add_positional_term(U);
    1982                 :            : }
    1983                 :            : 
    1984                 :            : phrased_term(P) ::= phrased_term(Q) PHR_TERM(T). {
    1985                 :            :     P = Q;
    1986                 :            :     P->add_positional_term(T);
    1987                 :            : }
    1988                 :            : 
    1989                 :            : // group - A group of terms separated only by whitespace - candidates for
    1990                 :            : // multi-term synonyms.
    1991                 :            : 
    1992                 :            : %type group {TermGroup *}
    1993                 :          7 : %destructor group {$$->destroy();}
    1994                 :            : 
    1995                 :            : group(P) ::= TERM(T) GROUP_TERM(U). {
    1996                 :        556 :     P = new TermGroup;
    1997                 :        556 :     P->add_term(T);
    1998                 :        556 :     P->add_term(U);
    1999                 :            : }
    2000                 :            : 
    2001                 :            : group(P) ::= group(Q) GROUP_TERM(T). {
    2002                 :      20859 :     P = Q;
    2003                 :      20859 :     P->add_term(T);
    2004                 :            : }
    2005                 :            : 
    2006                 :            : group(P) ::= group(Q) EMPTY_GROUP_OK. {
    2007                 :          8 :     P = Q;
    2008                 :          8 :     P->set_empty_ok();
    2009                 :          8 : }
    2010                 :            : 
    2011                 :            : // near_expr - 2 or more terms with NEAR in between.  There must be at least 2
    2012                 :            : // terms in order for there to be any NEAR operators!
    2013                 :            : 
    2014                 :            : %type near_expr {Terms *}
    2015                 :            : %destructor near_expr {$$->destroy();}
    2016                 :            : 
    2017                 :            : near_expr(P) ::= TERM(T) NEAR(N) TERM(U). {
    2018                 :         11 :     P = new Terms;
    2019                 :         11 :     P->add_positional_term(T);
    2020                 :         11 :     P->add_positional_term(U);
    2021         [ +  + ]:         11 :     if (N) {
    2022                 :          4 :         P->adjust_window(N->get_termpos());
    2023         [ +  - ]:          4 :         delete N;
    2024                 :            :     }
    2025                 :            : }
    2026                 :            : 
    2027                 :            : near_expr(P) ::= near_expr(Q) NEAR(N) TERM(T). {
    2028                 :          3 :     P = Q;
    2029                 :          3 :     P->add_positional_term(T);
    2030         [ -  + ]:          3 :     if (N) {
    2031                 :          0 :         P->adjust_window(N->get_termpos());
    2032         [ #  # ]:          0 :         delete N;
    2033                 :            :     }
    2034                 :            : }
    2035                 :            : 
    2036                 :            : // adj_expr - 2 or more terms with ADJ in between.  There must be at least 2
    2037                 :            : // terms in order for there to be any ADJ operators!
    2038                 :            : 
    2039                 :            : %type adj_expr {Terms *}
    2040                 :            : %destructor adj_expr {$$->destroy();}
    2041                 :            : 
    2042                 :            : adj_expr(P) ::= TERM(T) ADJ(N) TERM(U). {
    2043                 :            :     P = new Terms;
    2044                 :            :     P->add_positional_term(T);
    2045                 :            :     P->add_positional_term(U);
    2046                 :            :     if (N) {
    2047                 :            :         P->adjust_window(N->get_termpos());
    2048                 :            :         delete N;
    2049                 :            :     }
    2050                 :            : }
    2051                 :            : 
    2052                 :            : adj_expr(P) ::= adj_expr(Q) ADJ(N) TERM(T). {
    2053                 :            :     P = Q;
    2054                 :            :     P->add_positional_term(T);
    2055                 :            :     if (N) {
    2056                 :            :         P->adjust_window(N->get_termpos());
    2057                 :            :         delete N;
    2058                 :            :     }
    2059                 :            : }
    2060                 :            : 
    2061                 :            : // Select yacc syntax highlighting in vim editor: vim: syntax=yacc
    2062                 :            : // (lemon syntax colouring isn't supplied by default; yacc does an OK job).

Generated by: LCOV version 1.8