Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members  

include/xapian/enquire.h

Go to the documentation of this file.
00001 
00004 /* ----START-LICENCE----
00005  * Copyright 1999,2000,2001 BrightStation PLC
00006  * Copyright 2001,2002 Ananova Ltd
00007  * Copyright 2002,2003 Olly Betts
00008  *
00009  * This program is free software; you can redistribute it and/or
00010  * modify it under the terms of the GNU General Public License as
00011  * published by the Free Software Foundation; either version 2 of the
00012  * License, or (at your option) any later version.
00013  *
00014  * This program is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  * GNU General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU General Public License
00020  * along with this program; if not, write to the Free Software
00021  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00022  * USA
00023  * -----END-LICENCE-----
00024  */
00025 
00026 #ifndef XAPIAN_INCLUDED_ENQUIRE_H
00027 #define XAPIAN_INCLUDED_ENQUIRE_H
00028 
00029 #include <string>
00030 #include <time.h> // for time_t
00031 
00032 #include <xapian/base.h>
00033 #include <xapian/types.h>
00034 
00035 namespace Xapian {
00036 
00037 class Database;
00038 class Document;
00039 class ErrorHandler;
00040 class MSetIterator;
00041 class Query;
00042 class TermIterator;
00043 class Weight;
00044 
00048 class MSet {
00049     public:
00050         class Internal;
00052         Xapian::Internal::RefCntPtr<Internal> internal;
00053 
00054     public:
00055         // FIXME: public for now, private would be better
00057         MSet(MSet::Internal * internal_);
00058 
00060         MSet();
00061 
00063         ~MSet();
00064 
00066         MSet(const MSet & other);
00067 
00069         void operator=(const MSet &other);
00070 
00086         void fetch(const MSetIterator &begin, const MSetIterator &end) const;
00087 
00090         void fetch(const MSetIterator &item) const;
00091 
00094         void fetch() const;
00095 
00100         Xapian::percent convert_to_percent(Xapian::weight wt) const;
00101 
00103         Xapian::percent convert_to_percent(const MSetIterator &it) const;
00104 
00112         Xapian::doccount get_termfreq(const std::string &tname) const;
00113 
00121         Xapian::weight get_termweight(const std::string &tname) const;
00122 
00130         Xapian::doccount get_firstitem() const;
00131 
00138         Xapian::doccount get_matches_lower_bound() const;
00139 
00149         Xapian::doccount get_matches_estimated() const;
00150 
00157         Xapian::doccount get_matches_upper_bound() const;
00158 
00164         Xapian::weight get_max_possible() const;
00165 
00179         Xapian::weight get_max_attained() const;
00180 
00181         Xapian::doccount size() const;
00182 
00183         Xapian::doccount max_size() const;
00184 
00185         bool empty() const;
00186 
00187         void swap(MSet & other);
00188 
00189         MSetIterator begin() const;
00190 
00191         MSetIterator end() const;
00192 
00193         MSetIterator back() const;
00194         
00204         MSetIterator operator[](Xapian::doccount i) const;
00205 
00207 
00208         typedef MSetIterator value_type; // FIXME: not assignable...
00209         typedef MSetIterator iterator;
00210         typedef MSetIterator const_iterator;
00211         typedef MSetIterator & reference; // Hmm
00212         typedef MSetIterator & const_reference;
00213         typedef MSetIterator * pointer; // Hmm
00214         typedef Xapian::doccount_diff difference_type;
00215         typedef Xapian::doccount size_type;
00217         
00221         std::string get_description() const;
00222 };
00223 
00227 class MSetIterator {
00228     private:
00229         friend class MSet;
00230         friend bool operator==(const MSetIterator &a, const MSetIterator &b);
00231         friend bool operator!=(const MSetIterator &a, const MSetIterator &b);
00232 
00233         MSetIterator(Xapian::doccount index_, const MSet & mset_)
00234             : index(index_), mset(mset_) { }
00235 
00236         Xapian::doccount index;
00237         MSet mset;
00238 
00239     public:
00243         MSetIterator() : index(0), mset() { }
00244 
00245         ~MSetIterator() { }
00246 
00248         MSetIterator(const MSetIterator &other) {
00249             index = other.index;
00250             mset = other.mset;
00251         }
00252 
00254         void operator=(const MSetIterator &other) {
00255             index = other.index;
00256             mset = other.mset;
00257         }
00258 
00260         MSetIterator & operator++() {
00261             ++index;
00262             return *this;
00263         }
00264 
00265         void operator++(int) {
00266             ++index;
00267         }
00268 
00270         Xapian::docid operator*() const;
00271 
00290         Xapian::Document get_document() const;
00291 
00298         Xapian::doccount get_rank() const {
00299             return mset.get_firstitem() + index;
00300         }
00301 
00303         Xapian::weight get_weight() const;
00304 
00321         Xapian::doccount get_collapse_count() const;
00322 
00328         Xapian::percent get_percent() const;
00329 
00333         std::string get_description() const;
00334 
00336 
00337         typedef std::input_iterator_tag iterator_category; // FIXME: better than input_iterator!
00338         typedef Xapian::docid value_type;
00339         typedef Xapian::doccount_diff difference_type;
00340         typedef Xapian::docid * pointer;
00341         typedef Xapian::docid & reference;
00343 };
00344 
00345 inline bool operator==(const MSetIterator &a, const MSetIterator &b)
00346 {
00347     return (a.index == b.index);
00348 }
00349 
00350 inline bool operator!=(const MSetIterator &a, const MSetIterator &b)
00351 {
00352     return (a.index != b.index);
00353 }
00354 
00355 class ESetIterator;
00356 
00361 class ESet {
00362     public:
00363         class Internal;
00365         Internal *internal;
00366 
00368         ESet();
00369 
00371         ~ESet();
00372 
00374         ESet(const ESet & other);
00375 
00377         void operator=(const ESet &other);
00378 
00383         Xapian::termcount get_ebound() const;
00384 
00386         Xapian::termcount size() const;
00387 
00389         bool empty() const;
00390 
00392         ESetIterator begin() const;
00393 
00395         ESetIterator end() const;
00396 
00401         std::string get_description() const;
00402 };
00403 
00405 class ESetIterator {
00406     private:
00407         friend class ESet;
00408         friend bool operator==(const ESetIterator &a, const ESetIterator &b);
00409         friend bool operator!=(const ESetIterator &a, const ESetIterator &b);
00410 
00411         ESetIterator(Xapian::termcount index_, const ESet & eset_)
00412             : index(index_), eset(eset_) { }
00413 
00414         Xapian::termcount index;
00415         ESet eset;
00416 
00417     public:
00421         ESetIterator() : index(0), eset() { }
00422 
00423         ~ESetIterator() { }
00424 
00426         ESetIterator(const ESetIterator &other) {
00427             index = other.index;
00428             eset = other.eset;
00429         }
00430 
00432         void operator=(const ESetIterator &other) {
00433             index = other.index;
00434             eset = other.eset;
00435         }
00436 
00438         ESetIterator & operator++() {
00439             ++index;
00440             return *this;
00441         }
00442 
00443         void operator++(int) {
00444             ++index;
00445         }
00446 
00448         const std::string & operator *() const;
00449 
00451         Xapian::weight get_weight() const;
00452 
00456         std::string get_description() const;
00457 
00459 
00460         typedef std::input_iterator_tag iterator_category; // FIXME: better than input_iterator!
00461         typedef std::string value_type;
00462         typedef Xapian::termcount_diff difference_type;
00463         typedef std::string * pointer;
00464         typedef std::string & reference;
00466 };
00467 
00468 inline bool operator==(const ESetIterator &a, const ESetIterator &b)
00469 {
00470     return (a.index == b.index);
00471 }
00472 
00473 inline bool operator!=(const ESetIterator &a, const ESetIterator &b)
00474 {
00475     return (a.index != b.index);
00476 }
00477 
00482 class RSet {
00483     public:
00485         class Internal;
00486 
00488         Internal *internal;
00489 
00491         RSet(const RSet &rset);
00492 
00494         void operator=(const RSet &rset);
00495 
00497         RSet();
00498 
00500         ~RSet();
00501 
00503         Xapian::doccount size() const;
00504 
00506         bool empty() const;
00507 
00509         void add_document(Xapian::docid did);
00510         
00512         void add_document(const Xapian::MSetIterator & i) { add_document(*i); }
00513 
00515         void remove_document(Xapian::docid did);
00516 
00518         void remove_document(const Xapian::MSetIterator & i) { remove_document(*i); }
00519 
00521         bool contains(Xapian::docid did) const;
00522 
00524         bool contains(const Xapian::MSetIterator & i) { return contains(*i); }
00525 
00530         std::string get_description() const;
00531 };
00532 
00535 class MatchDecider {
00536     public:
00539         virtual int operator()(const Xapian::Document &doc) const = 0;
00540 
00542         virtual ~MatchDecider() {}
00543 };
00544 
00547 class ExpandDecider {
00548     public:
00551         virtual int operator()(const std::string & tname) const = 0;
00552 
00554         virtual ~ExpandDecider() {}
00555 };
00556 
00570 class Enquire {
00571     private:
00573         Enquire(const Enquire &);
00574 
00576         void operator=(const Enquire &);
00577 
00578     public:
00579         class Internal;
00581         Xapian::Internal::RefCntPtr<Internal> internal;
00582 
00598         Enquire(const Database &databases, ErrorHandler * errorhandler_ = 0);
00599 
00602         ~Enquire();
00603 
00611         void set_query(const Xapian::Query & query_);
00612 
00619         const Xapian::Query & get_query();
00620 
00627         void set_weighting_scheme(const Weight &weight_);
00628 
00655         void set_collapse_key(Xapian::valueno collapse_key);
00656 
00663         void set_sort_forward(bool sort_forward);
00664 
00682         void set_cutoff(Xapian::percent percent_cutoff, Xapian::weight weight_cutoff = 0);
00683 
00697         void set_sorting(Xapian::valueno sort_key, int sort_bands);
00698 
00710         void set_bias(Xapian::weight bias_weight, time_t bias_halflife);
00711 
00732         MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems,
00733                       const RSet * omrset = 0,
00734                       const MatchDecider * mdecider = 0) const;
00735 
00736         static const int include_query_terms = 1;
00737         static const int use_exact_termfreq = 2;
00761         ESet get_eset(Xapian::termcount maxitems,
00762                         const RSet & omrset,
00763                         int flags = 0,
00764                         double k = 1.0,
00765                         const Xapian::ExpandDecider * edecider = 0) const;
00766 
00781         inline ESet get_eset(Xapian::termcount maxitems, const RSet & omrset,
00782                                const Xapian::ExpandDecider * edecider) const {
00783             return get_eset(maxitems, omrset, 0, 1.0, edecider);
00784         }
00785 
00815         TermIterator get_matching_terms_begin(Xapian::docid did) const;
00816 
00818         TermIterator get_matching_terms_end(Xapian::docid did) const;
00819 
00843         TermIterator get_matching_terms_begin(const MSetIterator &it) const;
00844 
00846         TermIterator get_matching_terms_end(const MSetIterator &it) const;
00847 
00850         void register_match_decider(const std::string &name,
00851                                     const MatchDecider *mdecider = NULL);
00852 
00856         std::string get_description() const;
00857 };
00858 
00859 }
00860 
00861 class SocketServer;
00862 
00863 namespace Xapian {
00864 
00866 class Weight {
00867     friend class Enquire; // So Enquire can clone us
00868     friend class SocketServer; // So SocketServer can clone us - FIXME
00869     public:
00870         class Internal;
00871     private:
00872         Weight(const Weight &);
00873         void operator=(Weight &);
00874 
00876         //
00877         // Each subclass should implement this as:
00878         // virtual OmFooWeight * clone() const {
00879         //     return new OmFooWeight(param1, param2);
00880         // }
00881         virtual Weight * clone() const = 0;
00882 
00883     protected:
00884         const Internal * internal; // Weight::Internal == StatsSource
00885         Xapian::doclength querysize;
00886         Xapian::termcount wqf;
00887         std::string tname;
00888 
00889     public:
00890         Weight() { }
00891         virtual ~Weight() { }
00892 
00905         Weight * create(const Internal * internal_, Xapian::doclength querysize_,
00906                           Xapian::termcount wqf_, std::string tname_) const {
00907             Weight * wt = clone();
00908             wt->internal = internal_;
00909             wt->querysize = querysize_;
00910             wt->wqf = wqf_;
00911             wt->tname = tname_;
00912             return wt;
00913         }
00914 
00916         //
00917         //  If the subclass is called FooWeight, this should return "Foo".
00918         virtual std::string name() const = 0;
00919 
00921         virtual std::string serialise() const = 0;
00922 
00924         virtual Weight * Weight::unserialise(const std::string &s) const = 0;
00925 
00933         virtual Xapian::weight get_sumpart(Xapian::termcount wdf,
00934                                       Xapian::doclength len) const = 0;
00935 
00941         virtual Xapian::weight get_maxpart() const = 0;
00942 
00951         virtual Xapian::weight get_sumextra(Xapian::doclength len) const = 0;
00952 
00956         virtual Xapian::weight get_maxextra() const = 0;
00957 
00959         virtual bool get_sumpart_needs_doclength() const { return true; }
00960 };
00961 
00963 class BoolWeight : public Weight {
00964     public:
00965         Weight * clone() const {
00966             return new BoolWeight;
00967         }
00968         BoolWeight() { }
00969         ~BoolWeight() { }
00970         std::string name() const { return "Bool"; }
00971         std::string serialise() const { return ""; }
00972         Weight * unserialise(const std::string & /*s*/) const {
00973             return new BoolWeight;
00974         }
00975         Xapian::weight get_sumpart(Xapian::termcount /*wdf*/, Xapian::doclength /*len*/) const { return 0; }
00976         Xapian::weight get_maxpart() const { return 0; }
00977 
00978         Xapian::weight get_sumextra(Xapian::doclength /*len*/) const { return 0; }
00979         Xapian::weight get_maxextra() const { return 0; }
00980 
00981         bool get_sumpart_needs_doclength() const { return false; }      
00982 };
00983 
00985 //
00986 // BM25 weighting options : The BM25 formula is \f[
00987 //      \frac{C.s_{q}}{1+L_{d}}+\sum_{t}\frac{(A+1)q_{t}}{A+q_{t}}.\frac{(B+1)f_{t,d}}{B((1-D)+DL_{d})+f_{t,d}}.w_{t}
00988 // \f] where
00989 //   - \f$w_{t}\f$ is the termweight of term t
00990 //   - \f$f_{t,d}\f$ is the within document frequency of term t in document d
00991 //   - \f$q_{t}\f$ is the within query frequency of term t
00992 //   - \f$L_{d}\f$ is the normalised length of document d
00993 //   - \f$s_{q}\f$ is the size of the query
00994 //   - \f$A\f$, \f$B\f$, \f$C\f$ and \f$D\f$ are user specified parameters
00995 class BM25Weight : public Weight {
00996     private:
00997         mutable Xapian::weight termweight;
00998         mutable Xapian::doclength lenpart;
00999         mutable double BD;
01000 
01001         double A, B, C, D;
01002         Xapian::doclength min_normlen;
01003 
01004         mutable bool weight_calculated;
01005 
01006         void calc_termweight() const;
01007 
01008     public:
01027         BM25Weight(double A_, double B_, double C_, double D_,
01028                    double min_normlen_)
01029                 : A(A_), B(B_), C(C_), D(D_), min_normlen(min_normlen_),
01030                   weight_calculated(false)
01031         {
01032             if (A < 0) A = 0;
01033             if (B < 0) B = 0;
01034             if (C < 0) C = 0;
01035             if (D < 0) D = 0; else if (D > 1) D = 1;
01036         }
01037         BM25Weight() : A(1), B(1), C(0), D(0.5), min_normlen(0.5),
01038                        weight_calculated(false) { }
01039 
01040         Weight * clone() const {
01041             return new BM25Weight(A, B, C, D, min_normlen);
01042         }
01043         ~BM25Weight() { }
01044         std::string name() const { return "BM25"; }
01045         std::string serialise() const;
01046         Weight * unserialise(const std::string & s) const;
01047         Xapian::weight get_sumpart(Xapian::termcount wdf, Xapian::doclength len) const;
01048         Xapian::weight get_maxpart() const;
01049 
01050         Xapian::weight get_sumextra(Xapian::doclength len) const;
01051         Xapian::weight get_maxextra() const;
01052 
01053         bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
01054 };
01055 
01057 //
01058 // The Traditional weighting scheme formula is \f[
01059 //      \sum_{t}\frac{f_{t,d}}{k.L_{d}+f_{t,d}}.w_{t}
01060 // \f] where
01061 //   - \f$w_{t}\f$ is the termweight of term t
01062 //   - \f$f_{t,d}\f$ is the within document frequency of term t in document d
01063 //   - \f$L_{d}\f$ is the normalised length of document d
01064 //   - \f$k\f$ is a user specifiable parameter
01065 //
01066 // TradWeight is equivalent to BM25Weight(1, 1, 0, k, 0)
01067 class TradWeight : public Weight {
01068     private:
01069         mutable Xapian::weight termweight;
01070         mutable Xapian::doclength lenpart;
01071 
01072         double param_k;
01073 
01074         mutable bool weight_calculated;
01075 
01076         void calc_termweight() const;
01077 
01078     public:
01080         //
01081         // @param k  parameter governing the importance of within
01082         //           document frequency and document length - any positive
01083         //           number, 0 being wdf and doc length not used.  Default
01084         //           is 1.
01085         TradWeight(double k = 1) : param_k(k), weight_calculated(false) {
01086             if (param_k < 0) param_k = 0;
01087         }
01088         Weight * clone() const {
01089             return new TradWeight(param_k);
01090         }
01091         ~TradWeight() { }
01092         std::string name() const { return "Trad"; }
01093         std::string serialise() const;
01094         Weight * unserialise(const std::string & s) const;
01095         
01096         Xapian::weight get_sumpart(Xapian::termcount wdf, Xapian::doclength len) const;
01097         Xapian::weight get_maxpart() const;
01098 
01099         Xapian::weight get_sumextra(Xapian::doclength len) const;
01100         Xapian::weight get_maxextra() const;
01101 
01102         bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
01103 };
01104 
01105 }
01106 
01107 #endif /* XAPIAN_INCLUDED_ENQUIRE_H */

Documentation for Xapian (version 0.6.5).
Generated on 4 Jul 2003 by Doxygen 1.2.15.