Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

include/xapian/enquire.h

Go to the documentation of this file.
00001 
00025 #ifndef XAPIAN_INCLUDED_ENQUIRE_H
00026 #define XAPIAN_INCLUDED_ENQUIRE_H
00027 
00028 #include <string>
00029 #include <xapian/types.h>
00030 
00031 namespace Xapian {
00032 
00033 class Document;
00034 class Database;
00035 class ErrorHandler;
00036 
00037 class Query;
00038 class Weight;
00039 
00044 class MSetIterator {
00045   private:
00046     MSetIterator(Internal *internal_);
00047 
00048   public:
00049     friend class MSet;
00050 
00051     class Internal;
00053     Internal *internal;
00054 
00055     friend bool operator==(const MSetIterator &a, const MSetIterator &b);
00056 
00060     MSetIterator();
00061 
00062     ~MSetIterator();
00063 
00065     MSetIterator(const MSetIterator &other);
00066 
00068     void operator=(const MSetIterator &other);
00069 
00071     MSetIterator & operator++();
00072 
00073     void operator++(int);
00074 
00076     docid operator *() const;
00077 
00094     Document get_document() const;
00095 
00102     doccount get_rank() const;
00103 
00105     weight get_weight() const;
00106 
00111     percent get_percent() const;
00112 
00116     std::string get_description() const;
00117 
00119 
00120     typedef std::input_iterator_tag iterator_category;
00121     typedef docid value_type;
00122     typedef doccount_diff difference_type;
00123     typedef docid * pointer;
00124     typedef docid & reference;
00126 };
00127 
00128 inline bool operator!=(const MSetIterator &a, const MSetIterator &b)
00129 {
00130     return !(a == b);
00131 }
00132 
00136 class MSet {
00137   public:
00138     class Internal;
00140     Internal *internal;
00141 
00142     // FIXME: public for now, private would be better
00144     MSet(MSet::Internal * internal_);
00145 
00147     MSet();
00148 
00150     ~MSet();
00151 
00153     MSet(const MSet & other);
00154 
00156     void operator=(const MSet &other);
00157 
00173     void fetch(const MSetIterator &begin, const MSetIterator &end) const;
00174 
00177     void fetch(const MSetIterator &item) const;
00178 
00181     void fetch() const;
00182 
00187     percent convert_to_percent(weight wt) const;
00188 
00190     percent convert_to_percent(const MSetIterator &it) const;
00191 
00199     doccount get_termfreq(const termname &tname) const;
00200 
00208     weight get_termweight(const termname &tname) const;
00209 
00216     doccount get_firstitem() const;
00217 
00224     doccount get_matches_lower_bound() const;
00225 
00235     doccount get_matches_estimated() const;
00236 
00243     doccount get_matches_upper_bound() const;
00244 
00250     weight get_max_possible() const;
00251 
00265     weight get_max_attained() const;
00266 
00267     doccount size() const;
00268 
00269     doccount max_size() const;
00270 
00271     bool empty() const;
00272 
00273     void swap(MSet & other);
00274 
00275     MSetIterator begin() const;
00276 
00277     MSetIterator end() const;
00278 
00279     MSetIterator back() const;
00280     
00290     MSetIterator operator[](doccount i) const;
00291 
00293 
00294     typedef std::input_iterator_tag iterator_category;
00295     typedef MSetIterator value_type; // FIXME: not assignable...
00296     typedef MSetIterator iterator;
00297     typedef MSetIterator const_iterator;
00298     typedef MSetIterator & reference; // Hmm
00299     typedef MSetIterator & const_reference;
00300     typedef MSetIterator * pointer; // Hmm
00301     typedef doccount_diff difference_type;
00302     typedef doccount size_type;
00304     
00308     std::string get_description() const;
00309 };
00310 
00312 class ESetIterator {
00313   private:
00314 
00315     ESetIterator(Internal *internal_);
00316 
00317   public:
00318     friend class ESet;
00319     class Internal;
00321     Internal *internal;
00322 
00323     friend bool operator==(const ESetIterator &a, const ESetIterator &b);
00324 
00328     ESetIterator();
00329 
00331     ~ESetIterator();
00332 
00334     ESetIterator(const ESetIterator &other);
00335 
00337     void operator=(const ESetIterator &other);
00338 
00339     ESetIterator & operator++();
00340 
00341     void operator++(int);
00342 
00344     const termname & operator *() const;
00345 
00347     weight get_weight() const;
00348 
00352     std::string get_description() const;
00353 
00355 
00356     typedef std::input_iterator_tag iterator_category;
00357     typedef termname value_type;
00358     typedef termcount_diff difference_type;
00359     typedef termname * pointer;
00360     typedef termname & reference;
00362 };
00363 
00364 inline bool
00365 operator!=(const ESetIterator &a, const ESetIterator &b)
00366 {
00367     return !(a == b);
00368 }
00369 
00375 class ESet {
00376   public:
00377     class Internal;
00379     Internal *internal;
00380 
00382     ESet();
00383 
00385     ~ESet();
00386 
00388     ESet(const ESet & other);
00389 
00391     void operator=(const ESet &other);
00392 
00397     termcount get_ebound() const;
00398 
00400     termcount size() const;
00401 
00403     bool empty() const;
00404 
00406     ESetIterator begin() const;
00407 
00409     ESetIterator end() const;
00410 
00415     std::string get_description() const;
00416 };
00417 
00422 class RSet {
00423   public:
00425     class Internal;
00427     Internal *internal;
00428 
00430     RSet(const RSet &rset);
00431 
00433     void operator=(const RSet &rset);
00434 
00436     RSet();
00437 
00439     ~RSet();
00440 
00442     doccount size() const;
00443 
00445     bool empty() const;
00446 
00448     void add_document(docid did);
00449     
00451     void add_document(const MSetIterator & i) { add_document(*i); }
00452 
00454     void remove_document(docid did);
00455 
00457     void remove_document(const MSetIterator & i) { remove_document(*i); }
00458 
00460     bool contains(docid did) const;
00461 
00463     bool contains(const MSetIterator & i) { return contains(*i); }
00464 
00469     std::string get_description() const;
00470 };
00471 
00474 class MatchDecider {
00475   public:
00478     virtual int operator()(const Document &doc) const = 0;
00479 
00481     virtual ~MatchDecider() {}
00482 };
00483 
00486 class ExpandDecider {
00487   public:
00490     virtual int operator()(const termname & tname) const = 0;
00491 
00493     virtual ~ExpandDecider() {}
00494 };
00495 
00509 class Enquire {
00510   private:
00512     Enquire(const Enquire &);
00513 
00515     void operator=(const Enquire &);
00516 
00517   public:
00518     class Internal;
00520     Internal *internal;
00521 
00537     Enquire(const Database &databases, ErrorHandler * errorhandler_ = 0);
00538 
00547     ~Enquire();
00548 
00556     void set_query(const Query & query_);
00557 
00564     const Query & get_query();
00565 
00572     void set_weighting_scheme(const Weight &weight_);
00573 
00580     void set_collapse_key(valueno collapse_key);
00581 
00588     void set_sort_forward(bool sort_forward);
00589 
00607     void set_cutoff(int percent_cutoff, weight weight_cutoff = 0);
00608 
00622     void set_sorting(valueno sort_key, int sort_bands);
00623 
00635     void set_bias(weight bias_weight, time_t bias_halflife);
00636 
00657     MSet get_mset(doccount first, doccount maxitems, const RSet * rset = 0,
00658                   const MatchDecider * mdecider = 0) const;
00659 
00660     static const int include_query_terms = 1;
00661     static const int use_exact_termfreq = 2;
00685     ESet get_eset(termcount maxitems, const RSet & rset, int flags = 0,
00686                   double k = 1.0, const ExpandDecider * edecider = 0) const;
00687 
00702     ESet get_eset(termcount maxitems, const RSet & rset,
00703                   const ExpandDecider * edecider) const {
00704         return get_eset(maxitems, rset, 0, 1.0, edecider);
00705     }
00706 
00736     TermIterator get_matching_terms_begin(docid did) const;
00737 
00739     TermIterator get_matching_terms_end(docid did) const;
00740 
00764     TermIterator get_matching_terms_begin(const MSetIterator &it) const;
00765 
00767     TermIterator get_matching_terms_end(const MSetIterator &it) const;
00768 
00771     void register_match_decider(const std::string &name,
00772                                 const MatchDecider *mdecider = NULL);
00773 
00777     std::string get_description() const;
00778 };
00779 
00780 class SocketServer;
00781 
00783 class Weight {
00784   public:
00785     class Internal;
00786     friend class Enquire; // So Enquire can clone us
00787     friend class SocketServer; // So SocketServer can clone us - FIXME
00788   private:
00789     Weight(const Weight &);
00790     void operator=(Weight &);
00791 
00793     //
00794     // Each subclass should implement this as:
00795     // virtual FooWeight * clone() const {
00796     //     return new FooWeight(param1, param2);
00797     // }
00798     virtual Weight * clone() const = 0;
00799 
00800   protected:
00801     const Internal * internal; // Weight::Internal used to be called StatsSource...
00802     doclength querysize;
00803     termcount wqf;
00804     termname tname;
00805 
00806   public:
00807     Weight() { }
00808     virtual ~Weight() { }
00809 
00821     Weight * create(const Internal * internal_, doclength querysize_,
00822                     termcount wqf_, termname tname_) const {
00823         Weight * wt = clone();
00824         wt->internal = internal_;
00825         wt->querysize = querysize_;
00826         wt->wqf = wqf_;
00827         wt->tname = tname_;
00828         return wt;
00829     }
00830 
00832     //
00833     //  If the subclass is called FooWeight, this should return "Foo".
00834     virtual std::string name() const = 0;
00835 
00837     virtual std::string serialise() const = 0;
00838 
00840     virtual Weight * Weight::unserialise(const std::string &s) const = 0;
00841 
00849     virtual weight get_sumpart(termcount wdf, doclength len) const = 0;
00850 
00856     virtual weight get_maxpart() const = 0;
00857 
00866     virtual weight get_sumextra(doclength len) const = 0;
00867 
00871     virtual weight get_maxextra() const = 0;
00872 
00874     virtual bool get_sumpart_needs_doclength() const { return true; }
00875 };
00876 
00878 class BoolWeight : public Weight {
00879   public:
00880     Weight * clone() const {
00881         return new BoolWeight;
00882     }
00883     BoolWeight() { }
00884     ~BoolWeight() { }
00885     std::string name() const { return "Bool"; }
00886     std::string serialise() const { return ""; }
00887     Weight * unserialise(const std::string & /*s*/) const {
00888         return new BoolWeight;
00889     }
00890     weight get_sumpart(termcount /*wdf*/, doclength /*len*/) const {
00891         return 0;
00892     }
00893     weight get_maxpart() const { return 0; }
00894 
00895     weight get_sumextra(doclength /*len*/) const { return 0; }
00896     weight get_maxextra() const { return 0; }
00897 
00898     bool get_sumpart_needs_doclength() const { return false; }  
00899 };
00900 
00902 //
00903 // BM25 weighting options : The BM25 formula is \f[
00904 //      \frac{C.s_{q}}{1+L_{d}}+\sum_{t}\frac{(A+1)q_{t}}{A+q_{t}}.\frac{(B+1)f_{t,d}}{B((1-D)+DL_{d})+f_{t,d}}.w_{t}
00905 // \f] where
00906 //   - \f$w_{t}\f$ is the termweight of term t
00907 //   - \f$f_{t,d}\f$ is the within document frequency of term t in document d
00908 //   - \f$q_{t}\f$ is the within query frequency of term t
00909 //   - \f$L_{d}\f$ is the normalised length of document d
00910 //   - \f$s_{q}\f$ is the size of the query
00911 //   - \f$A\f$, \f$B\f$, \f$C\f$ and \f$D\f$ are user specified parameters
00912 class BM25Weight : public Weight {
00913   private:
00914     mutable weight termweight;
00915     mutable doclength lenpart;
00916     mutable double BD;
00917 
00918     double A, B, C, D;
00919     doclength min_normlen;
00920 
00921     mutable bool weight_calculated;
00922 
00923     void calc_termweight() const;
00924 
00925   public:
00944     BM25Weight(double A_, double B_, double C_, double D_,
00945                double min_normlen_)
00946             : A(A_), B(B_), C(C_), D(D_), min_normlen(min_normlen_),
00947               weight_calculated(false)
00948     {
00949         if (A < 0) A = 0;
00950         if (B < 0) B = 0;
00951         if (C < 0) C = 0;
00952         if (D < 0) D = 0; else if (D > 1) D = 1;
00953     }
00954     BM25Weight() : A(1), B(1), C(0), D(0.5), min_normlen(0.5),
00955                    weight_calculated(false) { }
00956 
00957     Weight * clone() const {
00958         return new BM25Weight(A, B, C, D, min_normlen);
00959     }
00960     ~BM25Weight() { }
00961     std::string name() const { return "BM25"; }
00962     std::string serialise() const;
00963     Weight * unserialise(const std::string & s) const;
00964     weight get_sumpart(termcount wdf, doclength len) const;
00965     weight get_maxpart() const;
00966 
00967     weight get_sumextra(doclength len) const;
00968     weight get_maxextra() const;
00969 
00970     bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
00971 };
00972 
00974 //
00975 // The Traditional weighting scheme formula is \f[
00976 //      \sum_{t}\frac{f_{t,d}}{k.L_{d}+f_{t,d}}.w_{t}
00977 // \f] where
00978 //   - \f$w_{t}\f$ is the termweight of term t
00979 //   - \f$f_{t,d}\f$ is the within document frequency of term t in document d
00980 //   - \f$L_{d}\f$ is the normalised length of document d
00981 //   - \f$k\f$ is a user specifiable parameter
00982 //
00983 // TradWeight is equivalent to BM25Weight(1, 1, 0, k, 0)
00984 class TradWeight : public Weight {
00985   private:
00986     mutable weight termweight;
00987     mutable doclength lenpart;
00988 
00989     double param_k;
00990 
00991     mutable bool weight_calculated;
00992 
00993     void calc_termweight() const;
00994 
00995   public:
00997     //
00998     // @param k  parameter governing the importance of within
00999     //           document frequency and document length - any positive
01000     //           number, 0 being wdf and doc length not used.  Default
01001     //           is 1.
01002     TradWeight(double k = 1) : param_k(k), weight_calculated(false) {
01003         if (param_k < 0) param_k = 0;
01004     }
01005     Weight * clone() const {
01006         return new TradWeight(param_k);
01007     }
01008     ~TradWeight() { }
01009     std::string name() const { return "Trad"; }
01010     std::string serialise() const;
01011     Weight * unserialise(const std::string & s) const;
01012     
01013     weight get_sumpart(termcount wdf, doclength len) const;
01014     weight get_maxpart() const;
01015 
01016     weight get_sumextra(doclength len) const;
01017     weight get_maxextra() const;
01018 
01019     bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
01020 };
01021 
01022 };
01023 
01024 #endif

Documentation for Xapian (version 0.6.3).
Generated on 24 Dec 2002 by Doxygen 1.2.15.