include/xapian/weight.h

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2007,2008,2009 Olly Betts
00005  * Copyright (C) 2009 Lemur Consulting Ltd
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00020  */
00021 
00022 #ifndef XAPIAN_INCLUDED_WEIGHT_H
00023 #define XAPIAN_INCLUDED_WEIGHT_H
00024 
00025 #include <string>
00026 
00027 #include <xapian/types.h>
00028 #include <xapian/visibility.h>
00029 
00030 namespace Xapian {
00031 
00033 class XAPIAN_VISIBILITY_DEFAULT Weight {
00034   protected:
00036     typedef enum {
00037         COLLECTION_SIZE = 1,
00038         RSET_SIZE = 2,
00039         AVERAGE_LENGTH = 4,
00040         TERMFREQ = 8,
00041         RELTERMFREQ = 16,
00042         QUERY_LENGTH = 32,
00043         WQF = 64,
00044         WDF = 128,
00045         DOC_LENGTH = 256,
00046         DOC_LENGTH_MIN = 512,
00047         DOC_LENGTH_MAX = 1024,
00048         WDF_MAX = 2048
00049     } stat_flags;
00050 
00060     void need_stat(stat_flags flag) {
00061         stats_needed = stat_flags(stats_needed | flag);
00062     }
00063 
00068     virtual void init(double factor) = 0;
00069 
00070   private:
00072     void operator=(const Weight &);
00073 
00086     virtual Weight * clone() const = 0;
00087 
00089     stat_flags stats_needed;
00090 
00092     Xapian::doccount collection_size_;
00093 
00095     Xapian::doccount rset_size_;
00096 
00098     Xapian::doclength average_length_;
00099 
00101     Xapian::doccount termfreq_;
00102 
00104     Xapian::doccount reltermfreq_;
00105 
00107     Xapian::termcount query_length_;
00108 
00110     Xapian::termcount wqf_;
00111 
00113     Xapian::termcount doclength_lower_bound_;
00114 
00116     Xapian::termcount doclength_upper_bound_;
00117 
00119     Xapian::termcount wdf_upper_bound_;
00120 
00121   public:
00122     class Internal;
00123 
00125     virtual ~Weight();
00126 
00141     virtual std::string name() const = 0;
00142 
00149     virtual std::string serialise() const = 0;
00150 
00163     virtual Weight * unserialise(const std::string & s) const = 0;
00164 
00173     virtual Xapian::weight get_sumpart(Xapian::termcount wdf,
00174                                        Xapian::termcount doclen) const = 0;
00175 
00181     virtual Xapian::weight get_maxpart() const = 0;
00182 
00190     virtual Xapian::weight get_sumextra(Xapian::termcount doclen) const = 0;
00191 
00198     virtual Xapian::weight get_maxextra() const = 0;
00199 
00207     Weight * clone_() const { return clone(); }
00208 
00218     void init_(const Internal & stats, Xapian::termcount query_len_,
00219                const std::string & term, Xapian::termcount wqf_,
00220                double factor);
00221 
00231     void init_(const Internal & stats, Xapian::termcount query_len_,
00232                double factor, Xapian::doccount termfreq,
00233                Xapian::doccount reltermfreq);
00234 
00241     void init_(const Internal & stats, Xapian::termcount query_len_);
00242 
00249     bool get_sumpart_needs_doclength_() const {
00250         return stats_needed & DOC_LENGTH;
00251     }
00252 
00258     bool get_sumpart_needs_wdf_() const {
00259         return stats_needed & WDF;
00260     }
00261 
00262   protected:
00264     Weight(const Weight &);
00265 
00267     Weight() : stats_needed() { }
00268 
00270     Xapian::doccount get_collection_size() const { return collection_size_; }
00271 
00273     Xapian::doccount get_rset_size() const { return rset_size_; }
00274 
00276     Xapian::doclength get_average_length() const { return average_length_; }
00277 
00279     Xapian::doccount get_termfreq() const { return termfreq_; }
00280 
00282     Xapian::doccount get_reltermfreq() const { return reltermfreq_; }
00283 
00285     Xapian::termcount get_query_length() const { return query_length_; }
00286 
00288     Xapian::termcount get_wqf() const { return wqf_; }
00289 
00294     Xapian::termcount get_doclength_upper_bound() const {
00295         return doclength_upper_bound_;
00296     }
00297 
00302     Xapian::termcount get_doclength_lower_bound() const {
00303         return doclength_lower_bound_;
00304     }
00305 
00310     Xapian::termcount get_wdf_upper_bound() const {
00311         return wdf_upper_bound_;
00312     }
00313 };
00314 
00319 class XAPIAN_VISIBILITY_DEFAULT BoolWeight : public Weight {
00320     BoolWeight * clone() const;
00321 
00322     void init(double factor);
00323 
00324   public:
00326     BoolWeight() { }
00327 
00328     std::string name() const;
00329 
00330     std::string serialise() const;
00331     BoolWeight * unserialise(const std::string & s) const;
00332 
00333     Xapian::weight get_sumpart(Xapian::termcount wdf,
00334                                Xapian::termcount doclen) const;
00335     Xapian::weight get_maxpart() const;
00336 
00337     Xapian::weight get_sumextra(Xapian::termcount doclen) const;
00338     Xapian::weight get_maxextra() const;
00339 };
00340 
00342 class XAPIAN_VISIBILITY_DEFAULT BM25Weight : public Weight {
00344     mutable Xapian::doclength len_factor;
00345 
00347     mutable Xapian::weight termweight;
00348 
00350     double param_k1, param_k2, param_k3, param_b;
00351 
00353     Xapian::doclength param_min_normlen;
00354 
00355     BM25Weight * clone() const;
00356 
00357     void init(double factor);
00358 
00359   public:
00387     BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
00388         : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
00389           param_min_normlen(min_normlen)
00390     {
00391         if (param_k1 < 0) param_k1 = 0;
00392         if (param_k2 < 0) param_k2 = 0;
00393         if (param_k3 < 0) param_k3 = 0;
00394         if (param_b < 0) {
00395             param_b = 0;
00396         } else if (param_b > 1) {
00397             param_b = 1;
00398         }
00399         need_stat(COLLECTION_SIZE);
00400         need_stat(RSET_SIZE);
00401         need_stat(TERMFREQ);
00402         need_stat(RELTERMFREQ);
00403         need_stat(WDF);
00404         need_stat(WDF_MAX);
00405         need_stat(WDF);
00406         if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
00407             need_stat(DOC_LENGTH_MIN);
00408             need_stat(AVERAGE_LENGTH);
00409         }
00410         if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
00411         if (param_k2 != 0) need_stat(QUERY_LENGTH);
00412         if (param_k3 != 0) need_stat(WQF);
00413     }
00414 
00415     BM25Weight()
00416         : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
00417           param_min_normlen(0.5)
00418     {
00419         need_stat(COLLECTION_SIZE);
00420         need_stat(RSET_SIZE);
00421         need_stat(TERMFREQ);
00422         need_stat(RELTERMFREQ);
00423         need_stat(WDF);
00424         need_stat(WDF_MAX);
00425         need_stat(WDF);
00426         need_stat(DOC_LENGTH_MIN);
00427         need_stat(AVERAGE_LENGTH);
00428         need_stat(DOC_LENGTH);
00429         need_stat(WQF);
00430     }
00431 
00432     std::string name() const;
00433 
00434     std::string serialise() const;
00435     BM25Weight * unserialise(const std::string & s) const;
00436 
00437     Xapian::weight get_sumpart(Xapian::termcount wdf,
00438                                Xapian::termcount doclen) const;
00439     Xapian::weight get_maxpart() const;
00440 
00441     Xapian::weight get_sumextra(Xapian::termcount doclen) const;
00442     Xapian::weight get_maxextra() const;
00443 };
00444 
00454 class XAPIAN_VISIBILITY_DEFAULT TradWeight : public Weight {
00456     mutable Xapian::doclength len_factor;
00457 
00459     mutable Xapian::weight termweight;
00460 
00462     double param_k;
00463 
00464     TradWeight * clone() const;
00465 
00466     void init(double factor);
00467 
00468   public:
00476     explicit TradWeight(double k = 1.0) : param_k(k) {
00477         if (param_k < 0) param_k = 0;
00478         if (param_k != 0.0) {
00479             need_stat(AVERAGE_LENGTH);
00480             need_stat(DOC_LENGTH);
00481         }
00482         need_stat(COLLECTION_SIZE);
00483         need_stat(RSET_SIZE);
00484         need_stat(TERMFREQ);
00485         need_stat(RELTERMFREQ);
00486         need_stat(DOC_LENGTH_MIN);
00487         need_stat(WDF);
00488         need_stat(WDF_MAX);
00489         need_stat(WDF);
00490     }
00491 
00492     std::string name() const;
00493 
00494     std::string serialise() const;
00495     TradWeight * unserialise(const std::string & s) const;
00496 
00497     Xapian::weight get_sumpart(Xapian::termcount wdf,
00498                                Xapian::termcount doclen) const;
00499     Xapian::weight get_maxpart() const;
00500 
00501     Xapian::weight get_sumextra(Xapian::termcount doclen) const;
00502     Xapian::weight get_maxextra() const;
00503 };
00504 
00505 }
00506 
00507 #endif // XAPIAN_INCLUDED_WEIGHT_H

Documentation for Xapian (version 1.1.2).
Generated on 23 Jul 2009 by Doxygen 1.5.2.