Add a rank/(rank+1) normalization option to ts_rank(). While the usefulness

author Tom Lane <tgl@sss.pgh.pa.us>

Wed, 14 Nov 2007 23:43:27 +0000 (23:43 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Wed, 14 Nov 2007 23:43:27 +0000 (23:43 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Wed, 14 Nov 2007 23:43:27 +0000 (23:43 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Wed, 14 Nov 2007 23:43:27 +0000 (23:43 +0000)
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml

index 31753791cda03182ea02ef0f832d2333607dffa5..9366fdd240737cf99a41e530b42da4ce27d395bf 100644 (file)
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.34 2007/11/14 23:43:27 tgl Exp $ -->
  
  <chapter id="textsearch">
   <title id="textsearch-title">Full Text Search</title>
@@ -940,6 +940,7 @@ SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
       <listitem>
        <para>
         4 divides the rank by the mean harmonic distance between extents
+       (this is implemented only by <function>ts_rank_cd</>)
        </para>
       </listitem>
       <listitem>
@@ -953,17 +954,24 @@ SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
         of unique words in document
        </para>
       </listitem>
+     <listitem>
+      <para>
+       32 divides the rank by itself + 1
+      </para>
+     </listitem>
      </itemizedlist>
  
+    If more than one flag bit is specified, the transformations are
+    applied in the order listed.
     </para>
  
     <para>
      It is important to note that the ranking functions do not use any global
-    information so it is impossible to produce a fair normalization to 1% or
-    100%, as sometimes desired.  However, a simple technique like
-    <literal>rank/(rank+1)</literal> can be applied.  Of course, this is just
-    a cosmetic change, i.e., the ordering of the search results will not
-    change.
+    information, so it is impossible to produce a fair normalization to 1% or
+    100% as sometimes desired.  Normalization option 32
+    (<literal>rank/(rank+1)</literal>) can be applied to scale all ranks
+    into the range zero to one, but of course this is just a cosmetic change;
+    it will not affect the ordering of the search results.
     </para>
  
     <para>
@@ -991,7 +999,7 @@ ORDER BY rank DESC LIMIT 10;
      This is the same example using normalized ranking:
  
  <programlisting>
-SELECT title, ts_rank_cd(textsearch, query)/(ts_rank_cd(textsearch, query) + 1) AS rank
+SELECT title, ts_rank_cd(textsearch, query, 32 /* rank/(rank+1) */ ) AS rank
  FROM apod, to_tsquery('neutrino|(dark &amp; matter)') query
  WHERE  query @@ textsearch
  ORDER BY rank DESC LIMIT 10;
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c

index bf0016d76bb0c4487ca1615e91c8653e5175db4f..297724710fffbcdb250de945219579a9d67cf7c4 100644 (file)
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.8 2007/09/20 18:10:57 teodor Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.9 2007/11/14 23:43:27 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -25,13 +25,14 @@ static float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
  
  #define wpos(wep)      ( w[ WEP_GETWEIGHT(wep) ] )
  
-#define RANK_NO_NORM           0x00
+#define RANK_NO_NORM                   0x00
  #define RANK_NORM_LOGLENGTH            0x01
-#define RANK_NORM_LENGTH       0x02
-#define RANK_NORM_EXTDIST      0x04
-#define RANK_NORM_UNIQ         0x08
-#define RANK_NORM_LOGUNIQ      0x10
-#define DEF_NORM_METHOD                RANK_NO_NORM
+#define RANK_NORM_LENGTH               0x02
+#define RANK_NORM_EXTDIST              0x04
+#define RANK_NORM_UNIQ                 0x08
+#define RANK_NORM_LOGUNIQ              0x10
+#define RANK_NORM_RDIVRPLUS1   0x20
+#define DEF_NORM_METHOD                        RANK_NO_NORM
  
  static float calc_rank_or(float *w, TSVector t, TSQuery q);
  static float calc_rank_and(float *w, TSVector t, TSQuery q);
@@ -348,12 +349,17 @@ calc_rank(float *w, TSVector t, TSQuery q, int4 method)
                         res /= (float) len;
         }
  
+       /* RANK_NORM_EXTDIST not applicable */
+
         if ((method & RANK_NORM_UNIQ) && t->size > 0)
                 res /= (float) (t->size);
  
         if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
                 res /= log((double) (t->size + 1)) / log(2.0);
  
+       if (method & RANK_NORM_RDIVRPLUS1)
+               res /= (res + 1);
+
         return res;
  }
  
@@ -762,7 +768,7 @@ calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method)
                         Wdoc /= (double) len;
         }
  
-       if ((method & RANK_NORM_EXTDIST) && SumDist > 0)
+       if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
                 Wdoc /= ((double) NExtent) / SumDist;
  
         if ((method & RANK_NORM_UNIQ) && txt->size > 0)
@@ -771,6 +777,9 @@ calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method)
         if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
                 Wdoc /= log((double) (txt->size + 1)) / log(2.0);
  
+       if (method & RANK_NORM_RDIVRPLUS1)
+               Wdoc /= (Wdoc + 1);
+
         pfree(doc);
  
         pfree( qr.operandexist );
author	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 14 Nov 2007 23:43:27 +0000 (23:43 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Wed, 14 Nov 2007 23:43:27 +0000 (23:43 +0000)
doc/src/sgml/textsearch.sgml		patch \| blob \| history
src/backend/utils/adt/tsrank.c		patch \| blob \| history