Improve GiST range-contained-by searches by adding a flag for empty ranges.

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 27 Nov 2011 21:50:37 +0000 (16:50 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 27 Nov 2011 21:51:29 +0000 (16:51 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 27 Nov 2011 21:50:37 +0000 (16:50 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 27 Nov 2011 21:51:29 +0000 (16:51 -0500)
diff --git a/src/backend/utils/adt/rangetypes.c b/src/backend/utils/adt/rangetypes.c

index d0f890c28aafc46813bfdfd7b5c6b3d04d0739dd..2b9e7b320abb81f9a1db2cfdb92a37f05f0288e4 100644 (file)
--- a/src/backend/utils/adt/rangetypes.c
+++ b/src/backend/utils/adt/rangetypes.c
@@ -1622,6 +1622,24 @@ range_get_flags(RangeType *range)
         return *((char *) range + VARSIZE(range) - 1);
  }
  
+/*
+ * range_set_contain_empty: set the RANGE_CONTAIN_EMPTY bit in the value.
+ *
+ * This is only needed in GiST operations, so we don't include a provision
+ * for setting it in range_serialize; rather, this function must be applied
+ * afterwards.
+ */
+void
+range_set_contain_empty(RangeType *range)
+{
+       char       *flagsp;
+
+       /* flag byte is datum's last byte */
+       flagsp = (char *) range + VARSIZE(range) - 1;
+
+       *flagsp |= RANGE_CONTAIN_EMPTY;
+}
+
  /*
   * This both serializes and canonicalizes (if applicable) the range.
   * This should be used by most callers.
diff --git a/src/backend/utils/adt/rangetypes_gist.c b/src/backend/utils/adt/rangetypes_gist.c

index be59a5c4a3858a38040a97ed827144e971ed575c..b68395882892b837b98b603e07527ca745a8e779 100644 (file)
--- a/src/backend/utils/adt/rangetypes_gist.c
+++ b/src/backend/utils/adt/rangetypes_gist.c
@@ -17,6 +17,7 @@
  #include "access/gist.h"
  #include "access/skey.h"
  #include "utils/builtins.h"
+#include "utils/datum.h"
  #include "utils/rangetypes.h"
  
  
@@ -32,7 +33,11 @@
  #define RANGESTRAT_CONTAINED_BY                        8
  #define RANGESTRAT_CONTAINS_ELEM               16
  #define RANGESTRAT_EQ                                  18
-#define RANGESTRAT_NE                                  19
+
+/* Copy a RangeType datum (hardwires typbyval and typlen for ranges...) */
+#define rangeCopy(r) \
+       ((RangeType *) DatumGetPointer(datumCopy(PointerGetDatum(r), \
+                                                                                        false, -1)))
  
  /*
   * Auxiliary structure for picksplit method.
@@ -145,6 +150,16 @@ range_gist_penalty(PG_FUNCTION_ARGS)
  
         subtype_diff = &typcache->rng_subdiff_finfo;
  
+       /*
+        * If new is or contains empty, and orig doesn't, apply infinite penalty.
+        * We really don't want to pollute an empty-free subtree with empties.
+        */
+       if (RangeIsOrContainsEmpty(new) && !RangeIsOrContainsEmpty(orig))
+       {
+               *penalty = get_float4_infinity();
+               PG_RETURN_POINTER(penalty);
+       }
+
         /*
          * We want to compare the size of "orig" to size of "orig union new".
          * The penalty will be the sum of the reduction in the lower bound plus
@@ -163,30 +178,9 @@ range_gist_penalty(PG_FUNCTION_ARGS)
         }
         else if (empty1)
         {
-               if (lower2.infinite || upper2.infinite)
-               {
-                       /* from empty to infinite */
-                       *penalty = get_float4_infinity();
-                       PG_RETURN_POINTER(penalty);
-               }
-               else if (OidIsValid(subtype_diff->fn_oid))
-               {
-                       /* from empty to upper2-lower2 */
-                       *penalty = DatumGetFloat8(FunctionCall2Coll(subtype_diff,
-                                                                                                               typcache->rng_collation,
-                                                                                                               upper2.val,
-                                                                                                               lower2.val));
-                       /* upper2 must be >= lower2 */
-                       if (*penalty < 0)
-                               *penalty = 0;           /* subtype_diff is broken */
-                       PG_RETURN_POINTER(penalty);
-               }
-               else
-               {
-                       /* wild guess */
-                       *penalty = 1.0;
-                       PG_RETURN_POINTER(penalty);
-               }
+               /* infinite penalty for pushing non-empty into all-empty subtree */
+               *penalty = get_float4_infinity();
+               PG_RETURN_POINTER(penalty);
         }
  
         /* if orig isn't empty, s_union can't be either */
@@ -334,15 +328,27 @@ range_gist_picksplit(PG_FUNCTION_ARGS)
  Datum
  range_gist_same(PG_FUNCTION_ARGS)
  {
-       /* Datum r1 = PG_GETARG_DATUM(0); */
-       /* Datum r2 = PG_GETARG_DATUM(1); */
+       RangeType  *r1 = PG_GETARG_RANGE(0);
+       RangeType  *r2 = PG_GETARG_RANGE(1);
         bool       *result = (bool *) PG_GETARG_POINTER(2);
  
         /*
-        * We can safely call range_eq using our fcinfo directly; it won't notice
-        * the third argument.  This allows it to use fn_extra for caching.
+        * range_eq will ignore the RANGE_CONTAIN_EMPTY flag, so we have to
+        * check that for ourselves.  More generally, if the entries have been
+        * properly normalized, then unequal flags bytes must mean unequal ranges
+        * ... so let's just test all the flag bits at once.
          */
-       *result = DatumGetBool(range_eq(fcinfo));
+       if (range_get_flags(r1) != range_get_flags(r2))
+               *result = false;
+       else
+       {
+               /*
+                * We can safely call range_eq using our fcinfo directly; it won't
+                * notice the third argument.  This allows it to use fn_extra for
+                * caching.
+                */
+               *result = DatumGetBool(range_eq(fcinfo));
+       }
  
         PG_RETURN_POINTER(result);
  }
@@ -356,27 +362,53 @@ range_gist_same(PG_FUNCTION_ARGS)
  /*
   * Return the smallest range that contains r1 and r2
   *
- * XXX would it be better to redefine range_union as working this way?
+ * This differs from regular range_union in two critical ways:
+ * 1. It won't throw an error for non-adjacent r1 and r2, but just absorb
+ * the intervening values into the result range.
+ * 2. We track whether any empty range has been union'd into the result,
+ * so that contained_by searches can be indexed.  Note that this means
+ * that *all* unions formed within the GiST index must go through here.
   */
  static RangeType *
  range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2)
  {
+       RangeType  *result;
         RangeBound      lower1,
                                 lower2;
         RangeBound      upper1,
                                 upper2;
         bool            empty1,
                                 empty2;
+       char            flags1,
+                               flags2;
         RangeBound *result_lower;
         RangeBound *result_upper;
  
         range_deserialize(typcache, r1, &lower1, &upper1, &empty1);
         range_deserialize(typcache, r2, &lower2, &upper2, &empty2);
+       flags1 = range_get_flags(r1);
+       flags2 = range_get_flags(r2);
  
         if (empty1)
+       {
+               /* We can return r2 as-is if it already is or contains empty */
+               if (flags2 & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY))
+                       return r2;
+               /* Else we'd better copy it (modify-in-place isn't safe) */
+               r2 = rangeCopy(r2);
+               range_set_contain_empty(r2);
                 return r2;
+       }
         if (empty2)
+       {
+               /* We can return r1 as-is if it already is or contains empty */
+               if (flags1 & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY))
+                       return r1;
+               /* Else we'd better copy it (modify-in-place isn't safe) */
+               r1 = rangeCopy(r1);
+               range_set_contain_empty(r1);
                 return r1;
+       }
  
         if (range_cmp_bounds(typcache, &lower1, &lower2) <= 0)
                 result_lower = &lower1;
@@ -389,12 +421,19 @@ range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2)
                 result_upper = &upper2;
  
         /* optimization to avoid constructing a new range */
-       if (result_lower == &lower1 && result_upper == &upper1)
+       if (result_lower == &lower1 && result_upper == &upper1 &&
+               ((flags1 & RANGE_CONTAIN_EMPTY) || !(flags2 & RANGE_CONTAIN_EMPTY)))
                 return r1;
-       if (result_lower == &lower2 && result_upper == &upper2)
+       if (result_lower == &lower2 && result_upper == &upper2 &&
+               ((flags2 & RANGE_CONTAIN_EMPTY) || !(flags1 & RANGE_CONTAIN_EMPTY)))
                 return r2;
  
-       return make_range(typcache, result_lower, result_upper, false);
+       result = make_range(typcache, result_lower, result_upper, false);
+
+       if ((flags1 & RANGE_CONTAIN_EMPTY) || (flags2 & RANGE_CONTAIN_EMPTY))
+               range_set_contain_empty(result);
+
+       return result;
  }
  
  /*
@@ -484,21 +523,26 @@ range_gist_consistent_int(FmgrInfo *flinfo, StrategyNumber strategy,
                         break;
                 case RANGESTRAT_CONTAINED_BY:
                         /*
-                        * Ideally we'd apply range_overlaps here, but at present it
-                        * might fail to find empty ranges in the index, which should
-                        * be reported as being contained by anything.  This needs work.
+                        * Empty ranges are contained by anything, so if key is or
+                        * contains any empty ranges, we must descend into it.  Otherwise,
+                        * descend only if key overlaps the query.
                          */
-                       return true;
+                       if (RangeIsOrContainsEmpty(key))
+                               return true;
+                       proc = range_overlaps;
                         break;
                 case RANGESTRAT_CONTAINS_ELEM:
                         proc = range_contains_elem;
                         break;
                 case RANGESTRAT_EQ:
+                       /*
+                        * If query is empty, descend only if the key is or contains any
+                        * empty ranges.  Otherwise, descend if key contains query.
+                        */
+                       if (RangeIsEmpty(DatumGetRangeType(query)))
+                               return RangeIsOrContainsEmpty(key);
                         proc = range_contains;
                         break;
-               case RANGESTRAT_NE:
-                       return true;
-                       break;
                 default:
                         elog(ERROR, "unrecognized range strategy: %d", strategy);
                         proc = NULL;            /* keep compiler quiet */
@@ -555,9 +599,6 @@ range_gist_consistent_leaf(FmgrInfo *flinfo, StrategyNumber strategy,
                 case RANGESTRAT_EQ:
                         proc = range_eq;
                         break;
-               case RANGESTRAT_NE:
-                       proc = range_ne;
-                       break;
                 default:
                         elog(ERROR, "unrecognized range strategy: %d", strategy);
                         proc = NULL;            /* keep compiler quiet */
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h

index 088c307660798d3f66afc230562b0450301a9668..5a73726ba31ad6f8b361692d1fcc420e3291d27b 100644 (file)
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
   */
  
  /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     201111241
+#define CATALOG_VERSION_NO     201111271
  
  #endif
diff --git a/src/include/catalog/pg_amop.h b/src/include/catalog/pg_amop.h

index a95cf5fa50dddb75f53dd5df8e16e55c0d6a67b8..1e8c9a289f9182bfd35fa23ea2506f3cd0723799 100644 (file)
--- a/src/include/catalog/pg_amop.h
+++ b/src/include/catalog/pg_amop.h
@@ -736,6 +736,5 @@ DATA(insert (       3919   3831 3831 7 s    3890 783 0 ));
  DATA(insert (  3919   3831 3831 8 s    3892 783 0 ));
  DATA(insert (  3919   3831 2283 16 s   3889 783 0 ));
  DATA(insert (  3919   3831 3831 18 s   3882 783 0 ));
-DATA(insert (  3919   3831 3831 19 s   3883 783 0 ));
  
  #endif   /* PG_AMOP_H */
diff --git a/src/include/utils/rangetypes.h b/src/include/utils/rangetypes.h

index c9a6834e6db9e61558410f936302fce2739733f1..307476d97620986c39e401a1c74811713c002eee 100644 (file)
--- a/src/include/utils/rangetypes.h
+++ b/src/include/utils/rangetypes.h
@@ -33,13 +33,15 @@ typedef struct
  #define RangeTypeGetOid(r)  ((r)->rangetypid)
  
  /* A range's flags byte contains these bits: */
-#define RANGE_EMPTY            0x01    /* range is empty */
-#define RANGE_LB_INC   0x02    /* lower bound is inclusive (vs exclusive) */
-#define RANGE_LB_NULL  0x04    /* lower bound is null (NOT CURRENTLY USED) */
-#define RANGE_LB_INF   0x08    /* lower bound is +/- infinity */
-#define RANGE_UB_INC   0x10    /* upper bound is inclusive (vs exclusive) */
-#define RANGE_UB_NULL  0x20    /* upper bound is null (NOT CURRENTLY USED) */
-#define RANGE_UB_INF   0x40    /* upper bound is +/- infinity */
+#define RANGE_EMPTY                    0x01    /* range is empty */
+#define RANGE_LB_INC           0x02    /* lower bound is inclusive */
+#define RANGE_UB_INC           0x04    /* upper bound is inclusive */
+#define RANGE_LB_INF           0x08    /* lower bound is -infinity */
+#define RANGE_UB_INF           0x10    /* upper bound is +infinity */
+#define RANGE_LB_NULL          0x20    /* lower bound is null (NOT USED) */
+#define RANGE_UB_NULL          0x40    /* upper bound is null (NOT USED) */
+#define RANGE_CONTAIN_EMPTY    0x80    /* marks a GiST internal-page entry whose
+                                                                        * subtree contains some empty ranges */
  
  #define RANGE_HAS_LBOUND(flags) (!((flags) & (RANGE_EMPTY | \
                                                                                           RANGE_LB_NULL | \
@@ -49,7 +51,9 @@ typedef struct
                                                                                           RANGE_UB_NULL | \
                                                                                           RANGE_UB_INF)))
  
-#define RangeIsEmpty(r)  (range_get_flags(r) & RANGE_EMPTY)
+#define RangeIsEmpty(r)  ((range_get_flags(r) & RANGE_EMPTY) != 0)
+#define RangeIsOrContainsEmpty(r)  \
+       ((range_get_flags(r) & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY)) != 0)
  
  
  /* Internal representation of either bound of a range (not what's on disk) */
@@ -152,6 +156,7 @@ extern void range_deserialize(TypeCacheEntry *typcache, RangeType *range,
                                                           RangeBound *lower, RangeBound *upper,
                                                           bool *empty);
  extern char range_get_flags(RangeType *range);
+extern void range_set_contain_empty(RangeType *range);
  extern RangeType *make_range(TypeCacheEntry *typcache, RangeBound *lower,
                                                 RangeBound *upper, bool empty);
  extern int range_cmp_bounds(TypeCacheEntry *typcache, RangeBound *b1,
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out

index 9bce504dd3d74ccc9a0d8481844e6c9087cd8a9b..b915134fff33d3154a24d56361d0427810b90af1 100644 (file)
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -1041,7 +1041,6 @@ ORDER BY 1, 2, 3;
          783 |           15 | <->
          783 |           16 | @>
          783 |           18 | =
-        783 |           19 | <>
          783 |           27 | @>
          783 |           28 | <@
          783 |           47 | @>
@@ -1054,7 +1053,7 @@ ORDER BY 1, 2, 3;
         2742 |            2 | @@@
         2742 |            3 | <@
         2742 |            4 | =
-(44 rows)
+(43 rows)
  
  -- Check that all opclass search operators have selectivity estimators.
  -- This is not absolutely required, but it seems a reasonable thing
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 27 Nov 2011 21:50:37 +0000 (16:50 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 27 Nov 2011 21:51:29 +0000 (16:51 -0500)
src/backend/utils/adt/rangetypes.c		patch \| blob \| history
src/backend/utils/adt/rangetypes_gist.c		patch \| blob \| history
src/include/catalog/catversion.h		patch \| blob \| history
src/include/catalog/pg_amop.h		patch \| blob \| history
src/include/utils/rangetypes.h		patch \| blob \| history
src/test/regress/expected/opr_sanity.out		patch \| blob \| history