Change hash index creation so that rather than always establishing exactly

author Tom Lane <tgl@sss.pgh.pa.us>

Sat, 15 Mar 2008 20:46:31 +0000 (20:46 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sat, 15 Mar 2008 20:46:31 +0000 (20:46 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Sat, 15 Mar 2008 20:46:31 +0000 (20:46 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sat, 15 Mar 2008 20:46:31 +0000 (20:46 +0000)
diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README

index 014f742c252ce34921e2e2fe532e5bd0388774f5..9f34fd73c483d3cd738d0b239b7aa09f35dab913 100644 (file)
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.6 2007/04/19 20:24:04 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.7 2008/03/15 20:46:31 tgl Exp $
  
  This directory contains an implementation of hash indexing for Postgres.  Most
  of the core ideas are taken from Margo Seltzer and Ozan Yigit, A New Hashing
@@ -65,6 +65,11 @@ hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
  former.  The difference between the two represents the number of overflow
  pages appearing between the bucket page groups of splitpoints N and N+1.
  
+(Note: the above describes what happens when filling an initially minimally
+sized hash index.  In practice, we try to estimate the required index size
+and allocate a suitable number of splitpoints immediately, to avoid
+expensive re-splitting during initial index build.)
+
  When S splitpoints exist altogether, the array entries hashm_spares[0]
  through hashm_spares[S] are valid; hashm_spares[S] records the current
  total number of overflow pages.  New overflow pages are created as needed
@@ -101,9 +106,9 @@ includes the bitmap pages, which is the reason for saying that bitmap
  pages are a subset of the overflow pages.  It turns out in fact that each
  bitmap page's first bit represents itself --- this is not an essential
  property, but falls out of the fact that we only allocate another bitmap
-page when we really need one.  Bit number zero always corresponds to block
-number 3, which is the first bitmap page and is allocated during index
-creation.
+page when we really need one.  Bit number zero always corresponds to the
+first bitmap page, which is allocated during index creation just after all
+the initially created buckets.
  
  
  Lock definitions
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c

index f6c4d5705d4284a6393d3ebcb81ba09be88b66bd..b008c0aa4a7abde67c91cc158abc16ae34b5d888 100644 (file)
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.98 2008/01/01 19:45:46 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.99 2008/03/15 20:46:31 tgl Exp $
   *
   * NOTES
   *       This file contains only the public interface routines.
@@ -22,6 +22,7 @@
  #include "access/hash.h"
  #include "catalog/index.h"
  #include "commands/vacuum.h"
+#include "optimizer/plancat.h"
  
  
  /* Working state for hashbuild and its callback */
@@ -48,6 +49,7 @@ hashbuild(PG_FUNCTION_ARGS)
         Relation        index = (Relation) PG_GETARG_POINTER(1);
         IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
         IndexBuildResult *result;
+       BlockNumber     relpages;
         double          reltuples;
         HashBuildState buildstate;
  
@@ -59,8 +61,11 @@ hashbuild(PG_FUNCTION_ARGS)
                 elog(ERROR, "index \"%s\" already contains data",
                          RelationGetRelationName(index));
  
-       /* initialize the hash index metadata page */
-       _hash_metapinit(index);
+       /* estimate the number of rows currently present in the table */
+       estimate_rel_size(heap, NULL, &relpages, &reltuples);
+
+       /* initialize the hash index metadata page and initial buckets */
+       _hash_metapinit(index, reltuples);
  
         /* build the index */
         buildstate.indtuples = 0;
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c

index 68861087585e97c873fe90eff1286d477d9af9d8..ec6f4b390fd944305e908aaebb035e325b733860 100644 (file)
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.72 2008/01/01 19:45:46 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.73 2008/03/15 20:46:31 tgl Exp $
   *
   * NOTES
   *       Postgres hash pages look like ordinary relation pages.  The opaque
@@ -312,15 +312,17 @@ _hash_chgbufaccess(Relation rel,
  
  /*
   *     _hash_metapinit() -- Initialize the metadata page of a hash index,
- *                             the two buckets that we begin with and the initial
- *                             bitmap page.
+ *                             the initial buckets, and the initial bitmap page.
+ *
+ * The initial number of buckets is dependent on num_tuples, an estimate
+ * of the number of tuples to be loaded into the index initially.
   *
   * We are fairly cavalier about locking here, since we know that no one else
   * could be accessing this index.  In particular the rule about not holding
   * multiple buffer locks is ignored.
   */
  void
-_hash_metapinit(Relation rel)
+_hash_metapinit(Relation rel, double num_tuples)
  {
         HashMetaPage metap;
         HashPageOpaque pageopaque;
@@ -330,7 +332,10 @@ _hash_metapinit(Relation rel)
         int32           data_width;
         int32           item_width;
         int32           ffactor;
-       uint16          i;
+       double          dnumbuckets;
+       uint32          num_buckets;
+       uint32          log2_num_buckets;
+       uint32          i;
  
         /* safety check */
         if (RelationGetNumberOfBlocks(rel) != 0)
@@ -354,7 +359,26 @@ _hash_metapinit(Relation rel)
                 ffactor = 10;
  
         /*
-        * We initialize the metapage, the first two bucket pages, and the first
+        * Choose the number of initial bucket pages to match the fill factor
+        * given the estimated number of tuples.  We round up the result to the
+        * next power of 2, however, and always force at least 2 bucket pages.
+        * The upper limit is determined by considerations explained in
+        * _hash_expandtable().
+        */
+       dnumbuckets = num_tuples / ffactor;
+       if (dnumbuckets <= 2.0)
+               num_buckets = 2;
+       else if (dnumbuckets >= (double) 0x40000000)
+               num_buckets = 0x40000000;
+       else
+               num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
+
+       log2_num_buckets = _hash_log2(num_buckets);
+       Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
+       Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
+
+       /*
+        * We initialize the metapage, the first N bucket pages, and the first
          * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
          * calls to occur.      This ensures that the smgr level has the right idea of
          * the physical index length.
@@ -398,23 +422,25 @@ _hash_metapinit(Relation rel)
         metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
  
         /*
-        * We initialize the index with two buckets, 0 and 1, occupying physical
-        * blocks 1 and 2.      The first freespace bitmap page is in block 3.
+        * We initialize the index with N buckets, 0 .. N-1, occupying physical
+        * blocks 1 to N.  The first freespace bitmap page is in block N+1.
+        * Since N is a power of 2, we can set the masks this way:
          */
-       metap->hashm_maxbucket = metap->hashm_lowmask = 1;      /* nbuckets - 1 */
-       metap->hashm_highmask = 3;      /* (nbuckets << 1) - 1 */
+       metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
+       metap->hashm_highmask = (num_buckets << 1) - 1;
  
         MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
         MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
  
-       metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
-       metap->hashm_ovflpoint = 1;
+       /* Set up mapping for one spare page after the initial splitpoints */
+       metap->hashm_spares[log2_num_buckets] = 1;
+       metap->hashm_ovflpoint = log2_num_buckets;
         metap->hashm_firstfree = 0;
  
         /*
-        * Initialize the first two buckets
+        * Initialize the first N buckets
          */
-       for (i = 0; i <= 1; i++)
+       for (i = 0; i < num_buckets; i++)
         {
                 buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
                 pg = BufferGetPage(buf);
@@ -430,7 +456,7 @@ _hash_metapinit(Relation rel)
         /*
          * Initialize first bitmap page
          */
-       _hash_initbitmap(rel, metap, 3);
+       _hash_initbitmap(rel, metap, num_buckets + 1);
  
         /* all done */
         _hash_wrtbuf(rel, metabuf);
@@ -511,6 +537,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
          * index with 2^32 buckets would certainly overflow BlockNumber and hence
          * _hash_alloc_buckets() would fail, but if we supported buckets smaller
          * than a disk block then this would be an independent constraint.
+        *
+        * If you change this, see also the maximum initial number of buckets
+        * in _hash_metapinit().
          */
         if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
                 goto fail;
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c

index 5f927095edcd9198f831652b32db52a64bf63346..a56dccd2ff5d9482736cd06e5c9769662948c9ba 100644 (file)
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -9,7 +9,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.140 2008/01/12 00:11:39 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.141 2008/03/15 20:46:31 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -45,8 +45,6 @@ bool          constraint_exclusion = false;
  get_relation_info_hook_type get_relation_info_hook = NULL;
  
  
-static void estimate_rel_size(Relation rel, int32 *attr_widths,
-                                 BlockNumber *pages, double *tuples);
  static List *get_relation_constraints(Oid relationObjectId, RelOptInfo *rel,
                                                  bool include_notnull);
  
@@ -319,7 +317,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
   * relation's attr_width[] cache; we fill this in if we have need to compute
   * the attribute widths for estimation purposes.
   */
-static void
+void
  estimate_rel_size(Relation rel, int32 *attr_widths,
                                   BlockNumber *pages, double *tuples)
  {
diff --git a/src/include/access/hash.h b/src/include/access/hash.h

index ac54c47f014ca704dd97e898ae8ad39d4c67d373..fd7b68e9aebeb18382e99628a041d66a78a0e3c2 100644 (file)
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.84 2008/01/01 19:45:56 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.85 2008/03/15 20:46:31 tgl Exp $
   *
   * NOTES
   *             modeled after Margo Seltzer's hash implementation for unix.
@@ -298,7 +298,7 @@ extern void _hash_dropbuf(Relation rel, Buffer buf);
  extern void _hash_wrtbuf(Relation rel, Buffer buf);
  extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access,
                                    int to_access);
-extern void _hash_metapinit(Relation rel);
+extern void _hash_metapinit(Relation rel, double num_tuples);
  extern void _hash_pageinit(Page page, Size size);
  extern void _hash_expandtable(Relation rel, Buffer metabuf);
  
diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h

index 42e24832495bdee61fd2c3c8cfadf59a04f2bb17..82b4c2200ad9d0b3e397db1ee30fd3b63d08e83c 100644 (file)
--- a/src/include/optimizer/plancat.h
+++ b/src/include/optimizer/plancat.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/optimizer/plancat.h,v 1.47 2008/01/01 19:45:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/optimizer/plancat.h,v 1.48 2008/03/15 20:46:31 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -15,6 +15,7 @@
  #define PLANCAT_H
  
  #include "nodes/relation.h"
+#include "utils/rel.h"
  
  /* Hook for plugins to get control in get_relation_info() */
  typedef void (*get_relation_info_hook_type) (PlannerInfo *root,
@@ -27,6 +28,9 @@ extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook;
  extern void get_relation_info(PlannerInfo *root, Oid relationObjectId,
                                   bool inhparent, RelOptInfo *rel);
  
+extern void estimate_rel_size(Relation rel, int32 *attr_widths,
+                                                         BlockNumber *pages, double *tuples);
+
  extern bool relation_excluded_by_constraints(RelOptInfo *rel,
                                                                  RangeTblEntry *rte);
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 15 Mar 2008 20:46:31 +0000 (20:46 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 15 Mar 2008 20:46:31 +0000 (20:46 +0000)
src/backend/access/hash/README		patch \| blob \| history
src/backend/access/hash/hash.c		patch \| blob \| history
src/backend/access/hash/hashpage.c		patch \| blob \| history
src/backend/optimizer/util/plancat.c		patch \| blob \| history
src/include/access/hash.h		patch \| blob \| history
src/include/optimizer/plancat.h		patch \| blob \| history