]> granicus.if.org Git - postgresql/commitdiff
Change hash indexes to store only the hash code rather than the whole indexed
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 15 Sep 2008 18:43:41 +0000 (18:43 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 15 Sep 2008 18:43:41 +0000 (18:43 +0000)
value.  This means that hash index lookups are always lossy and have to be
rechecked when the heap is visited; however, the gain in index compactness
outweighs this when the indexed values are wide.  Also, we only need to
perform datatype comparisons when the hash codes match exactly, rather than
for every entry in the hash bucket; so it could also win for datatypes that
have expensive comparison functions.  A small additional win is gained by
keeping hash index pages sorted by hash code and using binary search to reduce
the number of index tuples we have to look at.

Xiao Meng

This commit also incorporates Zdenek Kotala's patch to isolate hash metapages
and hash bitmaps a bit better from the page header datastructures.

13 files changed:
doc/src/sgml/catalogs.sgml
src/backend/access/hash/hash.c
src/backend/access/hash/hashinsert.c
src/backend/access/hash/hashovfl.c
src/backend/access/hash/hashpage.c
src/backend/access/hash/hashsearch.c
src/backend/access/hash/hashutil.c
src/backend/catalog/index.c
src/backend/utils/sort/tuplesort.c
src/include/access/hash.h
src/include/catalog/catversion.h
src/include/catalog/pg_am.h
src/include/catalog/pg_opclass.h

index 97a624f453e0cf6bce465cb1e2a03da7959c0968..646e37d79fc49debfc259f05c5fe6ac5588d3e75 100644 (file)
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.173 2008/09/10 18:09:19 alvherre Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.174 2008/09/15 18:43:41 tgl Exp $ -->
 <!--
  Documentation of the system catalogs, directed toward PostgreSQL developers
  -->
       <entry>Can an index of this type be clustered on?</entry>
      </row>
 
+     <row>
+      <entry><structfield>amkeytype</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-type"><structname>pg_type</structname></link>.oid</literal></entry>
+      <entry>Type of data stored in index, or zero if not a fixed type</entry>
+     </row>
+
      <row>
       <entry><structfield>aminsert</structfield></entry>
       <entry><type>regproc</type></entry>
      <row>
       <entry><structfield>sourceline</structfield></entry>
       <entry><type>text</type></entry>
-      <entry>Line number within the sourcefile the current value was set 
+      <entry>Line number within the sourcefile the current value was set
       from (NULL for values set in sources other than configuration files)
       </entry>
      </row>
index 41607c54dc3b2199a2675d09235d5333f72a4137..af4c4c058fd7cda8690ff38417ad730a28a805e4 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.104 2008/06/19 00:46:03 alvherre Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.105 2008/09/15 18:43:41 tgl Exp $
  *
  * NOTES
  *       This file contains only the public interface routines.
@@ -79,12 +79,12 @@ hashbuild(PG_FUNCTION_ARGS)
         * then we'll thrash horribly.  To prevent that scenario, we can sort the
         * tuples by (expected) bucket number.  However, such a sort is useless
         * overhead when the index does fit in RAM.  We choose to sort if the
-        * initial index size exceeds effective_cache_size.
+        * initial index size exceeds NBuffers.
         *
         * NOTE: this test will need adjustment if a bucket is ever different
         * from one page.
         */
-       if (num_buckets >= (uint32) effective_cache_size)
+       if (num_buckets >= (uint32) NBuffers)
                buildstate.spool = _h_spoolinit(index, num_buckets);
        else
                buildstate.spool = NULL;
@@ -129,7 +129,7 @@ hashbuildCallback(Relation index,
        IndexTuple      itup;
 
        /* form an index tuple and point it at the heap tuple */
-       itup = index_form_tuple(RelationGetDescr(index), values, isnull);
+       itup = _hash_form_tuple(index, values, isnull);
        itup->t_tid = htup->t_self;
 
        /* Hash indexes don't index nulls, see notes in hashinsert */
@@ -153,8 +153,8 @@ hashbuildCallback(Relation index,
 /*
  *     hashinsert() -- insert an index tuple into a hash table.
  *
- *     Hash on the index tuple's key, find the appropriate location
- *     for the new tuple, and put it there.
+ *     Hash on the heap tuple's key, form an index tuple with hash code.
+ *     Find the appropriate location for the new tuple, and put it there.
  */
 Datum
 hashinsert(PG_FUNCTION_ARGS)
@@ -171,7 +171,7 @@ hashinsert(PG_FUNCTION_ARGS)
        IndexTuple      itup;
 
        /* generate an index tuple */
-       itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
+       itup = _hash_form_tuple(rel, values, isnull);
        itup->t_tid = *ht_ctid;
 
        /*
@@ -211,8 +211,8 @@ hashgettuple(PG_FUNCTION_ARGS)
        OffsetNumber offnum;
        bool            res;
 
-       /* Hash indexes are never lossy (at the moment anyway) */
-       scan->xs_recheck = false;
+       /* Hash indexes are always lossy since we store only the hash code */
+       scan->xs_recheck = true;
 
        /*
         * We hold pin but not lock on current buffer while outside the hash AM.
@@ -317,7 +317,8 @@ hashgetbitmap(PG_FUNCTION_ARGS)
                /* Save tuple ID, and continue scanning */
                if (add_tuple) 
                {
-                       tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, false);
+                       /* Note we mark the tuple ID as requiring recheck */
+                       tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, true);
                        ntids++;
                }
 
@@ -527,7 +528,7 @@ hashbulkdelete(PG_FUNCTION_ARGS)
         * each bucket.
         */
        metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap =  HashPageGetMeta(BufferGetPage(metabuf));
        orig_maxbucket = metap->hashm_maxbucket;
        orig_ntuples = metap->hashm_ntuples;
        memcpy(&local_metapage, metap, sizeof(local_metapage));
@@ -629,7 +630,7 @@ loop_top:
 
        /* Write-lock metapage and check for split since we started */
        metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        if (cur_maxbucket != metap->hashm_maxbucket)
        {
index 7f68318f1a691ca7073608062e4b7ffe12729338..6195c8a2ac27e382e4f3d00aa181aff7d48c76b9 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.50 2008/06/19 00:46:03 alvherre Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.51 2008/09/15 18:43:41 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -43,18 +43,11 @@ _hash_doinsert(Relation rel, IndexTuple itup)
        bool            do_expand;
        uint32          hashkey;
        Bucket          bucket;
-       Datum           datum;
-       bool            isnull;
 
        /*
-        * Compute the hash key for the item.  We do this first so as not to need
-        * to hold any locks while running the hash function.
+        * Get the hash key for the item (it's stored in the index tuple itself).
         */
-       if (rel->rd_rel->relnatts != 1)
-               elog(ERROR, "hash indexes support only one index key");
-       datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
-       Assert(!isnull);
-       hashkey = _hash_datum2hashkey(rel, datum);
+       hashkey = _hash_get_indextuple_hashkey(itup);
 
        /* compute item size too */
        itemsz = IndexTupleDSize(*itup);
@@ -69,12 +62,14 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 
        /* Read the metapage */
        metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        /*
         * Check whether the item can fit on a hash page at all. (Eventually, we
         * ought to try to apply TOAST methods if not.)  Note that at this point,
         * itemsz doesn't include the ItemId.
+        *
+        * XXX this is useless code if we are only storing hash keys.
         */
        if (itemsz > HashMaxItemSize((Page) metap))
                ereport(ERROR,
@@ -197,11 +192,15 @@ _hash_pgaddtup(Relation rel,
 {
        OffsetNumber itup_off;
        Page            page;
+       uint32          hashkey;
 
        _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
        page = BufferGetPage(buf);
 
-       itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+       /* Find where to insert the tuple (preserving page's hashkey ordering) */
+       hashkey = _hash_get_indextuple_hashkey(itup);
+       itup_off = _hash_binsearch(page, hashkey);
+
        if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
                == InvalidOffsetNumber)
                elog(ERROR, "failed to add index item to \"%s\"",
index 06958ec86578372b287c93f7ccbf76f473357449..37315dbf3784f56563456eb190d5655c75e0ae26 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.64 2008/06/19 00:46:03 alvherre Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.65 2008/09/15 18:43:41 tgl Exp $
  *
  * NOTES
  *       Overflow pages look like ordinary relation pages.
@@ -187,7 +187,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
        _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 
        _hash_checkpage(rel, metabuf, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        /* start search at hashm_firstfree */
        orig_firstfree = metap->hashm_firstfree;
@@ -450,7 +450,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
 
        /* Read the metapage so we can determine which bitmap page to use */
        metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        /* Identify which bit to set */
        ovflbitno = blkno_to_bitno(metap, ovflblkno);
index 43ec69cab327d3968f4918e8c7f3b73dd3668962..c5edf6dcfb94d0fd3aa301cad949b18347db0b55 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.76 2008/08/11 11:05:10 heikki Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.77 2008/09/15 18:43:41 tgl Exp $
  *
  * NOTES
  *       Postgres hash pages look like ordinary relation pages.  The opaque
@@ -348,11 +348,9 @@ _hash_metapinit(Relation rel, double num_tuples)
         * Determine the target fill factor (in tuples per bucket) for this index.
         * The idea is to make the fill factor correspond to pages about as full
         * as the user-settable fillfactor parameter says.      We can compute it
-        * exactly if the index datatype is fixed-width, but for var-width there's
-        * some guessing involved.
+        * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
         */
-       data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
-                                                                RelationGetDescr(rel)->attrs[0]->atttypmod);
+       data_width = sizeof(uint32);
        item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
                sizeof(ItemIdData);             /* include the line pointer */
        ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
@@ -395,20 +393,18 @@ _hash_metapinit(Relation rel, double num_tuples)
        pageopaque->hasho_flag = LH_META_PAGE;
        pageopaque->hasho_page_id = HASHO_PAGE_ID;
 
-       metap = (HashMetaPage) pg;
+       metap = HashPageGetMeta(pg);
 
        metap->hashm_magic = HASH_MAGIC;
        metap->hashm_version = HASH_VERSION;
        metap->hashm_ntuples = 0;
        metap->hashm_nmaps = 0;
        metap->hashm_ffactor = ffactor;
-       metap->hashm_bsize = BufferGetPageSize(metabuf);
+       metap->hashm_bsize = HashGetMaxBitmapSize(pg);
        /* find largest bitmap array size that will fit in page size */
        for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
        {
-               if ((1 << i) <= (metap->hashm_bsize -
-                                                (MAXALIGN(sizeof(PageHeaderData)) +
-                                                 MAXALIGN(sizeof(HashPageOpaqueData)))))
+               if ((1 << i) <= metap->hashm_bsize)
                        break;
        }
        Assert(i > 0);
@@ -532,7 +528,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
        _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 
        _hash_checkpage(rel, metabuf, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        /*
         * Check to see if split is still needed; someone else might have already
@@ -774,8 +770,6 @@ _hash_splitbucket(Relation rel,
        Buffer          nbuf;
        BlockNumber oblkno;
        BlockNumber nblkno;
-       bool            null;
-       Datum           datum;
        HashPageOpaque oopaque;
        HashPageOpaque nopaque;
        IndexTuple      itup;
@@ -785,7 +779,6 @@ _hash_splitbucket(Relation rel,
        OffsetNumber omaxoffnum;
        Page            opage;
        Page            npage;
-       TupleDesc       itupdesc = RelationGetDescr(rel);
 
        /*
         * It should be okay to simultaneously write-lock pages from each bucket,
@@ -846,16 +839,11 @@ _hash_splitbucket(Relation rel,
                }
 
                /*
-                * Re-hash the tuple to determine which bucket it now belongs in.
-                *
-                * It is annoying to call the hash function while holding locks, but
-                * releasing and relocking the page for each tuple is unappealing too.
+                * Fetch the item's hash key (conveniently stored in the item)
+                * and determine which bucket it now belongs in.
                 */
                itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum));
-               datum = index_getattr(itup, 1, itupdesc, &null);
-               Assert(!null);
-
-               bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
+               bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
                                                                          maxbucket, highmask, lowmask);
 
                if (bucket == nbucket)
index 1e05558523f93bea52bd01a0d822654135e764f2..853683934230e1965722f24665550843faff3b2a 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.53 2008/06/19 00:46:03 alvherre Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.54 2008/09/15 18:43:41 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -178,6 +178,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
                hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
                                                                                   cur->sk_subtype);
 
+       so->hashso_sk_hash = hashkey;
+
        /*
         * Acquire shared split lock so we can compute the target bucket safely
         * (see README).
@@ -186,7 +188,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 
        /* Read the metapage */
        metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-       metap = (HashMetaPage) BufferGetPage(metabuf);
+       metap = HashPageGetMeta(BufferGetPage(metabuf));
 
        /*
         * Compute the target bucket number, and convert to block number.
@@ -284,7 +286,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                offnum = InvalidOffsetNumber;
 
        /*
-        * 'offnum' now points to the last tuple we have seen (if any).
+        * 'offnum' now points to the last tuple we examined (if any).
         *
         * continue to step through tuples until: 1) we get to the end of the
         * bucket chain or 2) we find a valid tuple.
@@ -297,25 +299,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                                if (offnum != InvalidOffsetNumber)
                                        offnum = OffsetNumberNext(offnum);      /* move forward */
                                else
-                                       offnum = FirstOffsetNumber; /* new page */
+                               {
+                                       /* new page, locate starting position by binary search */
+                                       offnum = _hash_binsearch(page, so->hashso_sk_hash);
+                               }
 
-                               while (offnum > maxoff)
+                               for (;;)
                                {
                                        /*
-                                        * either this page is empty (maxoff ==
-                                        * InvalidOffsetNumber) or we ran off the end.
+                                        * check if we're still in the range of items with
+                                        * the target hash key
+                                        */
+                                       if (offnum <= maxoff)
+                                       {
+                                               Assert(offnum >= FirstOffsetNumber);
+                                               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+                                               if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+                                                       break;                          /* yes, so exit for-loop */
+                                       }
+
+                                       /*
+                                        * ran off the end of this page, try the next
                                         */
                                        _hash_readnext(rel, &buf, &page, &opaque);
                                        if (BufferIsValid(buf))
                                        {
                                                maxoff = PageGetMaxOffsetNumber(page);
-                                               offnum = FirstOffsetNumber;
+                                               offnum = _hash_binsearch(page, so->hashso_sk_hash);
                                        }
                                        else
                                        {
                                                /* end of bucket */
-                                               maxoff = offnum = InvalidOffsetNumber;
-                                               break;  /* exit while */
+                                               itup = NULL;
+                                               break;  /* exit for-loop */
                                        }
                                }
                                break;
@@ -324,22 +340,39 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                                if (offnum != InvalidOffsetNumber)
                                        offnum = OffsetNumberPrev(offnum);      /* move back */
                                else
-                                       offnum = maxoff;        /* new page */
+                               {
+                                       /* new page, locate starting position by binary search */
+                                       offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+                               }
 
-                               while (offnum < FirstOffsetNumber)
+                               for (;;)
                                {
                                        /*
-                                        * either this page is empty (offnum ==
-                                        * InvalidOffsetNumber) or we ran off the end.
+                                        * check if we're still in the range of items with
+                                        * the target hash key
+                                        */
+                                       if (offnum >= FirstOffsetNumber)
+                                       {
+                                               Assert(offnum <= maxoff);
+                                               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+                                               if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
+                                                       break;                          /* yes, so exit for-loop */
+                                       }
+
+                                       /*
+                                        * ran off the end of this page, try the next
                                         */
                                        _hash_readprev(rel, &buf, &page, &opaque);
                                        if (BufferIsValid(buf))
-                                               maxoff = offnum = PageGetMaxOffsetNumber(page);
+                                       {
+                                               maxoff = PageGetMaxOffsetNumber(page);
+                                               offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+                                       }
                                        else
                                        {
                                                /* end of bucket */
-                                               maxoff = offnum = InvalidOffsetNumber;
-                                               break;  /* exit while */
+                                               itup = NULL;
+                                               break;  /* exit for-loop */
                                        }
                                }
                                break;
@@ -347,19 +380,19 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                        default:
                                /* NoMovementScanDirection */
                                /* this should not be reached */
+                               itup = NULL;
                                break;
                }
 
-               /* we ran off the end of the world without finding a match */
-               if (offnum == InvalidOffsetNumber)
+               if (itup == NULL)
                {
+                       /* we ran off the end of the bucket without finding a match */
                        *bufP = so->hashso_curbuf = InvalidBuffer;
                        ItemPointerSetInvalid(current);
                        return false;
                }
 
-               /* get ready to check this tuple */
-               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+               /* check the tuple quals, loop around if not met */
        } while (!_hash_checkqual(scan, itup));
 
        /* if we made it to here, we've found a valid tuple */
index 29cdf24529a7ae86bd1a185e5e01b2f558ae748b..7a1e3a8ad0baf005e8e5d79490e76dfdd8cf945e 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.56 2008/07/13 20:45:47 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/hash/hashutil.c,v 1.57 2008/09/15 18:43:41 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 bool
 _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
 {
+       /*
+        * Currently, we can't check any of the scan conditions since we do
+        * not have the original index entry value to supply to the sk_func.
+        * Always return true; we expect that hashgettuple already set the
+        * recheck flag to make the main indexscan code do it.
+        */
+#ifdef NOT_USED
        TupleDesc       tupdesc = RelationGetDescr(scan->indexRelation);
        ScanKey         key = scan->keyData;
        int                     scanKeySize = scan->numberOfKeys;
+#endif
 
        IncrIndexProcessed();
 
+#ifdef NOT_USED
        while (scanKeySize > 0)
        {
                Datum           datum;
@@ -59,6 +68,7 @@ _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
                key++;
                scanKeySize--;
        }
+#endif
 
        return true;
 }
@@ -190,7 +200,7 @@ _hash_checkpage(Relation rel, Buffer buf, int flags)
         */
        if (flags == LH_META_PAGE)
        {
-               HashMetaPage metap = (HashMetaPage) page;
+               HashMetaPage metap = HashPageGetMeta(page);
 
                if (metap->hashm_magic != HASH_MAGIC)
                        ereport(ERROR,
@@ -221,3 +231,123 @@ hashoptions(PG_FUNCTION_ARGS)
                PG_RETURN_BYTEA_P(result);
        PG_RETURN_NULL();
 }
+
+/*
+ * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
+ */
+uint32
+_hash_get_indextuple_hashkey(IndexTuple itup)
+{
+       char       *attp;
+
+       /*
+        * We assume the hash key is the first attribute and can't be null,
+        * so this can be done crudely but very very cheaply ...
+        */
+       attp = (char *) itup + IndexInfoFindDataOffset(itup->t_info);
+       return *((uint32 *) attp);
+}
+
+/*
+ * _hash_form_tuple - form an index tuple containing hash code only
+ */
+IndexTuple
+_hash_form_tuple(Relation index, Datum *values, bool *isnull)
+{
+       IndexTuple              itup;
+       uint32                  hashkey;
+       Datum                   hashkeydatum;
+       TupleDesc               hashdesc;
+
+       if (isnull[0])
+               hashkeydatum = (Datum) 0;
+       else
+       {
+               hashkey = _hash_datum2hashkey(index, values[0]);
+               hashkeydatum = UInt32GetDatum(hashkey);
+       }
+       hashdesc = RelationGetDescr(index);
+       Assert(hashdesc->natts == 1);
+       itup = index_form_tuple(hashdesc, &hashkeydatum, isnull);
+       return itup;
+}
+
+/*
+ * _hash_binsearch - Return the offset number in the page where the
+ *                                      specified hash value should be sought or inserted.
+ *
+ * We use binary search, relying on the assumption that the existing entries
+ * are ordered by hash key.
+ *
+ * Returns the offset of the first index entry having hashkey >= hash_value,
+ * or the page's max offset plus one if hash_value is greater than all
+ * existing hash keys in the page.  This is the appropriate place to start
+ * a search, or to insert a new item.
+ */
+OffsetNumber
+_hash_binsearch(Page page, uint32 hash_value)
+{
+       OffsetNumber    upper;
+       OffsetNumber    lower;
+
+       /* Loop invariant: lower <= desired place <= upper */
+       upper = PageGetMaxOffsetNumber(page) + 1;
+       lower = FirstOffsetNumber;
+
+       while (upper > lower)
+       {
+               OffsetNumber    off;
+               IndexTuple              itup;
+               uint32                  hashkey;
+
+               off = (upper + lower) / 2;
+               Assert(OffsetNumberIsValid(off));
+
+               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+               hashkey = _hash_get_indextuple_hashkey(itup);
+               if (hashkey < hash_value)
+                       lower = off + 1;
+               else
+                       upper = off;
+       }
+
+       return lower;
+}
+
+/*
+ * _hash_binsearch_last
+ *
+ * Same as above, except that if there are multiple matching items in the
+ * page, we return the offset of the last one instead of the first one,
+ * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
+ * This is handy for starting a new page in a backwards scan.
+ */
+OffsetNumber
+_hash_binsearch_last(Page page, uint32 hash_value)
+{
+       OffsetNumber    upper;
+       OffsetNumber    lower;
+
+       /* Loop invariant: lower <= desired place <= upper */
+       upper = PageGetMaxOffsetNumber(page);
+       lower = FirstOffsetNumber - 1;
+
+       while (upper > lower)
+       {
+               IndexTuple              itup;
+               OffsetNumber    off;
+               uint32                  hashkey;
+
+               off = (upper + lower + 1) / 2;
+               Assert(OffsetNumberIsValid(off));
+
+               itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
+               hashkey = _hash_get_indextuple_hashkey(itup);
+               if (hashkey > hash_value)
+                       upper = off - 1;
+               else
+                       lower = off;
+       }
+
+       return lower;
+}
index 1847f023e4a997e8d9025c17c624ce95689f5987..301e7d1f2d56877819089c927fc46549018ba2b4 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.303 2008/08/25 22:42:32 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.304 2008/09/15 18:43:41 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -76,6 +76,7 @@ typedef struct
 /* non-export function prototypes */
 static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
                                                 IndexInfo *indexInfo,
+                                                Oid accessMethodObjectId,
                                                 Oid *classObjectId);
 static void InitializeAttributeOids(Relation indexRelation,
                                                int numatts, Oid indexoid);
@@ -105,15 +106,28 @@ static Oid        IndexGetRelation(Oid indexId);
 static TupleDesc
 ConstructTupleDescriptor(Relation heapRelation,
                                                 IndexInfo *indexInfo,
+                                                Oid accessMethodObjectId,
                                                 Oid *classObjectId)
 {
        int                     numatts = indexInfo->ii_NumIndexAttrs;
        ListCell   *indexpr_item = list_head(indexInfo->ii_Expressions);
+       HeapTuple       amtuple;
+       Form_pg_am      amform;
        TupleDesc       heapTupDesc;
        TupleDesc       indexTupDesc;
        int                     natts;                  /* #atts in heap rel --- for error checks */
        int                     i;
 
+       /* We need access to the index AM's pg_am tuple */
+       amtuple = SearchSysCache(AMOID,
+                                                        ObjectIdGetDatum(accessMethodObjectId),
+                                                        0, 0, 0);
+       if (!HeapTupleIsValid(amtuple))
+               elog(ERROR, "cache lookup failed for access method %u",
+                        accessMethodObjectId);
+       amform = (Form_pg_am) GETSTRUCT(amtuple);
+
+       /* ... and to the table's tuple descriptor */
        heapTupDesc = RelationGetDescr(heapRelation);
        natts = RelationGetForm(heapRelation)->relnatts;
 
@@ -133,6 +147,7 @@ ConstructTupleDescriptor(Relation heapRelation,
                Form_pg_attribute to = indexTupDesc->attrs[i];
                HeapTuple       tuple;
                Form_pg_type typeTup;
+               Form_pg_opclass opclassTup;
                Oid                     keyType;
 
                if (atnum != 0)
@@ -231,8 +246,8 @@ ConstructTupleDescriptor(Relation heapRelation,
                to->attrelid = InvalidOid;
 
                /*
-                * Check the opclass to see if it provides a keytype (overriding the
-                * attribute type).
+                * Check the opclass and index AM to see if either provides a keytype
+                * (overriding the attribute type).  Opclass takes precedence.
                 */
                tuple = SearchSysCache(CLAOID,
                                                           ObjectIdGetDatum(classObjectId[i]),
@@ -240,7 +255,11 @@ ConstructTupleDescriptor(Relation heapRelation,
                if (!HeapTupleIsValid(tuple))
                        elog(ERROR, "cache lookup failed for opclass %u",
                                 classObjectId[i]);
-               keyType = ((Form_pg_opclass) GETSTRUCT(tuple))->opckeytype;
+               opclassTup = (Form_pg_opclass) GETSTRUCT(tuple);
+               if (OidIsValid(opclassTup->opckeytype))
+                       keyType = opclassTup->opckeytype;
+               else
+                       keyType = amform->amkeytype;
                ReleaseSysCache(tuple);
 
                if (OidIsValid(keyType) && keyType != to->atttypid)
@@ -264,6 +283,8 @@ ConstructTupleDescriptor(Relation heapRelation,
                }
        }
 
+       ReleaseSysCache(amtuple);
+
        return indexTupDesc;
 }
 
@@ -577,6 +598,7 @@ index_create(Oid heapRelationId,
         */
        indexTupDesc = ConstructTupleDescriptor(heapRelation,
                                                                                        indexInfo,
+                                                                                       accessMethodObjectId,
                                                                                        classObjectId);
 
        /*
index 775840da18507616d356ffb4ac0633c2827595f4..29a076e1384624bf97664505d83b10996a264aa7 100644 (file)
@@ -91,7 +91,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.86 2008/08/01 13:16:09 alvherre Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.87 2008/09/15 18:43:41 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include <limits.h>
 
 #include "access/genam.h"
-#include "access/hash.h"
 #include "access/nbtree.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_operator.h"
@@ -353,7 +352,6 @@ struct Tuplesortstate
        bool            enforceUnique;  /* complain if we find duplicate tuples */
 
        /* These are specific to the index_hash subcase: */
-       FmgrInfo   *hash_proc;          /* call info for the hash function */
        uint32          hash_mask;              /* mask for sortable part of hash code */
 
        /*
@@ -689,13 +687,6 @@ tuplesort_begin_index_hash(Relation indexRel,
 
        state->indexRel = indexRel;
 
-       /*
-        * We look up the index column's hash function just once, to avoid
-        * chewing lots of cycles in repeated index_getprocinfo calls.  This
-        * assumes that our caller holds the index relation open throughout the
-        * sort, else the pointer obtained here might cease to be valid.
-        */
-       state->hash_proc = index_getprocinfo(indexRel, 1, HASHPROC);
        state->hash_mask = hash_mask;
 
        MemoryContextSwitchTo(oldcontext);
@@ -2821,11 +2812,6 @@ static int
 comparetup_index_hash(const SortTuple *a, const SortTuple *b,
                                          Tuplesortstate *state)
 {
-       /*
-        * It's slightly annoying to redo the hash function each time, although
-        * most hash functions ought to be cheap.  Is it worth having a variant
-        * tuple storage format so we can store the hash code?
-        */
        uint32          hash1;
        uint32          hash2;
        IndexTuple      tuple1;
@@ -2834,13 +2820,14 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
        /* Allow interrupting long sorts */
        CHECK_FOR_INTERRUPTS();
 
-       /* Compute hash codes and mask off bits we don't want to sort by */
+       /*
+        * Fetch hash keys and mask off bits we don't want to sort by.
+        * We know that the first column of the index tuple is the hash key.
+        */
        Assert(!a->isnull1);
-       hash1 = DatumGetUInt32(FunctionCall1(state->hash_proc, a->datum1))
-               & state->hash_mask;
+       hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
        Assert(!b->isnull1);
-       hash2 = DatumGetUInt32(FunctionCall1(state->hash_proc, b->datum1))
-               & state->hash_mask;
+       hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
 
        if (hash1 > hash2)
                return 1;
index 0dab2b6ae91f017faf084714c6f304c0d8a85055..e00176d4519ae1f25b8f4edcd6703d2018f89475 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.89 2008/07/13 20:45:47 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.90 2008/09/15 18:43:41 tgl Exp $
  *
  * NOTES
  *             modeled after Margo Seltzer's hash implementation for unix.
@@ -75,6 +75,9 @@ typedef HashPageOpaqueData *HashPageOpaque;
  */
 typedef struct HashScanOpaqueData
 {
+       /* Hash value of the scan key, ie, the hash key we seek */
+       uint32          hashso_sk_hash;
+
        /*
         * By definition, a hash scan should be examining only one bucket. We
         * record the bucket number here as soon as it is known.
@@ -111,7 +114,7 @@ typedef HashScanOpaqueData *HashScanOpaque;
 #define HASH_METAPAGE  0               /* metapage is always block 0 */
 
 #define HASH_MAGIC             0x6440640
-#define HASH_VERSION   1               /* new for Pg 7.4 */
+#define HASH_VERSION   2               /* 2 signifies only hash key value is stored */
 
 /*
  * Spares[] holds the number of overflow pages currently allocated at or
@@ -138,7 +141,6 @@ typedef HashScanOpaqueData *HashScanOpaque;
 
 typedef struct HashMetaPageData
 {
-       PageHeaderData hashm_phdr;      /* pad for page header (do not use) */
        uint32          hashm_magic;    /* magic no. for hash tables */
        uint32          hashm_version;  /* version ID */
        double          hashm_ntuples;  /* number of tuples stored in the table */
@@ -191,8 +193,16 @@ typedef HashMetaPageData *HashMetaPage;
 #define BMPGSZ_BIT(metap)              ((metap)->hashm_bmsize << BYTE_TO_BIT)
 #define BMPG_SHIFT(metap)              ((metap)->hashm_bmshift)
 #define BMPG_MASK(metap)               (BMPGSZ_BIT(metap) - 1)
-#define HashPageGetBitmap(pg) \
-       ((uint32 *) (((char *) (pg)) + MAXALIGN(sizeof(PageHeaderData))))
+
+#define HashPageGetBitmap(page) \
+       ((uint32 *) PageGetContents(page))
+
+#define HashGetMaxBitmapSize(page) \
+       (PageGetPageSize((Page) page) - \
+        (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
+
+#define HashPageGetMeta(page) \
+       ((HashMetaPage) PageGetContents(page))
 
 /*
  * The number of bits in an ovflpage bitmap word.
@@ -330,6 +340,11 @@ extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket,
                                         uint32 highmask, uint32 lowmask);
 extern uint32 _hash_log2(uint32 num);
 extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
+extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
+extern IndexTuple _hash_form_tuple(Relation index,
+                                                                  Datum *values, bool *isnull);
+extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
+extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
 
 /* hash.c */
 extern void hash_redo(XLogRecPtr lsn, XLogRecord *record);
index 6e4b4d40f9d1e19b4ecd72685b6ba9373bcd5a26..bd08779e713e219ba43e300852bc7640a7012e22 100644 (file)
@@ -37,7 +37,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.485 2008/09/10 18:09:20 alvherre Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.486 2008/09/15 18:43:41 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -53,6 +53,6 @@
  */
 
 /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     200809101
+#define CATALOG_VERSION_NO     200809151
 
 #endif
index 712a409633d80fdbac8bd38fdd9ad98d19c6b0db..a7a638e083b70b8fab376d09c74ae158c0ace830 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/pg_am.h,v 1.57 2008/07/11 21:06:29 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_am.h,v 1.58 2008/09/15 18:43:41 tgl Exp $
  *
  * NOTES
  *             the genbki.sh script reads this file and generates .bki
@@ -48,6 +48,7 @@ CATALOG(pg_am,2601)
        bool            amsearchnulls;  /* can AM search for NULL index entries? */
        bool            amstorage;              /* can storage type differ from column type? */
        bool            amclusterable;  /* does AM support cluster command? */
+       Oid                     amkeytype;              /* type of data in index, or InvalidOid */
        regproc         aminsert;               /* "insert this tuple" function */
        regproc         ambeginscan;    /* "start new scan" function */
        regproc         amgettuple;             /* "next valid tuple" function */
@@ -74,7 +75,7 @@ typedef FormData_pg_am *Form_pg_am;
  *             compiler constants for pg_am
  * ----------------
  */
-#define Natts_pg_am                                            24
+#define Natts_pg_am                                            25
 #define Anum_pg_am_amname                              1
 #define Anum_pg_am_amstrategies                        2
 #define Anum_pg_am_amsupport                   3
@@ -86,35 +87,36 @@ typedef FormData_pg_am *Form_pg_am;
 #define Anum_pg_am_amsearchnulls               9
 #define Anum_pg_am_amstorage                   10
 #define Anum_pg_am_amclusterable               11
-#define Anum_pg_am_aminsert                            12
-#define Anum_pg_am_ambeginscan                 13
-#define Anum_pg_am_amgettuple                  14
-#define Anum_pg_am_amgetbitmap                 15
-#define Anum_pg_am_amrescan                            16
-#define Anum_pg_am_amendscan                   17
-#define Anum_pg_am_ammarkpos                   18
-#define Anum_pg_am_amrestrpos                  19
-#define Anum_pg_am_ambuild                             20
-#define Anum_pg_am_ambulkdelete                        21
-#define Anum_pg_am_amvacuumcleanup             22
-#define Anum_pg_am_amcostestimate              23
-#define Anum_pg_am_amoptions                   24
+#define Anum_pg_am_amkeytype                   12
+#define Anum_pg_am_aminsert                            13
+#define Anum_pg_am_ambeginscan                 14
+#define Anum_pg_am_amgettuple                  15
+#define Anum_pg_am_amgetbitmap                 16
+#define Anum_pg_am_amrescan                            17
+#define Anum_pg_am_amendscan                   18
+#define Anum_pg_am_ammarkpos                   19
+#define Anum_pg_am_amrestrpos                  20
+#define Anum_pg_am_ambuild                             21
+#define Anum_pg_am_ambulkdelete                        22
+#define Anum_pg_am_amvacuumcleanup             23
+#define Anum_pg_am_amcostestimate              24
+#define Anum_pg_am_amoptions                   25
 
 /* ----------------
  *             initial contents of pg_am
  * ----------------
  */
 
-DATA(insert OID = 403 (  btree 5 1 t t t t t t f t btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
+DATA(insert OID = 403 (  btree 5 1 t t t t t t f t btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate btoptions ));
 DESCR("b-tree index access method");
 #define BTREE_AM_OID 403
-DATA(insert OID = 405 (  hash  1 1 f f f f f f f f hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
+DATA(insert OID = 405 (  hash  1 1 f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions ));
 DESCR("hash index access method");
 #define HASH_AM_OID 405
-DATA(insert OID = 783 (  gist  0 7 f f t t t t t t gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
+DATA(insert OID = 783 (  gist  0 7 f f t t t t t t gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions ));
 DESCR("GiST index access method");
 #define GIST_AM_OID 783
-DATA(insert OID = 2742 (  gin  0 5 f f t t f f t f gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
+DATA(insert OID = 2742 (  gin  0 5 f f t t f f t f gininsert ginbeginscan gingettuple gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbulkdelete ginvacuumcleanup gincostestimate ginoptions ));
 DESCR("GIN index access method");
 #define GIN_AM_OID 2742
 
index f0cb23e27085e9957087f7c511b77cdfe69d63a5..7c4d95003c8b2ea60380c36131109c4f5396ea15 100644 (file)
@@ -28,7 +28,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/pg_opclass.h,v 1.82 2008/06/24 17:58:27 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_opclass.h,v 1.83 2008/09/15 18:43:41 tgl Exp $
  *
  * NOTES
  *       the genbki.sh script reads this file and generates .bki
@@ -123,13 +123,13 @@ DATA(insert (     403             macaddr_ops                     PGNSP PGUID 1984  829 t 0 ));
 DATA(insert (  405             macaddr_ops                     PGNSP PGUID 1985  829 t 0 ));
 /*
  * Here's an ugly little hack to save space in the system catalog indexes.
- * btree and hash don't ordinarily allow a storage type different from input
- * type; but cstring and name are the same thing except for trailing padding,
+ * btree doesn't ordinarily allow a storage type different from input type;
+ * but cstring and name are the same thing except for trailing padding,
  * and we can safely omit that within an index entry.  So we declare the
- * opclasses for name as using cstring storage type.
+ * btree opclass for name as using cstring storage type.
  */
 DATA(insert (  403             name_ops                        PGNSP PGUID 1986   19 t 2275 ));
-DATA(insert (  405             name_ops                        PGNSP PGUID 1987   19 t 2275 ));
+DATA(insert (  405             name_ops                        PGNSP PGUID 1987   19 t 0 ));
 DATA(insert (  403             numeric_ops                     PGNSP PGUID 1988 1700 t 0 ));
 DATA(insert (  405             numeric_ops                     PGNSP PGUID 1998 1700 t 0 ));
 DATA(insert OID = 1981 ( 403   oid_ops         PGNSP PGUID 1989   26 t 0 ));