granicus.if.org Git - postgresql/blob - src/backend/access/hash/hashpage.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * hashpage.c
   4  *        Hash table page management code for the Postgres hash access method
   5  *
   6  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.68 2007/05/30 20:11:51 tgl Exp $
  12  *
  13  * NOTES
  14  *        Postgres hash pages look like ordinary relation pages.  The opaque
  15  *        data at high addresses includes information about the page including
  16  *        whether a page is an overflow page or a true bucket, the bucket
  17  *        number, and the block numbers of the preceding and following pages
  18  *        in the same bucket.
  19  *
  20  *        The first page in a hash relation, page zero, is special -- it stores
  21  *        information describing the hash table; it is referred to as the
  22  *        "meta page." Pages one and higher store the actual data.
  23  *
  24  *        There are also bitmap pages, which are not manipulated here;
  25  *        see hashovfl.c.
  26  *
  27  *-------------------------------------------------------------------------
  28  */
  29 #include "postgres.h"
  30
  31 #include "access/genam.h"
  32 #include "access/hash.h"
  33 #include "miscadmin.h"
  34 #include "storage/lmgr.h"
  35 #include "storage/smgr.h"
  36 #include "utils/lsyscache.h"
  37
  38
  39 static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
  40                                                                 uint32 nblocks);
  41 static void _hash_splitbucket(Relation rel, Buffer metabuf,
  42                                   Bucket obucket, Bucket nbucket,
  43                                   BlockNumber start_oblkno,
  44                                   BlockNumber start_nblkno,
  45                                   uint32 maxbucket,
  46                                   uint32 highmask, uint32 lowmask);
  47
  48
  49 /*
  50  * We use high-concurrency locking on hash indexes (see README for an overview
  51  * of the locking rules).  However, we can skip taking lmgr locks when the
  52  * index is local to the current backend (ie, either temp or new in the
  53  * current transaction).  No one else can see it, so there's no reason to
  54  * take locks.  We still take buffer-level locks, but not lmgr locks.
  55  */
  56 #define USELOCKING(rel)         (!RELATION_IS_LOCAL(rel))
  57
  58
  59 /*
  60  * _hash_getlock() -- Acquire an lmgr lock.
  61  *
  62  * 'whichlock' should be zero to acquire the split-control lock, or the
  63  * block number of a bucket's primary bucket page to acquire the per-bucket
  64  * lock.  (See README for details of the use of these locks.)
  65  *
  66  * 'access' must be HASH_SHARE or HASH_EXCLUSIVE.
  67  */
  68 void
  69 _hash_getlock(Relation rel, BlockNumber whichlock, int access)
  70 {
  71         if (USELOCKING(rel))
  72                 LockPage(rel, whichlock, access);
  73 }
  74
  75 /*
  76  * _hash_try_getlock() -- Acquire an lmgr lock, but only if it's free.
  77  *
  78  * Same as above except we return FALSE without blocking if lock isn't free.
  79  */
  80 bool
  81 _hash_try_getlock(Relation rel, BlockNumber whichlock, int access)
  82 {
  83         if (USELOCKING(rel))
  84                 return ConditionalLockPage(rel, whichlock, access);
  85         else
  86                 return true;
  87 }
  88
  89 /*
  90  * _hash_droplock() -- Release an lmgr lock.
  91  */
  92 void
  93 _hash_droplock(Relation rel, BlockNumber whichlock, int access)
  94 {
  95         if (USELOCKING(rel))
  96                 UnlockPage(rel, whichlock, access);
  97 }
  98
  99 /*
 100  *      _hash_getbuf() -- Get a buffer by block number for read or write.
 101  *
 102  *              'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
 103  *              'flags' is a bitwise OR of the allowed page types.
 104  *
 105  *              This must be used only to fetch pages that are expected to be valid
 106  *              already.  _hash_checkpage() is applied using the given flags.
 107  *
 108  *              When this routine returns, the appropriate lock is set on the
 109  *              requested buffer and its reference count has been incremented
 110  *              (ie, the buffer is "locked and pinned").
 111  *
 112  *              P_NEW is disallowed because this routine can only be used
 113  *              to access pages that are known to be before the filesystem EOF.
 114  *              Extending the index should be done with _hash_getnewbuf.
 115  */
 116 Buffer
 117 _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
 118 {
 119         Buffer          buf;
 120
 121         if (blkno == P_NEW)
 122                 elog(ERROR, "hash AM does not use P_NEW");
 123
 124         buf = ReadBuffer(rel, blkno);
 125
 126         if (access != HASH_NOLOCK)
 127                 LockBuffer(buf, access);
 128
 129         /* ref count and lock type are correct */
 130
 131         _hash_checkpage(rel, buf, flags);
 132
 133         return buf;
 134 }
 135
 136 /*
 137  *      _hash_getinitbuf() -- Get and initialize a buffer by block number.
 138  *
 139  *              This must be used only to fetch pages that are known to be before
 140  *              the index's filesystem EOF, but are to be filled from scratch.
 141  *              _hash_pageinit() is applied automatically.  Otherwise it has
 142  *              effects similar to _hash_getbuf() with access = HASH_WRITE.
 143  *
 144  *              When this routine returns, a write lock is set on the
 145  *              requested buffer and its reference count has been incremented
 146  *              (ie, the buffer is "locked and pinned").
 147  *
 148  *              P_NEW is disallowed because this routine can only be used
 149  *              to access pages that are known to be before the filesystem EOF.
 150  *              Extending the index should be done with _hash_getnewbuf.
 151  */
 152 Buffer
 153 _hash_getinitbuf(Relation rel, BlockNumber blkno)
 154 {
 155         Buffer          buf;
 156
 157         if (blkno == P_NEW)
 158                 elog(ERROR, "hash AM does not use P_NEW");
 159
 160         buf = ReadOrZeroBuffer(rel, blkno);
 161
 162         LockBuffer(buf, HASH_WRITE);
 163
 164         /* ref count and lock type are correct */
 165
 166         /* initialize the page */
 167         _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
 168
 169         return buf;
 170 }
 171
 172 /*
 173  *      _hash_getnewbuf() -- Get a new page at the end of the index.
 174  *
 175  *              This has the same API as _hash_getinitbuf, except that we are adding
 176  *              a page to the index, and hence expect the page to be past the
 177  *              logical EOF.  (However, we have to support the case where it isn't,
 178  *              since a prior try might have crashed after extending the filesystem
 179  *              EOF but before updating the metapage to reflect the added page.)
 180  *
 181  *              It is caller's responsibility to ensure that only one process can
 182  *              extend the index at a time.
 183  */
 184 Buffer
 185 _hash_getnewbuf(Relation rel, BlockNumber blkno)
 186 {
 187         BlockNumber     nblocks = RelationGetNumberOfBlocks(rel);
 188         Buffer          buf;
 189
 190         if (blkno == P_NEW)
 191                 elog(ERROR, "hash AM does not use P_NEW");
 192         if (blkno > nblocks)
 193                 elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
 194                          RelationGetRelationName(rel));
 195
 196         /* smgr insists we use P_NEW to extend the relation */
 197         if (blkno == nblocks)
 198         {
 199                 buf = ReadBuffer(rel, P_NEW);
 200                 if (BufferGetBlockNumber(buf) != blkno)
 201                         elog(ERROR, "unexpected hash relation size: %u, should be %u",
 202                                  BufferGetBlockNumber(buf), blkno);
 203         }
 204         else
 205                 buf = ReadOrZeroBuffer(rel, blkno);
 206
 207         LockBuffer(buf, HASH_WRITE);
 208
 209         /* ref count and lock type are correct */
 210
 211         /* initialize the page */
 212         _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
 213
 214         return buf;
 215 }
 216
 217 /*
 218  *      _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
 219  *
 220  *              This is identical to _hash_getbuf() but also allows a buffer access
 221  *              strategy to be specified.  We use this for VACUUM operations.
 222  */
 223 Buffer
 224 _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
 225                                                    int access, int flags,
 226                                                    BufferAccessStrategy bstrategy)
 227 {
 228         Buffer          buf;
 229
 230         if (blkno == P_NEW)
 231                 elog(ERROR, "hash AM does not use P_NEW");
 232
 233         buf = ReadBufferWithStrategy(rel, blkno, bstrategy);
 234
 235         if (access != HASH_NOLOCK)
 236                 LockBuffer(buf, access);
 237
 238         /* ref count and lock type are correct */
 239
 240         _hash_checkpage(rel, buf, flags);
 241
 242         return buf;
 243 }
 244
 245 /*
 246  *      _hash_relbuf() -- release a locked buffer.
 247  *
 248  * Lock and pin (refcount) are both dropped.
 249  */
 250 void
 251 _hash_relbuf(Relation rel, Buffer buf)
 252 {
 253         UnlockReleaseBuffer(buf);
 254 }
 255
 256 /*
 257  *      _hash_dropbuf() -- release an unlocked buffer.
 258  *
 259  * This is used to unpin a buffer on which we hold no lock.
 260  */
 261 void
 262 _hash_dropbuf(Relation rel, Buffer buf)
 263 {
 264         ReleaseBuffer(buf);
 265 }
 266
 267 /*
 268  *      _hash_wrtbuf() -- write a hash page to disk.
 269  *
 270  *              This routine releases the lock held on the buffer and our refcount
 271  *              for it.  It is an error to call _hash_wrtbuf() without a write lock
 272  *              and a pin on the buffer.
 273  *
 274  * NOTE: this routine should go away when/if hash indexes are WAL-ified.
 275  * The correct sequence of operations is to mark the buffer dirty, then
 276  * write the WAL record, then release the lock and pin; so marking dirty
 277  * can't be combined with releasing.
 278  */
 279 void
 280 _hash_wrtbuf(Relation rel, Buffer buf)
 281 {
 282         MarkBufferDirty(buf);
 283         UnlockReleaseBuffer(buf);
 284 }
 285
 286 /*
 287  * _hash_chgbufaccess() -- Change the lock type on a buffer, without
 288  *                      dropping our pin on it.
 289  *
 290  * from_access and to_access may be HASH_READ, HASH_WRITE, or HASH_NOLOCK,
 291  * the last indicating that no buffer-level lock is held or wanted.
 292  *
 293  * When from_access == HASH_WRITE, we assume the buffer is dirty and tell
 294  * bufmgr it must be written out.  If the caller wants to release a write
 295  * lock on a page that's not been modified, it's okay to pass from_access
 296  * as HASH_READ (a bit ugly, but handy in some places).
 297  */
 298 void
 299 _hash_chgbufaccess(Relation rel,
 300                                    Buffer buf,
 301                                    int from_access,
 302                                    int to_access)
 303 {
 304         if (from_access == HASH_WRITE)
 305                 MarkBufferDirty(buf);
 306         if (from_access != HASH_NOLOCK)
 307                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 308         if (to_access != HASH_NOLOCK)
 309                 LockBuffer(buf, to_access);
 310 }
 311
 312
 313 /*
 314  *      _hash_metapinit() -- Initialize the metadata page of a hash index,
 315  *                              the two buckets that we begin with and the initial
 316  *                              bitmap page.
 317  *
 318  * We are fairly cavalier about locking here, since we know that no one else
 319  * could be accessing this index.  In particular the rule about not holding
 320  * multiple buffer locks is ignored.
 321  */
 322 void
 323 _hash_metapinit(Relation rel)
 324 {
 325         HashMetaPage metap;
 326         HashPageOpaque pageopaque;
 327         Buffer          metabuf;
 328         Buffer          buf;
 329         Page            pg;
 330         int32           data_width;
 331         int32           item_width;
 332         int32           ffactor;
 333         uint16          i;
 334
 335         /* safety check */
 336         if (RelationGetNumberOfBlocks(rel) != 0)
 337                 elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
 338                          RelationGetRelationName(rel));
 339
 340         /*
 341          * Determine the target fill factor (in tuples per bucket) for this index.
 342          * The idea is to make the fill factor correspond to pages about as full
 343          * as the user-settable fillfactor parameter says.      We can compute it
 344          * exactly if the index datatype is fixed-width, but for var-width there's
 345          * some guessing involved.
 346          */
 347         data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
 348                                                                  RelationGetDescr(rel)->attrs[0]->atttypmod);
 349         item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
 350                 sizeof(ItemIdData);             /* include the line pointer */
 351         ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
 352         /* keep to a sane range */
 353         if (ffactor < 10)
 354                 ffactor = 10;
 355
 356         /*
 357          * We initialize the metapage, the first two bucket pages, and the
 358          * first bitmap page in sequence, using _hash_getnewbuf to cause
 359          * smgrextend() calls to occur.  This ensures that the smgr level
 360          * has the right idea of the physical index length.
 361          */
 362         metabuf = _hash_getnewbuf(rel, HASH_METAPAGE);
 363         pg = BufferGetPage(metabuf);
 364
 365         pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
 366         pageopaque->hasho_prevblkno = InvalidBlockNumber;
 367         pageopaque->hasho_nextblkno = InvalidBlockNumber;
 368         pageopaque->hasho_bucket = -1;
 369         pageopaque->hasho_flag = LH_META_PAGE;
 370         pageopaque->hasho_page_id = HASHO_PAGE_ID;
 371
 372         metap = (HashMetaPage) pg;
 373
 374         metap->hashm_magic = HASH_MAGIC;
 375         metap->hashm_version = HASH_VERSION;
 376         metap->hashm_ntuples = 0;
 377         metap->hashm_nmaps = 0;
 378         metap->hashm_ffactor = ffactor;
 379         metap->hashm_bsize = BufferGetPageSize(metabuf);
 380         /* find largest bitmap array size that will fit in page size */
 381         for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
 382         {
 383                 if ((1 << i) <= (metap->hashm_bsize -
 384                                                  (MAXALIGN(sizeof(PageHeaderData)) +
 385                                                   MAXALIGN(sizeof(HashPageOpaqueData)))))
 386                         break;
 387         }
 388         Assert(i > 0);
 389         metap->hashm_bmsize = 1 << i;
 390         metap->hashm_bmshift = i + BYTE_TO_BIT;
 391         Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
 392
 393         /*
 394          * Label the index with its primary hash support function's OID.  This is
 395          * pretty useless for normal operation (in fact, hashm_procid is not used
 396          * anywhere), but it might be handy for forensic purposes so we keep it.
 397          */
 398         metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
 399
 400         /*
 401          * We initialize the index with two buckets, 0 and 1, occupying physical
 402          * blocks 1 and 2.      The first freespace bitmap page is in block 3.
 403          */
 404         metap->hashm_maxbucket = metap->hashm_lowmask = 1;      /* nbuckets - 1 */
 405         metap->hashm_highmask = 3;      /* (nbuckets << 1) - 1 */
 406
 407         MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
 408         MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
 409
 410         metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
 411         metap->hashm_ovflpoint = 1;
 412         metap->hashm_firstfree = 0;
 413
 414         /*
 415          * Initialize the first two buckets
 416          */
 417         for (i = 0; i <= 1; i++)
 418         {
 419                 buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i));
 420                 pg = BufferGetPage(buf);
 421                 pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
 422                 pageopaque->hasho_prevblkno = InvalidBlockNumber;
 423                 pageopaque->hasho_nextblkno = InvalidBlockNumber;
 424                 pageopaque->hasho_bucket = i;
 425                 pageopaque->hasho_flag = LH_BUCKET_PAGE;
 426                 pageopaque->hasho_page_id = HASHO_PAGE_ID;
 427                 _hash_wrtbuf(rel, buf);
 428         }
 429
 430         /*
 431          * Initialize first bitmap page
 432          */
 433         _hash_initbitmap(rel, metap, 3);
 434
 435         /* all done */
 436         _hash_wrtbuf(rel, metabuf);
 437 }
 438
 439 /*
 440  *      _hash_pageinit() -- Initialize a new hash index page.
 441  */
 442 void
 443 _hash_pageinit(Page page, Size size)
 444 {
 445         Assert(PageIsNew(page));
 446         PageInit(page, size, sizeof(HashPageOpaqueData));
 447 }
 448
 449 /*
 450  * Attempt to expand the hash table by creating one new bucket.
 451  *
 452  * This will silently do nothing if it cannot get the needed locks.
 453  *
 454  * The caller should hold no locks on the hash index.
 455  *
 456  * The caller must hold a pin, but no lock, on the metapage buffer.
 457  * The buffer is returned in the same state.
 458  */
 459 void
 460 _hash_expandtable(Relation rel, Buffer metabuf)
 461 {
 462         HashMetaPage metap;
 463         Bucket          old_bucket;
 464         Bucket          new_bucket;
 465         uint32          spare_ndx;
 466         BlockNumber start_oblkno;
 467         BlockNumber start_nblkno;
 468         uint32          maxbucket;
 469         uint32          highmask;
 470         uint32          lowmask;
 471
 472         /*
 473          * Obtain the page-zero lock to assert the right to begin a split (see
 474          * README).
 475          *
 476          * Note: deadlock should be impossible here. Our own backend could only be
 477          * holding bucket sharelocks due to stopped indexscans; those will not
 478          * block other holders of the page-zero lock, who are only interested in
 479          * acquiring bucket sharelocks themselves.      Exclusive bucket locks are
 480          * only taken here and in hashbulkdelete, and neither of these operations
 481          * needs any additional locks to complete.      (If, due to some flaw in this
 482          * reasoning, we manage to deadlock anyway, it's okay to error out; the
 483          * index will be left in a consistent state.)
 484          */
 485         _hash_getlock(rel, 0, HASH_EXCLUSIVE);
 486
 487         /* Write-lock the meta page */
 488         _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 489
 490         _hash_checkpage(rel, metabuf, LH_META_PAGE);
 491         metap = (HashMetaPage) BufferGetPage(metabuf);
 492
 493         /*
 494          * Check to see if split is still needed; someone else might have already
 495          * done one while we waited for the lock.
 496          *
 497          * Make sure this stays in sync with _hash_doinsert()
 498          */
 499         if (metap->hashm_ntuples <=
 500                 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
 501                 goto fail;
 502
 503         /*
 504          * Can't split anymore if maxbucket has reached its maximum possible value.
 505          *
 506          * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
 507          * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
 508          * to half that because of overflow looping in _hash_log2() and
 509          * insufficient space in hashm_spares[].  It's moot anyway because an
 510          * index with 2^32 buckets would certainly overflow BlockNumber and
 511          * hence _hash_alloc_buckets() would fail, but if we supported buckets
 512          * smaller than a disk block then this would be an independent constraint.
 513          */
 514         if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
 515                 goto fail;
 516
 517         /*
 518          * Determine which bucket is to be split, and attempt to lock the old
 519          * bucket.      If we can't get the lock, give up.
 520          *
 521          * The lock protects us against other backends, but not against our own
 522          * backend.  Must check for active scans separately.
 523          */
 524         new_bucket = metap->hashm_maxbucket + 1;
 525
 526         old_bucket = (new_bucket & metap->hashm_lowmask);
 527
 528         start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
 529
 530         if (_hash_has_active_scan(rel, old_bucket))
 531                 goto fail;
 532
 533         if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
 534                 goto fail;
 535
 536         /*
 537          * Likewise lock the new bucket (should never fail).
 538          *
 539          * Note: it is safe to compute the new bucket's blkno here, even though
 540          * we may still need to update the BUCKET_TO_BLKNO mapping.  This is
 541          * because the current value of hashm_spares[hashm_ovflpoint] correctly
 542          * shows where we are going to put a new splitpoint's worth of buckets.
 543          */
 544         start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
 545
 546         if (_hash_has_active_scan(rel, new_bucket))
 547                 elog(ERROR, "scan in progress on supposedly new bucket");
 548
 549         if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
 550                 elog(ERROR, "could not get lock on supposedly new bucket");
 551
 552         /*
 553          * If the split point is increasing (hashm_maxbucket's log base 2
 554          * increases), we need to allocate a new batch of bucket pages.
 555          */
 556         spare_ndx = _hash_log2(new_bucket + 1);
 557         if (spare_ndx > metap->hashm_ovflpoint)
 558         {
 559                 Assert(spare_ndx == metap->hashm_ovflpoint + 1);
 560                 /*
 561                  * The number of buckets in the new splitpoint is equal to the
 562                  * total number already in existence, i.e. new_bucket.  Currently
 563                  * this maps one-to-one to blocks required, but someday we may need
 564                  * a more complicated calculation here.
 565                  */
 566                 if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
 567                 {
 568                         /* can't split due to BlockNumber overflow */
 569                         _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
 570                         _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
 571                         goto fail;
 572                 }
 573         }
 574
 575         /*
 576          * Okay to proceed with split.  Update the metapage bucket mapping info.
 577          *
 578          * Since we are scribbling on the metapage data right in the shared
 579          * buffer, any failure in this next little bit leaves us with a big
 580          * problem: the metapage is effectively corrupt but could get written back
 581          * to disk.  We don't really expect any failure, but just to be sure,
 582          * establish a critical section.
 583          */
 584         START_CRIT_SECTION();
 585
 586         metap->hashm_maxbucket = new_bucket;
 587
 588         if (new_bucket > metap->hashm_highmask)
 589         {
 590                 /* Starting a new doubling */
 591                 metap->hashm_lowmask = metap->hashm_highmask;
 592                 metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
 593         }
 594
 595         /*
 596          * If the split point is increasing (hashm_maxbucket's log base 2
 597          * increases), we need to adjust the hashm_spares[] array and
 598          * hashm_ovflpoint so that future overflow pages will be created beyond
 599          * this new batch of bucket pages.
 600          */
 601         if (spare_ndx > metap->hashm_ovflpoint)
 602         {
 603                 metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
 604                 metap->hashm_ovflpoint = spare_ndx;
 605         }
 606
 607         /* Done mucking with metapage */
 608         END_CRIT_SECTION();
 609
 610         /*
 611          * Copy bucket mapping info now; this saves re-accessing the meta page
 612          * inside _hash_splitbucket's inner loop.  Note that once we drop the
 613          * split lock, other splits could begin, so these values might be out of
 614          * date before _hash_splitbucket finishes.      That's okay, since all it
 615          * needs is to tell which of these two buckets to map hashkeys into.
 616          */
 617         maxbucket = metap->hashm_maxbucket;
 618         highmask = metap->hashm_highmask;
 619         lowmask = metap->hashm_lowmask;
 620
 621         /* Write out the metapage and drop lock, but keep pin */
 622         _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
 623
 624         /* Release split lock; okay for other splits to occur now */
 625         _hash_droplock(rel, 0, HASH_EXCLUSIVE);
 626
 627         /* Relocate records to the new bucket */
 628         _hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
 629                                           start_oblkno, start_nblkno,
 630                                           maxbucket, highmask, lowmask);
 631
 632         /* Release bucket locks, allowing others to access them */
 633         _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
 634         _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
 635
 636         return;
 637
 638         /* Here if decide not to split or fail to acquire old bucket lock */
 639 fail:
 640
 641         /* We didn't write the metapage, so just drop lock */
 642         _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
 643
 644         /* Release split lock */
 645         _hash_droplock(rel, 0, HASH_EXCLUSIVE);
 646 }
 647
 648
 649 /*
 650  * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
 651  *
 652  * This does not need to initialize the new bucket pages; we'll do that as
 653  * each one is used by _hash_expandtable().  But we have to extend the logical
 654  * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
 655  * sync with ours, so that we don't get complaints from smgr.
 656  *
 657  * We do this by writing a page of zeroes at the end of the splitpoint range.
 658  * We expect that the filesystem will ensure that the intervening pages read
 659  * as zeroes too.  On many filesystems this "hole" will not be allocated
 660  * immediately, which means that the index file may end up more fragmented
 661  * than if we forced it all to be allocated now; but since we don't scan
 662  * hash indexes sequentially anyway, that probably doesn't matter.
 663  *
 664  * XXX It's annoying that this code is executed with the metapage lock held.
 665  * We need to interlock against _hash_getovflpage() adding a new overflow page
 666  * concurrently, but it'd likely be better to use LockRelationForExtension
 667  * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
 668  * so it may not be worth worrying about.
 669  *
 670  * Returns TRUE if successful, or FALSE if allocation failed due to
 671  * BlockNumber overflow.
 672  */
 673 static bool
 674 _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 675 {
 676         BlockNumber     lastblock;
 677         char            zerobuf[BLCKSZ];
 678
 679         lastblock = firstblock + nblocks - 1;
 680
 681         /*
 682          * Check for overflow in block number calculation; if so, we cannot
 683          * extend the index anymore.
 684          */
 685         if (lastblock < firstblock || lastblock == InvalidBlockNumber)
 686                 return false;
 687
 688         MemSet(zerobuf, 0, sizeof(zerobuf));
 689
 690         RelationOpenSmgr(rel);
 691         smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp);
 692
 693         return true;
 694 }
 695
 696
 697 /*
 698  * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 699  *
 700  * We are splitting a bucket that consists of a base bucket page and zero
 701  * or more overflow (bucket chain) pages.  We must relocate tuples that
 702  * belong in the new bucket, and compress out any free space in the old
 703  * bucket.
 704  *
 705  * The caller must hold exclusive locks on both buckets to ensure that
 706  * no one else is trying to access them (see README).
 707  *
 708  * The caller must hold a pin, but no lock, on the metapage buffer.
 709  * The buffer is returned in the same state.  (The metapage is only
 710  * touched if it becomes necessary to add or remove overflow pages.)
 711  */
 712 static void
 713 _hash_splitbucket(Relation rel,
 714                                   Buffer metabuf,
 715                                   Bucket obucket,
 716                                   Bucket nbucket,
 717                                   BlockNumber start_oblkno,
 718                                   BlockNumber start_nblkno,
 719                                   uint32 maxbucket,
 720                                   uint32 highmask,
 721                                   uint32 lowmask)
 722 {
 723         Bucket          bucket;
 724         Buffer          obuf;
 725         Buffer          nbuf;
 726         BlockNumber oblkno;
 727         BlockNumber nblkno;
 728         bool            null;
 729         Datum           datum;
 730         HashPageOpaque oopaque;
 731         HashPageOpaque nopaque;
 732         IndexTuple      itup;
 733         Size            itemsz;
 734         OffsetNumber ooffnum;
 735         OffsetNumber noffnum;
 736         OffsetNumber omaxoffnum;
 737         Page            opage;
 738         Page            npage;
 739         TupleDesc       itupdesc = RelationGetDescr(rel);
 740
 741         /*
 742          * It should be okay to simultaneously write-lock pages from each bucket,
 743          * since no one else can be trying to acquire buffer lock on pages of
 744          * either bucket.
 745          */
 746         oblkno = start_oblkno;
 747         obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE);
 748         opage = BufferGetPage(obuf);
 749         oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
 750
 751         nblkno = start_nblkno;
 752         nbuf = _hash_getnewbuf(rel, nblkno);
 753         npage = BufferGetPage(nbuf);
 754
 755         /* initialize the new bucket's primary page */
 756         nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
 757         nopaque->hasho_prevblkno = InvalidBlockNumber;
 758         nopaque->hasho_nextblkno = InvalidBlockNumber;
 759         nopaque->hasho_bucket = nbucket;
 760         nopaque->hasho_flag = LH_BUCKET_PAGE;
 761         nopaque->hasho_page_id = HASHO_PAGE_ID;
 762
 763         /*
 764          * Partition the tuples in the old bucket between the old bucket and the
 765          * new bucket, advancing along the old bucket's overflow bucket chain and
 766          * adding overflow pages to the new bucket as needed.
 767          */
 768         ooffnum = FirstOffsetNumber;
 769         omaxoffnum = PageGetMaxOffsetNumber(opage);
 770         for (;;)
 771         {
 772                 /*
 773                  * at each iteration through this loop, each of these variables should
 774                  * be up-to-date: obuf opage oopaque ooffnum omaxoffnum
 775                  */
 776
 777                 /* check if we're at the end of the page */
 778                 if (ooffnum > omaxoffnum)
 779                 {
 780                         /* at end of page, but check for an(other) overflow page */
 781                         oblkno = oopaque->hasho_nextblkno;
 782                         if (!BlockNumberIsValid(oblkno))
 783                                 break;
 784
 785                         /*
 786                          * we ran out of tuples on this particular page, but we have more
 787                          * overflow pages; advance to next page.
 788                          */
 789                         _hash_wrtbuf(rel, obuf);
 790
 791                         obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
 792                         opage = BufferGetPage(obuf);
 793                         oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
 794                         ooffnum = FirstOffsetNumber;
 795                         omaxoffnum = PageGetMaxOffsetNumber(opage);
 796                         continue;
 797                 }
 798
 799                 /*
 800                  * Re-hash the tuple to determine which bucket it now belongs in.
 801                  *
 802                  * It is annoying to call the hash function while holding locks, but
 803                  * releasing and relocking the page for each tuple is unappealing too.
 804                  */
 805                 itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum));
 806                 datum = index_getattr(itup, 1, itupdesc, &null);
 807                 Assert(!null);
 808
 809                 bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
 810                                                                           maxbucket, highmask, lowmask);
 811
 812                 if (bucket == nbucket)
 813                 {
 814                         /*
 815                          * insert the tuple into the new bucket.  if it doesn't fit on the
 816                          * current page in the new bucket, we must allocate a new overflow
 817                          * page and place the tuple on that page instead.
 818                          */
 819                         itemsz = IndexTupleDSize(*itup);
 820                         itemsz = MAXALIGN(itemsz);
 821
 822                         if (PageGetFreeSpace(npage) < itemsz)
 823                         {
 824                                 /* write out nbuf and drop lock, but keep pin */
 825                                 _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
 826                                 /* chain to a new overflow page */
 827                                 nbuf = _hash_addovflpage(rel, metabuf, nbuf);
 828                                 npage = BufferGetPage(nbuf);
 829                                 /* we don't need nopaque within the loop */
 830                         }
 831
 832                         noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
 833                         if (PageAddItem(npage, (Item) itup, itemsz, noffnum, LP_USED)
 834                                 == InvalidOffsetNumber)
 835                                 elog(ERROR, "failed to add index item to \"%s\"",
 836                                          RelationGetRelationName(rel));
 837
 838                         /*
 839                          * now delete the tuple from the old bucket.  after this section
 840                          * of code, 'ooffnum' will actually point to the ItemId to which
 841                          * we would point if we had advanced it before the deletion
 842                          * (PageIndexTupleDelete repacks the ItemId array).  this also
 843                          * means that 'omaxoffnum' is exactly one less than it used to be,
 844                          * so we really can just decrement it instead of calling
 845                          * PageGetMaxOffsetNumber.
 846                          */
 847                         PageIndexTupleDelete(opage, ooffnum);
 848                         omaxoffnum = OffsetNumberPrev(omaxoffnum);
 849                 }
 850                 else
 851                 {
 852                         /*
 853                          * the tuple stays on this page.  we didn't move anything, so we
 854                          * didn't delete anything and therefore we don't have to change
 855                          * 'omaxoffnum'.
 856                          */
 857                         Assert(bucket == obucket);
 858                         ooffnum = OffsetNumberNext(ooffnum);
 859                 }
 860         }
 861
 862         /*
 863          * We're at the end of the old bucket chain, so we're done partitioning
 864          * the tuples.  Before quitting, call _hash_squeezebucket to ensure the
 865          * tuples remaining in the old bucket (including the overflow pages) are
 866          * packed as tightly as possible.  The new bucket is already tight.
 867          */
 868         _hash_wrtbuf(rel, obuf);
 869         _hash_wrtbuf(rel, nbuf);
 870
 871         _hash_squeezebucket(rel, obucket, start_oblkno, NULL);
 872 }