granicus.if.org Git - postgresql/blob - src/backend/access/hash/hashpage.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * hashpage.c
   4  *        Hash table page management code for the Postgres hash access method
   5  *
   6  * Portions Copyright (c) 1996-2004, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.45 2004/08/29 04:12:18 momjian Exp $
  12  *
  13  * NOTES
  14  *        Postgres hash pages look like ordinary relation pages.  The opaque
  15  *        data at high addresses includes information about the page including
  16  *        whether a page is an overflow page or a true bucket, the bucket
  17  *        number, and the block numbers of the preceding and following pages
  18  *        in the same bucket.
  19  *
  20  *        The first page in a hash relation, page zero, is special -- it stores
  21  *        information describing the hash table; it is referred to as the
  22  *        "meta page." Pages one and higher store the actual data.
  23  *
  24  *        There are also bitmap pages, which are not manipulated here;
  25  *        see hashovfl.c.
  26  *
  27  *-------------------------------------------------------------------------
  28  */
  29 #include "postgres.h"
  30
  31 #include "access/genam.h"
  32 #include "access/hash.h"
  33 #include "storage/lmgr.h"
  34 #include "utils/lsyscache.h"
  35
  36
  37 static void _hash_splitbucket(Relation rel, Buffer metabuf,
  38                                                           Bucket obucket, Bucket nbucket,
  39                                                           BlockNumber start_oblkno,
  40                                                           BlockNumber start_nblkno,
  41                                                           uint32 maxbucket,
  42                                                           uint32 highmask, uint32 lowmask);
  43
  44
  45 /*
  46  * We use high-concurrency locking on hash indexes (see README for an overview
  47  * of the locking rules).  However, we can skip taking lmgr locks when the
  48  * index is local to the current backend (ie, either temp or new in the
  49  * current transaction).  No one else can see it, so there's no reason to
  50  * take locks.  We still take buffer-level locks, but not lmgr locks.
  51  */
  52 #define USELOCKING(rel)         (!RELATION_IS_LOCAL(rel))
  53
  54
  55 /*
  56  * _hash_getlock() -- Acquire an lmgr lock.
  57  *
  58  * 'whichlock' should be zero to acquire the split-control lock, or the
  59  * block number of a bucket's primary bucket page to acquire the per-bucket
  60  * lock.  (See README for details of the use of these locks.)
  61  *
  62  * 'access' must be HASH_SHARE or HASH_EXCLUSIVE.
  63  */
  64 void
  65 _hash_getlock(Relation rel, BlockNumber whichlock, int access)
  66 {
  67         if (USELOCKING(rel))
  68                 LockPage(rel, whichlock, access);
  69 }
  70
  71 /*
  72  * _hash_try_getlock() -- Acquire an lmgr lock, but only if it's free.
  73  *
  74  * Same as above except we return FALSE without blocking if lock isn't free.
  75  */
  76 bool
  77 _hash_try_getlock(Relation rel, BlockNumber whichlock, int access)
  78 {
  79         if (USELOCKING(rel))
  80                 return ConditionalLockPage(rel, whichlock, access);
  81         else
  82                 return true;
  83 }
  84
  85 /*
  86  * _hash_droplock() -- Release an lmgr lock.
  87  */
  88 void
  89 _hash_droplock(Relation rel, BlockNumber whichlock, int access)
  90 {
  91         if (USELOCKING(rel))
  92                 UnlockPage(rel, whichlock, access);
  93 }
  94
  95 /*
  96  *      _hash_getbuf() -- Get a buffer by block number for read or write.
  97  *
  98  *              'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
  99  *
 100  *              When this routine returns, the appropriate lock is set on the
 101  *              requested buffer and its reference count has been incremented
 102  *              (ie, the buffer is "locked and pinned").
 103  *
 104  *              XXX P_NEW is not used because, unlike the tree structures, we
 105  *              need the bucket blocks to be at certain block numbers.  we must
 106  *              depend on the caller to call _hash_pageinit on the block if it
 107  *              knows that this is a new block.
 108  */
 109 Buffer
 110 _hash_getbuf(Relation rel, BlockNumber blkno, int access)
 111 {
 112         Buffer          buf;
 113
 114         if (blkno == P_NEW)
 115                 elog(ERROR, "hash AM does not use P_NEW");
 116
 117         buf = ReadBuffer(rel, blkno);
 118
 119         if (access != HASH_NOLOCK)
 120                 LockBuffer(buf, access);
 121
 122         /* ref count and lock type are correct */
 123         return buf;
 124 }
 125
 126 /*
 127  *      _hash_relbuf() -- release a locked buffer.
 128  *
 129  * Lock and pin (refcount) are both dropped.  Note that either read or
 130  * write lock can be dropped this way, but if we modified the buffer,
 131  * this is NOT the right way to release a write lock.
 132  */
 133 void
 134 _hash_relbuf(Relation rel, Buffer buf)
 135 {
 136         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 137         ReleaseBuffer(buf);
 138 }
 139
 140 /*
 141  *      _hash_dropbuf() -- release an unlocked buffer.
 142  *
 143  * This is used to unpin a buffer on which we hold no lock.  It is assumed
 144  * that the buffer is not dirty.
 145  */
 146 void
 147 _hash_dropbuf(Relation rel, Buffer buf)
 148 {
 149         ReleaseBuffer(buf);
 150 }
 151
 152 /*
 153  *      _hash_wrtbuf() -- write a hash page to disk.
 154  *
 155  *              This routine releases the lock held on the buffer and our refcount
 156  *              for it.  It is an error to call _hash_wrtbuf() without a write lock
 157  *              and a pin on the buffer.
 158  *
 159  * NOTE: actually, the buffer manager just marks the shared buffer page
 160  * dirty here; the real I/O happens later.      This is okay since we are not
 161  * relying on write ordering anyway.  The WAL mechanism is responsible for
 162  * guaranteeing correctness after a crash.
 163  */
 164 void
 165 _hash_wrtbuf(Relation rel, Buffer buf)
 166 {
 167         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 168         WriteBuffer(buf);
 169 }
 170
 171 /*
 172  *      _hash_wrtnorelbuf() -- write a hash page to disk, but do not release
 173  *                                               our reference or lock.
 174  *
 175  *              It is an error to call _hash_wrtnorelbuf() without a write lock
 176  *              and a pin on the buffer.
 177  *
 178  * See above NOTE.
 179  */
 180 void
 181 _hash_wrtnorelbuf(Relation rel, Buffer buf)
 182 {
 183         WriteNoReleaseBuffer(buf);
 184 }
 185
 186 /*
 187  * _hash_chgbufaccess() -- Change the lock type on a buffer, without
 188  *                      dropping our pin on it.
 189  *
 190  * from_access and to_access may be HASH_READ, HASH_WRITE, or HASH_NOLOCK,
 191  * the last indicating that no buffer-level lock is held or wanted.
 192  *
 193  * When from_access == HASH_WRITE, we assume the buffer is dirty and tell
 194  * bufmgr it must be written out.  If the caller wants to release a write
 195  * lock on a page that's not been modified, it's okay to pass from_access
 196  * as HASH_READ (a bit ugly, but handy in some places).
 197  */
 198 void
 199 _hash_chgbufaccess(Relation rel,
 200                                    Buffer buf,
 201                                    int from_access,
 202                                    int to_access)
 203 {
 204         if (from_access != HASH_NOLOCK)
 205                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 206         if (from_access == HASH_WRITE)
 207                 WriteNoReleaseBuffer(buf);
 208
 209         if (to_access != HASH_NOLOCK)
 210                 LockBuffer(buf, to_access);
 211 }
 212
 213
 214 /*
 215  *      _hash_metapinit() -- Initialize the metadata page of a hash index,
 216  *                              the two buckets that we begin with and the initial
 217  *                              bitmap page.
 218  *
 219  * We are fairly cavalier about locking here, since we know that no one else
 220  * could be accessing this index.  In particular the rule about not holding
 221  * multiple buffer locks is ignored.
 222  */
 223 void
 224 _hash_metapinit(Relation rel)
 225 {
 226         HashMetaPage metap;
 227         HashPageOpaque pageopaque;
 228         Buffer          metabuf;
 229         Buffer          buf;
 230         Page            pg;
 231         int32           data_width;
 232         int32           item_width;
 233         int32           ffactor;
 234         uint16          i;
 235
 236         /* safety check */
 237         if (RelationGetNumberOfBlocks(rel) != 0)
 238                 elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
 239                          RelationGetRelationName(rel));
 240
 241         /*
 242          * Determine the target fill factor (tuples per bucket) for this index.
 243          * The idea is to make the fill factor correspond to pages about 3/4ths
 244          * full.  We can compute it exactly if the index datatype is fixed-width,
 245          * but for var-width there's some guessing involved.
 246          */
 247         data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
 248                                                                  RelationGetDescr(rel)->attrs[0]->atttypmod);
 249         item_width = MAXALIGN(sizeof(HashItemData)) + MAXALIGN(data_width) +
 250                 sizeof(ItemIdData);             /* include the line pointer */
 251         ffactor = (BLCKSZ * 3 / 4) / item_width;
 252         /* keep to a sane range */
 253         if (ffactor < 10)
 254                 ffactor = 10;
 255
 256         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
 257         pg = BufferGetPage(metabuf);
 258         _hash_pageinit(pg, BufferGetPageSize(metabuf));
 259
 260         pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
 261         pageopaque->hasho_prevblkno = InvalidBlockNumber;
 262         pageopaque->hasho_nextblkno = InvalidBlockNumber;
 263         pageopaque->hasho_bucket = -1;
 264         pageopaque->hasho_flag = LH_META_PAGE;
 265         pageopaque->hasho_filler = HASHO_FILL;
 266
 267         metap = (HashMetaPage) pg;
 268
 269         metap->hashm_magic = HASH_MAGIC;
 270         metap->hashm_version = HASH_VERSION;
 271         metap->hashm_ntuples = 0;
 272         metap->hashm_nmaps = 0;
 273         metap->hashm_ffactor = ffactor;
 274         metap->hashm_bsize = BufferGetPageSize(metabuf);
 275         /* find largest bitmap array size that will fit in page size */
 276         for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
 277         {
 278                 if ((1 << i) <= (metap->hashm_bsize -
 279                                                  (MAXALIGN(sizeof(PageHeaderData)) +
 280                                                   MAXALIGN(sizeof(HashPageOpaqueData)))))
 281                         break;
 282         }
 283         Assert(i > 0);
 284         metap->hashm_bmsize = 1 << i;
 285         metap->hashm_bmshift = i + BYTE_TO_BIT;
 286         Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
 287
 288         metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
 289
 290         /*
 291          * We initialize the index with two buckets, 0 and 1, occupying physical
 292          * blocks 1 and 2.  The first freespace bitmap page is in block 3.
 293          */
 294         metap->hashm_maxbucket = metap->hashm_lowmask = 1;      /* nbuckets - 1 */
 295         metap->hashm_highmask = 3;      /* (nbuckets << 1) - 1 */
 296
 297         MemSet((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares));
 298         MemSet((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
 299
 300         metap->hashm_spares[1] = 1;     /* the first bitmap page is only spare */
 301         metap->hashm_ovflpoint = 1;
 302         metap->hashm_firstfree = 0;
 303
 304         /*
 305          * Initialize the first two buckets
 306          */
 307         for (i = 0; i <= 1; i++)
 308         {
 309                 buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
 310                 pg = BufferGetPage(buf);
 311                 _hash_pageinit(pg, BufferGetPageSize(buf));
 312                 pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
 313                 pageopaque->hasho_prevblkno = InvalidBlockNumber;
 314                 pageopaque->hasho_nextblkno = InvalidBlockNumber;
 315                 pageopaque->hasho_bucket = i;
 316                 pageopaque->hasho_flag = LH_BUCKET_PAGE;
 317                 pageopaque->hasho_filler = HASHO_FILL;
 318                 _hash_wrtbuf(rel, buf);
 319         }
 320
 321         /*
 322          * Initialize first bitmap page.  Can't do this until we
 323          * create the first two buckets, else smgr will complain.
 324          */
 325         _hash_initbitmap(rel, metap, 3);
 326
 327         /* all done */
 328         _hash_wrtbuf(rel, metabuf);
 329 }
 330
 331 /*
 332  *      _hash_pageinit() -- Initialize a new hash index page.
 333  */
 334 void
 335 _hash_pageinit(Page page, Size size)
 336 {
 337         Assert(PageIsNew(page));
 338         PageInit(page, size, sizeof(HashPageOpaqueData));
 339 }
 340
 341 /*
 342  * Attempt to expand the hash table by creating one new bucket.
 343  *
 344  * This will silently do nothing if it cannot get the needed locks.
 345  *
 346  * The caller should hold no locks on the hash index.
 347  *
 348  * The caller must hold a pin, but no lock, on the metapage buffer.
 349  * The buffer is returned in the same state.
 350  */
 351 void
 352 _hash_expandtable(Relation rel, Buffer metabuf)
 353 {
 354         HashMetaPage metap;
 355         Bucket          old_bucket;
 356         Bucket          new_bucket;
 357         uint32          spare_ndx;
 358         BlockNumber start_oblkno;
 359         BlockNumber start_nblkno;
 360         uint32          maxbucket;
 361         uint32          highmask;
 362         uint32          lowmask;
 363
 364         /*
 365          * Obtain the page-zero lock to assert the right to begin a split
 366          * (see README).
 367          *
 368          * Note: deadlock should be impossible here. Our own backend could only
 369          * be holding bucket sharelocks due to stopped indexscans; those will not
 370          * block other holders of the page-zero lock, who are only interested in
 371          * acquiring bucket sharelocks themselves.  Exclusive bucket locks are
 372          * only taken here and in hashbulkdelete, and neither of these operations
 373          * needs any additional locks to complete.  (If, due to some flaw in this
 374          * reasoning, we manage to deadlock anyway, it's okay to error out; the
 375          * index will be left in a consistent state.)
 376          */
 377         _hash_getlock(rel, 0, HASH_EXCLUSIVE);
 378
 379         /* Write-lock the meta page */
 380         _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
 381
 382         metap = (HashMetaPage) BufferGetPage(metabuf);
 383         _hash_checkpage(rel, (Page) metap, LH_META_PAGE);
 384
 385         /*
 386          * Check to see if split is still needed; someone else might have already
 387          * done one while we waited for the lock.
 388          *
 389          * Make sure this stays in sync with_hash_doinsert()
 390          */
 391         if (metap->hashm_ntuples <=
 392                 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
 393                 goto fail;
 394
 395         /*
 396          * Determine which bucket is to be split, and attempt to lock the old
 397          * bucket.  If we can't get the lock, give up.
 398          *
 399          * The lock protects us against other backends, but not against our own
 400          * backend.  Must check for active scans separately.
 401          *
 402          * Ideally we would lock the new bucket too before proceeding, but if
 403          * we are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping
 404          * isn't correct yet.  For simplicity we update the metapage first and
 405          * then lock.  This should be okay because no one else should be trying
 406          * to lock the new bucket yet...
 407          */
 408         new_bucket = metap->hashm_maxbucket + 1;
 409         old_bucket = (new_bucket & metap->hashm_lowmask);
 410
 411         start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
 412
 413         if (_hash_has_active_scan(rel, old_bucket))
 414                 goto fail;
 415
 416         if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
 417                 goto fail;
 418
 419         /*
 420          * Okay to proceed with split.  Update the metapage bucket mapping info.
 421          */
 422         metap->hashm_maxbucket = new_bucket;
 423
 424         if (new_bucket > metap->hashm_highmask)
 425         {
 426                 /* Starting a new doubling */
 427                 metap->hashm_lowmask = metap->hashm_highmask;
 428                 metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
 429         }
 430
 431         /*
 432          * If the split point is increasing (hashm_maxbucket's log base 2
 433          * increases), we need to adjust the hashm_spares[] array and
 434          * hashm_ovflpoint so that future overflow pages will be created beyond
 435          * this new batch of bucket pages.
 436          *
 437          * XXX should initialize new bucket pages to prevent out-of-order
 438          * page creation?  Don't wanna do it right here though.
 439          */
 440         spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
 441         if (spare_ndx > metap->hashm_ovflpoint)
 442         {
 443                 Assert(spare_ndx == metap->hashm_ovflpoint + 1);
 444                 metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
 445                 metap->hashm_ovflpoint = spare_ndx;
 446         }
 447
 448         /* now we can compute the new bucket's primary block number */
 449         start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
 450
 451         Assert(!_hash_has_active_scan(rel, new_bucket));
 452
 453         if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
 454                 elog(PANIC, "could not get lock on supposedly new bucket");
 455
 456         /*
 457          * Copy bucket mapping info now; this saves re-accessing the meta page
 458          * inside _hash_splitbucket's inner loop.  Note that once we drop the
 459          * split lock, other splits could begin, so these values might be out of
 460          * date before _hash_splitbucket finishes.  That's okay, since all it
 461          * needs is to tell which of these two buckets to map hashkeys into.
 462          */
 463         maxbucket = metap->hashm_maxbucket;
 464         highmask = metap->hashm_highmask;
 465         lowmask = metap->hashm_lowmask;
 466
 467         /* Write out the metapage and drop lock, but keep pin */
 468         _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
 469
 470         /* Release split lock; okay for other splits to occur now */
 471         _hash_droplock(rel, 0, HASH_EXCLUSIVE);
 472
 473         /* Relocate records to the new bucket */
 474         _hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
 475                                           start_oblkno, start_nblkno,
 476                                           maxbucket, highmask, lowmask);
 477
 478         /* Release bucket locks, allowing others to access them */
 479         _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
 480         _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
 481
 482         return;
 483
 484         /* Here if decide not to split or fail to acquire old bucket lock */
 485 fail:
 486
 487         /* We didn't write the metapage, so just drop lock */
 488         _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
 489
 490         /* Release split lock */
 491         _hash_droplock(rel, 0, HASH_EXCLUSIVE);
 492 }
 493
 494
 495 /*
 496  * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 497  *
 498  * We are splitting a bucket that consists of a base bucket page and zero
 499  * or more overflow (bucket chain) pages.  We must relocate tuples that
 500  * belong in the new bucket, and compress out any free space in the old
 501  * bucket.
 502  *
 503  * The caller must hold exclusive locks on both buckets to ensure that
 504  * no one else is trying to access them (see README).
 505  *
 506  * The caller must hold a pin, but no lock, on the metapage buffer.
 507  * The buffer is returned in the same state.  (The metapage is only
 508  * touched if it becomes necessary to add or remove overflow pages.)
 509  */
 510 static void
 511 _hash_splitbucket(Relation rel,
 512                                   Buffer metabuf,
 513                                   Bucket obucket,
 514                                   Bucket nbucket,
 515                                   BlockNumber start_oblkno,
 516                                   BlockNumber start_nblkno,
 517                                   uint32 maxbucket,
 518                                   uint32 highmask,
 519                                   uint32 lowmask)
 520 {
 521         Bucket          bucket;
 522         Buffer          obuf;
 523         Buffer          nbuf;
 524         BlockNumber oblkno;
 525         BlockNumber nblkno;
 526         bool            null;
 527         Datum           datum;
 528         HashItem        hitem;
 529         HashPageOpaque oopaque;
 530         HashPageOpaque nopaque;
 531         IndexTuple      itup;
 532         Size            itemsz;
 533         OffsetNumber ooffnum;
 534         OffsetNumber noffnum;
 535         OffsetNumber omaxoffnum;
 536         Page            opage;
 537         Page            npage;
 538         TupleDesc       itupdesc = RelationGetDescr(rel);
 539
 540         /*
 541          * It should be okay to simultaneously write-lock pages from each
 542          * bucket, since no one else can be trying to acquire buffer lock
 543          * on pages of either bucket.
 544          */
 545         oblkno = start_oblkno;
 546         nblkno = start_nblkno;
 547         obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
 548         nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
 549         opage = BufferGetPage(obuf);
 550         npage = BufferGetPage(nbuf);
 551
 552         _hash_checkpage(rel, opage, LH_BUCKET_PAGE);
 553         oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
 554
 555         /* initialize the new bucket's primary page */
 556         _hash_pageinit(npage, BufferGetPageSize(nbuf));
 557         nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
 558         nopaque->hasho_prevblkno = InvalidBlockNumber;
 559         nopaque->hasho_nextblkno = InvalidBlockNumber;
 560         nopaque->hasho_bucket = nbucket;
 561         nopaque->hasho_flag = LH_BUCKET_PAGE;
 562         nopaque->hasho_filler = HASHO_FILL;
 563
 564         /*
 565          * Partition the tuples in the old bucket between the old bucket and the
 566          * new bucket, advancing along the old bucket's overflow bucket chain
 567          * and adding overflow pages to the new bucket as needed.
 568          */
 569         ooffnum = FirstOffsetNumber;
 570         omaxoffnum = PageGetMaxOffsetNumber(opage);
 571         for (;;)
 572         {
 573                 /*
 574                  * at each iteration through this loop, each of these variables
 575                  * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum
 576                  */
 577
 578                 /* check if we're at the end of the page */
 579                 if (ooffnum > omaxoffnum)
 580                 {
 581                         /* at end of page, but check for an(other) overflow page */
 582                         oblkno = oopaque->hasho_nextblkno;
 583                         if (!BlockNumberIsValid(oblkno))
 584                                 break;
 585                         /*
 586                          * we ran out of tuples on this particular page, but we
 587                          * have more overflow pages; advance to next page.
 588                          */
 589                         _hash_wrtbuf(rel, obuf);
 590
 591                         obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
 592                         opage = BufferGetPage(obuf);
 593                         _hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
 594                         oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
 595                         ooffnum = FirstOffsetNumber;
 596                         omaxoffnum = PageGetMaxOffsetNumber(opage);
 597                         continue;
 598                 }
 599
 600                 /*
 601                  * Re-hash the tuple to determine which bucket it now belongs in.
 602                  *
 603                  * It is annoying to call the hash function while holding locks,
 604                  * but releasing and relocking the page for each tuple is unappealing
 605                  * too.
 606                  */
 607                 hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
 608                 itup = &(hitem->hash_itup);
 609                 datum = index_getattr(itup, 1, itupdesc, &null);
 610                 Assert(!null);
 611
 612                 bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
 613                                                                           maxbucket, highmask, lowmask);
 614
 615                 if (bucket == nbucket)
 616                 {
 617                         /*
 618                          * insert the tuple into the new bucket.  if it doesn't fit on
 619                          * the current page in the new bucket, we must allocate a new
 620                          * overflow page and place the tuple on that page instead.
 621                          */
 622                         itemsz = IndexTupleDSize(hitem->hash_itup)
 623                                 + (sizeof(HashItemData) - sizeof(IndexTupleData));
 624
 625                         itemsz = MAXALIGN(itemsz);
 626
 627                         if (PageGetFreeSpace(npage) < itemsz)
 628                         {
 629                                 /* write out nbuf and drop lock, but keep pin */
 630                                 _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
 631                                 /* chain to a new overflow page */
 632                                 nbuf = _hash_addovflpage(rel, metabuf, nbuf);
 633                                 npage = BufferGetPage(nbuf);
 634                                 _hash_checkpage(rel, npage, LH_OVERFLOW_PAGE);
 635                                 /* we don't need nopaque within the loop */
 636                         }
 637
 638                         noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
 639                         if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED)
 640                                 == InvalidOffsetNumber)
 641                                 elog(ERROR, "failed to add index item to \"%s\"",
 642                                          RelationGetRelationName(rel));
 643
 644                         /*
 645                          * now delete the tuple from the old bucket.  after this
 646                          * section of code, 'ooffnum' will actually point to the
 647                          * ItemId to which we would point if we had advanced it before
 648                          * the deletion (PageIndexTupleDelete repacks the ItemId
 649                          * array).      this also means that 'omaxoffnum' is exactly one
 650                          * less than it used to be, so we really can just decrement it
 651                          * instead of calling PageGetMaxOffsetNumber.
 652                          */
 653                         PageIndexTupleDelete(opage, ooffnum);
 654                         omaxoffnum = OffsetNumberPrev(omaxoffnum);
 655                 }
 656                 else
 657                 {
 658                         /*
 659                          * the tuple stays on this page.  we didn't move anything, so
 660                          * we didn't delete anything and therefore we don't have to
 661                          * change 'omaxoffnum'.
 662                          */
 663                         Assert(bucket == obucket);
 664                         ooffnum = OffsetNumberNext(ooffnum);
 665                 }
 666         }
 667
 668         /*
 669          * We're at the end of the old bucket chain, so we're done partitioning
 670          * the tuples.  Before quitting, call _hash_squeezebucket to ensure the
 671          * tuples remaining in the old bucket (including the overflow pages) are
 672          * packed as tightly as possible.  The new bucket is already tight.
 673          */
 674         _hash_wrtbuf(rel, obuf);
 675         _hash_wrtbuf(rel, nbuf);
 676
 677         _hash_squeezebucket(rel, obucket, start_oblkno);
 678 }