granicus.if.org Git - postgresql/blob - src/backend/access/hash/hashpage.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * hashpage.c--
   4  *        Hash table page management code for the Postgres hash access method
   5  *
   6  * Copyright (c) 1994, Regents of the University of California
   7  *
   8  *
   9  * IDENTIFICATION
  10  *        $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.10 1997/09/07 04:38:00 momjian Exp $
  11  *
  12  * NOTES
  13  *        Postgres hash pages look like ordinary relation pages.  The opaque
  14  *        data at high addresses includes information about the page including
  15  *        whether a page is an overflow page or a true bucket, the block
  16  *        numbers of the preceding and following pages, and the overflow
  17  *        address of the page if it is an overflow page.
  18  *
  19  *        The first page in a hash relation, page zero, is special -- it stores
  20  *        information describing the hash table; it is referred to as teh
  21  *        "meta page." Pages one and higher store the actual data.
  22  *
  23  *-------------------------------------------------------------------------
  24  */
  25
  26 #include <postgres.h>
  27
  28 #include <access/hash.h>
  29 #include <storage/bufmgr.h>
  30 #include <miscadmin.h>
  31 #include <utils/memutils.h>
  32 #include <storage/lmgr.h>
  33 #include <access/genam.h>
  34
  35 #ifndef HAVE_MEMMOVE
  36 #include <regex/utils.h>
  37 #else
  38 #include <string.h>
  39 #endif
  40
  41 static void             _hash_setpagelock(Relation rel, BlockNumber blkno, int access);
  42 static void             _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access);
  43 static void             _hash_splitpage(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket);
  44
  45 /*
  46  *      We use high-concurrency locking on hash indices.  There are two cases in
  47  *      which we don't do locking.  One is when we're building the index.
  48  *      Since the creating transaction has not committed, no one can see
  49  *      the index, and there's no reason to share locks.  The second case
  50  *      is when we're just starting up the database system.  We use some
  51  *      special-purpose initialization code in the relation cache manager
  52  *      (see utils/cache/relcache.c) to allow us to do indexed scans on
  53  *      the system catalogs before we'd normally be able to.  This happens
  54  *      before the lock table is fully initialized, so we can't use it.
  55  *      Strictly speaking, this violates 2pl, but we don't do 2pl on the
  56  *      system catalogs anyway.
  57  */
  58
  59
  60 #define USELOCKING              (!BuildingHash && !IsInitProcessingMode())
  61
  62
  63 /*
  64  *      _hash_metapinit() -- Initialize the metadata page of a hash index,
  65  *                              the two buckets that we begin with and the initial
  66  *                              bitmap page.
  67  */
  68 void
  69 _hash_metapinit(Relation rel)
  70 {
  71         HashMetaPage    metap;
  72         HashPageOpaque  pageopaque;
  73         Buffer                  metabuf;
  74         Buffer                  buf;
  75         Page                    pg;
  76         int                             nbuckets;
  77         uint32                  nelem;          /* number elements */
  78         uint32                  lg2nelem;       /* _hash_log2(nelem)   */
  79         uint32                  nblocks;
  80         uint16                  i;
  81
  82         /* can't be sharing this with anyone, now... */
  83         if (USELOCKING)
  84                 RelationSetLockForWrite(rel);
  85
  86         if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0)
  87         {
  88                 elog(WARN, "Cannot initialize non-empty hash table %s",
  89                          RelationGetRelationName(rel));
  90         }
  91
  92         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
  93         pg = BufferGetPage(metabuf);
  94         metap = (HashMetaPage) pg;
  95         _hash_pageinit(pg, BufferGetPageSize(metabuf));
  96
  97         metap->hashm_magic = HASH_MAGIC;
  98         metap->hashm_version = HASH_VERSION;
  99         metap->hashm_nkeys = 0;
 100         metap->hashm_nmaps = 0;
 101         metap->hashm_ffactor = DEFAULT_FFACTOR;
 102         metap->hashm_bsize = BufferGetPageSize(metabuf);
 103         metap->hashm_bshift = _hash_log2(metap->hashm_bsize);
 104         for (i = metap->hashm_bshift; i > 0; --i)
 105         {
 106                 if ((1 << i) < (metap->hashm_bsize -
 107                                                 (DOUBLEALIGN(sizeof(PageHeaderData)) +
 108                                                  DOUBLEALIGN(sizeof(HashPageOpaqueData)))))
 109                 {
 110                         break;
 111                 }
 112         }
 113         Assert(i);
 114         metap->hashm_bmsize = 1 << i;
 115         metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
 116
 117         /*
 118          * Make nelem = 2 rather than 0 so that we end up allocating space for
 119          * the next greater power of two number of buckets.
 120          */
 121         nelem = 2;
 122         lg2nelem = 1;                           /* _hash_log2(MAX(nelem, 2)) */
 123         nbuckets = 2;                           /* 1 << lg2nelem */
 124
 125         memset((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares));
 126         memset((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
 127
 128         metap->hashm_spares[lg2nelem] = 2;      /* lg2nelem + 1 */
 129         metap->hashm_spares[lg2nelem + 1] = 2;          /* lg2nelem + 1 */
 130         metap->hashm_ovflpoint = 1; /* lg2nelem */
 131         metap->hashm_lastfreed = 2;
 132
 133         metap->hashm_maxbucket = metap->hashm_lowmask = 1;      /* nbuckets - 1 */
 134         metap->hashm_highmask = 3;      /* (nbuckets << 1) - 1 */
 135
 136         pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
 137         pageopaque->hasho_oaddr = InvalidOvflAddress;
 138         pageopaque->hasho_prevblkno = InvalidBlockNumber;
 139         pageopaque->hasho_nextblkno = InvalidBlockNumber;
 140         pageopaque->hasho_flag = LH_META_PAGE;
 141         pageopaque->hasho_bucket = -1;
 142
 143         /*
 144          * First bitmap page is at: splitpoint lg2nelem page offset 1 which
 145          * turns out to be page 3. Couldn't initialize page 3  until we
 146          * created the first two buckets above.
 147          */
 148         if (_hash_initbitmap(rel, metap, OADDR_OF(lg2nelem, 1), lg2nelem + 1, 0))
 149                 elog(WARN, "Problem with _hash_initbitmap.");
 150
 151         /* all done */
 152         _hash_wrtnorelbuf(rel, metabuf);
 153
 154         /*
 155          * initialize the first two buckets
 156          */
 157         for (i = 0; i <= 1; i++)
 158         {
 159                 buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE);
 160                 pg = BufferGetPage(buf);
 161                 _hash_pageinit(pg, BufferGetPageSize(buf));
 162                 pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
 163                 pageopaque->hasho_oaddr = InvalidOvflAddress;
 164                 pageopaque->hasho_prevblkno = InvalidBlockNumber;
 165                 pageopaque->hasho_nextblkno = InvalidBlockNumber;
 166                 pageopaque->hasho_flag = LH_BUCKET_PAGE;
 167                 pageopaque->hasho_bucket = i;
 168                 _hash_wrtbuf(rel, buf);
 169         }
 170
 171         _hash_relbuf(rel, metabuf, HASH_WRITE);
 172
 173         if (USELOCKING)
 174                 RelationUnsetLockForWrite(rel);
 175 }
 176
 177 /*
 178  *      _hash_getbuf() -- Get a buffer by block number for read or write.
 179  *
 180  *              When this routine returns, the appropriate lock is set on the
 181  *              requested buffer its reference count is correct.
 182  *
 183  *              XXX P_NEW is not used because, unlike the tree structures, we
 184  *              need the bucket blocks to be at certain block numbers.  we must
 185  *              depend on the caller to call _hash_pageinit on the block if it
 186  *              knows that this is a new block.
 187  */
 188 Buffer
 189 _hash_getbuf(Relation rel, BlockNumber blkno, int access)
 190 {
 191         Buffer                  buf;
 192
 193         if (blkno == P_NEW)
 194         {
 195                 elog(WARN, "_hash_getbuf: internal error: hash AM does not use P_NEW");
 196         }
 197         switch (access)
 198         {
 199         case HASH_WRITE:
 200         case HASH_READ:
 201                 _hash_setpagelock(rel, blkno, access);
 202                 break;
 203         default:
 204                 elog(WARN, "_hash_getbuf: invalid access (%d) on new blk: %s",
 205                          access, RelationGetRelationName(rel));
 206                 break;
 207         }
 208         buf = ReadBuffer(rel, blkno);
 209
 210         /* ref count and lock type are correct */
 211         return (buf);
 212 }
 213
 214 /*
 215  *      _hash_relbuf() -- release a locked buffer.
 216  */
 217 void
 218 _hash_relbuf(Relation rel, Buffer buf, int access)
 219 {
 220         BlockNumber             blkno;
 221
 222         blkno = BufferGetBlockNumber(buf);
 223
 224         switch (access)
 225         {
 226         case HASH_WRITE:
 227         case HASH_READ:
 228                 _hash_unsetpagelock(rel, blkno, access);
 229                 break;
 230         default:
 231                 elog(WARN, "_hash_relbuf: invalid access (%d) on blk %x: %s",
 232                          access, blkno, RelationGetRelationName(rel));
 233         }
 234
 235         ReleaseBuffer(buf);
 236 }
 237
 238 /*
 239  *      _hash_wrtbuf() -- write a hash page to disk.
 240  *
 241  *              This routine releases the lock held on the buffer and our reference
 242  *              to it.  It is an error to call _hash_wrtbuf() without a write lock
 243  *              or a reference to the buffer.
 244  */
 245 void
 246 _hash_wrtbuf(Relation rel, Buffer buf)
 247 {
 248         BlockNumber             blkno;
 249
 250         blkno = BufferGetBlockNumber(buf);
 251         WriteBuffer(buf);
 252         _hash_unsetpagelock(rel, blkno, HASH_WRITE);
 253 }
 254
 255 /*
 256  *      _hash_wrtnorelbuf() -- write a hash page to disk, but do not release
 257  *                                               our reference or lock.
 258  *
 259  *              It is an error to call _hash_wrtnorelbuf() without a write lock
 260  *              or a reference to the buffer.
 261  */
 262 void
 263 _hash_wrtnorelbuf(Relation rel, Buffer buf)
 264 {
 265         BlockNumber             blkno;
 266
 267         blkno = BufferGetBlockNumber(buf);
 268         WriteNoReleaseBuffer(buf);
 269 }
 270
 271 Page
 272 _hash_chgbufaccess(Relation rel,
 273                                    Buffer * bufp,
 274                                    int from_access,
 275                                    int to_access)
 276 {
 277         BlockNumber             blkno;
 278
 279         blkno = BufferGetBlockNumber(*bufp);
 280
 281         switch (from_access)
 282         {
 283         case HASH_WRITE:
 284                 _hash_wrtbuf(rel, *bufp);
 285                 break;
 286         case HASH_READ:
 287                 _hash_relbuf(rel, *bufp, from_access);
 288                 break;
 289         default:
 290                 elog(WARN, "_hash_chgbufaccess: invalid access (%d) on blk %x: %s",
 291                          from_access, blkno, RelationGetRelationName(rel));
 292                 break;
 293         }
 294         *bufp = _hash_getbuf(rel, blkno, to_access);
 295         return (BufferGetPage(*bufp));
 296 }
 297
 298 /*
 299  *      _hash_pageinit() -- Initialize a new page.
 300  */
 301 void
 302 _hash_pageinit(Page page, Size size)
 303 {
 304         Assert(((PageHeader) page)->pd_lower == 0);
 305         Assert(((PageHeader) page)->pd_upper == 0);
 306         Assert(((PageHeader) page)->pd_special == 0);
 307
 308         /*
 309          * Cargo-cult programming -- don't really need this to be zero, but
 310          * creating new pages is an infrequent occurrence and it makes me feel
 311          * good when I know they're empty.
 312          */
 313         memset(page, 0, size);
 314
 315         PageInit(page, size, sizeof(HashPageOpaqueData));
 316 }
 317
 318 static void
 319 _hash_setpagelock(Relation rel,
 320                                   BlockNumber blkno,
 321                                   int access)
 322 {
 323         ItemPointerData iptr;
 324
 325         if (USELOCKING)
 326         {
 327                 ItemPointerSet(&iptr, blkno, 1);
 328
 329                 switch (access)
 330                 {
 331                 case HASH_WRITE:
 332                         RelationSetSingleWLockPage(rel, &iptr);
 333                         break;
 334                 case HASH_READ:
 335                         RelationSetSingleRLockPage(rel, &iptr);
 336                         break;
 337                 default:
 338                         elog(WARN, "_hash_setpagelock: invalid access (%d) on blk %x: %s",
 339                                  access, blkno, RelationGetRelationName(rel));
 340                         break;
 341                 }
 342         }
 343 }
 344
 345 static void
 346 _hash_unsetpagelock(Relation rel,
 347                                         BlockNumber blkno,
 348                                         int access)
 349 {
 350         ItemPointerData iptr;
 351
 352         if (USELOCKING)
 353         {
 354                 ItemPointerSet(&iptr, blkno, 1);
 355
 356                 switch (access)
 357                 {
 358                 case HASH_WRITE:
 359                         RelationUnsetSingleWLockPage(rel, &iptr);
 360                         break;
 361                 case HASH_READ:
 362                         RelationUnsetSingleRLockPage(rel, &iptr);
 363                         break;
 364                 default:
 365                         elog(WARN, "_hash_unsetpagelock: invalid access (%d) on blk %x: %s",
 366                                  access, blkno, RelationGetRelationName(rel));
 367                         break;
 368                 }
 369         }
 370 }
 371
 372 void
 373 _hash_pagedel(Relation rel, ItemPointer tid)
 374 {
 375         Buffer                  buf;
 376         Buffer                  metabuf;
 377         Page                    page;
 378         BlockNumber             blkno;
 379         OffsetNumber    offno;
 380         HashMetaPage    metap;
 381         HashPageOpaque  opaque;
 382
 383         blkno = ItemPointerGetBlockNumber(tid);
 384         offno = ItemPointerGetOffsetNumber(tid);
 385
 386         buf = _hash_getbuf(rel, blkno, HASH_WRITE);
 387         page = BufferGetPage(buf);
 388         _hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 389         opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 390
 391         PageIndexTupleDelete(page, offno);
 392         _hash_wrtnorelbuf(rel, buf);
 393
 394         if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE))
 395         {
 396                 buf = _hash_freeovflpage(rel, buf);
 397                 if (BufferIsValid(buf))
 398                 {
 399                         _hash_relbuf(rel, buf, HASH_WRITE);
 400                 }
 401         }
 402         else
 403         {
 404                 _hash_relbuf(rel, buf, HASH_WRITE);
 405         }
 406
 407         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
 408         metap = (HashMetaPage) BufferGetPage(metabuf);
 409         _hash_checkpage((Page) metap, LH_META_PAGE);
 410         ++metap->hashm_nkeys;
 411         _hash_wrtbuf(rel, metabuf);
 412 }
 413
 414 void
 415 _hash_expandtable(Relation rel, Buffer metabuf)
 416 {
 417         HashMetaPage    metap;
 418         Bucket                  old_bucket;
 419         Bucket                  new_bucket;
 420         uint32                  spare_ndx;
 421
 422 /*        elog(DEBUG, "_hash_expandtable: expanding..."); */
 423
 424         metap = (HashMetaPage) BufferGetPage(metabuf);
 425         _hash_checkpage((Page) metap, LH_META_PAGE);
 426
 427         metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
 428         new_bucket = ++metap->MAX_BUCKET;
 429         metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
 430         old_bucket = (metap->MAX_BUCKET & metap->LOW_MASK);
 431
 432         /*
 433          * If the split point is increasing (MAX_BUCKET's log base 2 *
 434          * increases), we need to copy the current contents of the spare split
 435          * bucket to the next bucket.
 436          */
 437         spare_ndx = _hash_log2(metap->MAX_BUCKET + 1);
 438         if (spare_ndx > metap->OVFL_POINT)
 439         {
 440
 441                 metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
 442                 metap->SPARES[spare_ndx] = metap->SPARES[metap->OVFL_POINT];
 443                 metap->OVFL_POINT = spare_ndx;
 444                 metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
 445         }
 446
 447         if (new_bucket > metap->HIGH_MASK)
 448         {
 449
 450                 /* Starting a new doubling */
 451                 metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);
 452                 metap->LOW_MASK = metap->HIGH_MASK;
 453                 metap->HIGH_MASK = new_bucket | metap->LOW_MASK;
 454                 metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);
 455
 456         }
 457         /* Relocate records to the new bucket */
 458         _hash_splitpage(rel, metabuf, old_bucket, new_bucket);
 459 }
 460
 461
 462 /*
 463  * _hash_splitpage -- split 'obucket' into 'obucket' and 'nbucket'
 464  *
 465  * this routine is actually misnamed -- we are splitting a bucket that
 466  * consists of a base bucket page and zero or more overflow (bucket
 467  * chain) pages.
 468  */
 469 static void
 470 _hash_splitpage(Relation rel,
 471                                 Buffer metabuf,
 472                                 Bucket obucket,
 473                                 Bucket nbucket)
 474 {
 475         Bucket                  bucket;
 476         Buffer                  obuf;
 477         Buffer                  nbuf;
 478         Buffer                  ovflbuf;
 479         BlockNumber             oblkno;
 480         BlockNumber             nblkno;
 481         bool                    null;
 482         Datum                   datum;
 483         HashItem                hitem;
 484         HashPageOpaque  oopaque;
 485         HashPageOpaque  nopaque;
 486         HashMetaPage    metap;
 487         IndexTuple              itup;
 488         int                             itemsz;
 489         OffsetNumber    ooffnum;
 490         OffsetNumber    noffnum;
 491         OffsetNumber    omaxoffnum;
 492         Page                    opage;
 493         Page                    npage;
 494         TupleDesc               itupdesc;
 495
 496 /*        elog(DEBUG, "_hash_splitpage: splitting %d into %d,%d",
 497                  obucket, obucket, nbucket);
 498 */
 499         metap = (HashMetaPage) BufferGetPage(metabuf);
 500         _hash_checkpage((Page) metap, LH_META_PAGE);
 501
 502         /* get the buffers & pages */
 503         oblkno = BUCKET_TO_BLKNO(obucket);
 504         nblkno = BUCKET_TO_BLKNO(nbucket);
 505         obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
 506         nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
 507         opage = BufferGetPage(obuf);
 508         npage = BufferGetPage(nbuf);
 509
 510         /* initialize the new bucket */
 511         _hash_pageinit(npage, BufferGetPageSize(nbuf));
 512         nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
 513         nopaque->hasho_prevblkno = InvalidBlockNumber;
 514         nopaque->hasho_nextblkno = InvalidBlockNumber;
 515         nopaque->hasho_flag = LH_BUCKET_PAGE;
 516         nopaque->hasho_oaddr = InvalidOvflAddress;
 517         nopaque->hasho_bucket = nbucket;
 518         _hash_wrtnorelbuf(rel, nbuf);
 519
 520         /*
 521          * make sure the old bucket isn't empty.  advance 'opage' and friends
 522          * through the overflow bucket chain until we find a non-empty page.
 523          *
 524          * XXX we should only need this once, if we are careful to preserve the
 525          * invariant that overflow pages are never empty.
 526          */
 527         _hash_checkpage(opage, LH_BUCKET_PAGE);
 528         oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
 529         if (PageIsEmpty(opage))
 530         {
 531                 oblkno = oopaque->hasho_nextblkno;
 532                 _hash_relbuf(rel, obuf, HASH_WRITE);
 533                 if (!BlockNumberIsValid(oblkno))
 534                 {
 535
 536                         /*
 537                          * the old bucket is completely empty; of course, the new
 538                          * bucket will be as well, but since it's a base bucket page
 539                          * we don't care.
 540                          */
 541                         _hash_relbuf(rel, nbuf, HASH_WRITE);
 542                         return;
 543                 }
 544                 obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
 545                 opage = BufferGetPage(obuf);
 546                 _hash_checkpage(opage, LH_OVERFLOW_PAGE);
 547                 if (PageIsEmpty(opage))
 548                 {
 549                         elog(WARN, "_hash_splitpage: empty overflow page %d", oblkno);
 550                 }
 551                 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
 552         }
 553
 554         /*
 555          * we are now guaranteed that 'opage' is not empty.  partition the
 556          * tuples in the old bucket between the old bucket and the new bucket,
 557          * advancing along their respective overflow bucket chains and adding
 558          * overflow pages as needed.
 559          */
 560         ooffnum = FirstOffsetNumber;
 561         omaxoffnum = PageGetMaxOffsetNumber(opage);
 562         for (;;)
 563         {
 564
 565                 /*
 566                  * at each iteration through this loop, each of these variables
 567                  * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum
 568                  */
 569
 570                 /* check if we're at the end of the page */
 571                 if (ooffnum > omaxoffnum)
 572                 {
 573                         /* at end of page, but check for overflow page */
 574                         oblkno = oopaque->hasho_nextblkno;
 575                         if (BlockNumberIsValid(oblkno))
 576                         {
 577
 578                                 /*
 579                                  * we ran out of tuples on this particular page, but we
 580                                  * have more overflow pages; re-init values.
 581                                  */
 582                                 _hash_wrtbuf(rel, obuf);
 583                                 obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
 584                                 opage = BufferGetPage(obuf);
 585                                 _hash_checkpage(opage, LH_OVERFLOW_PAGE);
 586                                 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
 587
 588                                 /* we're guaranteed that an ovfl page has at least 1 tuple */
 589                                 if (PageIsEmpty(opage))
 590                                 {
 591                                         elog(WARN, "_hash_splitpage: empty ovfl page %d!",
 592                                                  oblkno);
 593                                 }
 594                                 ooffnum = FirstOffsetNumber;
 595                                 omaxoffnum = PageGetMaxOffsetNumber(opage);
 596                         }
 597                         else
 598                         {
 599
 600                                 /*
 601                                  * we're at the end of the bucket chain, so now we're
 602                                  * really done with everything.  before quitting, call
 603                                  * _hash_squeezebucket to ensure the tuples in the bucket
 604                                  * (including the overflow pages) are packed as tightly as
 605                                  * possible.
 606                                  */
 607                                 _hash_wrtbuf(rel, obuf);
 608                                 _hash_wrtbuf(rel, nbuf);
 609                                 _hash_squeezebucket(rel, metap, obucket);
 610                                 return;
 611                         }
 612                 }
 613
 614                 /* hash on the tuple */
 615                 hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
 616                 itup = &(hitem->hash_itup);
 617                 itupdesc = RelationGetTupleDescriptor(rel);
 618                 datum = index_getattr(itup, 1, itupdesc, &null);
 619                 bucket = _hash_call(rel, metap, datum);
 620
 621                 if (bucket == nbucket)
 622                 {
 623
 624                         /*
 625                          * insert the tuple into the new bucket.  if it doesn't fit on
 626                          * the current page in the new bucket, we must allocate a new
 627                          * overflow page and place the tuple on that page instead.
 628                          */
 629                         itemsz = IndexTupleDSize(hitem->hash_itup)
 630                                 + (sizeof(HashItemData) - sizeof(IndexTupleData));
 631
 632                         itemsz = DOUBLEALIGN(itemsz);
 633
 634                         if (PageGetFreeSpace(npage) < itemsz)
 635                         {
 636                                 ovflbuf = _hash_addovflpage(rel, &metabuf, nbuf);
 637                                 _hash_wrtbuf(rel, nbuf);
 638                                 nbuf = ovflbuf;
 639                                 npage = BufferGetPage(nbuf);
 640                                 _hash_checkpage(npage, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 641                         }
 642
 643                         noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
 644                         PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED);
 645                         _hash_wrtnorelbuf(rel, nbuf);
 646
 647                         /*
 648                          * now delete the tuple from the old bucket.  after this
 649                          * section of code, 'ooffnum' will actually point to the
 650                          * ItemId to which we would point if we had advanced it before
 651                          * the deletion (PageIndexTupleDelete repacks the ItemId
 652                          * array).      this also means that 'omaxoffnum' is exactly one
 653                          * less than it used to be, so we really can just decrement it
 654                          * instead of calling PageGetMaxOffsetNumber.
 655                          */
 656                         PageIndexTupleDelete(opage, ooffnum);
 657                         _hash_wrtnorelbuf(rel, obuf);
 658                         omaxoffnum = OffsetNumberPrev(omaxoffnum);
 659
 660                         /*
 661                          * tidy up.  if the old page was an overflow page and it is
 662                          * now empty, we must free it (we want to preserve the
 663                          * invariant that overflow pages cannot be empty).
 664                          */
 665                         if (PageIsEmpty(opage) &&
 666                                 (oopaque->hasho_flag & LH_OVERFLOW_PAGE))
 667                         {
 668                                 obuf = _hash_freeovflpage(rel, obuf);
 669
 670                                 /* check that we're not through the bucket chain */
 671                                 if (BufferIsInvalid(obuf))
 672                                 {
 673                                         _hash_wrtbuf(rel, nbuf);
 674                                         _hash_squeezebucket(rel, metap, obucket);
 675                                         return;
 676                                 }
 677
 678                                 /*
 679                                  * re-init. again, we're guaranteed that an ovfl page has
 680                                  * at least one tuple.
 681                                  */
 682                                 opage = BufferGetPage(obuf);
 683                                 _hash_checkpage(opage, LH_OVERFLOW_PAGE);
 684                                 oblkno = BufferGetBlockNumber(obuf);
 685                                 oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
 686                                 if (PageIsEmpty(opage))
 687                                 {
 688                                         elog(WARN, "_hash_splitpage: empty overflow page %d",
 689                                                  oblkno);
 690                                 }
 691                                 ooffnum = FirstOffsetNumber;
 692                                 omaxoffnum = PageGetMaxOffsetNumber(opage);
 693                         }
 694                 }
 695                 else
 696                 {
 697
 698                         /*
 699                          * the tuple stays on this page.  we didn't move anything, so
 700                          * we didn't delete anything and therefore we don't have to
 701                          * change 'omaxoffnum'.
 702                          *
 703                          * XXX any hash value from [0, nbucket-1] will map to this
 704                          * bucket, which doesn't make sense to me.
 705                          */
 706                         ooffnum = OffsetNumberNext(ooffnum);
 707                 }
 708         }
 709         /* NOTREACHED */
 710 }