granicus.if.org Git - postgresql/blob - src/backend/access/hash/hashinsert.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * hashinsert.c
   4  *        Item insertion in hash tables for Postgres.
   5  *
   6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/access/hash/hashinsert.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15
  16 #include "postgres.h"
  17
  18 #include "access/hash.h"
  19 #include "access/hash_xlog.h"
  20 #include "miscadmin.h"
  21 #include "utils/rel.h"
  22 #include "storage/lwlock.h"
  23 #include "storage/buf_internals.h"
  24 #include "storage/predicate.h"
  25
  26 static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
  27                                           RelFileNode hnode);
  28
  29 /*
  30  *      _hash_doinsert() -- Handle insertion of a single index tuple.
  31  *
  32  *              This routine is called by the public interface routines, hashbuild
  33  *              and hashinsert.  By here, itup is completely filled in.
  34  */
  35 void
  36 _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
  37 {
  38         Buffer          buf = InvalidBuffer;
  39         Buffer          bucket_buf;
  40         Buffer          metabuf;
  41         HashMetaPage metap;
  42         HashMetaPage usedmetap = NULL;
  43         Page            metapage;
  44         Page            page;
  45         HashPageOpaque pageopaque;
  46         Size            itemsz;
  47         bool            do_expand;
  48         uint32          hashkey;
  49         Bucket          bucket;
  50         OffsetNumber itup_off;
  51
  52         /*
  53          * Get the hash key for the item (it's stored in the index tuple itself).
  54          */
  55         hashkey = _hash_get_indextuple_hashkey(itup);
  56
  57         /* compute item size too */
  58         itemsz = IndexTupleSize(itup);
  59         itemsz = MAXALIGN(itemsz);      /* be safe, PageAddItem will do this but we
  60                                                                  * need to be consistent */
  61
  62 restart_insert:
  63
  64         /*
  65          * Read the metapage.  We don't lock it yet; HashMaxItemSize() will
  66          * examine pd_pagesize_version, but that can't change so we can examine it
  67          * without a lock.
  68          */
  69         metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
  70         metapage = BufferGetPage(metabuf);
  71
  72         /*
  73          * Check whether the item can fit on a hash page at all. (Eventually, we
  74          * ought to try to apply TOAST methods if not.)  Note that at this point,
  75          * itemsz doesn't include the ItemId.
  76          *
  77          * XXX this is useless code if we are only storing hash keys.
  78          */
  79         if (itemsz > HashMaxItemSize(metapage))
  80                 ereport(ERROR,
  81                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
  82                                  errmsg("index row size %zu exceeds hash maximum %zu",
  83                                                 itemsz, HashMaxItemSize(metapage)),
  84                                  errhint("Values larger than a buffer page cannot be indexed.")));
  85
  86         /* Lock the primary bucket page for the target bucket. */
  87         buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
  88                                                                                   &usedmetap);
  89         Assert(usedmetap != NULL);
  90
  91         CheckForSerializableConflictIn(rel, NULL, buf);
  92
  93         /* remember the primary bucket buffer to release the pin on it at end. */
  94         bucket_buf = buf;
  95
  96         page = BufferGetPage(buf);
  97         pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
  98         bucket = pageopaque->hasho_bucket;
  99
 100         /*
 101          * If this bucket is in the process of being split, try to finish the
 102          * split before inserting, because that might create room for the
 103          * insertion to proceed without allocating an additional overflow page.
 104          * It's only interesting to finish the split if we're trying to insert
 105          * into the bucket from which we're removing tuples (the "old" bucket),
 106          * not if we're trying to insert into the bucket into which tuples are
 107          * being moved (the "new" bucket).
 108          */
 109         if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
 110         {
 111                 /* release the lock on bucket buffer, before completing the split. */
 112                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 113
 114                 _hash_finish_split(rel, metabuf, buf, bucket,
 115                                                    usedmetap->hashm_maxbucket,
 116                                                    usedmetap->hashm_highmask,
 117                                                    usedmetap->hashm_lowmask);
 118
 119                 /* release the pin on old and meta buffer.  retry for insert. */
 120                 _hash_dropbuf(rel, buf);
 121                 _hash_dropbuf(rel, metabuf);
 122                 goto restart_insert;
 123         }
 124
 125         /* Do the insertion */
 126         while (PageGetFreeSpace(page) < itemsz)
 127         {
 128                 BlockNumber nextblkno;
 129
 130                 /*
 131                  * Check if current page has any DEAD tuples. If yes, delete these
 132                  * tuples and see if we can get a space for the new item to be
 133                  * inserted before moving to the next page in the bucket chain.
 134                  */
 135                 if (H_HAS_DEAD_TUPLES(pageopaque))
 136                 {
 137
 138                         if (IsBufferCleanupOK(buf))
 139                         {
 140                                 _hash_vacuum_one_page(rel, metabuf, buf, heapRel->rd_node);
 141
 142                                 if (PageGetFreeSpace(page) >= itemsz)
 143                                         break;          /* OK, now we have enough space */
 144                         }
 145                 }
 146
 147                 /*
 148                  * no space on this page; check for an overflow page
 149                  */
 150                 nextblkno = pageopaque->hasho_nextblkno;
 151
 152                 if (BlockNumberIsValid(nextblkno))
 153                 {
 154                         /*
 155                          * ovfl page exists; go get it.  if it doesn't have room, we'll
 156                          * find out next pass through the loop test above.  we always
 157                          * release both the lock and pin if this is an overflow page, but
 158                          * only the lock if this is the primary bucket page, since the pin
 159                          * on the primary bucket must be retained throughout the scan.
 160                          */
 161                         if (buf != bucket_buf)
 162                                 _hash_relbuf(rel, buf);
 163                         else
 164                                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 165                         buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
 166                         page = BufferGetPage(buf);
 167                 }
 168                 else
 169                 {
 170                         /*
 171                          * we're at the end of the bucket chain and we haven't found a
 172                          * page with enough room.  allocate a new overflow page.
 173                          */
 174
 175                         /* release our write lock without modifying buffer */
 176                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 177
 178                         /* chain to a new overflow page */
 179                         buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
 180                         page = BufferGetPage(buf);
 181
 182                         /* should fit now, given test above */
 183                         Assert(PageGetFreeSpace(page) >= itemsz);
 184                 }
 185                 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
 186                 Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
 187                 Assert(pageopaque->hasho_bucket == bucket);
 188         }
 189
 190         /*
 191          * Write-lock the metapage so we can increment the tuple count. After
 192          * incrementing it, check to see if it's time for a split.
 193          */
 194         LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 195
 196         /* Do the update.  No ereport(ERROR) until changes are logged */
 197         START_CRIT_SECTION();
 198
 199         /* found page with enough space, so add the item here */
 200         itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
 201         MarkBufferDirty(buf);
 202
 203         /* metapage operations */
 204         metap = HashPageGetMeta(metapage);
 205         metap->hashm_ntuples += 1;
 206
 207         /* Make sure this stays in sync with _hash_expandtable() */
 208         do_expand = metap->hashm_ntuples >
 209                 (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
 210
 211         MarkBufferDirty(metabuf);
 212
 213         /* XLOG stuff */
 214         if (RelationNeedsWAL(rel))
 215         {
 216                 xl_hash_insert xlrec;
 217                 XLogRecPtr      recptr;
 218
 219                 xlrec.offnum = itup_off;
 220
 221                 XLogBeginInsert();
 222                 XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
 223
 224                 XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
 225
 226                 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
 227                 XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
 228
 229                 recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
 230
 231                 PageSetLSN(BufferGetPage(buf), recptr);
 232                 PageSetLSN(BufferGetPage(metabuf), recptr);
 233         }
 234
 235         END_CRIT_SECTION();
 236
 237         /* drop lock on metapage, but keep pin */
 238         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 239
 240         /*
 241          * Release the modified page and ensure to release the pin on primary
 242          * page.
 243          */
 244         _hash_relbuf(rel, buf);
 245         if (buf != bucket_buf)
 246                 _hash_dropbuf(rel, bucket_buf);
 247
 248         /* Attempt to split if a split is needed */
 249         if (do_expand)
 250                 _hash_expandtable(rel, metabuf);
 251
 252         /* Finally drop our pin on the metapage */
 253         _hash_dropbuf(rel, metabuf);
 254 }
 255
 256 /*
 257  *      _hash_pgaddtup() -- add a tuple to a particular page in the index.
 258  *
 259  * This routine adds the tuple to the page as requested; it does not write out
 260  * the page.  It is an error to call pgaddtup() without pin and write lock on
 261  * the target buffer.
 262  *
 263  * Returns the offset number at which the tuple was inserted.  This function
 264  * is responsible for preserving the condition that tuples in a hash index
 265  * page are sorted by hashkey value.
 266  */
 267 OffsetNumber
 268 _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup)
 269 {
 270         OffsetNumber itup_off;
 271         Page            page;
 272         uint32          hashkey;
 273
 274         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 275         page = BufferGetPage(buf);
 276
 277         /* Find where to insert the tuple (preserving page's hashkey ordering) */
 278         hashkey = _hash_get_indextuple_hashkey(itup);
 279         itup_off = _hash_binsearch(page, hashkey);
 280
 281         if (PageAddItem(page, (Item) itup, itemsize, itup_off, false, false)
 282                 == InvalidOffsetNumber)
 283                 elog(ERROR, "failed to add index item to \"%s\"",
 284                          RelationGetRelationName(rel));
 285
 286         return itup_off;
 287 }
 288
 289 /*
 290  *      _hash_pgaddmultitup() -- add a tuple vector to a particular page in the
 291  *                                                       index.
 292  *
 293  * This routine has same requirements for locking and tuple ordering as
 294  * _hash_pgaddtup().
 295  *
 296  * Returns the offset number array at which the tuples were inserted.
 297  */
 298 void
 299 _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
 300                                         OffsetNumber *itup_offsets, uint16 nitups)
 301 {
 302         OffsetNumber itup_off;
 303         Page            page;
 304         uint32          hashkey;
 305         int                     i;
 306
 307         _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 308         page = BufferGetPage(buf);
 309
 310         for (i = 0; i < nitups; i++)
 311         {
 312                 Size            itemsize;
 313
 314                 itemsize = IndexTupleSize(itups[i]);
 315                 itemsize = MAXALIGN(itemsize);
 316
 317                 /* Find where to insert the tuple (preserving page's hashkey ordering) */
 318                 hashkey = _hash_get_indextuple_hashkey(itups[i]);
 319                 itup_off = _hash_binsearch(page, hashkey);
 320
 321                 itup_offsets[i] = itup_off;
 322
 323                 if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false)
 324                         == InvalidOffsetNumber)
 325                         elog(ERROR, "failed to add index item to \"%s\"",
 326                                  RelationGetRelationName(rel));
 327         }
 328 }
 329
 330 /*
 331  * _hash_vacuum_one_page - vacuum just one index page.
 332  *
 333  * Try to remove LP_DEAD items from the given page. We must acquire cleanup
 334  * lock on the page being modified before calling this function.
 335  */
 336
 337 static void
 338 _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf,
 339                                           RelFileNode hnode)
 340 {
 341         OffsetNumber deletable[MaxOffsetNumber];
 342         int                     ndeletable = 0;
 343         OffsetNumber offnum,
 344                                 maxoff;
 345         Page            page = BufferGetPage(buf);
 346         HashPageOpaque pageopaque;
 347         HashMetaPage metap;
 348
 349         /* Scan each tuple in page to see if it is marked as LP_DEAD */
 350         maxoff = PageGetMaxOffsetNumber(page);
 351         for (offnum = FirstOffsetNumber;
 352                  offnum <= maxoff;
 353                  offnum = OffsetNumberNext(offnum))
 354         {
 355                 ItemId          itemId = PageGetItemId(page, offnum);
 356
 357                 if (ItemIdIsDead(itemId))
 358                         deletable[ndeletable++] = offnum;
 359         }
 360
 361         if (ndeletable > 0)
 362         {
 363                 /*
 364                  * Write-lock the meta page so that we can decrement tuple count.
 365                  */
 366                 LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 367
 368                 /* No ereport(ERROR) until changes are logged */
 369                 START_CRIT_SECTION();
 370
 371                 PageIndexMultiDelete(page, deletable, ndeletable);
 372
 373                 /*
 374                  * Mark the page as not containing any LP_DEAD items. This is not
 375                  * certainly true (there might be some that have recently been marked,
 376                  * but weren't included in our target-item list), but it will almost
 377                  * always be true and it doesn't seem worth an additional page scan to
 378                  * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
 379                  * anyway.
 380                  */
 381                 pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
 382                 pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
 383
 384                 metap = HashPageGetMeta(BufferGetPage(metabuf));
 385                 metap->hashm_ntuples -= ndeletable;
 386
 387                 MarkBufferDirty(buf);
 388                 MarkBufferDirty(metabuf);
 389
 390                 /* XLOG stuff */
 391                 if (RelationNeedsWAL(rel))
 392                 {
 393                         xl_hash_vacuum_one_page xlrec;
 394                         XLogRecPtr      recptr;
 395
 396                         xlrec.hnode = hnode;
 397                         xlrec.ntuples = ndeletable;
 398
 399                         XLogBeginInsert();
 400                         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
 401                         XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
 402
 403                         /*
 404                          * We need the target-offsets array whether or not we store the
 405                          * whole buffer, to allow us to find the latestRemovedXid on a
 406                          * standby server.
 407                          */
 408                         XLogRegisterData((char *) deletable,
 409                                                          ndeletable * sizeof(OffsetNumber));
 410
 411                         XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
 412
 413                         recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
 414
 415                         PageSetLSN(BufferGetPage(buf), recptr);
 416                         PageSetLSN(BufferGetPage(metabuf), recptr);
 417                 }
 418
 419                 END_CRIT_SECTION();
 420
 421                 /*
 422                  * Releasing write lock on meta page as we have updated the tuple
 423                  * count.
 424                  */
 425                 LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
 426         }
 427 }