granicus.if.org Git - postgresql/blob - src/backend/nodes/tidbitmap.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * tidbitmap.c
   4  *        PostgreSQL tuple-id (TID) bitmap package
   5  *
   6  * This module provides bitmap data structures that are spiritually
   7  * similar to Bitmapsets, but are specially adapted to store sets of
   8  * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
   9  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
  10  * Also, since we wish to be able to store very large tuple sets in
  11  * memory with this data structure, we support "lossy" storage, in which
  12  * we no longer remember individual tuple offsets on a page but only the
  13  * fact that a particular page needs to be visited.
  14  *
  15  * The "lossy" storage uses one bit per disk page, so at the standard 8K
  16  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
  17  * of memory.  People pushing around tables of that size should have a
  18  * couple of Mb to spare, so we don't worry about providing a second level
  19  * of lossiness.  In theory we could fall back to page ranges at some
  20  * point, but for now that seems useless complexity.
  21  *
  22  * We also support the notion of candidate matches, or rechecking.      This
  23  * means we know that a search need visit only some tuples on a page,
  24  * but we are not certain that all of those tuples are real matches.
  25  * So the eventual heap scan must recheck the quals for these tuples only,
  26  * rather than rechecking the quals for all tuples on the page as in the
  27  * lossy-bitmap case.  Rechecking can be specified when TIDs are inserted
  28  * into a bitmap, and it can also happen internally when we AND a lossy
  29  * and a non-lossy page.
  30  *
  31  *
  32  * Copyright (c) 2003-2012, PostgreSQL Global Development Group
  33  *
  34  * IDENTIFICATION
  35  *        src/backend/nodes/tidbitmap.c
  36  *
  37  *-------------------------------------------------------------------------
  38  */
  39 #include "postgres.h"
  40
  41 #include <limits.h>
  42
  43 #include "access/htup_details.h"
  44 #include "nodes/bitmapset.h"
  45 #include "nodes/tidbitmap.h"
  46 #include "utils/hsearch.h"
  47
  48 /*
  49  * The maximum number of tuples per page is not large (typically 256 with
  50  * 8K pages, or 1024 with 32K pages).  So there's not much point in making
  51  * the per-page bitmaps variable size.  We just legislate that the size
  52  * is this:
  53  */
  54 #define MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
  55
  56 /*
  57  * When we have to switch over to lossy storage, we use a data structure
  58  * with one bit per page, where all pages having the same number DIV
  59  * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
  60  * and has the bit set for a given page, there must not be a per-page entry
  61  * for that page in the page table.
  62  *
  63  * We actually store both exact pages and lossy chunks in the same hash
  64  * table, using identical data structures.      (This is because dynahash.c's
  65  * memory management doesn't allow space to be transferred easily from one
  66  * hashtable to another.)  Therefore it's best if PAGES_PER_CHUNK is the
  67  * same as MAX_TUPLES_PER_PAGE, or at least not too different.  But we
  68  * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
  69  * remainder operations.  So, define it like this:
  70  */
  71 #define PAGES_PER_CHUNK  (BLCKSZ / 32)
  72
  73 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
  74
  75 #define WORDNUM(x)      ((x) / BITS_PER_BITMAPWORD)
  76 #define BITNUM(x)       ((x) % BITS_PER_BITMAPWORD)
  77
  78 /* number of active words for an exact page: */
  79 #define WORDS_PER_PAGE  ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
  80 /* number of active words for a lossy chunk: */
  81 #define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
  82
  83 /*
  84  * The hashtable entries are represented by this data structure.  For
  85  * an exact page, blockno is the page number and bit k of the bitmap
  86  * represents tuple offset k+1.  For a lossy chunk, blockno is the first
  87  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
  88  * bit k represents page blockno+k.  Note that it is not possible to
  89  * have exact storage for the first page of a chunk if we are using
  90  * lossy storage for any page in the chunk's range, since the same
  91  * hashtable entry has to serve both purposes.
  92  *
  93  * recheck is used only on exact pages --- it indicates that although
  94  * only the stated tuples need be checked, the full index qual condition
  95  * must be checked for each (ie, these are candidate matches).
  96  */
  97 typedef struct PagetableEntry
  98 {
  99         BlockNumber blockno;            /* page number (hashtable key) */
 100         bool            ischunk;                /* T = lossy storage, F = exact */
 101         bool            recheck;                /* should the tuples be rechecked? */
 102         bitmapword      words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
 103 } PagetableEntry;
 104
 105 /*
 106  * dynahash.c is optimized for relatively large, long-lived hash tables.
 107  * This is not ideal for TIDBitMap, particularly when we are using a bitmap
 108  * scan on the inside of a nestloop join: a bitmap may well live only long
 109  * enough to accumulate one entry in such cases.  We therefore avoid creating
 110  * an actual hashtable until we need two pagetable entries.  When just one
 111  * pagetable entry is needed, we store it in a fixed field of TIDBitMap.
 112  * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down
 113  * to zero or one page again.  So, status can be TBM_HASH even when nentries
 114  * is zero or one.)
 115  */
 116 typedef enum
 117 {
 118         TBM_EMPTY,                                      /* no hashtable, nentries == 0 */
 119         TBM_ONE_PAGE,                           /* entry1 contains the single entry */
 120         TBM_HASH                                        /* pagetable is valid, entry1 is not */
 121 } TBMStatus;
 122
 123 /*
 124  * Here is the representation for a whole TIDBitMap:
 125  */
 126 struct TIDBitmap
 127 {
 128         NodeTag         type;                   /* to make it a valid Node */
 129         MemoryContext mcxt;                     /* memory context containing me */
 130         TBMStatus       status;                 /* see codes above */
 131         HTAB       *pagetable;          /* hash table of PagetableEntry's */
 132         int                     nentries;               /* number of entries in pagetable */
 133         int                     maxentries;             /* limit on same to meet maxbytes */
 134         int                     npages;                 /* number of exact entries in pagetable */
 135         int                     nchunks;                /* number of lossy entries in pagetable */
 136         bool            iterating;              /* tbm_begin_iterate called? */
 137         PagetableEntry entry1;          /* used when status == TBM_ONE_PAGE */
 138         /* these are valid when iterating is true: */
 139         PagetableEntry **spages;        /* sorted exact-page list, or NULL */
 140         PagetableEntry **schunks;       /* sorted lossy-chunk list, or NULL */
 141 };
 142
 143 /*
 144  * When iterating over a bitmap in sorted order, a TBMIterator is used to
 145  * track our progress.  There can be several iterators scanning the same
 146  * bitmap concurrently.  Note that the bitmap becomes read-only as soon as
 147  * any iterator is created.
 148  */
 149 struct TBMIterator
 150 {
 151         TIDBitmap  *tbm;                        /* TIDBitmap we're iterating over */
 152         int                     spageptr;               /* next spages index */
 153         int                     schunkptr;              /* next schunks index */
 154         int                     schunkbit;              /* next bit to check in current schunk */
 155         TBMIterateResult output;        /* MUST BE LAST (because variable-size) */
 156 };
 157
 158
 159 /* Local function prototypes */
 160 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
 161 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
 162                                    const TIDBitmap *b);
 163 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
 164                                    BlockNumber pageno);
 165 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
 166 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
 167 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
 168 static void tbm_lossify(TIDBitmap *tbm);
 169 static int      tbm_comparator(const void *left, const void *right);
 170
 171
 172 /*
 173  * tbm_create - create an initially-empty bitmap
 174  *
 175  * The bitmap will live in the memory context that is CurrentMemoryContext
 176  * at the time of this call.  It will be limited to (approximately) maxbytes
 177  * total memory consumption.
 178  */
 179 TIDBitmap *
 180 tbm_create(long maxbytes)
 181 {
 182         TIDBitmap  *tbm;
 183         long            nbuckets;
 184
 185         /* Create the TIDBitmap struct and zero all its fields */
 186         tbm = makeNode(TIDBitmap);
 187
 188         tbm->mcxt = CurrentMemoryContext;
 189         tbm->status = TBM_EMPTY;
 190
 191         /*
 192          * Estimate number of hashtable entries we can have within maxbytes. This
 193          * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a
 194          * pointer per hash entry, which is crude but good enough for our purpose.
 195          * Also count an extra Pointer per entry for the arrays created during
 196          * iteration readout.
 197          */
 198         nbuckets = maxbytes /
 199                 (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
 200                  + sizeof(Pointer) + sizeof(Pointer));
 201         nbuckets = Min(nbuckets, INT_MAX - 1);          /* safety limit */
 202         nbuckets = Max(nbuckets, 16);           /* sanity limit */
 203         tbm->maxentries = (int) nbuckets;
 204
 205         return tbm;
 206 }
 207
 208 /*
 209  * Actually create the hashtable.  Since this is a moderately expensive
 210  * proposition, we don't do it until we have to.
 211  */
 212 static void
 213 tbm_create_pagetable(TIDBitmap *tbm)
 214 {
 215         HASHCTL         hash_ctl;
 216
 217         Assert(tbm->status != TBM_HASH);
 218         Assert(tbm->pagetable == NULL);
 219
 220         /* Create the hashtable proper */
 221         MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 222         hash_ctl.keysize = sizeof(BlockNumber);
 223         hash_ctl.entrysize = sizeof(PagetableEntry);
 224         hash_ctl.hash = tag_hash;
 225         hash_ctl.hcxt = tbm->mcxt;
 226         tbm->pagetable = hash_create("TIDBitmap",
 227                                                                  128,   /* start small and extend */
 228                                                                  &hash_ctl,
 229                                                                  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 230
 231         /* If entry1 is valid, push it into the hashtable */
 232         if (tbm->status == TBM_ONE_PAGE)
 233         {
 234                 PagetableEntry *page;
 235                 bool            found;
 236
 237                 page = (PagetableEntry *) hash_search(tbm->pagetable,
 238                                                                                           (void *) &tbm->entry1.blockno,
 239                                                                                           HASH_ENTER, &found);
 240                 Assert(!found);
 241                 memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
 242         }
 243
 244         tbm->status = TBM_HASH;
 245 }
 246
 247 /*
 248  * tbm_free - free a TIDBitmap
 249  */
 250 void
 251 tbm_free(TIDBitmap *tbm)
 252 {
 253         if (tbm->pagetable)
 254                 hash_destroy(tbm->pagetable);
 255         if (tbm->spages)
 256                 pfree(tbm->spages);
 257         if (tbm->schunks)
 258                 pfree(tbm->schunks);
 259         pfree(tbm);
 260 }
 261
 262 /*
 263  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
 264  *
 265  * If recheck is true, then the recheck flag will be set in the
 266  * TBMIterateResult when any of these tuples are reported out.
 267  */
 268 void
 269 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
 270                            bool recheck)
 271 {
 272         int                     i;
 273
 274         Assert(!tbm->iterating);
 275         for (i = 0; i < ntids; i++)
 276         {
 277                 BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
 278                 OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
 279                 PagetableEntry *page;
 280                 int                     wordnum,
 281                                         bitnum;
 282
 283                 /* safety check to ensure we don't overrun bit array bounds */
 284                 if (off < 1 || off > MAX_TUPLES_PER_PAGE)
 285                         elog(ERROR, "tuple offset out of range: %u", off);
 286
 287                 if (tbm_page_is_lossy(tbm, blk))
 288                         continue;                       /* whole page is already marked */
 289
 290                 page = tbm_get_pageentry(tbm, blk);
 291
 292                 if (page->ischunk)
 293                 {
 294                         /* The page is a lossy chunk header, set bit for itself */
 295                         wordnum = bitnum = 0;
 296                 }
 297                 else
 298                 {
 299                         /* Page is exact, so set bit for individual tuple */
 300                         wordnum = WORDNUM(off - 1);
 301                         bitnum = BITNUM(off - 1);
 302                 }
 303                 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
 304                 page->recheck |= recheck;
 305
 306                 if (tbm->nentries > tbm->maxentries)
 307                         tbm_lossify(tbm);
 308         }
 309 }
 310
 311 /*
 312  * tbm_add_page - add a whole page to a TIDBitmap
 313  *
 314  * This causes the whole page to be reported (with the recheck flag)
 315  * when the TIDBitmap is scanned.
 316  */
 317 void
 318 tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
 319 {
 320         /* Enter the page in the bitmap, or mark it lossy if already present */
 321         tbm_mark_page_lossy(tbm, pageno);
 322         /* If we went over the memory limit, lossify some more pages */
 323         if (tbm->nentries > tbm->maxentries)
 324                 tbm_lossify(tbm);
 325 }
 326
 327 /*
 328  * tbm_union - set union
 329  *
 330  * a is modified in-place, b is not changed
 331  */
 332 void
 333 tbm_union(TIDBitmap *a, const TIDBitmap *b)
 334 {
 335         Assert(!a->iterating);
 336         /* Nothing to do if b is empty */
 337         if (b->nentries == 0)
 338                 return;
 339         /* Scan through chunks and pages in b, merge into a */
 340         if (b->status == TBM_ONE_PAGE)
 341                 tbm_union_page(a, &b->entry1);
 342         else
 343         {
 344                 HASH_SEQ_STATUS status;
 345                 PagetableEntry *bpage;
 346
 347                 Assert(b->status == TBM_HASH);
 348                 hash_seq_init(&status, b->pagetable);
 349                 while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 350                         tbm_union_page(a, bpage);
 351         }
 352 }
 353
 354 /* Process one page of b during a union op */
 355 static void
 356 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
 357 {
 358         PagetableEntry *apage;
 359         int                     wordnum;
 360
 361         if (bpage->ischunk)
 362         {
 363                 /* Scan b's chunk, mark each indicated page lossy in a */
 364                 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 365                 {
 366                         bitmapword      w = bpage->words[wordnum];
 367
 368                         if (w != 0)
 369                         {
 370                                 BlockNumber pg;
 371
 372                                 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
 373                                 while (w != 0)
 374                                 {
 375                                         if (w & 1)
 376                                                 tbm_mark_page_lossy(a, pg);
 377                                         pg++;
 378                                         w >>= 1;
 379                                 }
 380                         }
 381                 }
 382         }
 383         else if (tbm_page_is_lossy(a, bpage->blockno))
 384         {
 385                 /* page is already lossy in a, nothing to do */
 386                 return;
 387         }
 388         else
 389         {
 390                 apage = tbm_get_pageentry(a, bpage->blockno);
 391                 if (apage->ischunk)
 392                 {
 393                         /* The page is a lossy chunk header, set bit for itself */
 394                         apage->words[0] |= ((bitmapword) 1 << 0);
 395                 }
 396                 else
 397                 {
 398                         /* Both pages are exact, merge at the bit level */
 399                         for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 400                                 apage->words[wordnum] |= bpage->words[wordnum];
 401                         apage->recheck |= bpage->recheck;
 402                 }
 403         }
 404
 405         if (a->nentries > a->maxentries)
 406                 tbm_lossify(a);
 407 }
 408
 409 /*
 410  * tbm_intersect - set intersection
 411  *
 412  * a is modified in-place, b is not changed
 413  */
 414 void
 415 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
 416 {
 417         Assert(!a->iterating);
 418         /* Nothing to do if a is empty */
 419         if (a->nentries == 0)
 420                 return;
 421         /* Scan through chunks and pages in a, try to match to b */
 422         if (a->status == TBM_ONE_PAGE)
 423         {
 424                 if (tbm_intersect_page(a, &a->entry1, b))
 425                 {
 426                         /* Page is now empty, remove it from a */
 427                         Assert(!a->entry1.ischunk);
 428                         a->npages--;
 429                         a->nentries--;
 430                         Assert(a->nentries == 0);
 431                         a->status = TBM_EMPTY;
 432                 }
 433         }
 434         else
 435         {
 436                 HASH_SEQ_STATUS status;
 437                 PagetableEntry *apage;
 438
 439                 Assert(a->status == TBM_HASH);
 440                 hash_seq_init(&status, a->pagetable);
 441                 while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 442                 {
 443                         if (tbm_intersect_page(a, apage, b))
 444                         {
 445                                 /* Page or chunk is now empty, remove it from a */
 446                                 if (apage->ischunk)
 447                                         a->nchunks--;
 448                                 else
 449                                         a->npages--;
 450                                 a->nentries--;
 451                                 if (hash_search(a->pagetable,
 452                                                                 (void *) &apage->blockno,
 453                                                                 HASH_REMOVE, NULL) == NULL)
 454                                         elog(ERROR, "hash table corrupted");
 455                         }
 456                 }
 457         }
 458 }
 459
 460 /*
 461  * Process one page of a during an intersection op
 462  *
 463  * Returns TRUE if apage is now empty and should be deleted from a
 464  */
 465 static bool
 466 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
 467 {
 468         const PagetableEntry *bpage;
 469         int                     wordnum;
 470
 471         if (apage->ischunk)
 472         {
 473                 /* Scan each bit in chunk, try to clear */
 474                 bool            candelete = true;
 475
 476                 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 477                 {
 478                         bitmapword      w = apage->words[wordnum];
 479
 480                         if (w != 0)
 481                         {
 482                                 bitmapword      neww = w;
 483                                 BlockNumber pg;
 484                                 int                     bitnum;
 485
 486                                 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
 487                                 bitnum = 0;
 488                                 while (w != 0)
 489                                 {
 490                                         if (w & 1)
 491                                         {
 492                                                 if (!tbm_page_is_lossy(b, pg) &&
 493                                                         tbm_find_pageentry(b, pg) == NULL)
 494                                                 {
 495                                                         /* Page is not in b at all, lose lossy bit */
 496                                                         neww &= ~((bitmapword) 1 << bitnum);
 497                                                 }
 498                                         }
 499                                         pg++;
 500                                         bitnum++;
 501                                         w >>= 1;
 502                                 }
 503                                 apage->words[wordnum] = neww;
 504                                 if (neww != 0)
 505                                         candelete = false;
 506                         }
 507                 }
 508                 return candelete;
 509         }
 510         else if (tbm_page_is_lossy(b, apage->blockno))
 511         {
 512                 /*
 513                  * Some of the tuples in 'a' might not satisfy the quals for 'b', but
 514                  * because the page 'b' is lossy, we don't know which ones. Therefore
 515                  * we mark 'a' as requiring rechecks, to indicate that at most those
 516                  * tuples set in 'a' are matches.
 517                  */
 518                 apage->recheck = true;
 519                 return false;
 520         }
 521         else
 522         {
 523                 bool            candelete = true;
 524
 525                 bpage = tbm_find_pageentry(b, apage->blockno);
 526                 if (bpage != NULL)
 527                 {
 528                         /* Both pages are exact, merge at the bit level */
 529                         Assert(!bpage->ischunk);
 530                         for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 531                         {
 532                                 apage->words[wordnum] &= bpage->words[wordnum];
 533                                 if (apage->words[wordnum] != 0)
 534                                         candelete = false;
 535                         }
 536                         apage->recheck |= bpage->recheck;
 537                 }
 538                 /* If there is no matching b page, we can just delete the a page */
 539                 return candelete;
 540         }
 541 }
 542
 543 /*
 544  * tbm_is_empty - is a TIDBitmap completely empty?
 545  */
 546 bool
 547 tbm_is_empty(const TIDBitmap *tbm)
 548 {
 549         return (tbm->nentries == 0);
 550 }
 551
 552 /*
 553  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
 554  *
 555  * The TBMIterator struct is created in the caller's memory context.
 556  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
 557  * okay to just allow the memory context to be released, too.  It is caller's
 558  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
 559  * is freed.
 560  *
 561  * NB: after this is called, it is no longer allowed to modify the contents
 562  * of the bitmap.  However, you can call this multiple times to scan the
 563  * contents repeatedly, including parallel scans.
 564  */
 565 TBMIterator *
 566 tbm_begin_iterate(TIDBitmap *tbm)
 567 {
 568         TBMIterator *iterator;
 569
 570         /*
 571          * Create the TBMIterator struct, with enough trailing space to serve the
 572          * needs of the TBMIterateResult sub-struct.
 573          */
 574         iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
 575                                                                  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
 576         iterator->tbm = tbm;
 577
 578         /*
 579          * Initialize iteration pointers.
 580          */
 581         iterator->spageptr = 0;
 582         iterator->schunkptr = 0;
 583         iterator->schunkbit = 0;
 584
 585         /*
 586          * If we have a hashtable, create and fill the sorted page lists, unless
 587          * we already did that for a previous iterator.  Note that the lists are
 588          * attached to the bitmap not the iterator, so they can be used by more
 589          * than one iterator.
 590          */
 591         if (tbm->status == TBM_HASH && !tbm->iterating)
 592         {
 593                 HASH_SEQ_STATUS status;
 594                 PagetableEntry *page;
 595                 int                     npages;
 596                 int                     nchunks;
 597
 598                 if (!tbm->spages && tbm->npages > 0)
 599                         tbm->spages = (PagetableEntry **)
 600                                 MemoryContextAlloc(tbm->mcxt,
 601                                                                    tbm->npages * sizeof(PagetableEntry *));
 602                 if (!tbm->schunks && tbm->nchunks > 0)
 603                         tbm->schunks = (PagetableEntry **)
 604                                 MemoryContextAlloc(tbm->mcxt,
 605                                                                    tbm->nchunks * sizeof(PagetableEntry *));
 606
 607                 hash_seq_init(&status, tbm->pagetable);
 608                 npages = nchunks = 0;
 609                 while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 610                 {
 611                         if (page->ischunk)
 612                                 tbm->schunks[nchunks++] = page;
 613                         else
 614                                 tbm->spages[npages++] = page;
 615                 }
 616                 Assert(npages == tbm->npages);
 617                 Assert(nchunks == tbm->nchunks);
 618                 if (npages > 1)
 619                         qsort(tbm->spages, npages, sizeof(PagetableEntry *),
 620                                   tbm_comparator);
 621                 if (nchunks > 1)
 622                         qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
 623                                   tbm_comparator);
 624         }
 625
 626         tbm->iterating = true;
 627
 628         return iterator;
 629 }
 630
 631 /*
 632  * tbm_iterate - scan through next page of a TIDBitmap
 633  *
 634  * Returns a TBMIterateResult representing one page, or NULL if there are
 635  * no more pages to scan.  Pages are guaranteed to be delivered in numerical
 636  * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
 637  * remember the exact tuples to look at on this page --- the caller must
 638  * examine all tuples on the page and check if they meet the intended
 639  * condition.  If result->recheck is true, only the indicated tuples need
 640  * be examined, but the condition must be rechecked anyway.  (For ease of
 641  * testing, recheck is always set true when ntuples < 0.)
 642  */
 643 TBMIterateResult *
 644 tbm_iterate(TBMIterator *iterator)
 645 {
 646         TIDBitmap  *tbm = iterator->tbm;
 647         TBMIterateResult *output = &(iterator->output);
 648
 649         Assert(tbm->iterating);
 650
 651         /*
 652          * If lossy chunk pages remain, make sure we've advanced schunkptr/
 653          * schunkbit to the next set bit.
 654          */
 655         while (iterator->schunkptr < tbm->nchunks)
 656         {
 657                 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
 658                 int                     schunkbit = iterator->schunkbit;
 659
 660                 while (schunkbit < PAGES_PER_CHUNK)
 661                 {
 662                         int                     wordnum = WORDNUM(schunkbit);
 663                         int                     bitnum = BITNUM(schunkbit);
 664
 665                         if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
 666                                 break;
 667                         schunkbit++;
 668                 }
 669                 if (schunkbit < PAGES_PER_CHUNK)
 670                 {
 671                         iterator->schunkbit = schunkbit;
 672                         break;
 673                 }
 674                 /* advance to next chunk */
 675                 iterator->schunkptr++;
 676                 iterator->schunkbit = 0;
 677         }
 678
 679         /*
 680          * If both chunk and per-page data remain, must output the numerically
 681          * earlier page.
 682          */
 683         if (iterator->schunkptr < tbm->nchunks)
 684         {
 685                 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
 686                 BlockNumber chunk_blockno;
 687
 688                 chunk_blockno = chunk->blockno + iterator->schunkbit;
 689                 if (iterator->spageptr >= tbm->npages ||
 690                         chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
 691                 {
 692                         /* Return a lossy page indicator from the chunk */
 693                         output->blockno = chunk_blockno;
 694                         output->ntuples = -1;
 695                         output->recheck = true;
 696                         iterator->schunkbit++;
 697                         return output;
 698                 }
 699         }
 700
 701         if (iterator->spageptr < tbm->npages)
 702         {
 703                 PagetableEntry *page;
 704                 int                     ntuples;
 705                 int                     wordnum;
 706
 707                 /* In ONE_PAGE state, we don't allocate an spages[] array */
 708                 if (tbm->status == TBM_ONE_PAGE)
 709                         page = &tbm->entry1;
 710                 else
 711                         page = tbm->spages[iterator->spageptr];
 712
 713                 /* scan bitmap to extract individual offset numbers */
 714                 ntuples = 0;
 715                 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
 716                 {
 717                         bitmapword      w = page->words[wordnum];
 718
 719                         if (w != 0)
 720                         {
 721                                 int                     off = wordnum * BITS_PER_BITMAPWORD + 1;
 722
 723                                 while (w != 0)
 724                                 {
 725                                         if (w & 1)
 726                                                 output->offsets[ntuples++] = (OffsetNumber) off;
 727                                         off++;
 728                                         w >>= 1;
 729                                 }
 730                         }
 731                 }
 732                 output->blockno = page->blockno;
 733                 output->ntuples = ntuples;
 734                 output->recheck = page->recheck;
 735                 iterator->spageptr++;
 736                 return output;
 737         }
 738
 739         /* Nothing more in the bitmap */
 740         return NULL;
 741 }
 742
 743 /*
 744  * tbm_end_iterate - finish an iteration over a TIDBitmap
 745  *
 746  * Currently this is just a pfree, but it might do more someday.  (For
 747  * instance, it could be useful to count open iterators and allow the
 748  * bitmap to return to read/write status when there are no more iterators.)
 749  */
 750 void
 751 tbm_end_iterate(TBMIterator *iterator)
 752 {
 753         pfree(iterator);
 754 }
 755
 756 /*
 757  * tbm_find_pageentry - find a PagetableEntry for the pageno
 758  *
 759  * Returns NULL if there is no non-lossy entry for the pageno.
 760  */
 761 static const PagetableEntry *
 762 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
 763 {
 764         const PagetableEntry *page;
 765
 766         if (tbm->nentries == 0)         /* in case pagetable doesn't exist */
 767                 return NULL;
 768
 769         if (tbm->status == TBM_ONE_PAGE)
 770         {
 771                 page = &tbm->entry1;
 772                 if (page->blockno != pageno)
 773                         return NULL;
 774                 Assert(!page->ischunk);
 775                 return page;
 776         }
 777
 778         page = (PagetableEntry *) hash_search(tbm->pagetable,
 779                                                                                   (void *) &pageno,
 780                                                                                   HASH_FIND, NULL);
 781         if (page == NULL)
 782                 return NULL;
 783         if (page->ischunk)
 784                 return NULL;                    /* don't want a lossy chunk header */
 785         return page;
 786 }
 787
 788 /*
 789  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
 790  *
 791  * If new, the entry is marked as an exact (non-chunk) entry.
 792  *
 793  * This may cause the table to exceed the desired memory size.  It is
 794  * up to the caller to call tbm_lossify() at the next safe point if so.
 795  */
 796 static PagetableEntry *
 797 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
 798 {
 799         PagetableEntry *page;
 800         bool            found;
 801
 802         if (tbm->status == TBM_EMPTY)
 803         {
 804                 /* Use the fixed slot */
 805                 page = &tbm->entry1;
 806                 found = false;
 807                 tbm->status = TBM_ONE_PAGE;
 808         }
 809         else
 810         {
 811                 if (tbm->status == TBM_ONE_PAGE)
 812                 {
 813                         page = &tbm->entry1;
 814                         if (page->blockno == pageno)
 815                                 return page;
 816                         /* Time to switch from one page to a hashtable */
 817                         tbm_create_pagetable(tbm);
 818                 }
 819
 820                 /* Look up or create an entry */
 821                 page = (PagetableEntry *) hash_search(tbm->pagetable,
 822                                                                                           (void *) &pageno,
 823                                                                                           HASH_ENTER, &found);
 824         }
 825
 826         /* Initialize it if not present before */
 827         if (!found)
 828         {
 829                 MemSet(page, 0, sizeof(PagetableEntry));
 830                 page->blockno = pageno;
 831                 /* must count it too */
 832                 tbm->nentries++;
 833                 tbm->npages++;
 834         }
 835
 836         return page;
 837 }
 838
 839 /*
 840  * tbm_page_is_lossy - is the page marked as lossily stored?
 841  */
 842 static bool
 843 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
 844 {
 845         PagetableEntry *page;
 846         BlockNumber chunk_pageno;
 847         int                     bitno;
 848
 849         /* we can skip the lookup if there are no lossy chunks */
 850         if (tbm->nchunks == 0)
 851                 return false;
 852         Assert(tbm->status == TBM_HASH);
 853
 854         bitno = pageno % PAGES_PER_CHUNK;
 855         chunk_pageno = pageno - bitno;
 856         page = (PagetableEntry *) hash_search(tbm->pagetable,
 857                                                                                   (void *) &chunk_pageno,
 858                                                                                   HASH_FIND, NULL);
 859         if (page != NULL && page->ischunk)
 860         {
 861                 int                     wordnum = WORDNUM(bitno);
 862                 int                     bitnum = BITNUM(bitno);
 863
 864                 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
 865                         return true;
 866         }
 867         return false;
 868 }
 869
 870 /*
 871  * tbm_mark_page_lossy - mark the page number as lossily stored
 872  *
 873  * This may cause the table to exceed the desired memory size.  It is
 874  * up to the caller to call tbm_lossify() at the next safe point if so.
 875  */
 876 static void
 877 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 878 {
 879         PagetableEntry *page;
 880         bool            found;
 881         BlockNumber chunk_pageno;
 882         int                     bitno;
 883         int                     wordnum;
 884         int                     bitnum;
 885
 886         /* We force the bitmap into hashtable mode whenever it's lossy */
 887         if (tbm->status != TBM_HASH)
 888                 tbm_create_pagetable(tbm);
 889
 890         bitno = pageno % PAGES_PER_CHUNK;
 891         chunk_pageno = pageno - bitno;
 892
 893         /*
 894          * Remove any extant non-lossy entry for the page.      If the page is its own
 895          * chunk header, however, we skip this and handle the case below.
 896          */
 897         if (bitno != 0)
 898         {
 899                 if (hash_search(tbm->pagetable,
 900                                                 (void *) &pageno,
 901                                                 HASH_REMOVE, NULL) != NULL)
 902                 {
 903                         /* It was present, so adjust counts */
 904                         tbm->nentries--;
 905                         tbm->npages--;          /* assume it must have been non-lossy */
 906                 }
 907         }
 908
 909         /* Look up or create entry for chunk-header page */
 910         page = (PagetableEntry *) hash_search(tbm->pagetable,
 911                                                                                   (void *) &chunk_pageno,
 912                                                                                   HASH_ENTER, &found);
 913
 914         /* Initialize it if not present before */
 915         if (!found)
 916         {
 917                 MemSet(page, 0, sizeof(PagetableEntry));
 918                 page->blockno = chunk_pageno;
 919                 page->ischunk = true;
 920                 /* must count it too */
 921                 tbm->nentries++;
 922                 tbm->nchunks++;
 923         }
 924         else if (!page->ischunk)
 925         {
 926                 /* chunk header page was formerly non-lossy, make it lossy */
 927                 MemSet(page, 0, sizeof(PagetableEntry));
 928                 page->blockno = chunk_pageno;
 929                 page->ischunk = true;
 930                 /* we assume it had some tuple bit(s) set, so mark it lossy */
 931                 page->words[0] = ((bitmapword) 1 << 0);
 932                 /* adjust counts */
 933                 tbm->nchunks++;
 934                 tbm->npages--;
 935         }
 936
 937         /* Now set the original target page's bit */
 938         wordnum = WORDNUM(bitno);
 939         bitnum = BITNUM(bitno);
 940         page->words[wordnum] |= ((bitmapword) 1 << bitnum);
 941 }
 942
 943 /*
 944  * tbm_lossify - lose some information to get back under the memory limit
 945  */
 946 static void
 947 tbm_lossify(TIDBitmap *tbm)
 948 {
 949         HASH_SEQ_STATUS status;
 950         PagetableEntry *page;
 951
 952         /*
 953          * XXX Really stupid implementation: this just lossifies pages in
 954          * essentially random order.  We should be paying some attention to the
 955          * number of bits set in each page, instead.
 956          *
 957          * Since we are called as soon as nentries exceeds maxentries, we should
 958          * push nentries down to significantly less than maxentries, or else we'll
 959          * just end up doing this again very soon.      We shoot for maxentries/2.
 960          */
 961         Assert(!tbm->iterating);
 962         Assert(tbm->status == TBM_HASH);
 963
 964         hash_seq_init(&status, tbm->pagetable);
 965         while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 966         {
 967                 if (page->ischunk)
 968                         continue;                       /* already a chunk header */
 969
 970                 /*
 971                  * If the page would become a chunk header, we won't save anything by
 972                  * converting it to lossy, so skip it.
 973                  */
 974                 if ((page->blockno % PAGES_PER_CHUNK) == 0)
 975                         continue;
 976
 977                 /* This does the dirty work ... */
 978                 tbm_mark_page_lossy(tbm, page->blockno);
 979
 980                 if (tbm->nentries <= tbm->maxentries / 2)
 981                 {
 982                         /* we have done enough */
 983                         hash_seq_term(&status);
 984                         break;
 985                 }
 986
 987                 /*
 988                  * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
 989                  * hashtable.  We can continue the same seq_search scan since we do
 990                  * not care whether we visit lossy chunks or not.
 991                  */
 992         }
 993
 994         /*
 995          * With a big bitmap and small work_mem, it's possible that we cannot get
 996          * under maxentries.  Again, if that happens, we'd end up uselessly
 997          * calling tbm_lossify over and over.  To prevent this from becoming a
 998          * performance sink, force maxentries up to at least double the current
 999          * number of entries.  (In essence, we're admitting inability to fit
1000          * within work_mem when we do this.)  Note that this test will not fire if
1001          * we broke out of the loop early; and if we didn't, the current number of
1002          * entries is simply not reducible any further.
1003          */
1004         if (tbm->nentries > tbm->maxentries / 2)
1005                 tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1006 }
1007
1008 /*
1009  * qsort comparator to handle PagetableEntry pointers.
1010  */
1011 static int
1012 tbm_comparator(const void *left, const void *right)
1013 {
1014         BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1015         BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1016
1017         if (l < r)
1018                 return -1;
1019         else if (l > r)
1020                 return 1;
1021         return 0;
1022 }