1 /*-------------------------------------------------------------------------
4 * PostgreSQL tuple-id (TID) bitmap package
6 * This module provides bitmap data structures that are spiritually
7 * similar to Bitmapsets, but are specially adapted to store sets of
8 * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 * Also, since we wish to be able to store very large tuple sets in
11 * memory with this data structure, we support "lossy" storage, in which
12 * we no longer remember individual tuple offsets on a page but only the
13 * fact that a particular page needs to be visited.
15 * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 * of memory. People pushing around tables of that size should have a
18 * couple of Mb to spare, so we don't worry about providing a second level
19 * of lossiness. In theory we could fall back to page ranges at some
20 * point, but for now that seems useless complexity.
22 * We also support the notion of candidate matches, or rechecking. This
23 * means we know that a search need visit only some tuples on a page,
24 * but we are not certain that all of those tuples are real matches.
25 * So the eventual heap scan must recheck the quals for these tuples only,
26 * rather than rechecking the quals for all tuples on the page as in the
27 * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 * into a bitmap, and it can also happen internally when we AND a lossy
29 * and a non-lossy page.
32 * Copyright (c) 2003-2017, PostgreSQL Global Development Group
35 * src/backend/nodes/tidbitmap.c
37 *-------------------------------------------------------------------------
43 #include "access/htup_details.h"
44 #include "nodes/bitmapset.h"
45 #include "nodes/tidbitmap.h"
46 #include "storage/lwlock.h"
47 #include "utils/dsa.h"
50 * The maximum number of tuples per page is not large (typically 256 with
51 * 8K pages, or 1024 with 32K pages). So there's not much point in making
52 * the per-page bitmaps variable size. We just legislate that the size
55 #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
58 * When we have to switch over to lossy storage, we use a data structure
59 * with one bit per page, where all pages having the same number DIV
60 * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
61 * and has the bit set for a given page, there must not be a per-page entry
62 * for that page in the page table.
64 * We actually store both exact pages and lossy chunks in the same hash
65 * table, using identical data structures. (This is because the memory
66 * management for hashtables doesn't easily/efficiently allow space to be
67 * transferred easily from one hashtable to another.) Therefore it's best
68 * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
69 * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
70 * avoid expensive integer remainder operations. So, define it like this:
72 #define PAGES_PER_CHUNK (BLCKSZ / 32)
74 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
76 #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
77 #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
79 /* number of active words for an exact page: */
80 #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
81 /* number of active words for a lossy chunk: */
82 #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
85 * The hashtable entries are represented by this data structure. For
86 * an exact page, blockno is the page number and bit k of the bitmap
87 * represents tuple offset k+1. For a lossy chunk, blockno is the first
88 * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
89 * bit k represents page blockno+k. Note that it is not possible to
90 * have exact storage for the first page of a chunk if we are using
91 * lossy storage for any page in the chunk's range, since the same
92 * hashtable entry has to serve both purposes.
94 * recheck is used only on exact pages --- it indicates that although
95 * only the stated tuples need be checked, the full index qual condition
96 * must be checked for each (ie, these are candidate matches).
98 typedef struct PagetableEntry
100 BlockNumber blockno; /* page number (hashtable key) */
101 char status; /* hash entry status */
102 bool ischunk; /* T = lossy storage, F = exact */
103 bool recheck; /* should the tuples be rechecked? */
104 bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
108 * Holds array of pagetable entries.
110 typedef struct PTEntryArray
112 pg_atomic_uint32 refcount; /* no. of iterator attached */
113 PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
117 * We want to avoid the overhead of creating the hashtable, which is
118 * comparatively large, when not necessary. Particularly when we are using a
119 * bitmap scan on the inside of a nestloop join: a bitmap may well live only
120 * long enough to accumulate one entry in such cases. We therefore avoid
121 * creating an actual hashtable until we need two pagetable entries. When
122 * just one pagetable entry is needed, we store it in a fixed field of
123 * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
124 * shrinks down to zero or one page again. So, status can be TBM_HASH even
125 * when nentries is zero or one.)
129 TBM_EMPTY, /* no hashtable, nentries == 0 */
130 TBM_ONE_PAGE, /* entry1 contains the single entry */
131 TBM_HASH /* pagetable is valid, entry1 is not */
135 * Current iterating state of the TBM.
139 TBM_NOT_ITERATING, /* not yet converted to page and chunk array */
140 TBM_ITERATING_PRIVATE, /* converted to local page and chunk array */
141 TBM_ITERATING_SHARED /* converted to shared page and chunk array */
145 * Here is the representation for a whole TIDBitMap:
149 NodeTag type; /* to make it a valid Node */
150 MemoryContext mcxt; /* memory context containing me */
151 TBMStatus status; /* see codes above */
152 struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
153 int nentries; /* number of entries in pagetable */
154 int maxentries; /* limit on same to meet maxbytes */
155 int npages; /* number of exact entries in pagetable */
156 int nchunks; /* number of lossy entries in pagetable */
157 TBMIteratingState iterating; /* tbm_begin_iterate called? */
158 uint32 lossify_start; /* offset to start lossifying hashtable at */
159 PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
160 /* these are valid when iterating is true: */
161 PagetableEntry **spages; /* sorted exact-page list, or NULL */
162 PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
163 dsa_pointer dsapagetable; /* dsa_pointer to the element array */
164 dsa_pointer dsapagetableold; /* dsa_pointer to the old element array */
165 dsa_pointer ptpages; /* dsa_pointer to the page array */
166 dsa_pointer ptchunks; /* dsa_pointer to the chunk array */
167 dsa_area *dsa; /* reference to per-query dsa area */
171 * When iterating over a bitmap in sorted order, a TBMIterator is used to
172 * track our progress. There can be several iterators scanning the same
173 * bitmap concurrently. Note that the bitmap becomes read-only as soon as
174 * any iterator is created.
178 TIDBitmap *tbm; /* TIDBitmap we're iterating over */
179 int spageptr; /* next spages index */
180 int schunkptr; /* next schunks index */
181 int schunkbit; /* next bit to check in current schunk */
182 TBMIterateResult output; /* MUST BE LAST (because variable-size) */
186 * Holds the shared members of the iterator so that multiple processes
187 * can jointly iterate.
189 typedef struct TBMSharedIteratorState
191 int nentries; /* number of entries in pagetable */
192 int maxentries; /* limit on same to meet maxbytes */
193 int npages; /* number of exact entries in pagetable */
194 int nchunks; /* number of lossy entries in pagetable */
195 dsa_pointer pagetable; /* dsa pointers to head of pagetable data */
196 dsa_pointer spages; /* dsa pointer to page array */
197 dsa_pointer schunks; /* dsa pointer to chunk array */
198 LWLock lock; /* lock to protect below members */
199 int spageptr; /* next spages index */
200 int schunkptr; /* next schunks index */
201 int schunkbit; /* next bit to check in current schunk */
202 } TBMSharedIteratorState;
205 * pagetable iteration array.
207 typedef struct PTIterationArray
209 pg_atomic_uint32 refcount; /* no. of iterator attached */
210 int index[FLEXIBLE_ARRAY_MEMBER]; /* index array */
214 * same as TBMIterator, but it is used for joint iteration, therefore this
215 * also holds a reference to the shared state.
217 struct TBMSharedIterator
219 TBMSharedIteratorState *state; /* shared state */
220 PTEntryArray *ptbase; /* pagetable element array */
221 PTIterationArray *ptpages; /* sorted exact page index list */
222 PTIterationArray *ptchunks; /* sorted lossy page index list */
223 TBMIterateResult output; /* MUST BE LAST (because variable-size) */
226 /* Local function prototypes */
227 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
228 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
230 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
232 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
233 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
234 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
235 static void tbm_lossify(TIDBitmap *tbm);
236 static int tbm_comparator(const void *left, const void *right);
237 static int tbm_shared_comparator(const void *left, const void *right,
241 * Simple inline murmur hash implementation for the exact width required, for
245 hash_blockno(BlockNumber b)
257 /* define hashtable mapping block numbers to PagetableEntry's */
258 #define SH_USE_NONDEFAULT_ALLOCATOR
259 #define SH_PREFIX pagetable
260 #define SH_ELEMENT_TYPE PagetableEntry
261 #define SH_KEY_TYPE BlockNumber
262 #define SH_KEY blockno
263 #define SH_HASH_KEY(tb, key) hash_blockno(key)
264 #define SH_EQUAL(tb, a, b) a == b
265 #define SH_SCOPE static inline
268 #include "lib/simplehash.h"
272 * tbm_create - create an initially-empty bitmap
274 * The bitmap will live in the memory context that is CurrentMemoryContext
275 * at the time of this call. It will be limited to (approximately) maxbytes
276 * total memory consumption. If the DSA passed to this function is not NULL
277 * then the memory for storing elements of the underlying page table will
278 * be allocated from the DSA.
281 tbm_create(long maxbytes, dsa_area *dsa)
286 /* Create the TIDBitmap struct and zero all its fields */
287 tbm = makeNode(TIDBitmap);
289 tbm->mcxt = CurrentMemoryContext;
290 tbm->status = TBM_EMPTY;
293 * Estimate number of hashtable entries we can have within maxbytes. This
294 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
295 * for our purpose. Also count an extra Pointer per entry for the arrays
296 * created during iteration readout.
298 nbuckets = maxbytes /
299 (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
300 nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
301 nbuckets = Max(nbuckets, 16); /* sanity limit */
302 tbm->maxentries = (int) nbuckets;
303 tbm->lossify_start = 0;
305 tbm->dsapagetable = InvalidDsaPointer;
306 tbm->dsapagetableold = InvalidDsaPointer;
307 tbm->ptpages = InvalidDsaPointer;
308 tbm->ptchunks = InvalidDsaPointer;
314 * Actually create the hashtable. Since this is a moderately expensive
315 * proposition, we don't do it until we have to.
318 tbm_create_pagetable(TIDBitmap *tbm)
320 Assert(tbm->status != TBM_HASH);
321 Assert(tbm->pagetable == NULL);
323 tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
325 /* If entry1 is valid, push it into the hashtable */
326 if (tbm->status == TBM_ONE_PAGE)
328 PagetableEntry *page;
332 page = pagetable_insert(tbm->pagetable,
336 oldstatus = page->status;
337 memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
338 page->status = oldstatus;
341 tbm->status = TBM_HASH;
345 * tbm_free - free a TIDBitmap
348 tbm_free(TIDBitmap *tbm)
351 pagetable_destroy(tbm->pagetable);
360 * tbm_free_shared_area - free shared state
362 * Free shared iterator state, Also free shared pagetable and iterator arrays
363 * memory if they are not referred by any of the shared iterator i.e recount
367 tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
369 TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
370 PTEntryArray *ptbase;
371 PTIterationArray *ptpages;
372 PTIterationArray *ptchunks;
374 if (DsaPointerIsValid(istate->pagetable))
376 ptbase = dsa_get_address(dsa, istate->pagetable);
377 if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
378 dsa_free(dsa, istate->pagetable);
380 if (DsaPointerIsValid(istate->spages))
382 ptpages = dsa_get_address(dsa, istate->spages);
383 if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
384 dsa_free(dsa, istate->spages);
386 if (DsaPointerIsValid(istate->schunks))
388 ptchunks = dsa_get_address(dsa, istate->schunks);
389 if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
390 dsa_free(dsa, istate->schunks);
397 * tbm_add_tuples - add some tuple IDs to a TIDBitmap
399 * If recheck is true, then the recheck flag will be set in the
400 * TBMIterateResult when any of these tuples are reported out.
403 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
406 BlockNumber currblk = InvalidBlockNumber;
407 PagetableEntry *page = NULL; /* only valid when currblk is valid */
410 Assert(tbm->iterating == TBM_NOT_ITERATING);
411 for (i = 0; i < ntids; i++)
413 BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
414 OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
418 /* safety check to ensure we don't overrun bit array bounds */
419 if (off < 1 || off > MAX_TUPLES_PER_PAGE)
420 elog(ERROR, "tuple offset out of range: %u", off);
423 * Look up target page unless we already did. This saves cycles when
424 * the input includes consecutive tuples on the same page, which is
425 * common enough to justify an extra test here.
429 if (tbm_page_is_lossy(tbm, blk))
430 page = NULL; /* remember page is lossy */
432 page = tbm_get_pageentry(tbm, blk);
437 continue; /* whole page is already marked */
441 /* The page is a lossy chunk header, set bit for itself */
442 wordnum = bitnum = 0;
446 /* Page is exact, so set bit for individual tuple */
447 wordnum = WORDNUM(off - 1);
448 bitnum = BITNUM(off - 1);
450 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
451 page->recheck |= recheck;
453 if (tbm->nentries > tbm->maxentries)
456 /* Page could have been converted to lossy, so force new lookup */
457 currblk = InvalidBlockNumber;
463 * tbm_add_page - add a whole page to a TIDBitmap
465 * This causes the whole page to be reported (with the recheck flag)
466 * when the TIDBitmap is scanned.
469 tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
471 /* Enter the page in the bitmap, or mark it lossy if already present */
472 tbm_mark_page_lossy(tbm, pageno);
473 /* If we went over the memory limit, lossify some more pages */
474 if (tbm->nentries > tbm->maxentries)
479 * tbm_union - set union
481 * a is modified in-place, b is not changed
484 tbm_union(TIDBitmap *a, const TIDBitmap *b)
486 Assert(!a->iterating);
487 /* Nothing to do if b is empty */
488 if (b->nentries == 0)
490 /* Scan through chunks and pages in b, merge into a */
491 if (b->status == TBM_ONE_PAGE)
492 tbm_union_page(a, &b->entry1);
495 pagetable_iterator i;
496 PagetableEntry *bpage;
498 Assert(b->status == TBM_HASH);
499 pagetable_start_iterate(b->pagetable, &i);
500 while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
501 tbm_union_page(a, bpage);
505 /* Process one page of b during a union op */
507 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
509 PagetableEntry *apage;
514 /* Scan b's chunk, mark each indicated page lossy in a */
515 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
517 bitmapword w = bpage->words[wordnum];
523 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
527 tbm_mark_page_lossy(a, pg);
534 else if (tbm_page_is_lossy(a, bpage->blockno))
536 /* page is already lossy in a, nothing to do */
541 apage = tbm_get_pageentry(a, bpage->blockno);
544 /* The page is a lossy chunk header, set bit for itself */
545 apage->words[0] |= ((bitmapword) 1 << 0);
549 /* Both pages are exact, merge at the bit level */
550 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
551 apage->words[wordnum] |= bpage->words[wordnum];
552 apage->recheck |= bpage->recheck;
556 if (a->nentries > a->maxentries)
561 * tbm_intersect - set intersection
563 * a is modified in-place, b is not changed
566 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
568 Assert(!a->iterating);
569 /* Nothing to do if a is empty */
570 if (a->nentries == 0)
572 /* Scan through chunks and pages in a, try to match to b */
573 if (a->status == TBM_ONE_PAGE)
575 if (tbm_intersect_page(a, &a->entry1, b))
577 /* Page is now empty, remove it from a */
578 Assert(!a->entry1.ischunk);
581 Assert(a->nentries == 0);
582 a->status = TBM_EMPTY;
587 pagetable_iterator i;
588 PagetableEntry *apage;
590 Assert(a->status == TBM_HASH);
591 pagetable_start_iterate(a->pagetable, &i);
592 while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
594 if (tbm_intersect_page(a, apage, b))
596 /* Page or chunk is now empty, remove it from a */
602 if (!pagetable_delete(a->pagetable, apage->blockno))
603 elog(ERROR, "hash table corrupted");
610 * Process one page of a during an intersection op
612 * Returns TRUE if apage is now empty and should be deleted from a
615 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
617 const PagetableEntry *bpage;
622 /* Scan each bit in chunk, try to clear */
623 bool candelete = true;
625 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
627 bitmapword w = apage->words[wordnum];
635 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
641 if (!tbm_page_is_lossy(b, pg) &&
642 tbm_find_pageentry(b, pg) == NULL)
644 /* Page is not in b at all, lose lossy bit */
645 neww &= ~((bitmapword) 1 << bitnum);
652 apage->words[wordnum] = neww;
659 else if (tbm_page_is_lossy(b, apage->blockno))
662 * Some of the tuples in 'a' might not satisfy the quals for 'b', but
663 * because the page 'b' is lossy, we don't know which ones. Therefore
664 * we mark 'a' as requiring rechecks, to indicate that at most those
665 * tuples set in 'a' are matches.
667 apage->recheck = true;
672 bool candelete = true;
674 bpage = tbm_find_pageentry(b, apage->blockno);
677 /* Both pages are exact, merge at the bit level */
678 Assert(!bpage->ischunk);
679 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
681 apage->words[wordnum] &= bpage->words[wordnum];
682 if (apage->words[wordnum] != 0)
685 apage->recheck |= bpage->recheck;
687 /* If there is no matching b page, we can just delete the a page */
693 * tbm_is_empty - is a TIDBitmap completely empty?
696 tbm_is_empty(const TIDBitmap *tbm)
698 return (tbm->nentries == 0);
702 * tbm_begin_iterate - prepare to iterate through a TIDBitmap
704 * The TBMIterator struct is created in the caller's memory context.
705 * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
706 * okay to just allow the memory context to be released, too. It is caller's
707 * responsibility not to touch the TBMIterator anymore once the TIDBitmap
710 * NB: after this is called, it is no longer allowed to modify the contents
711 * of the bitmap. However, you can call this multiple times to scan the
712 * contents repeatedly, including parallel scans.
715 tbm_begin_iterate(TIDBitmap *tbm)
717 TBMIterator *iterator;
719 Assert(tbm->iterating != TBM_ITERATING_SHARED);
722 * Create the TBMIterator struct, with enough trailing space to serve the
723 * needs of the TBMIterateResult sub-struct.
725 iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
726 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
730 * Initialize iteration pointers.
732 iterator->spageptr = 0;
733 iterator->schunkptr = 0;
734 iterator->schunkbit = 0;
737 * If we have a hashtable, create and fill the sorted page lists, unless
738 * we already did that for a previous iterator. Note that the lists are
739 * attached to the bitmap not the iterator, so they can be used by more
742 if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
744 pagetable_iterator i;
745 PagetableEntry *page;
749 if (!tbm->spages && tbm->npages > 0)
750 tbm->spages = (PagetableEntry **)
751 MemoryContextAlloc(tbm->mcxt,
752 tbm->npages * sizeof(PagetableEntry *));
753 if (!tbm->schunks && tbm->nchunks > 0)
754 tbm->schunks = (PagetableEntry **)
755 MemoryContextAlloc(tbm->mcxt,
756 tbm->nchunks * sizeof(PagetableEntry *));
758 npages = nchunks = 0;
759 pagetable_start_iterate(tbm->pagetable, &i);
760 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
763 tbm->schunks[nchunks++] = page;
765 tbm->spages[npages++] = page;
767 Assert(npages == tbm->npages);
768 Assert(nchunks == tbm->nchunks);
770 qsort(tbm->spages, npages, sizeof(PagetableEntry *),
773 qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
777 tbm->iterating = TBM_ITERATING_PRIVATE;
783 * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
785 * The necessary shared state will be allocated from the DSA passed to
786 * tbm_create, so that multiple processes can attach to it and iterate jointly.
788 * This will convert the pagetable hash into page and chunk array of the index
789 * into pagetable array.
792 tbm_prepare_shared_iterate(TIDBitmap *tbm)
795 TBMSharedIteratorState *istate;
796 PTEntryArray *ptbase = NULL;
797 PTIterationArray *ptpages = NULL;
798 PTIterationArray *ptchunks = NULL;
800 Assert(tbm->dsa != NULL);
801 Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
804 * Allocate TBMSharedIteratorState from DSA to hold the shared members and
805 * lock, this will also be used by multiple worker for shared iterate.
807 dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
808 istate = dsa_get_address(tbm->dsa, dp);
811 * If we're not already iterating, create and fill the sorted page lists.
812 * (If we are, the sorted page lists are already stored in the TIDBitmap,
813 * and we can just reuse them.)
815 if (tbm->iterating == TBM_NOT_ITERATING)
817 pagetable_iterator i;
818 PagetableEntry *page;
824 * Allocate the page and chunk array memory from the DSA to share
825 * across multiple processes.
829 tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
830 tbm->npages * sizeof(int));
831 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
832 pg_atomic_init_u32(&ptpages->refcount, 0);
836 tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
837 tbm->nchunks * sizeof(int));
838 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
839 pg_atomic_init_u32(&ptchunks->refcount, 0);
843 * If TBM status is TBM_HASH then iterate over the pagetable and
844 * convert it to page and chunk arrays. But if it's in the
845 * TBM_ONE_PAGE mode then directly allocate the space for one entry
848 npages = nchunks = 0;
849 if (tbm->status == TBM_HASH)
851 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
853 pagetable_start_iterate(tbm->pagetable, &i);
854 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
856 idx = page - ptbase->ptentry;
858 ptchunks->index[nchunks++] = idx;
860 ptpages->index[npages++] = idx;
863 Assert(npages == tbm->npages);
864 Assert(nchunks == tbm->nchunks);
866 else if (tbm->status == TBM_ONE_PAGE)
869 * In one page mode allocate the space for one pagetable entry and
870 * directly store its index i.e. 0 in page array
872 tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
873 sizeof(PagetableEntry));
874 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
875 ptpages->index[0] = 0;
879 pg_atomic_init_u32(&ptbase->refcount, 0);
881 qsort_arg((void *) (ptpages->index), npages, sizeof(int),
882 tbm_shared_comparator, (void *) ptbase->ptentry);
884 qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
885 tbm_shared_comparator, (void *) ptbase->ptentry);
889 * Store the TBM members in the shared state so that we can share them
890 * across multiple processes.
892 istate->nentries = tbm->nentries;
893 istate->maxentries = tbm->maxentries;
894 istate->npages = tbm->npages;
895 istate->nchunks = tbm->nchunks;
896 istate->pagetable = tbm->dsapagetable;
897 istate->spages = tbm->ptpages;
898 istate->schunks = tbm->ptchunks;
900 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
901 ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
902 ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
905 * For every shared iterator, referring to pagetable and iterator array,
906 * increase the refcount by 1 so that while freeing the shared iterator
907 * we don't free pagetable and iterator array until its refcount becomes 0.
910 pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
912 pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
913 if (ptchunks != NULL)
914 pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
916 /* Initialize the iterator lock */
917 LWLockInitialize(&istate->lock, LWTRANCHE_TBM);
919 /* Initialize the shared iterator state */
920 istate->schunkbit = 0;
921 istate->schunkptr = 0;
922 istate->spageptr = 0;
924 tbm->iterating = TBM_ITERATING_SHARED;
930 * tbm_extract_page_tuple - extract the tuple offsets from a page
932 * The extracted offsets are stored into TBMIterateResult.
935 tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
940 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
942 bitmapword w = page->words[wordnum];
946 int off = wordnum * BITS_PER_BITMAPWORD + 1;
951 output->offsets[ntuples++] = (OffsetNumber) off;
962 * tbm_advance_schunkbit - Advance the chunkbit
965 tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
967 int schunkbit = *schunkbitp;
969 while (schunkbit < PAGES_PER_CHUNK)
971 int wordnum = WORDNUM(schunkbit);
972 int bitnum = BITNUM(schunkbit);
974 if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
979 *schunkbitp = schunkbit;
983 * tbm_iterate - scan through next page of a TIDBitmap
985 * Returns a TBMIterateResult representing one page, or NULL if there are
986 * no more pages to scan. Pages are guaranteed to be delivered in numerical
987 * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
988 * remember the exact tuples to look at on this page --- the caller must
989 * examine all tuples on the page and check if they meet the intended
990 * condition. If result->recheck is true, only the indicated tuples need
991 * be examined, but the condition must be rechecked anyway. (For ease of
992 * testing, recheck is always set true when ntuples < 0.)
995 tbm_iterate(TBMIterator *iterator)
997 TIDBitmap *tbm = iterator->tbm;
998 TBMIterateResult *output = &(iterator->output);
1000 Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
1003 * If lossy chunk pages remain, make sure we've advanced schunkptr/
1004 * schunkbit to the next set bit.
1006 while (iterator->schunkptr < tbm->nchunks)
1008 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1009 int schunkbit = iterator->schunkbit;
1011 tbm_advance_schunkbit(chunk, &schunkbit);
1012 if (schunkbit < PAGES_PER_CHUNK)
1014 iterator->schunkbit = schunkbit;
1017 /* advance to next chunk */
1018 iterator->schunkptr++;
1019 iterator->schunkbit = 0;
1023 * If both chunk and per-page data remain, must output the numerically
1026 if (iterator->schunkptr < tbm->nchunks)
1028 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1029 BlockNumber chunk_blockno;
1031 chunk_blockno = chunk->blockno + iterator->schunkbit;
1032 if (iterator->spageptr >= tbm->npages ||
1033 chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1035 /* Return a lossy page indicator from the chunk */
1036 output->blockno = chunk_blockno;
1037 output->ntuples = -1;
1038 output->recheck = true;
1039 iterator->schunkbit++;
1044 if (iterator->spageptr < tbm->npages)
1046 PagetableEntry *page;
1049 /* In ONE_PAGE state, we don't allocate an spages[] array */
1050 if (tbm->status == TBM_ONE_PAGE)
1051 page = &tbm->entry1;
1053 page = tbm->spages[iterator->spageptr];
1055 /* scan bitmap to extract individual offset numbers */
1056 ntuples = tbm_extract_page_tuple(page, output);
1057 output->blockno = page->blockno;
1058 output->ntuples = ntuples;
1059 output->recheck = page->recheck;
1060 iterator->spageptr++;
1064 /* Nothing more in the bitmap */
1069 * tbm_shared_iterate - scan through next page of a TIDBitmap
1071 * As above, but this will iterate using an iterator which is shared
1072 * across multiple processes. We need to acquire the iterator LWLock,
1073 * before accessing the shared members.
1076 tbm_shared_iterate(TBMSharedIterator *iterator)
1078 TBMIterateResult *output = &iterator->output;
1079 TBMSharedIteratorState *istate = iterator->state;
1080 PagetableEntry *ptbase = NULL;
1081 int *idxpages = NULL;
1082 int *idxchunks = NULL;
1084 if (iterator->ptbase != NULL)
1085 ptbase = iterator->ptbase->ptentry;
1086 if (iterator->ptpages != NULL)
1087 idxpages = iterator->ptpages->index;
1088 if (iterator->ptchunks != NULL)
1089 idxchunks = iterator->ptchunks->index;
1091 /* Acquire the LWLock before accessing the shared members */
1092 LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1095 * If lossy chunk pages remain, make sure we've advanced schunkptr/
1096 * schunkbit to the next set bit.
1098 while (istate->schunkptr < istate->nchunks)
1100 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1101 int schunkbit = istate->schunkbit;
1103 tbm_advance_schunkbit(chunk, &schunkbit);
1104 if (schunkbit < PAGES_PER_CHUNK)
1106 istate->schunkbit = schunkbit;
1109 /* advance to next chunk */
1110 istate->schunkptr++;
1111 istate->schunkbit = 0;
1115 * If both chunk and per-page data remain, must output the numerically
1118 if (istate->schunkptr < istate->nchunks)
1120 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1121 BlockNumber chunk_blockno;
1123 chunk_blockno = chunk->blockno + istate->schunkbit;
1125 if (istate->spageptr >= istate->npages ||
1126 chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1128 /* Return a lossy page indicator from the chunk */
1129 output->blockno = chunk_blockno;
1130 output->ntuples = -1;
1131 output->recheck = true;
1132 istate->schunkbit++;
1134 LWLockRelease(&istate->lock);
1139 if (istate->spageptr < istate->npages)
1141 PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1144 /* scan bitmap to extract individual offset numbers */
1145 ntuples = tbm_extract_page_tuple(page, output);
1146 output->blockno = page->blockno;
1147 output->ntuples = ntuples;
1148 output->recheck = page->recheck;
1151 LWLockRelease(&istate->lock);
1156 LWLockRelease(&istate->lock);
1158 /* Nothing more in the bitmap */
1163 * tbm_end_iterate - finish an iteration over a TIDBitmap
1165 * Currently this is just a pfree, but it might do more someday. (For
1166 * instance, it could be useful to count open iterators and allow the
1167 * bitmap to return to read/write status when there are no more iterators.)
1170 tbm_end_iterate(TBMIterator *iterator)
1176 * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1178 * This doesn't free any of the shared state associated with the iterator,
1179 * just our backend-private state.
1182 tbm_end_shared_iterate(TBMSharedIterator *iterator)
1188 * tbm_find_pageentry - find a PagetableEntry for the pageno
1190 * Returns NULL if there is no non-lossy entry for the pageno.
1192 static const PagetableEntry *
1193 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1195 const PagetableEntry *page;
1197 if (tbm->nentries == 0) /* in case pagetable doesn't exist */
1200 if (tbm->status == TBM_ONE_PAGE)
1202 page = &tbm->entry1;
1203 if (page->blockno != pageno)
1205 Assert(!page->ischunk);
1209 page = pagetable_lookup(tbm->pagetable, pageno);
1213 return NULL; /* don't want a lossy chunk header */
1218 * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1220 * If new, the entry is marked as an exact (non-chunk) entry.
1222 * This may cause the table to exceed the desired memory size. It is
1223 * up to the caller to call tbm_lossify() at the next safe point if so.
1225 static PagetableEntry *
1226 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1228 PagetableEntry *page;
1231 if (tbm->status == TBM_EMPTY)
1233 /* Use the fixed slot */
1234 page = &tbm->entry1;
1236 tbm->status = TBM_ONE_PAGE;
1240 if (tbm->status == TBM_ONE_PAGE)
1242 page = &tbm->entry1;
1243 if (page->blockno == pageno)
1245 /* Time to switch from one page to a hashtable */
1246 tbm_create_pagetable(tbm);
1249 /* Look up or create an entry */
1250 page = pagetable_insert(tbm->pagetable, pageno, &found);
1253 /* Initialize it if not present before */
1256 char oldstatus = page->status;
1258 MemSet(page, 0, sizeof(PagetableEntry));
1259 page->status = oldstatus;
1260 page->blockno = pageno;
1261 /* must count it too */
1270 * tbm_page_is_lossy - is the page marked as lossily stored?
1273 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1275 PagetableEntry *page;
1276 BlockNumber chunk_pageno;
1279 /* we can skip the lookup if there are no lossy chunks */
1280 if (tbm->nchunks == 0)
1282 Assert(tbm->status == TBM_HASH);
1284 bitno = pageno % PAGES_PER_CHUNK;
1285 chunk_pageno = pageno - bitno;
1287 page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1289 if (page != NULL && page->ischunk)
1291 int wordnum = WORDNUM(bitno);
1292 int bitnum = BITNUM(bitno);
1294 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1301 * tbm_mark_page_lossy - mark the page number as lossily stored
1303 * This may cause the table to exceed the desired memory size. It is
1304 * up to the caller to call tbm_lossify() at the next safe point if so.
1307 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1309 PagetableEntry *page;
1311 BlockNumber chunk_pageno;
1316 /* We force the bitmap into hashtable mode whenever it's lossy */
1317 if (tbm->status != TBM_HASH)
1318 tbm_create_pagetable(tbm);
1320 bitno = pageno % PAGES_PER_CHUNK;
1321 chunk_pageno = pageno - bitno;
1324 * Remove any extant non-lossy entry for the page. If the page is its own
1325 * chunk header, however, we skip this and handle the case below.
1329 if (pagetable_delete(tbm->pagetable, pageno))
1331 /* It was present, so adjust counts */
1333 tbm->npages--; /* assume it must have been non-lossy */
1337 /* Look up or create entry for chunk-header page */
1338 page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1340 /* Initialize it if not present before */
1343 char oldstatus = page->status;
1345 MemSet(page, 0, sizeof(PagetableEntry));
1346 page->status = oldstatus;
1347 page->blockno = chunk_pageno;
1348 page->ischunk = true;
1349 /* must count it too */
1353 else if (!page->ischunk)
1355 char oldstatus = page->status;
1357 /* chunk header page was formerly non-lossy, make it lossy */
1358 MemSet(page, 0, sizeof(PagetableEntry));
1359 page->status = oldstatus;
1360 page->blockno = chunk_pageno;
1361 page->ischunk = true;
1362 /* we assume it had some tuple bit(s) set, so mark it lossy */
1363 page->words[0] = ((bitmapword) 1 << 0);
1369 /* Now set the original target page's bit */
1370 wordnum = WORDNUM(bitno);
1371 bitnum = BITNUM(bitno);
1372 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1376 * tbm_lossify - lose some information to get back under the memory limit
1379 tbm_lossify(TIDBitmap *tbm)
1381 pagetable_iterator i;
1382 PagetableEntry *page;
1385 * XXX Really stupid implementation: this just lossifies pages in
1386 * essentially random order. We should be paying some attention to the
1387 * number of bits set in each page, instead.
1389 * Since we are called as soon as nentries exceeds maxentries, we should
1390 * push nentries down to significantly less than maxentries, or else we'll
1391 * just end up doing this again very soon. We shoot for maxentries/2.
1393 Assert(tbm->iterating == TBM_NOT_ITERATING);
1394 Assert(tbm->status == TBM_HASH);
1396 pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1397 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1400 continue; /* already a chunk header */
1403 * If the page would become a chunk header, we won't save anything by
1404 * converting it to lossy, so skip it.
1406 if ((page->blockno % PAGES_PER_CHUNK) == 0)
1409 /* This does the dirty work ... */
1410 tbm_mark_page_lossy(tbm, page->blockno);
1412 if (tbm->nentries <= tbm->maxentries / 2)
1415 * We have made enough room. Remember where to start lossifying
1416 * next round, so we evenly iterate over the hashtable.
1418 tbm->lossify_start = i.cur;
1423 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1424 * hashtable and may have deleted the non-lossy chunk. We can
1425 * continue the same hash table scan, since failure to visit one
1426 * element or visiting the newly inserted element, isn't fatal.
1431 * With a big bitmap and small work_mem, it's possible that we cannot get
1432 * under maxentries. Again, if that happens, we'd end up uselessly
1433 * calling tbm_lossify over and over. To prevent this from becoming a
1434 * performance sink, force maxentries up to at least double the current
1435 * number of entries. (In essence, we're admitting inability to fit
1436 * within work_mem when we do this.) Note that this test will not fire if
1437 * we broke out of the loop early; and if we didn't, the current number of
1438 * entries is simply not reducible any further.
1440 if (tbm->nentries > tbm->maxentries / 2)
1441 tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1445 * qsort comparator to handle PagetableEntry pointers.
1448 tbm_comparator(const void *left, const void *right)
1450 BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1451 BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1461 * As above, but this will get index into PagetableEntry array. Therefore,
1462 * it needs to get actual PagetableEntry using the index before comparing the
1466 tbm_shared_comparator(const void *left, const void *right, void *arg)
1468 PagetableEntry *base = (PagetableEntry *) arg;
1469 PagetableEntry *lpage = &base[*(int *) left];
1470 PagetableEntry *rpage = &base[*(int *) right];
1472 if (lpage->blockno < rpage->blockno)
1474 else if (lpage->blockno > rpage->blockno)
1480 * tbm_attach_shared_iterate
1482 * Allocate a backend-private iterator and attach the shared iterator state
1483 * to it so that multiple processed can iterate jointly.
1485 * We also converts the DSA pointers to local pointers and store them into
1486 * our private iterator.
1489 tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1491 TBMSharedIterator *iterator;
1492 TBMSharedIteratorState *istate;
1495 * Create the TBMSharedIterator struct, with enough trailing space to
1496 * serve the needs of the TBMIterateResult sub-struct.
1498 iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1499 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1501 istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1503 iterator->state = istate;
1505 iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1508 iterator->ptpages = dsa_get_address(dsa, istate->spages);
1509 if (istate->nchunks)
1510 iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1516 * pagetable_allocate
1518 * Callback function for allocating the memory for hashtable elements.
1519 * Allocate memory for hashtable elements, using DSA if available.
1521 static inline void *
1522 pagetable_allocate(pagetable_hash *pagetable, Size size)
1524 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1525 PTEntryArray *ptbase;
1527 if (tbm->dsa == NULL)
1528 return MemoryContextAllocExtended(pagetable->ctx, size,
1529 MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1532 * Save the dsapagetable reference in dsapagetableold before allocating
1533 * new memory so that pagetable_free can free the old entry.
1535 tbm->dsapagetableold = tbm->dsapagetable;
1536 tbm->dsapagetable = dsa_allocate0(tbm->dsa, sizeof(PTEntryArray) + size);
1538 ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1539 return ptbase->ptentry;
1545 * Callback function for freeing hash table elements.
1548 pagetable_free(pagetable_hash *pagetable, void *pointer)
1550 TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
1552 /* pfree the input pointer if DSA is not available */
1553 if (tbm->dsa == NULL)
1555 else if (DsaPointerIsValid(tbm->dsapagetableold))
1557 dsa_free(tbm->dsa, tbm->dsapagetableold);
1558 tbm->dsapagetableold = InvalidDsaPointer;