1 /*-------------------------------------------------------------------------
4 * PostgreSQL tuple-id (TID) bitmap package
6 * This module provides bitmap data structures that are spiritually
7 * similar to Bitmapsets, but are specially adapted to store sets of
8 * tuple identifiers (TIDs), or ItemPointers. In particular, the division
9 * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10 * Also, since we wish to be able to store very large tuple sets in
11 * memory with this data structure, we support "lossy" storage, in which
12 * we no longer remember individual tuple offsets on a page but only the
13 * fact that a particular page needs to be visited.
15 * The "lossy" storage uses one bit per disk page, so at the standard 8K
16 * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17 * of memory. People pushing around tables of that size should have a
18 * couple of Mb to spare, so we don't worry about providing a second level
19 * of lossiness. In theory we could fall back to page ranges at some
20 * point, but for now that seems useless complexity.
22 * We also support the notion of candidate matches, or rechecking. This
23 * means we know that a search need visit only some tuples on a page,
24 * but we are not certain that all of those tuples are real matches.
25 * So the eventual heap scan must recheck the quals for these tuples only,
26 * rather than rechecking the quals for all tuples on the page as in the
27 * lossy-bitmap case. Rechecking can be specified when TIDs are inserted
28 * into a bitmap, and it can also happen internally when we AND a lossy
29 * and a non-lossy page.
32 * Copyright (c) 2003-2012, PostgreSQL Global Development Group
35 * src/backend/nodes/tidbitmap.c
37 *-------------------------------------------------------------------------
43 #include "access/htup_details.h"
44 #include "nodes/bitmapset.h"
45 #include "nodes/tidbitmap.h"
46 #include "utils/hsearch.h"
49 * The maximum number of tuples per page is not large (typically 256 with
50 * 8K pages, or 1024 with 32K pages). So there's not much point in making
51 * the per-page bitmaps variable size. We just legislate that the size
54 #define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage
57 * When we have to switch over to lossy storage, we use a data structure
58 * with one bit per page, where all pages having the same number DIV
59 * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present
60 * and has the bit set for a given page, there must not be a per-page entry
61 * for that page in the page table.
63 * We actually store both exact pages and lossy chunks in the same hash
64 * table, using identical data structures. (This is because dynahash.c's
65 * memory management doesn't allow space to be transferred easily from one
66 * hashtable to another.) Therefore it's best if PAGES_PER_CHUNK is the
67 * same as MAX_TUPLES_PER_PAGE, or at least not too different. But we
68 * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
69 * remainder operations. So, define it like this:
71 #define PAGES_PER_CHUNK (BLCKSZ / 32)
73 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
75 #define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
76 #define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
78 /* number of active words for an exact page: */
79 #define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
80 /* number of active words for a lossy chunk: */
81 #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
84 * The hashtable entries are represented by this data structure. For
85 * an exact page, blockno is the page number and bit k of the bitmap
86 * represents tuple offset k+1. For a lossy chunk, blockno is the first
87 * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
88 * bit k represents page blockno+k. Note that it is not possible to
89 * have exact storage for the first page of a chunk if we are using
90 * lossy storage for any page in the chunk's range, since the same
91 * hashtable entry has to serve both purposes.
93 * recheck is used only on exact pages --- it indicates that although
94 * only the stated tuples need be checked, the full index qual condition
95 * must be checked for each (ie, these are candidate matches).
97 typedef struct PagetableEntry
99 BlockNumber blockno; /* page number (hashtable key) */
100 bool ischunk; /* T = lossy storage, F = exact */
101 bool recheck; /* should the tuples be rechecked? */
102 bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
106 * dynahash.c is optimized for relatively large, long-lived hash tables.
107 * This is not ideal for TIDBitMap, particularly when we are using a bitmap
108 * scan on the inside of a nestloop join: a bitmap may well live only long
109 * enough to accumulate one entry in such cases. We therefore avoid creating
110 * an actual hashtable until we need two pagetable entries. When just one
111 * pagetable entry is needed, we store it in a fixed field of TIDBitMap.
112 * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down
113 * to zero or one page again. So, status can be TBM_HASH even when nentries
118 TBM_EMPTY, /* no hashtable, nentries == 0 */
119 TBM_ONE_PAGE, /* entry1 contains the single entry */
120 TBM_HASH /* pagetable is valid, entry1 is not */
124 * Here is the representation for a whole TIDBitMap:
128 NodeTag type; /* to make it a valid Node */
129 MemoryContext mcxt; /* memory context containing me */
130 TBMStatus status; /* see codes above */
131 HTAB *pagetable; /* hash table of PagetableEntry's */
132 int nentries; /* number of entries in pagetable */
133 int maxentries; /* limit on same to meet maxbytes */
134 int npages; /* number of exact entries in pagetable */
135 int nchunks; /* number of lossy entries in pagetable */
136 bool iterating; /* tbm_begin_iterate called? */
137 PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
138 /* these are valid when iterating is true: */
139 PagetableEntry **spages; /* sorted exact-page list, or NULL */
140 PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
144 * When iterating over a bitmap in sorted order, a TBMIterator is used to
145 * track our progress. There can be several iterators scanning the same
146 * bitmap concurrently. Note that the bitmap becomes read-only as soon as
147 * any iterator is created.
151 TIDBitmap *tbm; /* TIDBitmap we're iterating over */
152 int spageptr; /* next spages index */
153 int schunkptr; /* next schunks index */
154 int schunkbit; /* next bit to check in current schunk */
155 TBMIterateResult output; /* MUST BE LAST (because variable-size) */
159 /* Local function prototypes */
160 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
161 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
163 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
165 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
166 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
167 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
168 static void tbm_lossify(TIDBitmap *tbm);
169 static int tbm_comparator(const void *left, const void *right);
173 * tbm_create - create an initially-empty bitmap
175 * The bitmap will live in the memory context that is CurrentMemoryContext
176 * at the time of this call. It will be limited to (approximately) maxbytes
177 * total memory consumption.
180 tbm_create(long maxbytes)
185 /* Create the TIDBitmap struct and zero all its fields */
186 tbm = makeNode(TIDBitmap);
188 tbm->mcxt = CurrentMemoryContext;
189 tbm->status = TBM_EMPTY;
192 * Estimate number of hashtable entries we can have within maxbytes. This
193 * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a
194 * pointer per hash entry, which is crude but good enough for our purpose.
195 * Also count an extra Pointer per entry for the arrays created during
198 nbuckets = maxbytes /
199 (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
200 + sizeof(Pointer) + sizeof(Pointer));
201 nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
202 nbuckets = Max(nbuckets, 16); /* sanity limit */
203 tbm->maxentries = (int) nbuckets;
209 * Actually create the hashtable. Since this is a moderately expensive
210 * proposition, we don't do it until we have to.
213 tbm_create_pagetable(TIDBitmap *tbm)
217 Assert(tbm->status != TBM_HASH);
218 Assert(tbm->pagetable == NULL);
220 /* Create the hashtable proper */
221 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
222 hash_ctl.keysize = sizeof(BlockNumber);
223 hash_ctl.entrysize = sizeof(PagetableEntry);
224 hash_ctl.hash = tag_hash;
225 hash_ctl.hcxt = tbm->mcxt;
226 tbm->pagetable = hash_create("TIDBitmap",
227 128, /* start small and extend */
229 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
231 /* If entry1 is valid, push it into the hashtable */
232 if (tbm->status == TBM_ONE_PAGE)
234 PagetableEntry *page;
237 page = (PagetableEntry *) hash_search(tbm->pagetable,
238 (void *) &tbm->entry1.blockno,
241 memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
244 tbm->status = TBM_HASH;
248 * tbm_free - free a TIDBitmap
251 tbm_free(TIDBitmap *tbm)
254 hash_destroy(tbm->pagetable);
263 * tbm_add_tuples - add some tuple IDs to a TIDBitmap
265 * If recheck is true, then the recheck flag will be set in the
266 * TBMIterateResult when any of these tuples are reported out.
269 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
274 Assert(!tbm->iterating);
275 for (i = 0; i < ntids; i++)
277 BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
278 OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
279 PagetableEntry *page;
283 /* safety check to ensure we don't overrun bit array bounds */
284 if (off < 1 || off > MAX_TUPLES_PER_PAGE)
285 elog(ERROR, "tuple offset out of range: %u", off);
287 if (tbm_page_is_lossy(tbm, blk))
288 continue; /* whole page is already marked */
290 page = tbm_get_pageentry(tbm, blk);
294 /* The page is a lossy chunk header, set bit for itself */
295 wordnum = bitnum = 0;
299 /* Page is exact, so set bit for individual tuple */
300 wordnum = WORDNUM(off - 1);
301 bitnum = BITNUM(off - 1);
303 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
304 page->recheck |= recheck;
306 if (tbm->nentries > tbm->maxentries)
312 * tbm_add_page - add a whole page to a TIDBitmap
314 * This causes the whole page to be reported (with the recheck flag)
315 * when the TIDBitmap is scanned.
318 tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
320 /* Enter the page in the bitmap, or mark it lossy if already present */
321 tbm_mark_page_lossy(tbm, pageno);
322 /* If we went over the memory limit, lossify some more pages */
323 if (tbm->nentries > tbm->maxentries)
328 * tbm_union - set union
330 * a is modified in-place, b is not changed
333 tbm_union(TIDBitmap *a, const TIDBitmap *b)
335 Assert(!a->iterating);
336 /* Nothing to do if b is empty */
337 if (b->nentries == 0)
339 /* Scan through chunks and pages in b, merge into a */
340 if (b->status == TBM_ONE_PAGE)
341 tbm_union_page(a, &b->entry1);
344 HASH_SEQ_STATUS status;
345 PagetableEntry *bpage;
347 Assert(b->status == TBM_HASH);
348 hash_seq_init(&status, b->pagetable);
349 while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
350 tbm_union_page(a, bpage);
354 /* Process one page of b during a union op */
356 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
358 PagetableEntry *apage;
363 /* Scan b's chunk, mark each indicated page lossy in a */
364 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
366 bitmapword w = bpage->words[wordnum];
372 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
376 tbm_mark_page_lossy(a, pg);
383 else if (tbm_page_is_lossy(a, bpage->blockno))
385 /* page is already lossy in a, nothing to do */
390 apage = tbm_get_pageentry(a, bpage->blockno);
393 /* The page is a lossy chunk header, set bit for itself */
394 apage->words[0] |= ((bitmapword) 1 << 0);
398 /* Both pages are exact, merge at the bit level */
399 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
400 apage->words[wordnum] |= bpage->words[wordnum];
401 apage->recheck |= bpage->recheck;
405 if (a->nentries > a->maxentries)
410 * tbm_intersect - set intersection
412 * a is modified in-place, b is not changed
415 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
417 Assert(!a->iterating);
418 /* Nothing to do if a is empty */
419 if (a->nentries == 0)
421 /* Scan through chunks and pages in a, try to match to b */
422 if (a->status == TBM_ONE_PAGE)
424 if (tbm_intersect_page(a, &a->entry1, b))
426 /* Page is now empty, remove it from a */
427 Assert(!a->entry1.ischunk);
430 Assert(a->nentries == 0);
431 a->status = TBM_EMPTY;
436 HASH_SEQ_STATUS status;
437 PagetableEntry *apage;
439 Assert(a->status == TBM_HASH);
440 hash_seq_init(&status, a->pagetable);
441 while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
443 if (tbm_intersect_page(a, apage, b))
445 /* Page or chunk is now empty, remove it from a */
451 if (hash_search(a->pagetable,
452 (void *) &apage->blockno,
453 HASH_REMOVE, NULL) == NULL)
454 elog(ERROR, "hash table corrupted");
461 * Process one page of a during an intersection op
463 * Returns TRUE if apage is now empty and should be deleted from a
466 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
468 const PagetableEntry *bpage;
473 /* Scan each bit in chunk, try to clear */
474 bool candelete = true;
476 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
478 bitmapword w = apage->words[wordnum];
486 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
492 if (!tbm_page_is_lossy(b, pg) &&
493 tbm_find_pageentry(b, pg) == NULL)
495 /* Page is not in b at all, lose lossy bit */
496 neww &= ~((bitmapword) 1 << bitnum);
503 apage->words[wordnum] = neww;
510 else if (tbm_page_is_lossy(b, apage->blockno))
513 * Some of the tuples in 'a' might not satisfy the quals for 'b', but
514 * because the page 'b' is lossy, we don't know which ones. Therefore
515 * we mark 'a' as requiring rechecks, to indicate that at most those
516 * tuples set in 'a' are matches.
518 apage->recheck = true;
523 bool candelete = true;
525 bpage = tbm_find_pageentry(b, apage->blockno);
528 /* Both pages are exact, merge at the bit level */
529 Assert(!bpage->ischunk);
530 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
532 apage->words[wordnum] &= bpage->words[wordnum];
533 if (apage->words[wordnum] != 0)
536 apage->recheck |= bpage->recheck;
538 /* If there is no matching b page, we can just delete the a page */
544 * tbm_is_empty - is a TIDBitmap completely empty?
547 tbm_is_empty(const TIDBitmap *tbm)
549 return (tbm->nentries == 0);
553 * tbm_begin_iterate - prepare to iterate through a TIDBitmap
555 * The TBMIterator struct is created in the caller's memory context.
556 * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
557 * okay to just allow the memory context to be released, too. It is caller's
558 * responsibility not to touch the TBMIterator anymore once the TIDBitmap
561 * NB: after this is called, it is no longer allowed to modify the contents
562 * of the bitmap. However, you can call this multiple times to scan the
563 * contents repeatedly, including parallel scans.
566 tbm_begin_iterate(TIDBitmap *tbm)
568 TBMIterator *iterator;
571 * Create the TBMIterator struct, with enough trailing space to serve the
572 * needs of the TBMIterateResult sub-struct.
574 iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
575 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
579 * Initialize iteration pointers.
581 iterator->spageptr = 0;
582 iterator->schunkptr = 0;
583 iterator->schunkbit = 0;
586 * If we have a hashtable, create and fill the sorted page lists, unless
587 * we already did that for a previous iterator. Note that the lists are
588 * attached to the bitmap not the iterator, so they can be used by more
591 if (tbm->status == TBM_HASH && !tbm->iterating)
593 HASH_SEQ_STATUS status;
594 PagetableEntry *page;
598 if (!tbm->spages && tbm->npages > 0)
599 tbm->spages = (PagetableEntry **)
600 MemoryContextAlloc(tbm->mcxt,
601 tbm->npages * sizeof(PagetableEntry *));
602 if (!tbm->schunks && tbm->nchunks > 0)
603 tbm->schunks = (PagetableEntry **)
604 MemoryContextAlloc(tbm->mcxt,
605 tbm->nchunks * sizeof(PagetableEntry *));
607 hash_seq_init(&status, tbm->pagetable);
608 npages = nchunks = 0;
609 while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
612 tbm->schunks[nchunks++] = page;
614 tbm->spages[npages++] = page;
616 Assert(npages == tbm->npages);
617 Assert(nchunks == tbm->nchunks);
619 qsort(tbm->spages, npages, sizeof(PagetableEntry *),
622 qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
626 tbm->iterating = true;
632 * tbm_iterate - scan through next page of a TIDBitmap
634 * Returns a TBMIterateResult representing one page, or NULL if there are
635 * no more pages to scan. Pages are guaranteed to be delivered in numerical
636 * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to
637 * remember the exact tuples to look at on this page --- the caller must
638 * examine all tuples on the page and check if they meet the intended
639 * condition. If result->recheck is true, only the indicated tuples need
640 * be examined, but the condition must be rechecked anyway. (For ease of
641 * testing, recheck is always set true when ntuples < 0.)
644 tbm_iterate(TBMIterator *iterator)
646 TIDBitmap *tbm = iterator->tbm;
647 TBMIterateResult *output = &(iterator->output);
649 Assert(tbm->iterating);
652 * If lossy chunk pages remain, make sure we've advanced schunkptr/
653 * schunkbit to the next set bit.
655 while (iterator->schunkptr < tbm->nchunks)
657 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
658 int schunkbit = iterator->schunkbit;
660 while (schunkbit < PAGES_PER_CHUNK)
662 int wordnum = WORDNUM(schunkbit);
663 int bitnum = BITNUM(schunkbit);
665 if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
669 if (schunkbit < PAGES_PER_CHUNK)
671 iterator->schunkbit = schunkbit;
674 /* advance to next chunk */
675 iterator->schunkptr++;
676 iterator->schunkbit = 0;
680 * If both chunk and per-page data remain, must output the numerically
683 if (iterator->schunkptr < tbm->nchunks)
685 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
686 BlockNumber chunk_blockno;
688 chunk_blockno = chunk->blockno + iterator->schunkbit;
689 if (iterator->spageptr >= tbm->npages ||
690 chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
692 /* Return a lossy page indicator from the chunk */
693 output->blockno = chunk_blockno;
694 output->ntuples = -1;
695 output->recheck = true;
696 iterator->schunkbit++;
701 if (iterator->spageptr < tbm->npages)
703 PagetableEntry *page;
707 /* In ONE_PAGE state, we don't allocate an spages[] array */
708 if (tbm->status == TBM_ONE_PAGE)
711 page = tbm->spages[iterator->spageptr];
713 /* scan bitmap to extract individual offset numbers */
715 for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
717 bitmapword w = page->words[wordnum];
721 int off = wordnum * BITS_PER_BITMAPWORD + 1;
726 output->offsets[ntuples++] = (OffsetNumber) off;
732 output->blockno = page->blockno;
733 output->ntuples = ntuples;
734 output->recheck = page->recheck;
735 iterator->spageptr++;
739 /* Nothing more in the bitmap */
744 * tbm_end_iterate - finish an iteration over a TIDBitmap
746 * Currently this is just a pfree, but it might do more someday. (For
747 * instance, it could be useful to count open iterators and allow the
748 * bitmap to return to read/write status when there are no more iterators.)
751 tbm_end_iterate(TBMIterator *iterator)
757 * tbm_find_pageentry - find a PagetableEntry for the pageno
759 * Returns NULL if there is no non-lossy entry for the pageno.
761 static const PagetableEntry *
762 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
764 const PagetableEntry *page;
766 if (tbm->nentries == 0) /* in case pagetable doesn't exist */
769 if (tbm->status == TBM_ONE_PAGE)
772 if (page->blockno != pageno)
774 Assert(!page->ischunk);
778 page = (PagetableEntry *) hash_search(tbm->pagetable,
784 return NULL; /* don't want a lossy chunk header */
789 * tbm_get_pageentry - find or create a PagetableEntry for the pageno
791 * If new, the entry is marked as an exact (non-chunk) entry.
793 * This may cause the table to exceed the desired memory size. It is
794 * up to the caller to call tbm_lossify() at the next safe point if so.
796 static PagetableEntry *
797 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
799 PagetableEntry *page;
802 if (tbm->status == TBM_EMPTY)
804 /* Use the fixed slot */
807 tbm->status = TBM_ONE_PAGE;
811 if (tbm->status == TBM_ONE_PAGE)
814 if (page->blockno == pageno)
816 /* Time to switch from one page to a hashtable */
817 tbm_create_pagetable(tbm);
820 /* Look up or create an entry */
821 page = (PagetableEntry *) hash_search(tbm->pagetable,
826 /* Initialize it if not present before */
829 MemSet(page, 0, sizeof(PagetableEntry));
830 page->blockno = pageno;
831 /* must count it too */
840 * tbm_page_is_lossy - is the page marked as lossily stored?
843 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
845 PagetableEntry *page;
846 BlockNumber chunk_pageno;
849 /* we can skip the lookup if there are no lossy chunks */
850 if (tbm->nchunks == 0)
852 Assert(tbm->status == TBM_HASH);
854 bitno = pageno % PAGES_PER_CHUNK;
855 chunk_pageno = pageno - bitno;
856 page = (PagetableEntry *) hash_search(tbm->pagetable,
857 (void *) &chunk_pageno,
859 if (page != NULL && page->ischunk)
861 int wordnum = WORDNUM(bitno);
862 int bitnum = BITNUM(bitno);
864 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
871 * tbm_mark_page_lossy - mark the page number as lossily stored
873 * This may cause the table to exceed the desired memory size. It is
874 * up to the caller to call tbm_lossify() at the next safe point if so.
877 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
879 PagetableEntry *page;
881 BlockNumber chunk_pageno;
886 /* We force the bitmap into hashtable mode whenever it's lossy */
887 if (tbm->status != TBM_HASH)
888 tbm_create_pagetable(tbm);
890 bitno = pageno % PAGES_PER_CHUNK;
891 chunk_pageno = pageno - bitno;
894 * Remove any extant non-lossy entry for the page. If the page is its own
895 * chunk header, however, we skip this and handle the case below.
899 if (hash_search(tbm->pagetable,
901 HASH_REMOVE, NULL) != NULL)
903 /* It was present, so adjust counts */
905 tbm->npages--; /* assume it must have been non-lossy */
909 /* Look up or create entry for chunk-header page */
910 page = (PagetableEntry *) hash_search(tbm->pagetable,
911 (void *) &chunk_pageno,
914 /* Initialize it if not present before */
917 MemSet(page, 0, sizeof(PagetableEntry));
918 page->blockno = chunk_pageno;
919 page->ischunk = true;
920 /* must count it too */
924 else if (!page->ischunk)
926 /* chunk header page was formerly non-lossy, make it lossy */
927 MemSet(page, 0, sizeof(PagetableEntry));
928 page->blockno = chunk_pageno;
929 page->ischunk = true;
930 /* we assume it had some tuple bit(s) set, so mark it lossy */
931 page->words[0] = ((bitmapword) 1 << 0);
937 /* Now set the original target page's bit */
938 wordnum = WORDNUM(bitno);
939 bitnum = BITNUM(bitno);
940 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
944 * tbm_lossify - lose some information to get back under the memory limit
947 tbm_lossify(TIDBitmap *tbm)
949 HASH_SEQ_STATUS status;
950 PagetableEntry *page;
953 * XXX Really stupid implementation: this just lossifies pages in
954 * essentially random order. We should be paying some attention to the
955 * number of bits set in each page, instead.
957 * Since we are called as soon as nentries exceeds maxentries, we should
958 * push nentries down to significantly less than maxentries, or else we'll
959 * just end up doing this again very soon. We shoot for maxentries/2.
961 Assert(!tbm->iterating);
962 Assert(tbm->status == TBM_HASH);
964 hash_seq_init(&status, tbm->pagetable);
965 while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
968 continue; /* already a chunk header */
971 * If the page would become a chunk header, we won't save anything by
972 * converting it to lossy, so skip it.
974 if ((page->blockno % PAGES_PER_CHUNK) == 0)
977 /* This does the dirty work ... */
978 tbm_mark_page_lossy(tbm, page->blockno);
980 if (tbm->nentries <= tbm->maxentries / 2)
982 /* we have done enough */
983 hash_seq_term(&status);
988 * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
989 * hashtable. We can continue the same seq_search scan since we do
990 * not care whether we visit lossy chunks or not.
995 * With a big bitmap and small work_mem, it's possible that we cannot get
996 * under maxentries. Again, if that happens, we'd end up uselessly
997 * calling tbm_lossify over and over. To prevent this from becoming a
998 * performance sink, force maxentries up to at least double the current
999 * number of entries. (In essence, we're admitting inability to fit
1000 * within work_mem when we do this.) Note that this test will not fire if
1001 * we broke out of the loop early; and if we didn't, the current number of
1002 * entries is simply not reducible any further.
1004 if (tbm->nentries > tbm->maxentries / 2)
1005 tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1009 * qsort comparator to handle PagetableEntry pointers.
1012 tbm_comparator(const void *left, const void *right)
1014 BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1015 BlockNumber r = (*((PagetableEntry *const *) right))->blockno;