]> granicus.if.org Git - postgresql/blob - src/backend/nodes/tidbitmap.c
ae7a913c12c77c0e6ca1b13c10011fe32572c45c
[postgresql] / src / backend / nodes / tidbitmap.c
1 /*-------------------------------------------------------------------------
2  *
3  * tidbitmap.c
4  *        PostgreSQL tuple-id (TID) bitmap package
5  *
6  * This module provides bitmap data structures that are spiritually
7  * similar to Bitmapsets, but are specially adapted to store sets of
8  * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
9  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
10  * Also, since we wish to be able to store very large tuple sets in
11  * memory with this data structure, we support "lossy" storage, in which
12  * we no longer remember individual tuple offsets on a page but only the
13  * fact that a particular page needs to be visited.
14  *
15  * The "lossy" storage uses one bit per disk page, so at the standard 8K
16  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
17  * of memory.  People pushing around tables of that size should have a
18  * couple of Mb to spare, so we don't worry about providing a second level
19  * of lossiness.  In theory we could fall back to page ranges at some
20  * point, but for now that seems useless complexity.
21  *
22  * We also support the notion of candidate matches, or rechecking.  This
23  * means we know that a search need visit only some tuples on a page,
24  * but we are not certain that all of those tuples are real matches.
25  * So the eventual heap scan must recheck the quals for these tuples only,
26  * rather than rechecking the quals for all tuples on the page as in the
27  * lossy-bitmap case.  Rechecking can be specified when TIDs are inserted
28  * into a bitmap, and it can also happen internally when we AND a lossy
29  * and a non-lossy page.
30  *
31  *
32  * Copyright (c) 2003-2017, PostgreSQL Global Development Group
33  *
34  * IDENTIFICATION
35  *        src/backend/nodes/tidbitmap.c
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40
41 #include <limits.h>
42
43 #include "access/htup_details.h"
44 #include "nodes/bitmapset.h"
45 #include "nodes/tidbitmap.h"
46 #include "storage/lwlock.h"
47 #include "utils/dsa.h"
48
49 /*
50  * The maximum number of tuples per page is not large (typically 256 with
51  * 8K pages, or 1024 with 32K pages).  So there's not much point in making
52  * the per-page bitmaps variable size.  We just legislate that the size
53  * is this:
54  */
55 #define MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
56
57 /*
58  * When we have to switch over to lossy storage, we use a data structure
59  * with one bit per page, where all pages having the same number DIV
60  * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
61  * and has the bit set for a given page, there must not be a per-page entry
62  * for that page in the page table.
63  *
64  * We actually store both exact pages and lossy chunks in the same hash
65  * table, using identical data structures.  (This is because the memory
66  * management for hashtables doesn't easily/efficiently allow space to be
67  * transferred easily from one hashtable to another.)  Therefore it's best
68  * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
69  * too different.  But we also want PAGES_PER_CHUNK to be a power of 2 to
70  * avoid expensive integer remainder operations.  So, define it like this:
71  */
72 #define PAGES_PER_CHUNK  (BLCKSZ / 32)
73
74 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
75
76 #define WORDNUM(x)      ((x) / BITS_PER_BITMAPWORD)
77 #define BITNUM(x)       ((x) % BITS_PER_BITMAPWORD)
78
79 /* number of active words for an exact page: */
80 #define WORDS_PER_PAGE  ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
81 /* number of active words for a lossy chunk: */
82 #define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
83
84 /*
85  * The hashtable entries are represented by this data structure.  For
86  * an exact page, blockno is the page number and bit k of the bitmap
87  * represents tuple offset k+1.  For a lossy chunk, blockno is the first
88  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
89  * bit k represents page blockno+k.  Note that it is not possible to
90  * have exact storage for the first page of a chunk if we are using
91  * lossy storage for any page in the chunk's range, since the same
92  * hashtable entry has to serve both purposes.
93  *
94  * recheck is used only on exact pages --- it indicates that although
95  * only the stated tuples need be checked, the full index qual condition
96  * must be checked for each (ie, these are candidate matches).
97  */
98 typedef struct PagetableEntry
99 {
100         BlockNumber blockno;            /* page number (hashtable key) */
101         char            status;                 /* hash entry status */
102         bool            ischunk;                /* T = lossy storage, F = exact */
103         bool            recheck;                /* should the tuples be rechecked? */
104         bitmapword      words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
105 } PagetableEntry;
106
107 /*
108  * Holds array of pagetable entries.
109  */
110 typedef struct PTEntryArray
111 {
112         pg_atomic_uint32        refcount;               /* no. of iterator attached */
113         PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
114 } PTEntryArray;
115
116 /*
117  * We want to avoid the overhead of creating the hashtable, which is
118  * comparatively large, when not necessary. Particularly when we are using a
119  * bitmap scan on the inside of a nestloop join: a bitmap may well live only
120  * long enough to accumulate one entry in such cases.  We therefore avoid
121  * creating an actual hashtable until we need two pagetable entries.  When
122  * just one pagetable entry is needed, we store it in a fixed field of
123  * TIDBitMap.  (NOTE: we don't get rid of the hashtable if the bitmap later
124  * shrinks down to zero or one page again.  So, status can be TBM_HASH even
125  * when nentries is zero or one.)
126  */
127 typedef enum
128 {
129         TBM_EMPTY,                                      /* no hashtable, nentries == 0 */
130         TBM_ONE_PAGE,                           /* entry1 contains the single entry */
131         TBM_HASH                                        /* pagetable is valid, entry1 is not */
132 } TBMStatus;
133
134 /*
135  * Current iterating state of the TBM.
136  */
137 typedef enum
138 {
139         TBM_NOT_ITERATING,                      /* not yet converted to page and chunk array */
140         TBM_ITERATING_PRIVATE,          /* converted to local page and chunk array */
141         TBM_ITERATING_SHARED            /* converted to shared page and chunk array */
142 } TBMIteratingState;
143
144 /*
145  * Here is the representation for a whole TIDBitMap:
146  */
147 struct TIDBitmap
148 {
149         NodeTag         type;                   /* to make it a valid Node */
150         MemoryContext mcxt;                     /* memory context containing me */
151         TBMStatus       status;                 /* see codes above */
152         struct pagetable_hash *pagetable;       /* hash table of PagetableEntry's */
153         int                     nentries;               /* number of entries in pagetable */
154         int                     maxentries;             /* limit on same to meet maxbytes */
155         int                     npages;                 /* number of exact entries in pagetable */
156         int                     nchunks;                /* number of lossy entries in pagetable */
157         TBMIteratingState iterating;    /* tbm_begin_iterate called? */
158         uint32          lossify_start;  /* offset to start lossifying hashtable at */
159         PagetableEntry entry1;          /* used when status == TBM_ONE_PAGE */
160         /* these are valid when iterating is true: */
161         PagetableEntry **spages;        /* sorted exact-page list, or NULL */
162         PagetableEntry **schunks;       /* sorted lossy-chunk list, or NULL */
163         dsa_pointer dsapagetable;       /* dsa_pointer to the element array */
164         dsa_pointer dsapagetableold;    /* dsa_pointer to the old element array */
165         dsa_pointer ptpages;            /* dsa_pointer to the page array */
166         dsa_pointer ptchunks;           /* dsa_pointer to the chunk array */
167         dsa_area   *dsa;                        /* reference to per-query dsa area */
168 };
169
170 /*
171  * When iterating over a bitmap in sorted order, a TBMIterator is used to
172  * track our progress.  There can be several iterators scanning the same
173  * bitmap concurrently.  Note that the bitmap becomes read-only as soon as
174  * any iterator is created.
175  */
176 struct TBMIterator
177 {
178         TIDBitmap  *tbm;                        /* TIDBitmap we're iterating over */
179         int                     spageptr;               /* next spages index */
180         int                     schunkptr;              /* next schunks index */
181         int                     schunkbit;              /* next bit to check in current schunk */
182         TBMIterateResult output;        /* MUST BE LAST (because variable-size) */
183 };
184
185 /*
186  * Holds the shared members of the iterator so that multiple processes
187  * can jointly iterate.
188  */
189 typedef struct TBMSharedIteratorState
190 {
191         int                     nentries;               /* number of entries in pagetable */
192         int                     maxentries;             /* limit on same to meet maxbytes */
193         int                     npages;                 /* number of exact entries in pagetable */
194         int                     nchunks;                /* number of lossy entries in pagetable */
195         dsa_pointer pagetable;          /* dsa pointers to head of pagetable data */
196         dsa_pointer spages;                     /* dsa pointer to page array */
197         dsa_pointer schunks;            /* dsa pointer to chunk array */
198         LWLock          lock;                   /* lock to protect below members */
199         int                     spageptr;               /* next spages index */
200         int                     schunkptr;              /* next schunks index */
201         int                     schunkbit;              /* next bit to check in current schunk */
202 } TBMSharedIteratorState;
203
204 /*
205  * pagetable iteration array.
206  */
207 typedef struct PTIterationArray
208 {
209         pg_atomic_uint32                        refcount;       /* no. of iterator attached */
210         int                     index[FLEXIBLE_ARRAY_MEMBER];   /* index array */
211 } PTIterationArray;
212
213 /*
214  * same as TBMIterator, but it is used for joint iteration, therefore this
215  * also holds a reference to the shared state.
216  */
217 struct TBMSharedIterator
218 {
219         TBMSharedIteratorState *state;          /* shared state */
220         PTEntryArray *ptbase;           /* pagetable element array */
221         PTIterationArray *ptpages;      /* sorted exact page index list */
222         PTIterationArray *ptchunks; /* sorted lossy page index list */
223         TBMIterateResult output;        /* MUST BE LAST (because variable-size) */
224 };
225
226 /* Local function prototypes */
227 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
228 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
229                                    const TIDBitmap *b);
230 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
231                                    BlockNumber pageno);
232 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
233 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
234 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
235 static void tbm_lossify(TIDBitmap *tbm);
236 static int      tbm_comparator(const void *left, const void *right);
237 static int tbm_shared_comparator(const void *left, const void *right,
238                                           void *arg);
239
240 /*
241  * Simple inline murmur hash implementation for the exact width required, for
242  * performance.
243  */
244 static inline uint32
245 hash_blockno(BlockNumber b)
246 {
247         uint32          h = b;
248
249         h ^= h >> 16;
250         h *= 0x85ebca6b;
251         h ^= h >> 13;
252         h *= 0xc2b2ae35;
253         h ^= h >> 16;
254         return h;
255 }
256
257 /* define hashtable mapping block numbers to PagetableEntry's */
258 #define SH_USE_NONDEFAULT_ALLOCATOR
259 #define SH_PREFIX pagetable
260 #define SH_ELEMENT_TYPE PagetableEntry
261 #define SH_KEY_TYPE BlockNumber
262 #define SH_KEY blockno
263 #define SH_HASH_KEY(tb, key) hash_blockno(key)
264 #define SH_EQUAL(tb, a, b) a == b
265 #define SH_SCOPE static inline
266 #define SH_DEFINE
267 #define SH_DECLARE
268 #include "lib/simplehash.h"
269
270
271 /*
272  * tbm_create - create an initially-empty bitmap
273  *
274  * The bitmap will live in the memory context that is CurrentMemoryContext
275  * at the time of this call.  It will be limited to (approximately) maxbytes
276  * total memory consumption.  If the DSA passed to this function is not NULL
277  * then the memory for storing elements of the underlying page table will
278  * be allocated from the DSA.
279  */
280 TIDBitmap *
281 tbm_create(long maxbytes, dsa_area *dsa)
282 {
283         TIDBitmap  *tbm;
284         long            nbuckets;
285
286         /* Create the TIDBitmap struct and zero all its fields */
287         tbm = makeNode(TIDBitmap);
288
289         tbm->mcxt = CurrentMemoryContext;
290         tbm->status = TBM_EMPTY;
291
292         /*
293          * Estimate number of hashtable entries we can have within maxbytes. This
294          * estimates the hash cost as sizeof(PagetableEntry), which is good enough
295          * for our purpose.  Also count an extra Pointer per entry for the arrays
296          * created during iteration readout.
297          */
298         nbuckets = maxbytes /
299                 (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
300         nbuckets = Min(nbuckets, INT_MAX - 1);          /* safety limit */
301         nbuckets = Max(nbuckets, 16);           /* sanity limit */
302         tbm->maxentries = (int) nbuckets;
303         tbm->lossify_start = 0;
304         tbm->dsa = dsa;
305         tbm->dsapagetable = InvalidDsaPointer;
306         tbm->dsapagetableold = InvalidDsaPointer;
307         tbm->ptpages = InvalidDsaPointer;
308         tbm->ptchunks = InvalidDsaPointer;
309
310         return tbm;
311 }
312
313 /*
314  * Actually create the hashtable.  Since this is a moderately expensive
315  * proposition, we don't do it until we have to.
316  */
317 static void
318 tbm_create_pagetable(TIDBitmap *tbm)
319 {
320         Assert(tbm->status != TBM_HASH);
321         Assert(tbm->pagetable == NULL);
322
323         tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
324
325         /* If entry1 is valid, push it into the hashtable */
326         if (tbm->status == TBM_ONE_PAGE)
327         {
328                 PagetableEntry *page;
329                 bool            found;
330                 char            oldstatus;
331
332                 page = pagetable_insert(tbm->pagetable,
333                                                                 tbm->entry1.blockno,
334                                                                 &found);
335                 Assert(!found);
336                 oldstatus = page->status;
337                 memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
338                 page->status = oldstatus;
339         }
340
341         tbm->status = TBM_HASH;
342 }
343
344 /*
345  * tbm_free - free a TIDBitmap
346  */
347 void
348 tbm_free(TIDBitmap *tbm)
349 {
350         if (tbm->pagetable)
351                 pagetable_destroy(tbm->pagetable);
352         if (tbm->spages)
353                 pfree(tbm->spages);
354         if (tbm->schunks)
355                 pfree(tbm->schunks);
356         pfree(tbm);
357 }
358
359 /*
360  * tbm_free_shared_area - free shared state
361  *
362  * Free shared iterator state, Also free shared pagetable and iterator arrays
363  * memory if they are not referred by any of the shared iterator i.e recount
364  * is becomes 0.
365  */
366 void
367 tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
368 {
369         TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
370         PTEntryArray *ptbase;
371         PTIterationArray *ptpages;
372         PTIterationArray *ptchunks;
373
374         if (DsaPointerIsValid(istate->pagetable))
375         {
376                 ptbase = dsa_get_address(dsa, istate->pagetable);
377                 if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
378                         dsa_free(dsa, istate->pagetable);
379         }
380         if (DsaPointerIsValid(istate->spages))
381         {
382                 ptpages = dsa_get_address(dsa, istate->spages);
383                 if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
384                         dsa_free(dsa, istate->spages);
385         }
386         if (DsaPointerIsValid(istate->schunks))
387         {
388                 ptchunks = dsa_get_address(dsa, istate->schunks);
389                 if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
390                         dsa_free(dsa, istate->schunks);
391         }
392
393         dsa_free(dsa, dp);
394 }
395
396 /*
397  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
398  *
399  * If recheck is true, then the recheck flag will be set in the
400  * TBMIterateResult when any of these tuples are reported out.
401  */
402 void
403 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
404                            bool recheck)
405 {
406         BlockNumber currblk = InvalidBlockNumber;
407         PagetableEntry *page = NULL;    /* only valid when currblk is valid */
408         int                     i;
409
410         Assert(tbm->iterating == TBM_NOT_ITERATING);
411         for (i = 0; i < ntids; i++)
412         {
413                 BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
414                 OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
415                 int                     wordnum,
416                                         bitnum;
417
418                 /* safety check to ensure we don't overrun bit array bounds */
419                 if (off < 1 || off > MAX_TUPLES_PER_PAGE)
420                         elog(ERROR, "tuple offset out of range: %u", off);
421
422                 /*
423                  * Look up target page unless we already did.  This saves cycles when
424                  * the input includes consecutive tuples on the same page, which is
425                  * common enough to justify an extra test here.
426                  */
427                 if (blk != currblk)
428                 {
429                         if (tbm_page_is_lossy(tbm, blk))
430                                 page = NULL;    /* remember page is lossy */
431                         else
432                                 page = tbm_get_pageentry(tbm, blk);
433                         currblk = blk;
434                 }
435
436                 if (page == NULL)
437                         continue;                       /* whole page is already marked */
438
439                 if (page->ischunk)
440                 {
441                         /* The page is a lossy chunk header, set bit for itself */
442                         wordnum = bitnum = 0;
443                 }
444                 else
445                 {
446                         /* Page is exact, so set bit for individual tuple */
447                         wordnum = WORDNUM(off - 1);
448                         bitnum = BITNUM(off - 1);
449                 }
450                 page->words[wordnum] |= ((bitmapword) 1 << bitnum);
451                 page->recheck |= recheck;
452
453                 if (tbm->nentries > tbm->maxentries)
454                 {
455                         tbm_lossify(tbm);
456                         /* Page could have been converted to lossy, so force new lookup */
457                         currblk = InvalidBlockNumber;
458                 }
459         }
460 }
461
462 /*
463  * tbm_add_page - add a whole page to a TIDBitmap
464  *
465  * This causes the whole page to be reported (with the recheck flag)
466  * when the TIDBitmap is scanned.
467  */
468 void
469 tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
470 {
471         /* Enter the page in the bitmap, or mark it lossy if already present */
472         tbm_mark_page_lossy(tbm, pageno);
473         /* If we went over the memory limit, lossify some more pages */
474         if (tbm->nentries > tbm->maxentries)
475                 tbm_lossify(tbm);
476 }
477
478 /*
479  * tbm_union - set union
480  *
481  * a is modified in-place, b is not changed
482  */
483 void
484 tbm_union(TIDBitmap *a, const TIDBitmap *b)
485 {
486         Assert(!a->iterating);
487         /* Nothing to do if b is empty */
488         if (b->nentries == 0)
489                 return;
490         /* Scan through chunks and pages in b, merge into a */
491         if (b->status == TBM_ONE_PAGE)
492                 tbm_union_page(a, &b->entry1);
493         else
494         {
495                 pagetable_iterator i;
496                 PagetableEntry *bpage;
497
498                 Assert(b->status == TBM_HASH);
499                 pagetable_start_iterate(b->pagetable, &i);
500                 while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
501                         tbm_union_page(a, bpage);
502         }
503 }
504
505 /* Process one page of b during a union op */
506 static void
507 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
508 {
509         PagetableEntry *apage;
510         int                     wordnum;
511
512         if (bpage->ischunk)
513         {
514                 /* Scan b's chunk, mark each indicated page lossy in a */
515                 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
516                 {
517                         bitmapword      w = bpage->words[wordnum];
518
519                         if (w != 0)
520                         {
521                                 BlockNumber pg;
522
523                                 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
524                                 while (w != 0)
525                                 {
526                                         if (w & 1)
527                                                 tbm_mark_page_lossy(a, pg);
528                                         pg++;
529                                         w >>= 1;
530                                 }
531                         }
532                 }
533         }
534         else if (tbm_page_is_lossy(a, bpage->blockno))
535         {
536                 /* page is already lossy in a, nothing to do */
537                 return;
538         }
539         else
540         {
541                 apage = tbm_get_pageentry(a, bpage->blockno);
542                 if (apage->ischunk)
543                 {
544                         /* The page is a lossy chunk header, set bit for itself */
545                         apage->words[0] |= ((bitmapword) 1 << 0);
546                 }
547                 else
548                 {
549                         /* Both pages are exact, merge at the bit level */
550                         for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
551                                 apage->words[wordnum] |= bpage->words[wordnum];
552                         apage->recheck |= bpage->recheck;
553                 }
554         }
555
556         if (a->nentries > a->maxentries)
557                 tbm_lossify(a);
558 }
559
560 /*
561  * tbm_intersect - set intersection
562  *
563  * a is modified in-place, b is not changed
564  */
565 void
566 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
567 {
568         Assert(!a->iterating);
569         /* Nothing to do if a is empty */
570         if (a->nentries == 0)
571                 return;
572         /* Scan through chunks and pages in a, try to match to b */
573         if (a->status == TBM_ONE_PAGE)
574         {
575                 if (tbm_intersect_page(a, &a->entry1, b))
576                 {
577                         /* Page is now empty, remove it from a */
578                         Assert(!a->entry1.ischunk);
579                         a->npages--;
580                         a->nentries--;
581                         Assert(a->nentries == 0);
582                         a->status = TBM_EMPTY;
583                 }
584         }
585         else
586         {
587                 pagetable_iterator i;
588                 PagetableEntry *apage;
589
590                 Assert(a->status == TBM_HASH);
591                 pagetable_start_iterate(a->pagetable, &i);
592                 while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
593                 {
594                         if (tbm_intersect_page(a, apage, b))
595                         {
596                                 /* Page or chunk is now empty, remove it from a */
597                                 if (apage->ischunk)
598                                         a->nchunks--;
599                                 else
600                                         a->npages--;
601                                 a->nentries--;
602                                 if (!pagetable_delete(a->pagetable, apage->blockno))
603                                         elog(ERROR, "hash table corrupted");
604                         }
605                 }
606         }
607 }
608
609 /*
610  * Process one page of a during an intersection op
611  *
612  * Returns TRUE if apage is now empty and should be deleted from a
613  */
614 static bool
615 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
616 {
617         const PagetableEntry *bpage;
618         int                     wordnum;
619
620         if (apage->ischunk)
621         {
622                 /* Scan each bit in chunk, try to clear */
623                 bool            candelete = true;
624
625                 for (wordnum = 0; wordnum < WORDS_PER_CHUNK; wordnum++)
626                 {
627                         bitmapword      w = apage->words[wordnum];
628
629                         if (w != 0)
630                         {
631                                 bitmapword      neww = w;
632                                 BlockNumber pg;
633                                 int                     bitnum;
634
635                                 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
636                                 bitnum = 0;
637                                 while (w != 0)
638                                 {
639                                         if (w & 1)
640                                         {
641                                                 if (!tbm_page_is_lossy(b, pg) &&
642                                                         tbm_find_pageentry(b, pg) == NULL)
643                                                 {
644                                                         /* Page is not in b at all, lose lossy bit */
645                                                         neww &= ~((bitmapword) 1 << bitnum);
646                                                 }
647                                         }
648                                         pg++;
649                                         bitnum++;
650                                         w >>= 1;
651                                 }
652                                 apage->words[wordnum] = neww;
653                                 if (neww != 0)
654                                         candelete = false;
655                         }
656                 }
657                 return candelete;
658         }
659         else if (tbm_page_is_lossy(b, apage->blockno))
660         {
661                 /*
662                  * Some of the tuples in 'a' might not satisfy the quals for 'b', but
663                  * because the page 'b' is lossy, we don't know which ones. Therefore
664                  * we mark 'a' as requiring rechecks, to indicate that at most those
665                  * tuples set in 'a' are matches.
666                  */
667                 apage->recheck = true;
668                 return false;
669         }
670         else
671         {
672                 bool            candelete = true;
673
674                 bpage = tbm_find_pageentry(b, apage->blockno);
675                 if (bpage != NULL)
676                 {
677                         /* Both pages are exact, merge at the bit level */
678                         Assert(!bpage->ischunk);
679                         for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
680                         {
681                                 apage->words[wordnum] &= bpage->words[wordnum];
682                                 if (apage->words[wordnum] != 0)
683                                         candelete = false;
684                         }
685                         apage->recheck |= bpage->recheck;
686                 }
687                 /* If there is no matching b page, we can just delete the a page */
688                 return candelete;
689         }
690 }
691
692 /*
693  * tbm_is_empty - is a TIDBitmap completely empty?
694  */
695 bool
696 tbm_is_empty(const TIDBitmap *tbm)
697 {
698         return (tbm->nentries == 0);
699 }
700
701 /*
702  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
703  *
704  * The TBMIterator struct is created in the caller's memory context.
705  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
706  * okay to just allow the memory context to be released, too.  It is caller's
707  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
708  * is freed.
709  *
710  * NB: after this is called, it is no longer allowed to modify the contents
711  * of the bitmap.  However, you can call this multiple times to scan the
712  * contents repeatedly, including parallel scans.
713  */
714 TBMIterator *
715 tbm_begin_iterate(TIDBitmap *tbm)
716 {
717         TBMIterator *iterator;
718
719         Assert(tbm->iterating != TBM_ITERATING_SHARED);
720
721         /*
722          * Create the TBMIterator struct, with enough trailing space to serve the
723          * needs of the TBMIterateResult sub-struct.
724          */
725         iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
726                                                                  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
727         iterator->tbm = tbm;
728
729         /*
730          * Initialize iteration pointers.
731          */
732         iterator->spageptr = 0;
733         iterator->schunkptr = 0;
734         iterator->schunkbit = 0;
735
736         /*
737          * If we have a hashtable, create and fill the sorted page lists, unless
738          * we already did that for a previous iterator.  Note that the lists are
739          * attached to the bitmap not the iterator, so they can be used by more
740          * than one iterator.
741          */
742         if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
743         {
744                 pagetable_iterator i;
745                 PagetableEntry *page;
746                 int                     npages;
747                 int                     nchunks;
748
749                 if (!tbm->spages && tbm->npages > 0)
750                         tbm->spages = (PagetableEntry **)
751                                 MemoryContextAlloc(tbm->mcxt,
752                                                                    tbm->npages * sizeof(PagetableEntry *));
753                 if (!tbm->schunks && tbm->nchunks > 0)
754                         tbm->schunks = (PagetableEntry **)
755                                 MemoryContextAlloc(tbm->mcxt,
756                                                                    tbm->nchunks * sizeof(PagetableEntry *));
757
758                 npages = nchunks = 0;
759                 pagetable_start_iterate(tbm->pagetable, &i);
760                 while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
761                 {
762                         if (page->ischunk)
763                                 tbm->schunks[nchunks++] = page;
764                         else
765                                 tbm->spages[npages++] = page;
766                 }
767                 Assert(npages == tbm->npages);
768                 Assert(nchunks == tbm->nchunks);
769                 if (npages > 1)
770                         qsort(tbm->spages, npages, sizeof(PagetableEntry *),
771                                   tbm_comparator);
772                 if (nchunks > 1)
773                         qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
774                                   tbm_comparator);
775         }
776
777         tbm->iterating = TBM_ITERATING_PRIVATE;
778
779         return iterator;
780 }
781
782 /*
783  * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
784  *
785  * The necessary shared state will be allocated from the DSA passed to
786  * tbm_create, so that multiple processes can attach to it and iterate jointly.
787  *
788  * This will convert the pagetable hash into page and chunk array of the index
789  * into pagetable array.
790  */
791 dsa_pointer
792 tbm_prepare_shared_iterate(TIDBitmap *tbm)
793 {
794         dsa_pointer dp;
795         TBMSharedIteratorState *istate;
796         PTEntryArray *ptbase = NULL;
797         PTIterationArray *ptpages = NULL;
798         PTIterationArray *ptchunks = NULL;
799
800         Assert(tbm->dsa != NULL);
801         Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
802
803         /*
804          * Allocate TBMSharedIteratorState from DSA to hold the shared members and
805          * lock, this will also be used by multiple worker for shared iterate.
806          */
807         dp = dsa_allocate0(tbm->dsa, sizeof(TBMSharedIteratorState));
808         istate = dsa_get_address(tbm->dsa, dp);
809
810         /*
811          * If we're not already iterating, create and fill the sorted page lists.
812          * (If we are, the sorted page lists are already stored in the TIDBitmap,
813          * and we can just reuse them.)
814          */
815         if (tbm->iterating == TBM_NOT_ITERATING)
816         {
817                 pagetable_iterator i;
818                 PagetableEntry *page;
819                 int                     idx;
820                 int                     npages;
821                 int                     nchunks;
822
823                 /*
824                  * Allocate the page and chunk array memory from the DSA to share
825                  * across multiple processes.
826                  */
827                 if (tbm->npages)
828                 {
829                         tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
830                                                                                 tbm->npages * sizeof(int));
831                         ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
832                         pg_atomic_init_u32(&ptpages->refcount, 0);
833                 }
834                 if (tbm->nchunks)
835                 {
836                         tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
837                                                                                  tbm->nchunks * sizeof(int));
838                         ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
839                         pg_atomic_init_u32(&ptchunks->refcount, 0);
840                 }
841
842                 /*
843                  * If TBM status is TBM_HASH then iterate over the pagetable and
844                  * convert it to page and chunk arrays.  But if it's in the
845                  * TBM_ONE_PAGE mode then directly allocate the space for one entry
846                  * from the DSA.
847                  */
848                 npages = nchunks = 0;
849                 if (tbm->status == TBM_HASH)
850                 {
851                         ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
852
853                         pagetable_start_iterate(tbm->pagetable, &i);
854                         while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
855                         {
856                                 idx = page - ptbase->ptentry;
857                                 if (page->ischunk)
858                                         ptchunks->index[nchunks++] = idx;
859                                 else
860                                         ptpages->index[npages++] = idx;
861                         }
862
863                         Assert(npages == tbm->npages);
864                         Assert(nchunks == tbm->nchunks);
865                 }
866                 else if (tbm->status == TBM_ONE_PAGE)
867                 {
868                         /*
869                          * In one page mode allocate the space for one pagetable entry and
870                          * directly store its index i.e. 0 in page array
871                          */
872                         tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
873                                                                                          sizeof(PagetableEntry));
874                         ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
875                         ptpages->index[0] = 0;
876                 }
877
878                 if (ptbase != NULL)
879                         pg_atomic_init_u32(&ptbase->refcount, 0);
880                 if (npages > 1)
881                         qsort_arg((void *) (ptpages->index), npages, sizeof(int),
882                                           tbm_shared_comparator, (void *) ptbase->ptentry);
883                 if (nchunks > 1)
884                         qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
885                                           tbm_shared_comparator, (void *) ptbase->ptentry);
886         }
887
888         /*
889          * Store the TBM members in the shared state so that we can share them
890          * across multiple processes.
891          */
892         istate->nentries = tbm->nentries;
893         istate->maxentries = tbm->maxentries;
894         istate->npages = tbm->npages;
895         istate->nchunks = tbm->nchunks;
896         istate->pagetable = tbm->dsapagetable;
897         istate->spages = tbm->ptpages;
898         istate->schunks = tbm->ptchunks;
899
900         ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
901         ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
902         ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
903
904         /*
905          * For every shared iterator, referring to pagetable and iterator array,
906          * increase the refcount by 1 so that while freeing the shared iterator
907          * we don't free pagetable and iterator array until its refcount becomes 0.
908          */
909         if (ptbase != NULL)
910                 pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
911         if (ptpages != NULL)
912                 pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
913         if (ptchunks != NULL)
914                 pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
915
916         /* Initialize the iterator lock */
917         LWLockInitialize(&istate->lock, LWTRANCHE_TBM);
918
919         /* Initialize the shared iterator state */
920         istate->schunkbit = 0;
921         istate->schunkptr = 0;
922         istate->spageptr = 0;
923
924         tbm->iterating = TBM_ITERATING_SHARED;
925
926         return dp;
927 }
928
929 /*
930  * tbm_extract_page_tuple - extract the tuple offsets from a page
931  *
932  * The extracted offsets are stored into TBMIterateResult.
933  */
934 static inline int
935 tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
936 {
937         int                     wordnum;
938         int                     ntuples = 0;
939
940         for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
941         {
942                 bitmapword      w = page->words[wordnum];
943
944                 if (w != 0)
945                 {
946                         int                     off = wordnum * BITS_PER_BITMAPWORD + 1;
947
948                         while (w != 0)
949                         {
950                                 if (w & 1)
951                                         output->offsets[ntuples++] = (OffsetNumber) off;
952                                 off++;
953                                 w >>= 1;
954                         }
955                 }
956         }
957
958         return ntuples;
959 }
960
961 /*
962  *      tbm_advance_schunkbit - Advance the chunkbit
963  */
964 static inline void
965 tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
966 {
967         int                     schunkbit = *schunkbitp;
968
969         while (schunkbit < PAGES_PER_CHUNK)
970         {
971                 int                     wordnum = WORDNUM(schunkbit);
972                 int                     bitnum = BITNUM(schunkbit);
973
974                 if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
975                         break;
976                 schunkbit++;
977         }
978
979         *schunkbitp = schunkbit;
980 }
981
982 /*
983  * tbm_iterate - scan through next page of a TIDBitmap
984  *
985  * Returns a TBMIterateResult representing one page, or NULL if there are
986  * no more pages to scan.  Pages are guaranteed to be delivered in numerical
987  * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
988  * remember the exact tuples to look at on this page --- the caller must
989  * examine all tuples on the page and check if they meet the intended
990  * condition.  If result->recheck is true, only the indicated tuples need
991  * be examined, but the condition must be rechecked anyway.  (For ease of
992  * testing, recheck is always set true when ntuples < 0.)
993  */
994 TBMIterateResult *
995 tbm_iterate(TBMIterator *iterator)
996 {
997         TIDBitmap  *tbm = iterator->tbm;
998         TBMIterateResult *output = &(iterator->output);
999
1000         Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
1001
1002         /*
1003          * If lossy chunk pages remain, make sure we've advanced schunkptr/
1004          * schunkbit to the next set bit.
1005          */
1006         while (iterator->schunkptr < tbm->nchunks)
1007         {
1008                 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1009                 int                     schunkbit = iterator->schunkbit;
1010
1011                 tbm_advance_schunkbit(chunk, &schunkbit);
1012                 if (schunkbit < PAGES_PER_CHUNK)
1013                 {
1014                         iterator->schunkbit = schunkbit;
1015                         break;
1016                 }
1017                 /* advance to next chunk */
1018                 iterator->schunkptr++;
1019                 iterator->schunkbit = 0;
1020         }
1021
1022         /*
1023          * If both chunk and per-page data remain, must output the numerically
1024          * earlier page.
1025          */
1026         if (iterator->schunkptr < tbm->nchunks)
1027         {
1028                 PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
1029                 BlockNumber chunk_blockno;
1030
1031                 chunk_blockno = chunk->blockno + iterator->schunkbit;
1032                 if (iterator->spageptr >= tbm->npages ||
1033                         chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
1034                 {
1035                         /* Return a lossy page indicator from the chunk */
1036                         output->blockno = chunk_blockno;
1037                         output->ntuples = -1;
1038                         output->recheck = true;
1039                         iterator->schunkbit++;
1040                         return output;
1041                 }
1042         }
1043
1044         if (iterator->spageptr < tbm->npages)
1045         {
1046                 PagetableEntry *page;
1047                 int                     ntuples;
1048
1049                 /* In ONE_PAGE state, we don't allocate an spages[] array */
1050                 if (tbm->status == TBM_ONE_PAGE)
1051                         page = &tbm->entry1;
1052                 else
1053                         page = tbm->spages[iterator->spageptr];
1054
1055                 /* scan bitmap to extract individual offset numbers */
1056                 ntuples = tbm_extract_page_tuple(page, output);
1057                 output->blockno = page->blockno;
1058                 output->ntuples = ntuples;
1059                 output->recheck = page->recheck;
1060                 iterator->spageptr++;
1061                 return output;
1062         }
1063
1064         /* Nothing more in the bitmap */
1065         return NULL;
1066 }
1067
1068 /*
1069  *      tbm_shared_iterate - scan through next page of a TIDBitmap
1070  *
1071  *      As above, but this will iterate using an iterator which is shared
1072  *      across multiple processes.  We need to acquire the iterator LWLock,
1073  *      before accessing the shared members.
1074  */
1075 TBMIterateResult *
1076 tbm_shared_iterate(TBMSharedIterator *iterator)
1077 {
1078         TBMIterateResult *output = &iterator->output;
1079         TBMSharedIteratorState *istate = iterator->state;
1080         PagetableEntry *ptbase = NULL;
1081         int                *idxpages = NULL;
1082         int                *idxchunks = NULL;
1083
1084         if (iterator->ptbase != NULL)
1085                 ptbase = iterator->ptbase->ptentry;
1086         if (iterator->ptpages != NULL)
1087                 idxpages = iterator->ptpages->index;
1088         if (iterator->ptchunks != NULL)
1089                 idxchunks = iterator->ptchunks->index;
1090
1091         /* Acquire the LWLock before accessing the shared members */
1092         LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
1093
1094         /*
1095          * If lossy chunk pages remain, make sure we've advanced schunkptr/
1096          * schunkbit to the next set bit.
1097          */
1098         while (istate->schunkptr < istate->nchunks)
1099         {
1100                 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1101                 int                     schunkbit = istate->schunkbit;
1102
1103                 tbm_advance_schunkbit(chunk, &schunkbit);
1104                 if (schunkbit < PAGES_PER_CHUNK)
1105                 {
1106                         istate->schunkbit = schunkbit;
1107                         break;
1108                 }
1109                 /* advance to next chunk */
1110                 istate->schunkptr++;
1111                 istate->schunkbit = 0;
1112         }
1113
1114         /*
1115          * If both chunk and per-page data remain, must output the numerically
1116          * earlier page.
1117          */
1118         if (istate->schunkptr < istate->nchunks)
1119         {
1120                 PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
1121                 BlockNumber chunk_blockno;
1122
1123                 chunk_blockno = chunk->blockno + istate->schunkbit;
1124
1125                 if (istate->spageptr >= istate->npages ||
1126                         chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
1127                 {
1128                         /* Return a lossy page indicator from the chunk */
1129                         output->blockno = chunk_blockno;
1130                         output->ntuples = -1;
1131                         output->recheck = true;
1132                         istate->schunkbit++;
1133
1134                         LWLockRelease(&istate->lock);
1135                         return output;
1136                 }
1137         }
1138
1139         if (istate->spageptr < istate->npages)
1140         {
1141                 PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
1142                 int                     ntuples;
1143
1144                 /* scan bitmap to extract individual offset numbers */
1145                 ntuples = tbm_extract_page_tuple(page, output);
1146                 output->blockno = page->blockno;
1147                 output->ntuples = ntuples;
1148                 output->recheck = page->recheck;
1149                 istate->spageptr++;
1150
1151                 LWLockRelease(&istate->lock);
1152
1153                 return output;
1154         }
1155
1156         LWLockRelease(&istate->lock);
1157
1158         /* Nothing more in the bitmap */
1159         return NULL;
1160 }
1161
1162 /*
1163  * tbm_end_iterate - finish an iteration over a TIDBitmap
1164  *
1165  * Currently this is just a pfree, but it might do more someday.  (For
1166  * instance, it could be useful to count open iterators and allow the
1167  * bitmap to return to read/write status when there are no more iterators.)
1168  */
1169 void
1170 tbm_end_iterate(TBMIterator *iterator)
1171 {
1172         pfree(iterator);
1173 }
1174
1175 /*
1176  * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
1177  *
1178  * This doesn't free any of the shared state associated with the iterator,
1179  * just our backend-private state.
1180  */
1181 void
1182 tbm_end_shared_iterate(TBMSharedIterator *iterator)
1183 {
1184         pfree(iterator);
1185 }
1186
1187 /*
1188  * tbm_find_pageentry - find a PagetableEntry for the pageno
1189  *
1190  * Returns NULL if there is no non-lossy entry for the pageno.
1191  */
1192 static const PagetableEntry *
1193 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
1194 {
1195         const PagetableEntry *page;
1196
1197         if (tbm->nentries == 0)         /* in case pagetable doesn't exist */
1198                 return NULL;
1199
1200         if (tbm->status == TBM_ONE_PAGE)
1201         {
1202                 page = &tbm->entry1;
1203                 if (page->blockno != pageno)
1204                         return NULL;
1205                 Assert(!page->ischunk);
1206                 return page;
1207         }
1208
1209         page = pagetable_lookup(tbm->pagetable, pageno);
1210         if (page == NULL)
1211                 return NULL;
1212         if (page->ischunk)
1213                 return NULL;                    /* don't want a lossy chunk header */
1214         return page;
1215 }
1216
1217 /*
1218  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
1219  *
1220  * If new, the entry is marked as an exact (non-chunk) entry.
1221  *
1222  * This may cause the table to exceed the desired memory size.  It is
1223  * up to the caller to call tbm_lossify() at the next safe point if so.
1224  */
1225 static PagetableEntry *
1226 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
1227 {
1228         PagetableEntry *page;
1229         bool            found;
1230
1231         if (tbm->status == TBM_EMPTY)
1232         {
1233                 /* Use the fixed slot */
1234                 page = &tbm->entry1;
1235                 found = false;
1236                 tbm->status = TBM_ONE_PAGE;
1237         }
1238         else
1239         {
1240                 if (tbm->status == TBM_ONE_PAGE)
1241                 {
1242                         page = &tbm->entry1;
1243                         if (page->blockno == pageno)
1244                                 return page;
1245                         /* Time to switch from one page to a hashtable */
1246                         tbm_create_pagetable(tbm);
1247                 }
1248
1249                 /* Look up or create an entry */
1250                 page = pagetable_insert(tbm->pagetable, pageno, &found);
1251         }
1252
1253         /* Initialize it if not present before */
1254         if (!found)
1255         {
1256                 char            oldstatus = page->status;
1257
1258                 MemSet(page, 0, sizeof(PagetableEntry));
1259                 page->status = oldstatus;
1260                 page->blockno = pageno;
1261                 /* must count it too */
1262                 tbm->nentries++;
1263                 tbm->npages++;
1264         }
1265
1266         return page;
1267 }
1268
1269 /*
1270  * tbm_page_is_lossy - is the page marked as lossily stored?
1271  */
1272 static bool
1273 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
1274 {
1275         PagetableEntry *page;
1276         BlockNumber chunk_pageno;
1277         int                     bitno;
1278
1279         /* we can skip the lookup if there are no lossy chunks */
1280         if (tbm->nchunks == 0)
1281                 return false;
1282         Assert(tbm->status == TBM_HASH);
1283
1284         bitno = pageno % PAGES_PER_CHUNK;
1285         chunk_pageno = pageno - bitno;
1286
1287         page = pagetable_lookup(tbm->pagetable, chunk_pageno);
1288
1289         if (page != NULL && page->ischunk)
1290         {
1291                 int                     wordnum = WORDNUM(bitno);
1292                 int                     bitnum = BITNUM(bitno);
1293
1294                 if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
1295                         return true;
1296         }
1297         return false;
1298 }
1299
1300 /*
1301  * tbm_mark_page_lossy - mark the page number as lossily stored
1302  *
1303  * This may cause the table to exceed the desired memory size.  It is
1304  * up to the caller to call tbm_lossify() at the next safe point if so.
1305  */
1306 static void
1307 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
1308 {
1309         PagetableEntry *page;
1310         bool            found;
1311         BlockNumber chunk_pageno;
1312         int                     bitno;
1313         int                     wordnum;
1314         int                     bitnum;
1315
1316         /* We force the bitmap into hashtable mode whenever it's lossy */
1317         if (tbm->status != TBM_HASH)
1318                 tbm_create_pagetable(tbm);
1319
1320         bitno = pageno % PAGES_PER_CHUNK;
1321         chunk_pageno = pageno - bitno;
1322
1323         /*
1324          * Remove any extant non-lossy entry for the page.  If the page is its own
1325          * chunk header, however, we skip this and handle the case below.
1326          */
1327         if (bitno != 0)
1328         {
1329                 if (pagetable_delete(tbm->pagetable, pageno))
1330                 {
1331                         /* It was present, so adjust counts */
1332                         tbm->nentries--;
1333                         tbm->npages--;          /* assume it must have been non-lossy */
1334                 }
1335         }
1336
1337         /* Look up or create entry for chunk-header page */
1338         page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
1339
1340         /* Initialize it if not present before */
1341         if (!found)
1342         {
1343                 char            oldstatus = page->status;
1344
1345                 MemSet(page, 0, sizeof(PagetableEntry));
1346                 page->status = oldstatus;
1347                 page->blockno = chunk_pageno;
1348                 page->ischunk = true;
1349                 /* must count it too */
1350                 tbm->nentries++;
1351                 tbm->nchunks++;
1352         }
1353         else if (!page->ischunk)
1354         {
1355                 char            oldstatus = page->status;
1356
1357                 /* chunk header page was formerly non-lossy, make it lossy */
1358                 MemSet(page, 0, sizeof(PagetableEntry));
1359                 page->status = oldstatus;
1360                 page->blockno = chunk_pageno;
1361                 page->ischunk = true;
1362                 /* we assume it had some tuple bit(s) set, so mark it lossy */
1363                 page->words[0] = ((bitmapword) 1 << 0);
1364                 /* adjust counts */
1365                 tbm->nchunks++;
1366                 tbm->npages--;
1367         }
1368
1369         /* Now set the original target page's bit */
1370         wordnum = WORDNUM(bitno);
1371         bitnum = BITNUM(bitno);
1372         page->words[wordnum] |= ((bitmapword) 1 << bitnum);
1373 }
1374
1375 /*
1376  * tbm_lossify - lose some information to get back under the memory limit
1377  */
1378 static void
1379 tbm_lossify(TIDBitmap *tbm)
1380 {
1381         pagetable_iterator i;
1382         PagetableEntry *page;
1383
1384         /*
1385          * XXX Really stupid implementation: this just lossifies pages in
1386          * essentially random order.  We should be paying some attention to the
1387          * number of bits set in each page, instead.
1388          *
1389          * Since we are called as soon as nentries exceeds maxentries, we should
1390          * push nentries down to significantly less than maxentries, or else we'll
1391          * just end up doing this again very soon.  We shoot for maxentries/2.
1392          */
1393         Assert(tbm->iterating == TBM_NOT_ITERATING);
1394         Assert(tbm->status == TBM_HASH);
1395
1396         pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1397         while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
1398         {
1399                 if (page->ischunk)
1400                         continue;                       /* already a chunk header */
1401
1402                 /*
1403                  * If the page would become a chunk header, we won't save anything by
1404                  * converting it to lossy, so skip it.
1405                  */
1406                 if ((page->blockno % PAGES_PER_CHUNK) == 0)
1407                         continue;
1408
1409                 /* This does the dirty work ... */
1410                 tbm_mark_page_lossy(tbm, page->blockno);
1411
1412                 if (tbm->nentries <= tbm->maxentries / 2)
1413                 {
1414                         /*
1415                          * We have made enough room. Remember where to start lossifying
1416                          * next round, so we evenly iterate over the hashtable.
1417                          */
1418                         tbm->lossify_start = i.cur;
1419                         break;
1420                 }
1421
1422                 /*
1423                  * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1424                  * hashtable and may have deleted the non-lossy chunk.  We can
1425                  * continue the same hash table scan, since failure to visit one
1426                  * element or visiting the newly inserted element, isn't fatal.
1427                  */
1428         }
1429
1430         /*
1431          * With a big bitmap and small work_mem, it's possible that we cannot get
1432          * under maxentries.  Again, if that happens, we'd end up uselessly
1433          * calling tbm_lossify over and over.  To prevent this from becoming a
1434          * performance sink, force maxentries up to at least double the current
1435          * number of entries.  (In essence, we're admitting inability to fit
1436          * within work_mem when we do this.)  Note that this test will not fire if
1437          * we broke out of the loop early; and if we didn't, the current number of
1438          * entries is simply not reducible any further.
1439          */
1440         if (tbm->nentries > tbm->maxentries / 2)
1441                 tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
1442 }
1443
1444 /*
1445  * qsort comparator to handle PagetableEntry pointers.
1446  */
1447 static int
1448 tbm_comparator(const void *left, const void *right)
1449 {
1450         BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
1451         BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
1452
1453         if (l < r)
1454                 return -1;
1455         else if (l > r)
1456                 return 1;
1457         return 0;
1458 }
1459
1460 /*
1461  * As above, but this will get index into PagetableEntry array.  Therefore,
1462  * it needs to get actual PagetableEntry using the index before comparing the
1463  * blockno.
1464  */
1465 static int
1466 tbm_shared_comparator(const void *left, const void *right, void *arg)
1467 {
1468         PagetableEntry *base = (PagetableEntry *) arg;
1469         PagetableEntry *lpage = &base[*(int *) left];
1470         PagetableEntry *rpage = &base[*(int *) right];
1471
1472         if (lpage->blockno < rpage->blockno)
1473                 return -1;
1474         else if (lpage->blockno > rpage->blockno)
1475                 return 1;
1476         return 0;
1477 }
1478
1479 /*
1480  *      tbm_attach_shared_iterate
1481  *
1482  *      Allocate a backend-private iterator and attach the shared iterator state
1483  *      to it so that multiple processed can iterate jointly.
1484  *
1485  *      We also converts the DSA pointers to local pointers and store them into
1486  *      our private iterator.
1487  */
1488 TBMSharedIterator *
1489 tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
1490 {
1491         TBMSharedIterator *iterator;
1492         TBMSharedIteratorState *istate;
1493
1494         /*
1495          * Create the TBMSharedIterator struct, with enough trailing space to
1496          * serve the needs of the TBMIterateResult sub-struct.
1497          */
1498         iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
1499                                                                  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
1500
1501         istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
1502
1503         iterator->state = istate;
1504
1505         iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
1506
1507         if (istate->npages)
1508                 iterator->ptpages = dsa_get_address(dsa, istate->spages);
1509         if (istate->nchunks)
1510                 iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
1511
1512         return iterator;
1513 }
1514
1515 /*
1516  * pagetable_allocate
1517  *
1518  * Callback function for allocating the memory for hashtable elements.
1519  * Allocate memory for hashtable elements, using DSA if available.
1520  */
1521 static inline void *
1522 pagetable_allocate(pagetable_hash *pagetable, Size size)
1523 {
1524         TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
1525         PTEntryArray *ptbase;
1526
1527         if (tbm->dsa == NULL)
1528                 return MemoryContextAllocExtended(pagetable->ctx, size,
1529                                                                                   MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
1530
1531         /*
1532          * Save the dsapagetable reference in dsapagetableold before allocating
1533          * new memory so that pagetable_free can free the old entry.
1534          */
1535         tbm->dsapagetableold = tbm->dsapagetable;
1536         tbm->dsapagetable = dsa_allocate0(tbm->dsa, sizeof(PTEntryArray) + size);
1537
1538         ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
1539         return ptbase->ptentry;
1540 }
1541
1542 /*
1543  * pagetable_free
1544  *
1545  * Callback function for freeing hash table elements.
1546  */
1547 static inline void
1548 pagetable_free(pagetable_hash *pagetable, void *pointer)
1549 {
1550         TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
1551
1552         /* pfree the input pointer if DSA is not available */
1553         if (tbm->dsa == NULL)
1554                 pfree(pointer);
1555         else if (DsaPointerIsValid(tbm->dsapagetableold))
1556         {
1557                 dsa_free(tbm->dsa, tbm->dsapagetableold);
1558                 tbm->dsapagetableold = InvalidDsaPointer;
1559         }
1560 }