]> granicus.if.org Git - postgresql/commitdiff
tidbitmap: Support shared iteration.
authorRobert Haas <rhaas@postgresql.org>
Wed, 8 Mar 2017 13:02:03 +0000 (08:02 -0500)
committerRobert Haas <rhaas@postgresql.org>
Wed, 8 Mar 2017 13:09:38 +0000 (08:09 -0500)
When a shared iterator is used, each call to tbm_shared_iterate()
returns a result that has not yet been returned to any process
attached to the shared iterator.  In other words, each cooperating
processes gets a disjoint subset of the full result set, but all
results are returned exactly once.

This is infrastructure for parallel bitmap heap scan.

Dilip Kumar.  The larger patch set of which this is a part has been
reviewed and tested by (at least) Andres Freund, Amit Khandekar,
Tushar Ahuja, Rafia Sabih, Haribabu Kommi, and Thomas Munro.

Discussion: http://postgr.es/m/CAFiTN-uc4=0WxRGfCzs-xfkMYcSEWUC-Fon6thkJGjkh9i=13A@mail.gmail.com

doc/src/sgml/monitoring.sgml
src/backend/access/gin/ginget.c
src/backend/executor/nodeBitmapIndexscan.c
src/backend/executor/nodeBitmapOr.c
src/backend/nodes/tidbitmap.c
src/backend/storage/lmgr/lwlock.c
src/include/nodes/tidbitmap.h
src/include/storage/lwlock.h

index fad5cb05b95dac91a03af111af0b179ea690bbe4..27ed35f0a7b756632af4f730a9acfa8ea495c92b 100644 (file)
@@ -826,7 +826,7 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
 
       <tbody>
        <row>
-        <entry morerows="58"><literal>LWLock</></entry>
+        <entry morerows="59"><literal>LWLock</></entry>
         <entry><literal>ShmemIndexLock</></entry>
         <entry>Waiting to find or allocate space in shared memory.</entry>
        </row>
@@ -1081,6 +1081,10 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
          <entry><literal>parallel_query_dsa</></entry>
          <entry>Waiting for parallel query dynamic shared memory allocation lock.</entry>
         </row>
+        <row>
+         <entry><literal>tbm</></entry>
+         <entry>Waiting for TBM shared iterator lock.</entry>
+        </row>
         <row>
          <entry morerows="9"><literal>Lock</></entry>
          <entry><literal>relation</></entry>
index 60f005ce2bea6e0fdd02ac4323c804c8d8722f70..87cd9eaa9f55c9170a3201542d5218396637de1e 100644 (file)
@@ -120,7 +120,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
        Form_pg_attribute attr;
 
        /* Initialize empty bitmap result */
-       scanEntry->matchBitmap = tbm_create(work_mem * 1024L);
+       scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL);
 
        /* Null query cannot partial-match anything */
        if (scanEntry->isPartialMatch &&
index 4274e9abcb5211a11f2f1fe100905e7477eb9607..94bb289f1bb32d31d83446d9da97476f67330989 100644 (file)
@@ -78,7 +78,7 @@ MultiExecBitmapIndexScan(BitmapIndexScanState *node)
        else
        {
                /* XXX should we use less than work_mem for this? */
-               tbm = tbm_create(work_mem * 1024L);
+               tbm = tbm_create(work_mem * 1024L, NULL);
        }
 
        /*
index 3c6155b7c3ef5d90e9b737c4ea4f0e6d22cb1446..1d280beddc171e00290bd4cb68716059b1379cf7 100644 (file)
@@ -129,7 +129,7 @@ MultiExecBitmapOr(BitmapOrState *node)
                        if (result == NULL) /* first subplan */
                        {
                                /* XXX should we use less than work_mem for this? */
-                               result = tbm_create(work_mem * 1024L);
+                               result = tbm_create(work_mem * 1024L, NULL);
                        }
 
                        ((BitmapIndexScanState *) subnode)->biss_result = result;
index 0885812420906ec351d2e8062011eee9f14e38b8..9dcef327043958fd181672636e89b6de8e982131 100644 (file)
@@ -43,6 +43,8 @@
 #include "access/htup_details.h"
 #include "nodes/bitmapset.h"
 #include "nodes/tidbitmap.h"
+#include "storage/lwlock.h"
+#include "utils/dsa.h"
 
 /*
  * The maximum number of tuples per page is not large (typically 256 with
@@ -102,6 +104,15 @@ typedef struct PagetableEntry
        bitmapword      words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
 } PagetableEntry;
 
+/*
+ * Holds array of pagetable entries.
+ */
+typedef struct PTEntryArray
+{
+       pg_atomic_uint32        refcount;               /* no. of iterator attached */
+       PagetableEntry ptentry[FLEXIBLE_ARRAY_MEMBER];
+} PTEntryArray;
+
 /*
  * We want to avoid the overhead of creating the hashtable, which is
  * comparatively large, when not necessary. Particularly when we are using a
@@ -120,6 +131,16 @@ typedef enum
        TBM_HASH                                        /* pagetable is valid, entry1 is not */
 } TBMStatus;
 
+/*
+ * Current iterating state of the TBM.
+ */
+typedef enum
+{
+       TBM_NOT_ITERATING,                      /* not yet converted to page and chunk array */
+       TBM_ITERATING_PRIVATE,          /* converted to local page and chunk array */
+       TBM_ITERATING_SHARED            /* converted to shared page and chunk array */
+} TBMIteratingState;
+
 /*
  * Here is the representation for a whole TIDBitMap:
  */
@@ -133,12 +154,17 @@ struct TIDBitmap
        int                     maxentries;             /* limit on same to meet maxbytes */
        int                     npages;                 /* number of exact entries in pagetable */
        int                     nchunks;                /* number of lossy entries in pagetable */
-       bool            iterating;              /* tbm_begin_iterate called? */
+       TBMIteratingState iterating;    /* tbm_begin_iterate called? */
        uint32          lossify_start;  /* offset to start lossifying hashtable at */
        PagetableEntry entry1;          /* used when status == TBM_ONE_PAGE */
        /* these are valid when iterating is true: */
        PagetableEntry **spages;        /* sorted exact-page list, or NULL */
        PagetableEntry **schunks;       /* sorted lossy-chunk list, or NULL */
+       dsa_pointer dsapagetable;       /* dsa_pointer to the element array */
+       dsa_pointer dsapagetableold;    /* dsa_pointer to the old element array */
+       dsa_pointer ptpages;            /* dsa_pointer to the page array */
+       dsa_pointer ptchunks;           /* dsa_pointer to the chunk array */
+       dsa_area   *dsa;                        /* reference to per-query dsa area */
 };
 
 /*
@@ -156,6 +182,46 @@ struct TBMIterator
        TBMIterateResult output;        /* MUST BE LAST (because variable-size) */
 };
 
+/*
+ * Holds the shared members of the iterator so that multiple processes
+ * can jointly iterate.
+ */
+typedef struct TBMSharedIteratorState
+{
+       int                     nentries;               /* number of entries in pagetable */
+       int                     maxentries;             /* limit on same to meet maxbytes */
+       int                     npages;                 /* number of exact entries in pagetable */
+       int                     nchunks;                /* number of lossy entries in pagetable */
+       dsa_pointer pagetable;          /* dsa pointers to head of pagetable data */
+       dsa_pointer spages;                     /* dsa pointer to page array */
+       dsa_pointer schunks;            /* dsa pointer to chunk array */
+       LWLock          lock;                   /* lock to protect below members */
+       int                     spageptr;               /* next spages index */
+       int                     schunkptr;              /* next schunks index */
+       int                     schunkbit;              /* next bit to check in current schunk */
+} TBMSharedIteratorState;
+
+/*
+ * pagetable iteration array.
+ */
+typedef struct PTIterationArray
+{
+       pg_atomic_uint32                        refcount;       /* no. of iterator attached */
+       int                     index[FLEXIBLE_ARRAY_MEMBER];   /* index array */
+} PTIterationArray;
+
+/*
+ * same as TBMIterator, but it is used for joint iteration, therefore this
+ * also holds a reference to the shared state.
+ */
+struct TBMSharedIterator
+{
+       TBMSharedIteratorState *state;          /* shared state */
+       PTEntryArray *ptbase;           /* pagetable element array */
+       PTIterationArray *ptpages;      /* sorted exact page index list */
+       PTIterationArray *ptchunks; /* sorted lossy page index list */
+       TBMIterateResult output;        /* MUST BE LAST (because variable-size) */
+};
 
 /* Local function prototypes */
 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
@@ -168,6 +234,8 @@ static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
 static void tbm_lossify(TIDBitmap *tbm);
 static int     tbm_comparator(const void *left, const void *right);
+static int tbm_shared_comparator(const void *left, const void *right,
+                                         void *arg);
 
 /*
  * Simple inline murmur hash implementation for the exact width required, for
@@ -187,6 +255,7 @@ hash_blockno(BlockNumber b)
 }
 
 /* define hashtable mapping block numbers to PagetableEntry's */
+#define SH_USE_NONDEFAULT_ALLOCATOR
 #define SH_PREFIX pagetable
 #define SH_ELEMENT_TYPE PagetableEntry
 #define SH_KEY_TYPE BlockNumber
@@ -204,10 +273,12 @@ hash_blockno(BlockNumber b)
  *
  * The bitmap will live in the memory context that is CurrentMemoryContext
  * at the time of this call.  It will be limited to (approximately) maxbytes
- * total memory consumption.
+ * total memory consumption.  If the DSA passed to this function is not NULL
+ * then the memory for storing elements of the underlying page table will
+ * be allocated from the DSA.
  */
 TIDBitmap *
-tbm_create(long maxbytes)
+tbm_create(long maxbytes, dsa_area *dsa)
 {
        TIDBitmap  *tbm;
        long            nbuckets;
@@ -230,6 +301,7 @@ tbm_create(long maxbytes)
        nbuckets = Max(nbuckets, 16);           /* sanity limit */
        tbm->maxentries = (int) nbuckets;
        tbm->lossify_start = 0;
+       tbm->dsa = dsa;
 
        return tbm;
 }
@@ -244,7 +316,7 @@ tbm_create_pagetable(TIDBitmap *tbm)
        Assert(tbm->status != TBM_HASH);
        Assert(tbm->pagetable == NULL);
 
-       tbm->pagetable = pagetable_create(tbm->mcxt, 128, NULL);
+       tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
 
        /* If entry1 is valid, push it into the hashtable */
        if (tbm->status == TBM_ONE_PAGE)
@@ -280,6 +352,40 @@ tbm_free(TIDBitmap *tbm)
        pfree(tbm);
 }
 
+/*
+ * tbm_free_shared_area - free shared state
+ *
+ * Free shared iterator state, Also free shared pagetable and iterator arrays
+ * memory if they are not referred by any of the shared iterator i.e recount
+ * is becomes 0.
+ */
+void
+tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp)
+{
+       TBMSharedIteratorState *istate = dsa_get_address(dsa, dp);
+       PTEntryArray *ptbase = dsa_get_address(dsa, istate->pagetable);
+       PTIterationArray *ptpages;
+       PTIterationArray *ptchunks;
+
+       if (pg_atomic_sub_fetch_u32(&ptbase->refcount, 1) == 0)
+               dsa_free(dsa, istate->pagetable);
+
+       if (istate->spages)
+       {
+               ptpages = dsa_get_address(dsa, istate->spages);
+               if (pg_atomic_sub_fetch_u32(&ptpages->refcount, 1) == 0)
+                       dsa_free(dsa, istate->spages);
+       }
+       if (istate->schunks)
+       {
+               ptchunks = dsa_get_address(dsa, istate->schunks);
+               if (pg_atomic_sub_fetch_u32(&ptchunks->refcount, 1) == 0)
+                       dsa_free(dsa, istate->schunks);
+       }
+
+       dsa_free(dsa, dp);
+}
+
 /*
  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
  *
@@ -294,7 +400,7 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
        PagetableEntry *page = NULL;    /* only valid when currblk is valid */
        int                     i;
 
-       Assert(!tbm->iterating);
+       Assert(tbm->iterating == TBM_NOT_ITERATING);
        for (i = 0; i < ntids; i++)
        {
                BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
@@ -603,6 +709,8 @@ tbm_begin_iterate(TIDBitmap *tbm)
 {
        TBMIterator *iterator;
 
+       Assert(tbm->iterating != TBM_ITERATING_SHARED);
+
        /*
         * Create the TBMIterator struct, with enough trailing space to serve the
         * needs of the TBMIterateResult sub-struct.
@@ -624,7 +732,7 @@ tbm_begin_iterate(TIDBitmap *tbm)
         * attached to the bitmap not the iterator, so they can be used by more
         * than one iterator.
         */
-       if (tbm->status == TBM_HASH && !tbm->iterating)
+       if (tbm->status == TBM_HASH && tbm->iterating == TBM_NOT_ITERATING)
        {
                pagetable_iterator i;
                PagetableEntry *page;
@@ -659,11 +767,210 @@ tbm_begin_iterate(TIDBitmap *tbm)
                                  tbm_comparator);
        }
 
-       tbm->iterating = true;
+       tbm->iterating = TBM_ITERATING_PRIVATE;
 
        return iterator;
 }
 
+/*
+ * tbm_prepare_shared_iterate - prepare shared iteration state for a TIDBitmap.
+ *
+ * The necessary shared state will be allocated from the DSA passed to
+ * tbm_create, so that multiple processes can attach to it and iterate jointly.
+ *
+ * This will convert the pagetable hash into page and chunk array of the index
+ * into pagetable array.
+ */
+dsa_pointer
+tbm_prepare_shared_iterate(TIDBitmap *tbm)
+{
+       dsa_pointer dp;
+       TBMSharedIteratorState *istate;
+       PTEntryArray *ptbase;
+       PTIterationArray *ptpages;
+       PTIterationArray *ptchunks;
+
+       Assert(tbm->dsa != NULL);
+       Assert(tbm->iterating != TBM_ITERATING_PRIVATE);
+
+       /*
+        * Allocate TBMSharedIteratorState from DSA to hold the shared members and
+        * lock, this will also be used by multiple worker for shared iterate.
+        */
+       dp = dsa_allocate(tbm->dsa, sizeof(TBMSharedIteratorState));
+       istate = dsa_get_address(tbm->dsa, dp);
+
+       /*
+        * If we're not already iterating, create and fill the sorted page lists.
+        * (If we are, the sorted page lists are already stored in the TIDBitmap,
+        * and we can just reuse them.)
+        */
+       if (tbm->iterating == TBM_NOT_ITERATING)
+       {
+               pagetable_iterator i;
+               PagetableEntry *page;
+               int                     idx;
+               int                     npages;
+               int                     nchunks;
+
+               /*
+                * Allocate the page and chunk array memory from the DSA to share
+                * across multiple processes.
+                */
+               if (tbm->npages)
+               {
+                       tbm->ptpages = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
+                                                                               tbm->npages * sizeof(int));
+                       ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
+                       pg_atomic_init_u32(&ptpages->refcount, 0);
+               }
+               if (tbm->nchunks)
+               {
+                       tbm->ptchunks = dsa_allocate(tbm->dsa, sizeof(PTIterationArray) +
+                                                                                tbm->nchunks * sizeof(int));
+                       ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
+                       pg_atomic_init_u32(&ptchunks->refcount, 0);
+               }
+
+               /*
+                * If TBM status is TBM_HASH then iterate over the pagetable and
+                * convert it to page and chunk arrays.  But if it's in the
+                * TBM_ONE_PAGE mode then directly allocate the space for one entry
+                * from the DSA.
+                */
+               npages = nchunks = 0;
+               if (tbm->status == TBM_HASH)
+               {
+                       ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
+
+                       pagetable_start_iterate(tbm->pagetable, &i);
+                       while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
+                       {
+                               idx = page - ptbase->ptentry;
+                               if (page->ischunk)
+                                       ptchunks->index[nchunks++] = idx;
+                               else
+                                       ptpages->index[npages++] = idx;
+                       }
+
+                       Assert(npages == tbm->npages);
+                       Assert(nchunks == tbm->nchunks);
+               }
+               else
+               {
+                       /*
+                        * In one page mode allocate the space for one pagetable entry and
+                        * directly store its index i.e. 0 in page array
+                        */
+                       tbm->dsapagetable = dsa_allocate(tbm->dsa, sizeof(PTEntryArray) +
+                                                                                        sizeof(PagetableEntry));
+                       ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
+                       ptpages->index[0] = 0;
+               }
+
+               pg_atomic_init_u32(&ptbase->refcount, 0);
+
+               if (npages > 1)
+                       qsort_arg((void *) (ptpages->index), npages, sizeof(int),
+                                         tbm_shared_comparator, (void *) ptbase->ptentry);
+               if (nchunks > 1)
+                       qsort_arg((void *) (ptchunks->index), nchunks, sizeof(int),
+                                         tbm_shared_comparator, (void *) ptbase->ptentry);
+       }
+
+       /*
+        * Store the TBM members in the shared state so that we can share them
+        * across multiple processes.
+        */
+       istate->nentries = tbm->nentries;
+       istate->maxentries = tbm->maxentries;
+       istate->npages = tbm->npages;
+       istate->nchunks = tbm->nchunks;
+       istate->pagetable = tbm->dsapagetable;
+       istate->spages = tbm->ptpages;
+       istate->schunks = tbm->ptchunks;
+
+       ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
+       ptpages = dsa_get_address(tbm->dsa, tbm->ptpages);
+       ptchunks = dsa_get_address(tbm->dsa, tbm->ptchunks);
+
+       /*
+        * For every shared iterator, referring to pagetable and iterator array,
+        * increase the refcount by 1 so that while freeing the shared iterator
+        * we don't free pagetable and iterator array until its refcount becomes 0.
+        */
+       pg_atomic_add_fetch_u32(&ptbase->refcount, 1);
+       if (ptpages)
+               pg_atomic_add_fetch_u32(&ptpages->refcount, 1);
+       if (ptchunks)
+               pg_atomic_add_fetch_u32(&ptchunks->refcount, 1);
+
+       /* Initialize the iterator lock */
+       LWLockInitialize(&istate->lock, LWTRANCHE_TBM);
+
+       /* Initialize the shared iterator state */
+       istate->schunkbit = 0;
+       istate->schunkptr = 0;
+       istate->spageptr = 0;
+
+       tbm->iterating = TBM_ITERATING_SHARED;
+
+       return dp;
+}
+
+/*
+ * tbm_extract_page_tuple - extract the tuple offsets from a page
+ *
+ * The extracted offsets are stored into TBMIterateResult.
+ */
+static inline int
+tbm_extract_page_tuple(PagetableEntry *page, TBMIterateResult *output)
+{
+       int                     wordnum;
+       int                     ntuples = 0;
+
+       for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+       {
+               bitmapword      w = page->words[wordnum];
+
+               if (w != 0)
+               {
+                       int                     off = wordnum * BITS_PER_BITMAPWORD + 1;
+
+                       while (w != 0)
+                       {
+                               if (w & 1)
+                                       output->offsets[ntuples++] = (OffsetNumber) off;
+                               off++;
+                               w >>= 1;
+                       }
+               }
+       }
+
+       return ntuples;
+}
+
+/*
+ *     tbm_advance_schunkbit - Advance the chunkbit
+ */
+static inline void
+tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
+{
+       int                     schunkbit = *schunkbitp;
+
+       while (schunkbit < PAGES_PER_CHUNK)
+       {
+               int                     wordnum = WORDNUM(schunkbit);
+               int                     bitnum = BITNUM(schunkbit);
+
+               if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
+                       break;
+               schunkbit++;
+       }
+
+       *schunkbitp = schunkbit;
+}
+
 /*
  * tbm_iterate - scan through next page of a TIDBitmap
  *
@@ -682,7 +989,7 @@ tbm_iterate(TBMIterator *iterator)
        TIDBitmap  *tbm = iterator->tbm;
        TBMIterateResult *output = &(iterator->output);
 
-       Assert(tbm->iterating);
+       Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
 
        /*
         * If lossy chunk pages remain, make sure we've advanced schunkptr/
@@ -693,15 +1000,7 @@ tbm_iterate(TBMIterator *iterator)
                PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
                int                     schunkbit = iterator->schunkbit;
 
-               while (schunkbit < PAGES_PER_CHUNK)
-               {
-                       int                     wordnum = WORDNUM(schunkbit);
-                       int                     bitnum = BITNUM(schunkbit);
-
-                       if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
-                               break;
-                       schunkbit++;
-               }
+               tbm_advance_schunkbit(chunk, &schunkbit);
                if (schunkbit < PAGES_PER_CHUNK)
                {
                        iterator->schunkbit = schunkbit;
@@ -738,7 +1037,6 @@ tbm_iterate(TBMIterator *iterator)
        {
                PagetableEntry *page;
                int                     ntuples;
-               int                     wordnum;
 
                /* In ONE_PAGE state, we don't allocate an spages[] array */
                if (tbm->status == TBM_ONE_PAGE)
@@ -747,31 +1045,102 @@ tbm_iterate(TBMIterator *iterator)
                        page = tbm->spages[iterator->spageptr];
 
                /* scan bitmap to extract individual offset numbers */
-               ntuples = 0;
-               for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+               ntuples = tbm_extract_page_tuple(page, output);
+               output->blockno = page->blockno;
+               output->ntuples = ntuples;
+               output->recheck = page->recheck;
+               iterator->spageptr++;
+               return output;
+       }
+
+       /* Nothing more in the bitmap */
+       return NULL;
+}
+
+/*
+ *     tbm_shared_iterate - scan through next page of a TIDBitmap
+ *
+ *     As above, but this will iterate using an iterator which is shared
+ *     across multiple processes.  We need to acquire the iterator LWLock,
+ *     before accessing the shared members.
+ */
+TBMIterateResult *
+tbm_shared_iterate(TBMSharedIterator *iterator)
+{
+       TBMIterateResult *output = &iterator->output;
+       TBMSharedIteratorState *istate = iterator->state;
+       PagetableEntry *ptbase = iterator->ptbase->ptentry;
+       int                *idxpages = iterator->ptpages->index;
+       int                *idxchunks = iterator->ptchunks->index;
+
+       /* Acquire the LWLock before accessing the shared members */
+       LWLockAcquire(&istate->lock, LW_EXCLUSIVE);
+
+       /*
+        * If lossy chunk pages remain, make sure we've advanced schunkptr/
+        * schunkbit to the next set bit.
+        */
+       while (istate->schunkptr < istate->nchunks)
+       {
+               PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
+               int                     schunkbit = istate->schunkbit;
+
+               tbm_advance_schunkbit(chunk, &schunkbit);
+               if (schunkbit < PAGES_PER_CHUNK)
                {
-                       bitmapword      w = page->words[wordnum];
+                       istate->schunkbit = schunkbit;
+                       break;
+               }
+               /* advance to next chunk */
+               istate->schunkptr++;
+               istate->schunkbit = 0;
+       }
 
-                       if (w != 0)
-                       {
-                               int                     off = wordnum * BITS_PER_BITMAPWORD + 1;
+       /*
+        * If both chunk and per-page data remain, must output the numerically
+        * earlier page.
+        */
+       if (istate->schunkptr < istate->nchunks)
+       {
+               PagetableEntry *chunk = &ptbase[idxchunks[istate->schunkptr]];
+               PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
+               BlockNumber chunk_blockno;
 
-                               while (w != 0)
-                               {
-                                       if (w & 1)
-                                               output->offsets[ntuples++] = (OffsetNumber) off;
-                                       off++;
-                                       w >>= 1;
-                               }
-                       }
+               chunk_blockno = chunk->blockno + istate->schunkbit;
+
+               if (istate->spageptr >= istate->npages ||
+                       chunk_blockno < page->blockno)
+               {
+                       /* Return a lossy page indicator from the chunk */
+                       output->blockno = chunk_blockno;
+                       output->ntuples = -1;
+                       output->recheck = true;
+                       istate->schunkbit++;
+
+                       LWLockRelease(&istate->lock);
+                       return output;
                }
+       }
+
+       if (istate->spageptr < istate->npages)
+       {
+               PagetableEntry *page = &ptbase[idxpages[istate->spageptr]];
+               int                     ntuples;
+
+               /* scan bitmap to extract individual offset numbers */
+               ntuples = tbm_extract_page_tuple(page, output);
                output->blockno = page->blockno;
                output->ntuples = ntuples;
                output->recheck = page->recheck;
-               iterator->spageptr++;
+               istate->spageptr++;
+
+               LWLockRelease(&istate->lock);
+
                return output;
        }
 
+       LWLockRelease(&istate->lock);
+
        /* Nothing more in the bitmap */
        return NULL;
 }
@@ -789,6 +1158,18 @@ tbm_end_iterate(TBMIterator *iterator)
        pfree(iterator);
 }
 
+/*
+ * tbm_end_shared_iterate - finish a shared iteration over a TIDBitmap
+ *
+ * This doesn't free any of the shared state associated with the iterator,
+ * just our backend-private state.
+ */
+void
+tbm_end_shared_iterate(TBMSharedIterator *iterator)
+{
+       pfree(iterator);
+}
+
 /*
  * tbm_find_pageentry - find a PagetableEntry for the pageno
  *
@@ -995,7 +1376,7 @@ tbm_lossify(TIDBitmap *tbm)
         * push nentries down to significantly less than maxentries, or else we'll
         * just end up doing this again very soon.  We shoot for maxentries/2.
         */
-       Assert(!tbm->iterating);
+       Assert(tbm->iterating == TBM_NOT_ITERATING);
        Assert(tbm->status == TBM_HASH);
 
        pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
@@ -1061,3 +1442,105 @@ tbm_comparator(const void *left, const void *right)
                return 1;
        return 0;
 }
+
+/*
+ * As above, but this will get index into PagetableEntry array.  Therefore,
+ * it needs to get actual PagetableEntry using the index before comparing the
+ * blockno.
+ */
+static int
+tbm_shared_comparator(const void *left, const void *right, void *arg)
+{
+       PagetableEntry *base = (PagetableEntry *) arg;
+       PagetableEntry *lpage = &base[*(int *) left];
+       PagetableEntry *rpage = &base[*(int *) right];
+
+       if (lpage->blockno < rpage->blockno)
+               return -1;
+       else if (lpage->blockno > rpage->blockno)
+               return 1;
+       return 0;
+}
+
+/*
+ *     tbm_attach_shared_iterate
+ *
+ *     Allocate a backend-private iterator and attach the shared iterator state
+ *     to it so that multiple processed can iterate jointly.
+ *
+ *     We also converts the DSA pointers to local pointers and store them into
+ *     our private iterator.
+ */
+TBMSharedIterator *
+tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
+{
+       TBMSharedIterator *iterator;
+       TBMSharedIteratorState *istate;
+
+       /*
+        * Create the TBMSharedIterator struct, with enough trailing space to
+        * serve the needs of the TBMIterateResult sub-struct.
+        */
+       iterator = (TBMSharedIterator *) palloc(sizeof(TBMSharedIterator) +
+                                                                MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
+
+       istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
+
+       iterator->state = istate;
+
+       iterator->ptbase = dsa_get_address(dsa, istate->pagetable);
+
+       if (istate->npages)
+               iterator->ptpages = dsa_get_address(dsa, istate->spages);
+       if (istate->nchunks)
+               iterator->ptchunks = dsa_get_address(dsa, istate->schunks);
+
+       return iterator;
+}
+
+/*
+ * pagetable_allocate
+ *
+ * Callback function for allocating the memory for hashtable elements.
+ * Allocate memory for hashtable elements, using DSA if available.
+ */
+static inline void *
+pagetable_allocate(pagetable_hash *pagetable, Size size)
+{
+       TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
+       PTEntryArray *ptbase;
+
+       if (tbm->dsa == NULL)
+               return MemoryContextAllocExtended(pagetable->ctx, size,
+                                                                                 MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
+
+       /*
+        * Save the dsapagetable reference in dsapagetableold before allocating
+        * new memory so that pagetable_free can free the old entry.
+        */
+       tbm->dsapagetableold = tbm->dsapagetable;
+       tbm->dsapagetable = dsa_allocate0(tbm->dsa, sizeof(PTEntryArray) + size);
+
+       ptbase = dsa_get_address(tbm->dsa, tbm->dsapagetable);
+       return ptbase->ptentry;
+}
+
+/*
+ * pagetable_free
+ *
+ * Callback function for freeing hash table elements.
+ */
+static inline void
+pagetable_free(pagetable_hash *pagetable, void *pointer)
+{
+       TIDBitmap  *tbm = (TIDBitmap *) pagetable->private_data;
+
+       /* pfree the input pointer if DSA is not available */
+       if (tbm->dsa == NULL)
+               pfree(pointer);
+       else if (DsaPointerIsValid(tbm->dsapagetableold))
+       {
+               dsa_free(tbm->dsa, tbm->dsapagetableold);
+               tbm->dsapagetableold = InvalidDsaPointer;
+       }
+}
index ab81d94b51043d7ce263717da644a464ff9fe817..3e133941f47e1734b0d5f2cc7c6c7421d7272cdd 100644 (file)
@@ -510,6 +510,7 @@ RegisterLWLockTranches(void)
                                                  "predicate_lock_manager");
        LWLockRegisterTranche(LWTRANCHE_PARALLEL_QUERY_DSA,
                                                  "parallel_query_dsa");
+       LWLockRegisterTranche(LWTRANCHE_TBM, "tbm");
 
        /* Register named tranches. */
        for (i = 0; i < NamedLWLockTrancheRequests; i++)
index 14992e09f840607cfe21c0a856d3b9198ce4ac79..87f4bb7c91154337c14d33fa28d3e4645651efc4 100644 (file)
@@ -23,6 +23,7 @@
 #define TIDBITMAP_H
 
 #include "storage/itemptr.h"
+#include "utils/dsa.h"
 
 
 /*
@@ -33,6 +34,7 @@ typedef struct TIDBitmap TIDBitmap;
 
 /* Likewise, TBMIterator is private */
 typedef struct TBMIterator TBMIterator;
+typedef struct TBMSharedIterator TBMSharedIterator;
 
 /* Result structure for tbm_iterate */
 typedef struct
@@ -46,8 +48,9 @@ typedef struct
 
 /* function prototypes in nodes/tidbitmap.c */
 
-extern TIDBitmap *tbm_create(long maxbytes);
+extern TIDBitmap *tbm_create(long maxbytes, dsa_area *dsa);
 extern void tbm_free(TIDBitmap *tbm);
+extern void tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp);
 
 extern void tbm_add_tuples(TIDBitmap *tbm,
                           const ItemPointer tids, int ntids,
@@ -60,7 +63,12 @@ extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b);
 extern bool tbm_is_empty(const TIDBitmap *tbm);
 
 extern TBMIterator *tbm_begin_iterate(TIDBitmap *tbm);
+extern dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm);
 extern TBMIterateResult *tbm_iterate(TBMIterator *iterator);
+extern TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator);
 extern void tbm_end_iterate(TBMIterator *iterator);
+extern void tbm_end_shared_iterate(TBMSharedIterator *iterator);
+extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa,
+                                                 dsa_pointer dp);
 
 #endif   /* TIDBITMAP_H */
index 8bd93c3b819eed1a883661129900d12985105b83..0cd45bb6d8e959f81a8c8f80d71eb1542b40024e 100644 (file)
@@ -212,6 +212,7 @@ typedef enum BuiltinTrancheIds
        LWTRANCHE_LOCK_MANAGER,
        LWTRANCHE_PREDICATE_LOCK_MANAGER,
        LWTRANCHE_PARALLEL_QUERY_DSA,
+       LWTRANCHE_TBM,
        LWTRANCHE_FIRST_USER_DEFINED
 }      BuiltinTrancheIds;