From da56e57695b9ec4f623023c2abfa77cec6b2da09 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 17 May 2005 00:43:47 +0000 Subject: [PATCH] Modify tidbitmap.c to avoid creating a hash table until there is more than one heap page represented in the bitmap. This is a bit ugly but it cuts overhead fairly effectively in simple join cases. Per example from Sergey Koposov. --- src/backend/nodes/tidbitmap.c | 440 ++++++++++++++++++++++------------ 1 file changed, 291 insertions(+), 149 deletions(-) diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 6982038042..d0ca978cb6 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -23,7 +23,7 @@ * Copyright (c) 2003-2005, PostgreSQL Global Development Group * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.2 2005/04/19 22:35:15 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.3 2005/05/17 00:43:47 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -90,6 +90,24 @@ typedef struct PagetableEntry bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]; } PagetableEntry; +/* + * dynahash.c is optimized for relatively large, long-lived hash tables. + * This is not ideal for TIDBitMap, particularly when we are using a bitmap + * scan on the inside of a nestloop join: a bitmap may well live only long + * enough to accumulate one entry in such cases. We therefore avoid creating + * an actual hashtable until we need two pagetable entries. When just one + * pagetable entry is needed, we store it in a fixed field of TIDBitMap. + * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down + * to zero or one page again. So, status can be TBM_HASH even when nentries + * is zero or one.) + */ +typedef enum +{ + TBM_EMPTY, /* no hashtable, nentries == 0 */ + TBM_ONE_PAGE, /* entry1 contains the single entry */ + TBM_HASH /* pagetable is valid, entry1 is not */ +} TBMStatus; + /* * Here is the representation for a whole TIDBitMap: */ @@ -97,25 +115,29 @@ struct TIDBitmap { NodeTag type; /* to make it a valid Node */ MemoryContext mcxt; /* memory context containing me */ + TBMStatus status; /* see codes above */ HTAB *pagetable; /* hash table of PagetableEntry's */ int nentries; /* number of entries in pagetable */ int maxentries; /* limit on same to meet maxbytes */ int npages; /* number of exact entries in pagetable */ int nchunks; /* number of lossy entries in pagetable */ bool iterating; /* tbm_begin_iterate called? */ + PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ /* the remaining fields are used while producing sorted output: */ - TBMIterateResult *output; /* NULL if not yet created */ PagetableEntry **spages; /* sorted exact-page list, or NULL */ PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */ int spageptr; /* next spages index */ int schunkptr; /* next schunks index */ int schunkbit; /* next bit to check in current schunk */ + TBMIterateResult output; /* MUST BE LAST (because variable-size) */ }; /* Local function prototypes */ -static PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm, - BlockNumber pageno); +static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage); +static bool tbm_intersect_page(PagetableEntry *apage, const TIDBitmap *b); +static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm, + BlockNumber pageno); static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno); static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno); static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno); @@ -134,37 +156,79 @@ TIDBitmap * tbm_create(long maxbytes) { TIDBitmap *tbm; - HASHCTL hash_ctl; long nbuckets; - tbm = makeNode(TIDBitmap); - /* we rely on makeNode to have zeroed all the fields */ + /* + * Create the TIDBitmap struct, with enough trailing space to serve + * the needs of the TBMIterateResult sub-struct. + */ + tbm = (TIDBitmap *) palloc(sizeof(TIDBitmap) + + MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); + /* Zero all the fixed fields */ + MemSetAligned(tbm, 0, sizeof(TIDBitmap)); + + tbm->type = T_TIDBitmap; /* Set NodeTag */ tbm->mcxt = CurrentMemoryContext; + tbm->status = TBM_EMPTY; /* * Estimate number of hashtable entries we can have within maxbytes. * This estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) * plus a pointer per hash entry, which is crude but good enough for - * our purpose. (NOTE: this does not count the space for data - * structures created during iteration readout.) + * our purpose. Also count an extra Pointer per entry for the arrays + * created during iteration readout. */ nbuckets = maxbytes / (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry)) - + sizeof(Pointer)); + + sizeof(Pointer) + sizeof(Pointer)); nbuckets = Min(nbuckets, INT_MAX-1); /* safety limit */ + nbuckets = Max(nbuckets, 16); /* sanity limit */ tbm->maxentries = (int) nbuckets; + return tbm; +} + +/* + * Actually create the hashtable. Since this is a moderately expensive + * proposition, we don't do it until we have to. + */ +static void +tbm_create_pagetable(TIDBitmap *tbm) +{ + HASHCTL hash_ctl; + + Assert(tbm->status != TBM_HASH); + Assert(tbm->pagetable == NULL); + + /* Create the hashtable proper */ MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(BlockNumber); hash_ctl.entrysize = sizeof(PagetableEntry); hash_ctl.hash = tag_hash; - hash_ctl.hcxt = CurrentMemoryContext; + hash_ctl.hcxt = tbm->mcxt; tbm->pagetable = hash_create("TIDBitmap", 128, /* start small and extend */ &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); - return tbm; + /* If entry1 is valid, push it into the hashtable */ + if (tbm->status == TBM_ONE_PAGE) + { + PagetableEntry *page; + bool found; + + page = (PagetableEntry *) hash_search(tbm->pagetable, + (void *) &tbm->entry1.blockno, + HASH_ENTER, &found); + if (page == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + Assert(!found); + memcpy(page, &tbm->entry1, sizeof(PagetableEntry)); + } + + tbm->status = TBM_HASH; } /* @@ -173,9 +237,8 @@ tbm_create(long maxbytes) void tbm_free(TIDBitmap *tbm) { - hash_destroy(tbm->pagetable); - if (tbm->output) - pfree(tbm->output); + if (tbm->pagetable) + hash_destroy(tbm->pagetable); if (tbm->spages) pfree(tbm->spages); if (tbm->schunks) @@ -235,62 +298,77 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids) void tbm_union(TIDBitmap *a, const TIDBitmap *b) { - HASH_SEQ_STATUS status; + Assert(!a->iterating); + /* Nothing to do if b is empty */ + if (b->nentries == 0) + return; + /* Scan through chunks and pages in b, merge into a */ + if (b->status == TBM_ONE_PAGE) + tbm_union_page(a, &b->entry1); + else + { + HASH_SEQ_STATUS status; + PagetableEntry *bpage; + + Assert(b->status == TBM_HASH); + hash_seq_init(&status, b->pagetable); + while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL) + tbm_union_page(a, bpage); + } +} + +/* Process one page of b during a union op */ +static void +tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage) +{ PagetableEntry *apage; - PagetableEntry *bpage; int wordnum; - Assert(!a->iterating); - /* Scan through chunks and pages in b, merge into a */ - hash_seq_init(&status, b->pagetable); - while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL) + if (bpage->ischunk) { - if (bpage->ischunk) + /* Scan b's chunk, mark each indicated page lossy in a */ + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) { - /* Scan b's chunk, mark each indicated page lossy in a */ - for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + bitmapword w = bpage->words[wordnum]; + + if (w != 0) { - bitmapword w = bpage->words[wordnum]; + BlockNumber pg; - if (w != 0) + pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD); + while (w != 0) { - BlockNumber pg; - - pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD); - while (w != 0) - { - if (w & 1) - tbm_mark_page_lossy(a, pg); - pg++; - w >>= 1; - } + if (w & 1) + tbm_mark_page_lossy(a, pg); + pg++; + w >>= 1; } } } - else if (tbm_page_is_lossy(a, bpage->blockno)) + } + else if (tbm_page_is_lossy(a, bpage->blockno)) + { + /* page is already lossy in a, nothing to do */ + return; + } + else + { + apage = tbm_get_pageentry(a, bpage->blockno); + if (apage->ischunk) { - /* page is already lossy in a, nothing to do */ - continue; + /* The page is a lossy chunk header, set bit for itself */ + apage->words[0] |= ((bitmapword) 1 << 0); } else { - apage = tbm_get_pageentry(a, bpage->blockno); - if (apage->ischunk) - { - /* The page is a lossy chunk header, set bit for itself */ - apage->words[0] |= ((bitmapword) 1 << 0); - } - else - { - /* Both pages are exact, merge at the bit level */ - for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) - apage->words[wordnum] |= bpage->words[wordnum]; - } + /* Both pages are exact, merge at the bit level */ + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + apage->words[wordnum] |= bpage->words[wordnum]; } - - if (a->nentries > a->maxentries) - tbm_lossify(a); } + + if (a->nentries > a->maxentries) + tbm_lossify(a); } /* @@ -301,96 +379,121 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b) void tbm_intersect(TIDBitmap *a, const TIDBitmap *b) { - HASH_SEQ_STATUS status; - PagetableEntry *apage; - PagetableEntry *bpage; - int wordnum; - Assert(!a->iterating); + /* Nothing to do if a is empty */ + if (a->nentries == 0) + return; /* Scan through chunks and pages in a, try to match to b */ - hash_seq_init(&status, a->pagetable); - while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL) + if (a->status == TBM_ONE_PAGE) { - if (apage->ischunk) + if (tbm_intersect_page(&a->entry1, b)) { - /* Scan each bit in chunk, try to clear */ - bool candelete = true; - - for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) - { - bitmapword w = apage->words[wordnum]; - - if (w != 0) - { - bitmapword neww = w; - BlockNumber pg; - int bitnum; + /* Page is now empty, remove it from a */ + Assert(!a->entry1.ischunk); + a->npages--; + a->nentries--; + Assert(a->nentries == 0); + a->status = TBM_EMPTY; + } + } + else + { + HASH_SEQ_STATUS status; + PagetableEntry *apage; - pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD); - bitnum = 0; - while (w != 0) - { - if (w & 1) - { - if (!tbm_page_is_lossy(b, pg) && - tbm_find_pageentry(b, pg) == NULL) - { - /* Page is not in b at all, lose lossy bit */ - neww &= ~((bitmapword) 1 << bitnum); - } - } - pg++; - bitnum++; - w >>= 1; - } - apage->words[wordnum] = neww; - if (neww != 0) - candelete = false; - } - } - if (candelete) + Assert(a->status == TBM_HASH); + hash_seq_init(&status, a->pagetable); + while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL) + { + if (tbm_intersect_page(apage, b)) { - /* Chunk is now empty, remove it from a */ + /* Page or chunk is now empty, remove it from a */ + if (apage->ischunk) + a->nchunks--; + else + a->npages--; + a->nentries--; if (hash_search(a->pagetable, (void *) &apage->blockno, HASH_REMOVE, NULL) == NULL) elog(ERROR, "hash table corrupted"); - a->nentries--; - a->nchunks--; } } - else if (tbm_page_is_lossy(b, apage->blockno)) - { - /* page is lossy in b, cannot clear any bits */ - continue; - } - else + } +} + +/* + * Process one page of a during an intersection op + * + * Returns TRUE if apage is now empty and should be deleted from a + */ +static bool +tbm_intersect_page(PagetableEntry *apage, const TIDBitmap *b) +{ + const PagetableEntry *bpage; + int wordnum; + + if (apage->ischunk) + { + /* Scan each bit in chunk, try to clear */ + bool candelete = true; + + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) { - bool candelete = true; + bitmapword w = apage->words[wordnum]; - bpage = tbm_find_pageentry(b, apage->blockno); - if (bpage != NULL) + if (w != 0) { - /* Both pages are exact, merge at the bit level */ - Assert(!bpage->ischunk); - for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + bitmapword neww = w; + BlockNumber pg; + int bitnum; + + pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD); + bitnum = 0; + while (w != 0) { - apage->words[wordnum] &= bpage->words[wordnum]; - if (apage->words[wordnum] != 0) - candelete = false; + if (w & 1) + { + if (!tbm_page_is_lossy(b, pg) && + tbm_find_pageentry(b, pg) == NULL) + { + /* Page is not in b at all, lose lossy bit */ + neww &= ~((bitmapword) 1 << bitnum); + } + } + pg++; + bitnum++; + w >>= 1; } + apage->words[wordnum] = neww; + if (neww != 0) + candelete = false; } - if (candelete) + } + return candelete; + } + else if (tbm_page_is_lossy(b, apage->blockno)) + { + /* page is lossy in b, cannot clear any bits */ + return false; + } + else + { + bool candelete = true; + + bpage = tbm_find_pageentry(b, apage->blockno); + if (bpage != NULL) + { + /* Both pages are exact, merge at the bit level */ + Assert(!bpage->ischunk); + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) { - /* Page is now empty, remove it from a */ - if (hash_search(a->pagetable, - (void *) &apage->blockno, - HASH_REMOVE, NULL) == NULL) - elog(ERROR, "hash table corrupted"); - a->nentries--; - a->npages--; + apage->words[wordnum] &= bpage->words[wordnum]; + if (apage->words[wordnum] != 0) + candelete = false; } } + return candelete; } } @@ -411,15 +514,16 @@ tbm_begin_iterate(TIDBitmap *tbm) tbm->iterating = true; /* - * Allocate the output data structure if we didn't already. - * (We don't do this during tbm_create since it's entirely possible - * that a TIDBitmap will live and die without ever being iterated.) + * Reset iteration pointers. + */ + tbm->spageptr = 0; + tbm->schunkptr = 0; + tbm->schunkbit = 0; + /* + * Nothing else to do if no entries, nor if we don't have a hashtable. */ - if (!tbm->output) - tbm->output = (TBMIterateResult *) - MemoryContextAllocZero(tbm->mcxt, - sizeof(TBMIterateResult) + - MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); + if (tbm->nentries == 0 || tbm->status != TBM_HASH) + return; /* * Create and fill the sorted page lists if we didn't already. */ @@ -447,12 +551,6 @@ tbm_begin_iterate(TIDBitmap *tbm) qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator); if (nchunks > 1) qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator); - /* - * Reset iteration pointers. - */ - tbm->spageptr = 0; - tbm->schunkptr = 0; - tbm->schunkbit = 0; } /* @@ -468,7 +566,7 @@ tbm_begin_iterate(TIDBitmap *tbm) TBMIterateResult * tbm_iterate(TIDBitmap *tbm) { - TBMIterateResult *output = tbm->output; + TBMIterateResult *output = &(tbm->output); Assert(tbm->iterating); /* @@ -521,10 +619,16 @@ tbm_iterate(TIDBitmap *tbm) if (tbm->spageptr < tbm->npages) { - PagetableEntry *page = tbm->spages[tbm->spageptr]; + PagetableEntry *page; int ntuples; int wordnum; + /* In ONE_PAGE state, we don't allocate an spages[] array */ + if (tbm->status == TBM_ONE_PAGE) + page = &tbm->entry1; + else + page = tbm->spages[tbm->spageptr]; + /* scan bitmap to extract individual offset numbers */ ntuples = 0; for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) @@ -559,10 +663,22 @@ tbm_iterate(TIDBitmap *tbm) * * Returns NULL if there is no non-lossy entry for the pageno. */ -static PagetableEntry * +static const PagetableEntry * tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno) { - PagetableEntry *page; + const PagetableEntry *page; + + if (tbm->nentries == 0) /* in case pagetable doesn't exist */ + return NULL; + + if (tbm->status == TBM_ONE_PAGE) + { + page = &tbm->entry1; + if (page->blockno != pageno) + return NULL; + Assert(!page->ischunk); + return page; + } page = (PagetableEntry *) hash_search(tbm->pagetable, (void *) &pageno, @@ -588,14 +704,33 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) PagetableEntry *page; bool found; - /* Look up or create an entry */ - page = (PagetableEntry *) hash_search(tbm->pagetable, - (void *) &pageno, - HASH_ENTER, &found); - if (page == NULL) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + if (tbm->status == TBM_EMPTY) + { + /* Use the fixed slot */ + page = &tbm->entry1; + found = false; + tbm->status = TBM_ONE_PAGE; + } + else + { + if (tbm->status == TBM_ONE_PAGE) + { + page = &tbm->entry1; + if (page->blockno == pageno) + return page; + /* Time to switch from one page to a hashtable */ + tbm_create_pagetable(tbm); + } + + /* Look up or create an entry */ + page = (PagetableEntry *) hash_search(tbm->pagetable, + (void *) &pageno, + HASH_ENTER, &found); + if (page == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } /* Initialize it if not present before */ if (!found) @@ -623,6 +758,7 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno) /* we can skip the lookup if there are no lossy chunks */ if (tbm->nchunks == 0) return false; + Assert(tbm->status == TBM_HASH); bitno = pageno % PAGES_PER_CHUNK; chunk_pageno = pageno - bitno; @@ -656,6 +792,10 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) int wordnum; int bitnum; + /* We force the bitmap into hashtable mode whenever it's lossy */ + if (tbm->status != TBM_HASH) + tbm_create_pagetable(tbm); + bitno = pageno % PAGES_PER_CHUNK; chunk_pageno = pageno - bitno; @@ -731,6 +871,8 @@ tbm_lossify(TIDBitmap *tbm) * during each call. */ Assert(!tbm->iterating); + Assert(tbm->status == TBM_HASH); + hash_seq_init(&status, tbm->pagetable); while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL) { -- 2.40.0