From db306521351a072a56bf7ab526932c78beb0a4aa Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 17 Apr 2005 22:24:02 +0000 Subject: [PATCH] Initial implementation of lossy-tuple-bitmap data structures. Not connected to anything useful yet ... --- src/backend/nodes/Makefile | 4 +- src/backend/nodes/tidbitmap.c | 774 ++++++++++++++++++++++++++++++++++ src/include/nodes/nodes.h | 13 +- src/include/nodes/tidbitmap.h | 55 +++ 4 files changed, 840 insertions(+), 6 deletions(-) create mode 100644 src/backend/nodes/tidbitmap.c create mode 100644 src/include/nodes/tidbitmap.h diff --git a/src/backend/nodes/Makefile b/src/backend/nodes/Makefile index 86f533e7a0..3fdf3dfe43 100644 --- a/src/backend/nodes/Makefile +++ b/src/backend/nodes/Makefile @@ -4,7 +4,7 @@ # Makefile for backend/nodes # # IDENTIFICATION -# $PostgreSQL: pgsql/src/backend/nodes/Makefile,v 1.17 2004/08/02 01:30:42 tgl Exp $ +# $PostgreSQL: pgsql/src/backend/nodes/Makefile,v 1.18 2005/04/17 22:24:02 tgl Exp $ # #------------------------------------------------------------------------- @@ -12,7 +12,7 @@ subdir = src/backend/nodes top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = nodeFuncs.o nodes.o list.o bitmapset.o \ +OBJS = nodeFuncs.o nodes.o list.o bitmapset.o tidbitmap.o \ copyfuncs.o equalfuncs.o makefuncs.o \ outfuncs.o readfuncs.o print.o read.o params.o value.o diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c new file mode 100644 index 0000000000..f250515921 --- /dev/null +++ b/src/backend/nodes/tidbitmap.c @@ -0,0 +1,774 @@ +/*------------------------------------------------------------------------- + * + * tidbitmap.c + * PostgreSQL tuple-id (TID) bitmap package + * + * This module provides bitmap data structures that are spiritually + * similar to Bitmapsets, but are specially adapted to store sets of + * tuple identifiers (TIDs), or ItemPointers. In particular, the division + * of an ItemPointer into BlockNumber and OffsetNumber is catered for. + * Also, since we wish to be able to store very large tuple sets in + * memory with this data structure, we support "lossy" storage, in which + * we no longer remember individual tuple offsets on a page but only the + * fact that a particular page needs to be visited. + * + * The "lossy" storage uses one bit per disk page, so at the standard 8K + * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb + * of memory. People pushing around tables of that size should have a + * couple of Mb to spare, so we don't worry about providing a second level + * of lossiness. In theory we could fall back to page ranges at some + * point, but for now that seems useless complexity. + * + * + * Copyright (c) 2003-2005, PostgreSQL Global Development Group + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.1 2005/04/17 22:24:02 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/htup.h" +#include "nodes/tidbitmap.h" +#include "utils/hsearch.h" + + +/* + * The maximum number of tuples per page is not large (typically 256 with + * 8K pages, or 1024 with 32K pages). So there's not much point in making + * the per-page bitmaps variable size. We just legislate that the size + * is this: + */ +#define MAX_TUPLES_PER_PAGE ((BLCKSZ - 1) / MAXALIGN(offsetof(HeapTupleHeaderData, t_bits) + sizeof(ItemIdData)) + 1) + +/* + * When we have to switch over to lossy storage, we use a data structure + * with one bit per page, where all pages having the same number DIV + * PAGES_PER_CHUNK are aggregated into one chunk. When a chunk is present + * and has the bit set for a given page, there must not be a per-page entry + * for that page in the page table. + * + * We actually store both exact pages and lossy chunks in the same hash + * table, using identical data structures. (This is because dynahash.c's + * memory management doesn't allow space to be transferred easily from one + * hashtable to another.) Therefore it's best if PAGES_PER_CHUNK is the + * same as MAX_TUPLES_PER_PAGE, or at least not too different. But we + * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer + * remainder operations. So, define it like this: + */ +#define PAGES_PER_CHUNK (BLCKSZ / 32) + +/* The bitmap unit size can be adjusted by changing these declarations: */ +#define BITS_PER_BITMAPWORD 32 +typedef uint32 bitmapword; /* must be an unsigned type */ + +#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD) +#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD) + +/* number of active words for an exact page: */ +#define WORDS_PER_PAGE ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1) +/* number of active words for a lossy chunk: */ +#define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1) + +/* + * The hashtable entries are represented by this data structure. For + * an exact page, blockno is the page number and bit k of the bitmap + * represents tuple offset k+1. For a lossy chunk, blockno is the first + * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and + * bit k represents page blockno+k. Note that it is not possible to + * have exact storage for the first page of a chunk if we are using + * lossy storage for any page in the chunk's range, since the same + * hashtable entry has to serve both purposes. + */ +typedef struct PagetableEntry +{ + BlockNumber blockno; /* page number (hashtable key) */ + bool ischunk; /* T = lossy storage, F = exact */ + bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]; +} PagetableEntry; + +/* + * Here is the representation for a whole TIDBitMap: + */ +struct TIDBitmap +{ + NodeTag type; /* to make it a valid Node */ + MemoryContext mcxt; /* memory context containing me */ + HTAB *pagetable; /* hash table of PagetableEntry's */ + int nentries; /* number of entries in pagetable */ + int maxentries; /* limit on same to meet maxbytes */ + int npages; /* number of exact entries in pagetable */ + int nchunks; /* number of lossy entries in pagetable */ + bool iterating; /* tbm_begin_iterate called? */ + /* the remaining fields are used while producing sorted output: */ + TBMIterateResult *output; /* NULL if not yet created */ + PagetableEntry **spages; /* sorted exact-page list, or NULL */ + PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */ + int spageptr; /* next spages index */ + int schunkptr; /* next schunks index */ + int schunkbit; /* next bit to check in current schunk */ +}; + + +/* Local function prototypes */ +static PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm, + BlockNumber pageno); +static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno); +static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno); +static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno); +static void tbm_lossify(TIDBitmap *tbm); +static int tbm_comparator(const void *left, const void *right); + + +/* + * tbm_create - create an initially-empty bitmap + * + * The bitmap will live in the memory context that is CurrentMemoryContext + * at the time of this call. It will be limited to (approximately) maxbytes + * total memory consumption. + */ +TIDBitmap * +tbm_create(long maxbytes) +{ + TIDBitmap *tbm; + HASHCTL hash_ctl; + long nbuckets; + + tbm = makeNode(TIDBitmap); + /* we rely on makeNode to have zeroed all the fields */ + tbm->mcxt = CurrentMemoryContext; + + /* + * Estimate number of hashtable entries we can have within maxbytes. + * This estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) + * plus a pointer per hash entry, which is crude but good enough for + * our purpose. (NOTE: this does not count the space for data + * structures created during iteration readout.) + */ + nbuckets = maxbytes / + (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry)) + + sizeof(Pointer)); + nbuckets = Min(nbuckets, INT_MAX-1); /* safety limit */ + tbm->maxentries = (int) nbuckets; + + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(BlockNumber); + hash_ctl.entrysize = sizeof(PagetableEntry); + hash_ctl.hash = tag_hash; + hash_ctl.hcxt = CurrentMemoryContext; + tbm->pagetable = hash_create("TIDBitmap", + nbuckets, + &hash_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + + return tbm; +} + +/* + * tbm_free - free a TIDBitmap + */ +void +tbm_free(TIDBitmap *tbm) +{ + hash_destroy(tbm->pagetable); + if (tbm->output) + pfree(tbm->output); + if (tbm->spages) + pfree(tbm->spages); + if (tbm->schunks) + pfree(tbm->schunks); + pfree(tbm); +} + +/* + * tbm_add_tuples - add some tuple IDs to a TIDBitmap + */ +void +tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids) +{ + int i; + + Assert(!tbm->iterating); + for (i = 0; i < ntids; i++) + { + BlockNumber blk = ItemPointerGetBlockNumber(tids + i); + OffsetNumber off = ItemPointerGetOffsetNumber(tids + i); + PagetableEntry *page; + int wordnum, + bitnum; + + /* safety check to ensure we don't overrun bit array bounds */ + if (off < 1 || off > MAX_TUPLES_PER_PAGE) + elog(ERROR, "tuple offset out of range: %u", off); + + if (tbm_page_is_lossy(tbm, blk)) + continue; /* whole page is already marked */ + + page = tbm_get_pageentry(tbm, blk); + + if (page->ischunk) + { + /* The page is a lossy chunk header, set bit for itself */ + wordnum = bitnum = 0; + } + else + { + /* Page is exact, so set bit for individual tuple */ + wordnum = WORDNUM(off - 1); + bitnum = BITNUM(off - 1); + } + page->words[wordnum] |= ((bitmapword) 1 << bitnum); + + if (tbm->nentries > tbm->maxentries) + tbm_lossify(tbm); + } +} + +/* + * tbm_union - set union + * + * a is modified in-place, b is not changed + */ +void +tbm_union(TIDBitmap *a, const TIDBitmap *b) +{ + HASH_SEQ_STATUS status; + PagetableEntry *apage; + PagetableEntry *bpage; + int wordnum; + + Assert(!a->iterating); + /* Scan through chunks and pages in b, merge into a */ + hash_seq_init(&status, b->pagetable); + while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL) + { + if (bpage->ischunk) + { + /* Scan b's chunk, mark each indicated page lossy in a */ + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + { + bitmapword w = bpage->words[wordnum]; + + if (w != 0) + { + BlockNumber pg; + + pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD); + while (w != 0) + { + if (w & 1) + tbm_mark_page_lossy(a, pg); + pg++; + w >>= 1; + } + } + } + } + else if (tbm_page_is_lossy(a, bpage->blockno)) + { + /* page is already lossy in a, nothing to do */ + continue; + } + else + { + apage = tbm_get_pageentry(a, bpage->blockno); + if (apage->ischunk) + { + /* The page is a lossy chunk header, set bit for itself */ + apage->words[0] |= ((bitmapword) 1 << 0); + } + else + { + /* Both pages are exact, merge at the bit level */ + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + apage->words[wordnum] |= bpage->words[wordnum]; + } + } + + if (a->nentries > a->maxentries) + tbm_lossify(a); + } +} + +/* + * tbm_intersect - set intersection + * + * a is modified in-place, b is not changed + */ +void +tbm_intersect(TIDBitmap *a, const TIDBitmap *b) +{ + HASH_SEQ_STATUS status; + PagetableEntry *apage; + PagetableEntry *bpage; + int wordnum; + + Assert(!a->iterating); + /* Scan through chunks and pages in a, try to match to b */ + hash_seq_init(&status, a->pagetable); + while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL) + { + if (apage->ischunk) + { + /* Scan each bit in chunk, try to clear */ + bool candelete = true; + + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + { + bitmapword w = apage->words[wordnum]; + + if (w != 0) + { + bitmapword neww = w; + BlockNumber pg; + int bitnum; + + pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD); + bitnum = 0; + while (w != 0) + { + if (w & 1) + { + if (!tbm_page_is_lossy(b, pg) && + tbm_find_pageentry(b, pg) == NULL) + { + /* Page is not in b at all, lose lossy bit */ + neww &= ~((bitmapword) 1 << bitnum); + } + } + pg++; + bitnum++; + w >>= 1; + } + apage->words[wordnum] = neww; + if (neww != 0) + candelete = false; + } + } + if (candelete) + { + /* Chunk is now empty, remove it from a */ + if (hash_search(a->pagetable, + (void *) &apage->blockno, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "hash table corrupted"); + a->nentries--; + a->nchunks--; + } + } + else if (tbm_page_is_lossy(b, apage->blockno)) + { + /* page is lossy in b, cannot clear any bits */ + continue; + } + else + { + bool candelete = true; + + bpage = tbm_find_pageentry(b, apage->blockno); + if (bpage != NULL) + { + /* Both pages are exact, merge at the bit level */ + Assert(!bpage->ischunk); + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + { + apage->words[wordnum] &= bpage->words[wordnum]; + if (apage->words[wordnum] != 0) + candelete = false; + } + } + if (candelete) + { + /* Page is now empty, remove it from a */ + if (hash_search(a->pagetable, + (void *) &apage->blockno, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "hash table corrupted"); + a->nentries--; + a->npages--; + } + } + } +} + +/* + * tbm_begin_iterate - prepare to iterate through a TIDBitmap + * + * NB: after this is called, it is no longer allowed to modify the contents + * of the bitmap. However, you can call this multiple times to scan the + * contents repeatedly. + */ +void +tbm_begin_iterate(TIDBitmap *tbm) +{ + HASH_SEQ_STATUS status; + PagetableEntry *page; + int npages; + int nchunks; + + tbm->iterating = true; + /* + * Allocate the output data structure if we didn't already. + * (We don't do this during tbm_create since it's entirely possible + * that a TIDBitmap will live and die without ever being iterated.) + */ + if (!tbm->output) + tbm->output = (TBMIterateResult *) + MemoryContextAllocZero(tbm->mcxt, + sizeof(TBMIterateResult) + + MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); + /* + * Create and fill the sorted page lists if we didn't already. + */ + if (!tbm->spages && tbm->npages > 0) + tbm->spages = (PagetableEntry **) + MemoryContextAlloc(tbm->mcxt, + tbm->npages * sizeof(PagetableEntry *)); + if (!tbm->schunks && tbm->nchunks > 0) + tbm->schunks = (PagetableEntry **) + MemoryContextAlloc(tbm->mcxt, + tbm->nchunks * sizeof(PagetableEntry *)); + + hash_seq_init(&status, tbm->pagetable); + npages = nchunks = 0; + while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL) + { + if (page->ischunk) + tbm->schunks[nchunks++] = page; + else + tbm->spages[npages++] = page; + } + Assert(npages == tbm->npages); + Assert(nchunks == tbm->nchunks); + if (npages > 1) + qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator); + if (nchunks > 1) + qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator); + /* + * Reset iteration pointers. + */ + tbm->spageptr = 0; + tbm->schunkptr = 0; + tbm->schunkbit = 0; +} + +/* + * tbm_iterate - scan through next page of a TIDBitmap + * + * Returns a TBMIterateResult representing one page, or NULL if there are + * no more pages to scan. Pages are guaranteed to be delivered in numerical + * order. If result->ntuples < 0, then the bitmap is "lossy" and failed to + * remember the exact tuples to look at on this page --- the caller must + * examine all tuples on the page and check if they meet the intended + * condition. + */ +TBMIterateResult * +tbm_iterate(TIDBitmap *tbm) +{ + TBMIterateResult *output = tbm->output; + + Assert(tbm->iterating); + /* + * If lossy chunk pages remain, make sure we've advanced schunkptr/ + * schunkbit to the next set bit. + */ + while (tbm->schunkptr < tbm->nchunks) + { + PagetableEntry *chunk = tbm->schunks[tbm->schunkptr]; + int schunkbit = tbm->schunkbit; + + while (schunkbit < PAGES_PER_CHUNK) + { + int wordnum = WORDNUM(schunkbit); + int bitnum = BITNUM(schunkbit); + + if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) + break; + schunkbit++; + } + if (schunkbit < PAGES_PER_CHUNK) + { + tbm->schunkbit = schunkbit; + break; + } + /* advance to next chunk */ + tbm->schunkptr++; + tbm->schunkbit = 0; + } + /* + * If both chunk and per-page data remain, must output the numerically + * earlier page. + */ + if (tbm->schunkptr < tbm->nchunks) + { + PagetableEntry *chunk = tbm->schunks[tbm->schunkptr]; + BlockNumber chunk_blockno; + + chunk_blockno = chunk->blockno + tbm->schunkbit; + if (tbm->spageptr >= tbm->npages || + chunk_blockno < tbm->spages[tbm->spageptr]->blockno) + { + /* Return a lossy page indicator from the chunk */ + output->blockno = chunk_blockno; + output->ntuples = -1; + tbm->schunkbit++; + return output; + } + } + + if (tbm->spageptr < tbm->npages) + { + PagetableEntry *page = tbm->spages[tbm->spageptr]; + int ntuples; + int wordnum; + + /* scan bitmap to extract individual offset numbers */ + ntuples = 0; + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + { + bitmapword w = page->words[wordnum]; + + if (w != 0) + { + int off = wordnum * BITS_PER_BITMAPWORD + 1; + + while (w != 0) + { + if (w & 1) + output->offsets[ntuples++] = (OffsetNumber) off; + off++; + w >>= 1; + } + } + } + output->blockno = page->blockno; + output->ntuples = ntuples; + tbm->spageptr++; + return output; + } + + /* Nothing more in the bitmap */ + return NULL; +} + +/* + * tbm_find_pageentry - find a PagetableEntry for the pageno + * + * Returns NULL if there is no non-lossy entry for the pageno. + */ +static PagetableEntry * +tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno) +{ + PagetableEntry *page; + + page = (PagetableEntry *) hash_search(tbm->pagetable, + (void *) &pageno, + HASH_FIND, NULL); + if (page == NULL) + return NULL; + if (page->ischunk) + return NULL; /* don't want a lossy chunk header */ + return page; +} + +/* + * tbm_get_pageentry - find or create a PagetableEntry for the pageno + * + * If new, the entry is marked as an exact (non-chunk) entry. + * + * This may cause the table to exceed the desired memory size. It is + * up to the caller to call tbm_lossify() at the next safe point if so. + */ +static PagetableEntry * +tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) +{ + PagetableEntry *page; + bool found; + + /* Look up or create an entry */ + page = (PagetableEntry *) hash_search(tbm->pagetable, + (void *) &pageno, + HASH_ENTER, &found); + if (page == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* Initialize it if not present before */ + if (!found) + { + MemSet(page, 0, sizeof(PagetableEntry)); + page->blockno = pageno; + /* must count it too */ + tbm->nentries++; + tbm->npages++; + } + + return page; +} + +/* + * tbm_page_is_lossy - is the page marked as lossily stored? + */ +static bool +tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno) +{ + PagetableEntry *page; + BlockNumber chunk_pageno; + int bitno; + + /* we can skip the lookup if there are no lossy chunks */ + if (tbm->nchunks == 0) + return false; + + bitno = pageno % PAGES_PER_CHUNK; + chunk_pageno = pageno - bitno; + page = (PagetableEntry *) hash_search(tbm->pagetable, + (void *) &chunk_pageno, + HASH_FIND, NULL); + if (page != NULL && page->ischunk) + { + int wordnum = WORDNUM(bitno); + int bitnum = BITNUM(bitno); + + if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) + return true; + } + return false; +} + +/* + * tbm_mark_page_lossy - mark the page number as lossily stored + * + * This may cause the table to exceed the desired memory size. It is + * up to the caller to call tbm_lossify() at the next safe point if so. + */ +static void +tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) +{ + PagetableEntry *page; + bool found; + BlockNumber chunk_pageno; + int bitno; + int wordnum; + int bitnum; + + bitno = pageno % PAGES_PER_CHUNK; + chunk_pageno = pageno - bitno; + + /* + * Remove any extant non-lossy entry for the page. If the page is + * its own chunk header, however, we skip this and handle the case + * below. + */ + if (bitno != 0) + { + if (hash_search(tbm->pagetable, + (void *) &pageno, + HASH_REMOVE, NULL) != NULL) + { + /* It was present, so adjust counts */ + tbm->nentries--; + tbm->npages--; /* assume it must have been non-lossy */ + } + } + + /* Look up or create entry for chunk-header page */ + page = (PagetableEntry *) hash_search(tbm->pagetable, + (void *) &chunk_pageno, + HASH_ENTER, &found); + if (page == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* Initialize it if not present before */ + if (!found) + { + MemSet(page, 0, sizeof(PagetableEntry)); + page->blockno = chunk_pageno; + page->ischunk = true; + /* must count it too */ + tbm->nentries++; + tbm->nchunks++; + } + else if (!page->ischunk) + { + /* chunk header page was formerly non-lossy, make it lossy */ + MemSet(page, 0, sizeof(PagetableEntry)); + page->blockno = chunk_pageno; + page->ischunk = true; + /* we assume it had some tuple bit(s) set, so mark it lossy */ + page->words[0] = ((bitmapword) 1 << 0); + /* adjust counts */ + tbm->nchunks++; + tbm->npages--; + } + + /* Now set the original target page's bit */ + wordnum = WORDNUM(bitno); + bitnum = BITNUM(bitno); + page->words[wordnum] |= ((bitmapword) 1 << bitnum); +} + +/* + * tbm_lossify - lose some information to get back under the memory limit + */ +static void +tbm_lossify(TIDBitmap *tbm) +{ + HASH_SEQ_STATUS status; + PagetableEntry *page; + + /* + * XXX Really stupid implementation: this just lossifies pages in + * essentially random order. We should be paying some attention + * to the number of bits set in each page, instead. Also it might + * be a good idea to lossify more than the minimum number of pages + * during each call. + */ + Assert(!tbm->iterating); + hash_seq_init(&status, tbm->pagetable); + while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL) + { + if (page->ischunk) + continue; /* already a chunk header */ + /* + * If the page would become a chunk header, we won't save anything + * by converting it to lossy, so skip it. + */ + if ((page->blockno % PAGES_PER_CHUNK) == 0) + continue; + + /* This does the dirty work ... */ + tbm_mark_page_lossy(tbm, page->blockno); + + if (tbm->nentries <= tbm->maxentries) + return; /* we have done enough */ + + /* + * Note: tbm_mark_page_lossy may have inserted a lossy chunk into + * the hashtable. We can continue the same seq_search scan since + * we do not care whether we visit lossy chunks or not. + */ + } +} + +/* + * qsort comparator to handle PagetableEntry pointers. + */ +static int +tbm_comparator(const void *left, const void *right) +{ + BlockNumber l = (*((const PagetableEntry **) left))->blockno; + BlockNumber r = (*((const PagetableEntry **) right))->blockno; + + if (l < r) + return -1; + else if (l > r) + return 1; + return 0; +} diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 4822b5f6cc..3c5528546a 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.165 2005/04/06 16:34:07 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.166 2005/04/17 22:24:02 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -304,11 +304,16 @@ typedef enum NodeTag T_FunctionParameter, /* - * TAGS FOR FUNCTION-CALL CONTEXT AND RESULTINFO NODES (see fmgr.h) + * TAGS FOR RANDOM OTHER STUFF + * + * These are objects that aren't part of parse/plan/execute node tree + * structures, but we give them NodeTags anyway for identification + * purposes (usually because they are involved in APIs where we want + * to pass multiple object types through the same pointer). */ T_TriggerData = 900, /* in commands/trigger.h */ - T_ReturnSetInfo /* in nodes/execnodes.h */ - + T_ReturnSetInfo, /* in nodes/execnodes.h */ + T_TIDBitmap /* in nodes/tidbitmap.h */ } NodeTag; /* diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h new file mode 100644 index 0000000000..2c90c01d81 --- /dev/null +++ b/src/include/nodes/tidbitmap.h @@ -0,0 +1,55 @@ +/*------------------------------------------------------------------------- + * + * tidbitmap.h + * PostgreSQL tuple-id (TID) bitmap package + * + * This module provides bitmap data structures that are spiritually + * similar to Bitmapsets, but are specially adapted to store sets of + * tuple identifiers (TIDs), or ItemPointers. In particular, the division + * of an ItemPointer into BlockNumber and OffsetNumber is catered for. + * Also, since we wish to be able to store very large tuple sets in + * memory with this data structure, we support "lossy" storage, in which + * we no longer remember individual tuple offsets on a page but only the + * fact that a particular page needs to be visited. + * + * + * Copyright (c) 2003-2005, PostgreSQL Global Development Group + * + * $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.1 2005/04/17 22:24:02 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef TIDBITMAP_H +#define TIDBITMAP_H + +#include "storage/itemptr.h" + + +/* + * Actual bitmap representation is private to tidbitmap.c. Callers can + * do IsA(x, TIDBitmap) on it, but nothing else. + */ +typedef struct TIDBitmap TIDBitmap; + +/* Result structure for tbm_iterate */ +typedef struct +{ + BlockNumber blockno; /* page number containing tuples */ + int ntuples; /* -1 indicates lossy result */ + OffsetNumber offsets[1]; /* VARIABLE LENGTH ARRAY */ +} TBMIterateResult; /* VARIABLE LENGTH STRUCT */ + +/* function prototypes in nodes/tidbitmap.c */ + +extern TIDBitmap *tbm_create(long maxbytes); +extern void tbm_free(TIDBitmap *tbm); + +extern void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids); + +extern void tbm_union(TIDBitmap *a, const TIDBitmap *b); +extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b); + +extern void tbm_begin_iterate(TIDBitmap *tbm); +extern TBMIterateResult *tbm_iterate(TIDBitmap *tbm); + +#endif /* TIDBITMAP_H */ -- 2.40.0