Initial implementation of lossy-tuple-bitmap data structures.

author Tom Lane <tgl@sss.pgh.pa.us>

Sun, 17 Apr 2005 22:24:02 +0000 (22:24 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sun, 17 Apr 2005 22:24:02 +0000 (22:24 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Sun, 17 Apr 2005 22:24:02 +0000 (22:24 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sun, 17 Apr 2005 22:24:02 +0000 (22:24 +0000)
diff --git a/src/backend/nodes/Makefile b/src/backend/nodes/Makefile

index 86f533e7a0e4c3c1a7d0b819f9f3b04867154023..3fdf3dfe43453c1a24de091fb39ec94ee2b04be4 100644 (file)
--- a/src/backend/nodes/Makefile
+++ b/src/backend/nodes/Makefile
@@ -4,7 +4,7 @@
  #    Makefile for backend/nodes
  #
  # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/nodes/Makefile,v 1.17 2004/08/02 01:30:42 tgl Exp $
+#    $PostgreSQL: pgsql/src/backend/nodes/Makefile,v 1.18 2005/04/17 22:24:02 tgl Exp $
  #
  #-------------------------------------------------------------------------
  
@@ -12,7 +12,7 @@ subdir = src/backend/nodes
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
-OBJS = nodeFuncs.o nodes.o list.o bitmapset.o \
+OBJS = nodeFuncs.o nodes.o list.o bitmapset.o tidbitmap.o \
         copyfuncs.o equalfuncs.o makefuncs.o \
         outfuncs.o readfuncs.o print.o read.o params.o value.o
  
diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c

new file mode 100644 (file)

index 0000000..f250515
--- /dev/null
+++ b/src/backend/nodes/tidbitmap.c
@@ -0,0 +1,774 @@
+/*-------------------------------------------------------------------------
+ *
+ * tidbitmap.c
+ *       PostgreSQL tuple-id (TID) bitmap package
+ *
+ * This module provides bitmap data structures that are spiritually
+ * similar to Bitmapsets, but are specially adapted to store sets of
+ * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
+ * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
+ * Also, since we wish to be able to store very large tuple sets in
+ * memory with this data structure, we support "lossy" storage, in which
+ * we no longer remember individual tuple offsets on a page but only the
+ * fact that a particular page needs to be visited.
+ *
+ * The "lossy" storage uses one bit per disk page, so at the standard 8K
+ * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
+ * of memory.  People pushing around tables of that size should have a
+ * couple of Mb to spare, so we don't worry about providing a second level
+ * of lossiness.  In theory we could fall back to page ranges at some
+ * point, but for now that seems useless complexity.
+ *
+ *
+ * Copyright (c) 2003-2005, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *       $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.1 2005/04/17 22:24:02 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "access/htup.h"
+#include "nodes/tidbitmap.h"
+#include "utils/hsearch.h"
+
+
+/*
+ * The maximum number of tuples per page is not large (typically 256 with
+ * 8K pages, or 1024 with 32K pages).  So there's not much point in making
+ * the per-page bitmaps variable size.  We just legislate that the size
+ * is this:
+ */
+#define MAX_TUPLES_PER_PAGE  ((BLCKSZ - 1) / MAXALIGN(offsetof(HeapTupleHeaderData, t_bits) + sizeof(ItemIdData)) + 1)
+
+/*
+ * When we have to switch over to lossy storage, we use a data structure
+ * with one bit per page, where all pages having the same number DIV
+ * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
+ * and has the bit set for a given page, there must not be a per-page entry
+ * for that page in the page table.
+ *
+ * We actually store both exact pages and lossy chunks in the same hash
+ * table, using identical data structures.  (This is because dynahash.c's
+ * memory management doesn't allow space to be transferred easily from one
+ * hashtable to another.)  Therefore it's best if PAGES_PER_CHUNK is the
+ * same as MAX_TUPLES_PER_PAGE, or at least not too different.  But we
+ * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
+ * remainder operations.  So, define it like this:
+ */
+#define PAGES_PER_CHUNK  (BLCKSZ / 32)
+
+/* The bitmap unit size can be adjusted by changing these declarations: */
+#define BITS_PER_BITMAPWORD 32
+typedef uint32 bitmapword;             /* must be an unsigned type */
+
+#define WORDNUM(x)     ((x) / BITS_PER_BITMAPWORD)
+#define BITNUM(x)      ((x) % BITS_PER_BITMAPWORD)
+
+/* number of active words for an exact page: */
+#define WORDS_PER_PAGE  ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
+/* number of active words for a lossy chunk: */
+#define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
+
+/*
+ * The hashtable entries are represented by this data structure.  For
+ * an exact page, blockno is the page number and bit k of the bitmap
+ * represents tuple offset k+1.  For a lossy chunk, blockno is the first
+ * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
+ * bit k represents page blockno+k.  Note that it is not possible to
+ * have exact storage for the first page of a chunk if we are using
+ * lossy storage for any page in the chunk's range, since the same
+ * hashtable entry has to serve both purposes.
+ */
+typedef struct PagetableEntry
+{
+       BlockNumber     blockno;                /* page number (hashtable key) */
+       bool            ischunk;                /* T = lossy storage, F = exact */
+       bitmapword      words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
+} PagetableEntry;
+
+/*
+ * Here is the representation for a whole TIDBitMap:
+ */
+struct TIDBitmap
+{
+       NodeTag         type;                   /* to make it a valid Node */
+       MemoryContext mcxt;                     /* memory context containing me */
+       HTAB       *pagetable;          /* hash table of PagetableEntry's */
+       int                     nentries;               /* number of entries in pagetable */
+       int                     maxentries;             /* limit on same to meet maxbytes */
+       int                     npages;                 /* number of exact entries in pagetable */
+       int                     nchunks;                /* number of lossy entries in pagetable */
+       bool            iterating;              /* tbm_begin_iterate called? */
+       /* the remaining fields are used while producing sorted output: */
+       TBMIterateResult *output;       /* NULL if not yet created */
+       PagetableEntry **spages;        /* sorted exact-page list, or NULL */
+       PagetableEntry **schunks;       /* sorted lossy-chunk list, or NULL */
+       int                     spageptr;               /* next spages index */
+       int                     schunkptr;              /* next schunks index */
+       int                     schunkbit;              /* next bit to check in current schunk */
+};
+
+
+/* Local function prototypes */
+static PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
+                                                                                 BlockNumber pageno);
+static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
+static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
+static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
+static void tbm_lossify(TIDBitmap *tbm);
+static int     tbm_comparator(const void *left, const void *right);
+
+
+/*
+ * tbm_create - create an initially-empty bitmap
+ *
+ * The bitmap will live in the memory context that is CurrentMemoryContext
+ * at the time of this call.  It will be limited to (approximately) maxbytes
+ * total memory consumption.
+ */
+TIDBitmap *
+tbm_create(long maxbytes)
+{
+       TIDBitmap  *tbm;
+       HASHCTL         hash_ctl;
+       long            nbuckets;
+
+       tbm = makeNode(TIDBitmap);
+       /* we rely on makeNode to have zeroed all the fields */
+       tbm->mcxt = CurrentMemoryContext;
+
+       /*
+        * Estimate number of hashtable entries we can have within maxbytes.
+        * This estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT))
+        * plus a pointer per hash entry, which is crude but good enough for
+        * our purpose.  (NOTE: this does not count the space for data
+        * structures created during iteration readout.)
+        */
+       nbuckets = maxbytes /
+               (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
+                + sizeof(Pointer));
+       nbuckets = Min(nbuckets, INT_MAX-1);    /* safety limit */
+       tbm->maxentries = (int) nbuckets;
+
+       MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+       hash_ctl.keysize = sizeof(BlockNumber);
+       hash_ctl.entrysize = sizeof(PagetableEntry);
+       hash_ctl.hash = tag_hash;
+       hash_ctl.hcxt = CurrentMemoryContext;
+       tbm->pagetable = hash_create("TIDBitmap",
+                                                                nbuckets,
+                                                                &hash_ctl,
+                                                                HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+       return tbm;
+}
+
+/*
+ * tbm_free - free a TIDBitmap
+ */
+void
+tbm_free(TIDBitmap *tbm)
+{
+       hash_destroy(tbm->pagetable);
+       if (tbm->output)
+               pfree(tbm->output);
+       if (tbm->spages)
+               pfree(tbm->spages);
+       if (tbm->schunks)
+               pfree(tbm->schunks);
+       pfree(tbm);
+}
+
+/*
+ * tbm_add_tuples - add some tuple IDs to a TIDBitmap
+ */
+void
+tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids)
+{
+       int                     i;
+
+       Assert(!tbm->iterating);
+       for (i = 0; i < ntids; i++)
+       {
+               BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
+               OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
+               PagetableEntry *page;
+               int                     wordnum,
+                                       bitnum;
+
+               /* safety check to ensure we don't overrun bit array bounds */
+               if (off < 1 || off > MAX_TUPLES_PER_PAGE)
+                       elog(ERROR, "tuple offset out of range: %u", off);
+
+               if (tbm_page_is_lossy(tbm, blk))
+                       continue;                       /* whole page is already marked */
+
+               page = tbm_get_pageentry(tbm, blk);
+
+               if (page->ischunk)
+               {
+                       /* The page is a lossy chunk header, set bit for itself */
+                       wordnum = bitnum = 0;
+               }
+               else
+               {
+                       /* Page is exact, so set bit for individual tuple */
+                       wordnum = WORDNUM(off - 1);
+                       bitnum = BITNUM(off - 1);
+               }
+               page->words[wordnum] |= ((bitmapword) 1 << bitnum);
+
+               if (tbm->nentries > tbm->maxentries)
+                       tbm_lossify(tbm);
+       }
+}
+
+/*
+ * tbm_union - set union
+ *
+ * a is modified in-place, b is not changed
+ */
+void
+tbm_union(TIDBitmap *a, const TIDBitmap *b)
+{
+       HASH_SEQ_STATUS status;
+       PagetableEntry *apage;
+       PagetableEntry *bpage;
+       int             wordnum;
+
+       Assert(!a->iterating);
+       /* Scan through chunks and pages in b, merge into a */
+       hash_seq_init(&status, b->pagetable);
+       while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+       {
+               if (bpage->ischunk)
+               {
+                       /* Scan b's chunk, mark each indicated page lossy in a */
+                       for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+                       {
+                               bitmapword      w = bpage->words[wordnum];
+
+                               if (w != 0)
+                               {
+                                       BlockNumber     pg;
+
+                                       pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
+                                       while (w != 0)
+                                       {
+                                               if (w & 1)
+                                                       tbm_mark_page_lossy(a, pg);
+                                               pg++;
+                                               w >>= 1;
+                                       }
+                               }
+                       }
+               }
+               else if (tbm_page_is_lossy(a, bpage->blockno))
+               {
+                       /* page is already lossy in a, nothing to do */
+                       continue;
+               }
+               else
+               {
+                       apage = tbm_get_pageentry(a, bpage->blockno);
+                       if (apage->ischunk)
+                       {
+                               /* The page is a lossy chunk header, set bit for itself */
+                               apage->words[0] |= ((bitmapword) 1 << 0);
+                       }
+                       else
+                       {
+                               /* Both pages are exact, merge at the bit level */
+                               for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+                                       apage->words[wordnum] |= bpage->words[wordnum];
+                       }
+               }
+
+               if (a->nentries > a->maxentries)
+                       tbm_lossify(a);
+       }
+}
+
+/*
+ * tbm_intersect - set intersection
+ *
+ * a is modified in-place, b is not changed
+ */
+void
+tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
+{
+       HASH_SEQ_STATUS status;
+       PagetableEntry *apage;
+       PagetableEntry *bpage;
+       int             wordnum;
+
+       Assert(!a->iterating);
+       /* Scan through chunks and pages in a, try to match to b */
+       hash_seq_init(&status, a->pagetable);
+       while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+       {
+               if (apage->ischunk)
+               {
+                       /* Scan each bit in chunk, try to clear */
+                       bool    candelete = true;
+
+                       for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+                       {
+                               bitmapword      w = apage->words[wordnum];
+
+                               if (w != 0)
+                               {
+                                       bitmapword      neww = w;
+                                       BlockNumber     pg;
+                                       int             bitnum;
+
+                                       pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
+                                       bitnum = 0;
+                                       while (w != 0)
+                                       {
+                                               if (w & 1)
+                                               {
+                                                       if (!tbm_page_is_lossy(b, pg) &&
+                                                               tbm_find_pageentry(b, pg) == NULL)
+                                                       {
+                                                               /* Page is not in b at all, lose lossy bit */
+                                                               neww &= ~((bitmapword) 1 << bitnum);
+                                                       }
+                                               }
+                                               pg++;
+                                               bitnum++;
+                                               w >>= 1;
+                                       }
+                                       apage->words[wordnum] = neww;
+                                       if (neww != 0)
+                                               candelete = false;
+                               }
+                       }
+                       if (candelete)
+                       {
+                               /* Chunk is now empty, remove it from a */
+                               if (hash_search(a->pagetable,
+                                                               (void *) &apage->blockno,
+                                                               HASH_REMOVE, NULL) == NULL)
+                                       elog(ERROR, "hash table corrupted");
+                               a->nentries--;
+                               a->nchunks--;
+                       }
+               }
+               else if (tbm_page_is_lossy(b, apage->blockno))
+               {
+                       /* page is lossy in b, cannot clear any bits */
+                       continue;
+               }
+               else
+               {
+                       bool    candelete = true;
+
+                       bpage = tbm_find_pageentry(b, apage->blockno);
+                       if (bpage != NULL)
+                       {
+                               /* Both pages are exact, merge at the bit level */
+                               Assert(!bpage->ischunk);
+                               for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+                               {
+                                       apage->words[wordnum] &= bpage->words[wordnum];
+                                       if (apage->words[wordnum] != 0)
+                                               candelete = false;
+                               }
+                       }
+                       if (candelete)
+                       {
+                               /* Page is now empty, remove it from a */
+                               if (hash_search(a->pagetable,
+                                                               (void *) &apage->blockno,
+                                                               HASH_REMOVE, NULL) == NULL)
+                                       elog(ERROR, "hash table corrupted");
+                               a->nentries--;
+                               a->npages--;
+                       }
+               }
+       }
+}
+
+/*
+ * tbm_begin_iterate - prepare to iterate through a TIDBitmap
+ *
+ * NB: after this is called, it is no longer allowed to modify the contents
+ * of the bitmap.  However, you can call this multiple times to scan the
+ * contents repeatedly.
+ */
+void
+tbm_begin_iterate(TIDBitmap *tbm)
+{
+       HASH_SEQ_STATUS status;
+       PagetableEntry *page;
+       int                     npages;
+       int                     nchunks;
+
+       tbm->iterating = true;
+       /*
+        * Allocate the output data structure if we didn't already.
+        * (We don't do this during tbm_create since it's entirely possible
+        * that a TIDBitmap will live and die without ever being iterated.)
+        */
+       if (!tbm->output)
+               tbm->output = (TBMIterateResult *)
+                       MemoryContextAllocZero(tbm->mcxt,
+                                                                  sizeof(TBMIterateResult) +
+                                                                  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
+       /*
+        * Create and fill the sorted page lists if we didn't already.
+        */
+       if (!tbm->spages && tbm->npages > 0)
+               tbm->spages = (PagetableEntry **)
+                       MemoryContextAlloc(tbm->mcxt,
+                                                          tbm->npages * sizeof(PagetableEntry *));
+       if (!tbm->schunks && tbm->nchunks > 0)
+               tbm->schunks = (PagetableEntry **)
+                       MemoryContextAlloc(tbm->mcxt,
+                                                          tbm->nchunks * sizeof(PagetableEntry *));
+
+       hash_seq_init(&status, tbm->pagetable);
+       npages = nchunks = 0;
+       while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+       {
+               if (page->ischunk)
+                       tbm->schunks[nchunks++] = page;
+               else
+                       tbm->spages[npages++] = page;
+       }
+       Assert(npages == tbm->npages);
+       Assert(nchunks == tbm->nchunks);
+       if (npages > 1)
+               qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator);
+       if (nchunks > 1)
+               qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator);
+       /*
+        * Reset iteration pointers.
+        */
+       tbm->spageptr = 0;
+       tbm->schunkptr = 0;
+       tbm->schunkbit = 0;
+}
+
+/*
+ * tbm_iterate - scan through next page of a TIDBitmap
+ *
+ * Returns a TBMIterateResult representing one page, or NULL if there are
+ * no more pages to scan.  Pages are guaranteed to be delivered in numerical
+ * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
+ * remember the exact tuples to look at on this page --- the caller must
+ * examine all tuples on the page and check if they meet the intended
+ * condition.
+ */
+TBMIterateResult *
+tbm_iterate(TIDBitmap *tbm)
+{
+       TBMIterateResult *output = tbm->output;
+
+       Assert(tbm->iterating);
+       /*
+        * If lossy chunk pages remain, make sure we've advanced schunkptr/
+        * schunkbit to the next set bit.
+        */
+       while (tbm->schunkptr < tbm->nchunks)
+       {
+               PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
+               int             schunkbit = tbm->schunkbit;
+
+               while (schunkbit < PAGES_PER_CHUNK)
+               {
+                       int             wordnum = WORDNUM(schunkbit);
+                       int             bitnum = BITNUM(schunkbit);
+
+                       if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
+                               break;
+                       schunkbit++;
+               }
+               if (schunkbit < PAGES_PER_CHUNK)
+               {
+                       tbm->schunkbit = schunkbit;
+                       break;
+               }
+               /* advance to next chunk */
+               tbm->schunkptr++;
+               tbm->schunkbit = 0;
+       }
+       /*
+        * If both chunk and per-page data remain, must output the numerically
+        * earlier page.
+        */
+       if (tbm->schunkptr < tbm->nchunks)
+       {
+               PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
+               BlockNumber chunk_blockno;
+
+               chunk_blockno = chunk->blockno + tbm->schunkbit;
+               if (tbm->spageptr >= tbm->npages ||
+                       chunk_blockno < tbm->spages[tbm->spageptr]->blockno)
+               {
+                       /* Return a lossy page indicator from the chunk */
+                       output->blockno = chunk_blockno;
+                       output->ntuples = -1;
+                       tbm->schunkbit++;
+                       return output;
+               }
+       }
+
+       if (tbm->spageptr < tbm->npages)
+       {
+               PagetableEntry *page = tbm->spages[tbm->spageptr];
+               int                     ntuples;
+               int                     wordnum;
+
+               /* scan bitmap to extract individual offset numbers */
+               ntuples = 0;
+               for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+               {
+                       bitmapword      w = page->words[wordnum];
+
+                       if (w != 0)
+                       {
+                               int                     off = wordnum * BITS_PER_BITMAPWORD + 1;
+
+                               while (w != 0)
+                               {
+                                       if (w & 1)
+                                               output->offsets[ntuples++] = (OffsetNumber) off;
+                                       off++;
+                                       w >>= 1;
+                               }
+                       }
+               }
+               output->blockno = page->blockno;
+               output->ntuples = ntuples;
+               tbm->spageptr++;
+               return output;
+       }
+
+       /* Nothing more in the bitmap */
+       return NULL;
+}
+
+/*
+ * tbm_find_pageentry - find a PagetableEntry for the pageno
+ *
+ * Returns NULL if there is no non-lossy entry for the pageno.
+ */
+static PagetableEntry *
+tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
+{
+       PagetableEntry *page;
+
+       page = (PagetableEntry *) hash_search(tbm->pagetable,
+                                                                                 (void *) &pageno,
+                                                                                 HASH_FIND, NULL);
+       if (page == NULL)
+               return NULL;
+       if (page->ischunk)
+               return NULL;                    /* don't want a lossy chunk header */
+       return page;
+}
+
+/*
+ * tbm_get_pageentry - find or create a PagetableEntry for the pageno
+ *
+ * If new, the entry is marked as an exact (non-chunk) entry.
+ *
+ * This may cause the table to exceed the desired memory size.  It is
+ * up to the caller to call tbm_lossify() at the next safe point if so.
+ */
+static PagetableEntry *
+tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
+{
+       PagetableEntry *page;
+       bool            found;
+
+       /* Look up or create an entry */
+       page = (PagetableEntry *) hash_search(tbm->pagetable,
+                                                                                 (void *) &pageno,
+                                                                                 HASH_ENTER, &found);
+       if (page == NULL)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OUT_OF_MEMORY),
+                                errmsg("out of memory")));
+
+       /* Initialize it if not present before */
+       if (!found)
+       {
+               MemSet(page, 0, sizeof(PagetableEntry));
+               page->blockno = pageno;
+               /* must count it too */
+               tbm->nentries++;
+               tbm->npages++;
+       }
+
+       return page;
+}
+
+/*
+ * tbm_page_is_lossy - is the page marked as lossily stored?
+ */
+static bool
+tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
+{
+       PagetableEntry *page;
+       BlockNumber chunk_pageno;
+       int                     bitno;
+
+       /* we can skip the lookup if there are no lossy chunks */
+       if (tbm->nchunks == 0)
+               return false;
+
+       bitno = pageno % PAGES_PER_CHUNK;
+       chunk_pageno = pageno - bitno;
+       page = (PagetableEntry *) hash_search(tbm->pagetable,
+                                                                                 (void *) &chunk_pageno,
+                                                                                 HASH_FIND, NULL);
+       if (page != NULL && page->ischunk)
+       {
+               int             wordnum = WORDNUM(bitno);
+               int             bitnum = BITNUM(bitno);
+
+               if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
+                       return true;
+       }
+       return false;
+}
+
+/*
+ * tbm_mark_page_lossy - mark the page number as lossily stored
+ *
+ * This may cause the table to exceed the desired memory size.  It is
+ * up to the caller to call tbm_lossify() at the next safe point if so.
+ */
+static void
+tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
+{
+       PagetableEntry *page;
+       bool            found;
+       BlockNumber chunk_pageno;
+       int                     bitno;
+       int                     wordnum;
+       int                     bitnum;
+
+       bitno = pageno % PAGES_PER_CHUNK;
+       chunk_pageno = pageno - bitno;
+
+       /*
+        * Remove any extant non-lossy entry for the page.  If the page is
+        * its own chunk header, however, we skip this and handle the case
+        * below.
+        */
+       if (bitno != 0)
+       {
+               if (hash_search(tbm->pagetable,
+                                               (void *) &pageno,
+                                               HASH_REMOVE, NULL) != NULL)
+               {
+                       /* It was present, so adjust counts */
+                       tbm->nentries--;
+                       tbm->npages--;          /* assume it must have been non-lossy */
+               }
+       }
+
+       /* Look up or create entry for chunk-header page */
+       page = (PagetableEntry *) hash_search(tbm->pagetable,
+                                                                                 (void *) &chunk_pageno,
+                                                                                 HASH_ENTER, &found);
+       if (page == NULL)
+               ereport(ERROR,
+                               (errcode(ERRCODE_OUT_OF_MEMORY),
+                                errmsg("out of memory")));
+
+       /* Initialize it if not present before */
+       if (!found)
+       {
+               MemSet(page, 0, sizeof(PagetableEntry));
+               page->blockno = chunk_pageno;
+               page->ischunk = true;
+               /* must count it too */
+               tbm->nentries++;
+               tbm->nchunks++;
+       }
+       else if (!page->ischunk)
+       {
+               /* chunk header page was formerly non-lossy, make it lossy */
+               MemSet(page, 0, sizeof(PagetableEntry));
+               page->blockno = chunk_pageno;
+               page->ischunk = true;
+               /* we assume it had some tuple bit(s) set, so mark it lossy */
+               page->words[0] = ((bitmapword) 1 << 0);
+               /* adjust counts */
+               tbm->nchunks++;
+               tbm->npages--;
+       }
+
+       /* Now set the original target page's bit */
+       wordnum = WORDNUM(bitno);
+       bitnum = BITNUM(bitno);
+       page->words[wordnum] |= ((bitmapword) 1 << bitnum);
+}
+
+/*
+ * tbm_lossify - lose some information to get back under the memory limit
+ */
+static void
+tbm_lossify(TIDBitmap *tbm)
+{
+       HASH_SEQ_STATUS status;
+       PagetableEntry *page;
+
+       /*
+        * XXX Really stupid implementation: this just lossifies pages in
+        * essentially random order.  We should be paying some attention
+        * to the number of bits set in each page, instead.  Also it might
+        * be a good idea to lossify more than the minimum number of pages
+        * during each call.
+        */
+       Assert(!tbm->iterating);
+       hash_seq_init(&status, tbm->pagetable);
+       while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+       {
+               if (page->ischunk)
+                       continue;                       /* already a chunk header */
+               /*
+                * If the page would become a chunk header, we won't save anything
+                * by converting it to lossy, so skip it.
+                */
+               if ((page->blockno % PAGES_PER_CHUNK) == 0)
+                       continue;
+
+               /* This does the dirty work ... */
+               tbm_mark_page_lossy(tbm, page->blockno);
+
+               if (tbm->nentries <= tbm->maxentries)
+                       return;                         /* we have done enough */
+
+               /*
+                * Note: tbm_mark_page_lossy may have inserted a lossy chunk into
+                * the hashtable.  We can continue the same seq_search scan since
+                * we do not care whether we visit lossy chunks or not.
+                */
+       }
+}
+
+/*
+ * qsort comparator to handle PagetableEntry pointers.
+ */
+static int
+tbm_comparator(const void *left, const void *right)
+{
+       BlockNumber l = (*((const PagetableEntry **) left))->blockno;
+       BlockNumber r = (*((const PagetableEntry **) right))->blockno;
+
+       if (l < r)
+               return -1;
+       else if (l > r)
+               return 1;
+       return 0;
+}
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h

index 4822b5f6cc43bf9a99f86ee26935d6a878b82128..3c5528546af97a43642be3fad191b46ab074d72b 100644 (file)
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.165 2005/04/06 16:34:07 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/nodes.h,v 1.166 2005/04/17 22:24:02 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -304,11 +304,16 @@ typedef enum NodeTag
         T_FunctionParameter,
  
         /*
-        * TAGS FOR FUNCTION-CALL CONTEXT AND RESULTINFO NODES (see fmgr.h)
+        * TAGS FOR RANDOM OTHER STUFF
+        *
+        * These are objects that aren't part of parse/plan/execute node tree
+        * structures, but we give them NodeTags anyway for identification
+        * purposes (usually because they are involved in APIs where we want
+        * to pass multiple object types through the same pointer).
          */
         T_TriggerData = 900,            /* in commands/trigger.h */
-       T_ReturnSetInfo                         /* in nodes/execnodes.h */
-
+       T_ReturnSetInfo,                        /* in nodes/execnodes.h */
+       T_TIDBitmap                                     /* in nodes/tidbitmap.h */
  } NodeTag;
  
  /*
diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h

new file mode 100644 (file)

index 0000000..2c90c01
--- /dev/null
+++ b/src/include/nodes/tidbitmap.h
@@ -0,0 +1,55 @@
+/*-------------------------------------------------------------------------
+ *
+ * tidbitmap.h
+ *       PostgreSQL tuple-id (TID) bitmap package
+ *
+ * This module provides bitmap data structures that are spiritually
+ * similar to Bitmapsets, but are specially adapted to store sets of
+ * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
+ * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
+ * Also, since we wish to be able to store very large tuple sets in
+ * memory with this data structure, we support "lossy" storage, in which
+ * we no longer remember individual tuple offsets on a page but only the
+ * fact that a particular page needs to be visited.
+ *
+ *
+ * Copyright (c) 2003-2005, PostgreSQL Global Development Group
+ *
+ * $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.1 2005/04/17 22:24:02 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef TIDBITMAP_H
+#define TIDBITMAP_H
+
+#include "storage/itemptr.h"
+
+
+/*
+ * Actual bitmap representation is private to tidbitmap.c.  Callers can
+ * do IsA(x, TIDBitmap) on it, but nothing else.
+ */
+typedef struct TIDBitmap TIDBitmap;
+
+/* Result structure for tbm_iterate */
+typedef struct
+{
+       BlockNumber     blockno;                /* page number containing tuples */
+       int                     ntuples;                /* -1 indicates lossy result */
+       OffsetNumber offsets[1];        /* VARIABLE LENGTH ARRAY */
+} TBMIterateResult;                            /* VARIABLE LENGTH STRUCT */
+
+/* function prototypes in nodes/tidbitmap.c */
+
+extern TIDBitmap *tbm_create(long maxbytes);
+extern void tbm_free(TIDBitmap *tbm);
+
+extern void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids);
+
+extern void tbm_union(TIDBitmap *a, const TIDBitmap *b);
+extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b);
+
+extern void tbm_begin_iterate(TIDBitmap *tbm);
+extern TBMIterateResult *tbm_iterate(TIDBitmap *tbm);
+
+#endif   /* TIDBITMAP_H */
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 17 Apr 2005 22:24:02 +0000 (22:24 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sun, 17 Apr 2005 22:24:02 +0000 (22:24 +0000)
src/backend/nodes/Makefile		patch \| blob \| history
src/backend/nodes/tidbitmap.c	[new file with mode: 0644]	patch \| blob
src/include/nodes/nodes.h		patch \| blob \| history
src/include/nodes/tidbitmap.h	[new file with mode: 0644]	patch \| blob