*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.41 2003/09/04 22:06:27 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.41.2.1 2006/11/19 21:33:46 tgl Exp $
*
* NOTES
* Overflow pages look like ordinary relation pages.
#include "access/hash.h"
-static BlockNumber _hash_getovflpage(Relation rel, Buffer metabuf);
+static Buffer _hash_getovflpage(Relation rel, Buffer metabuf);
static uint32 _hash_firstfreebit(uint32 map);
Buffer
_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
{
- BlockNumber ovflblkno;
Buffer ovflbuf;
Page page;
Page ovflpage;
HashPageOpaque pageopaque;
HashPageOpaque ovflopaque;
- /* allocate an empty overflow page */
- ovflblkno = _hash_getovflpage(rel, metabuf);
-
- /* lock the overflow page */
- ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE);
+ /* allocate and lock an empty overflow page */
+ ovflbuf = _hash_getovflpage(rel, metabuf);
ovflpage = BufferGetPage(ovflbuf);
/*
_hash_wrtnorelbuf(rel, ovflbuf);
/* logically chain overflow page to previous page */
- pageopaque->hasho_nextblkno = ovflblkno;
+ pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
_hash_wrtbuf(rel, buf);
return ovflbuf;
/*
* _hash_getovflpage()
*
- * Find an available overflow page and return its block number.
+ * Find an available overflow page and return it. The returned buffer
+ * is pinned and write-locked, but its contents are not initialized.
*
* The caller must hold a pin, but no lock, on the metapage buffer.
- * The buffer is returned in the same state.
+ * That buffer is left in the same state at exit.
*/
-static BlockNumber
+static Buffer
_hash_getovflpage(Relation rel, Buffer metabuf)
{
HashMetaPage metap;
Buffer mapbuf = 0;
+ Buffer newbuf;
BlockNumber blkno;
uint32 orig_firstfree;
uint32 splitnum;
_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
}
- /* No Free Page Found - have to allocate a new page */
- bit = metap->hashm_spares[splitnum];
- metap->hashm_spares[splitnum]++;
-
- /* Check if we need to allocate a new bitmap page */
+ /*
+ * No free pages --- have to extend the relation to add an overflow page.
+ * First, check to see if we have to add a new bitmap page too.
+ */
if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
{
/*
* correctly marked "in use". Subsequent pages do not exist yet,
* but it is convenient to pre-mark them as "in use" too.
*/
- _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit));
-
bit = metap->hashm_spares[splitnum];
+ _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit));
metap->hashm_spares[splitnum]++;
}
else
{
/*
- * Nothing to do here; since the page was past the last used page,
+ * Nothing to do here; since the page will be past the last used page,
* we know its bitmap bit was preinitialized to "in use".
*/
}
/* Calculate address of the new overflow page */
+ bit = metap->hashm_spares[splitnum];
blkno = bitno_to_blkno(metap, bit);
+ /*
+ * We have to fetch the page with P_NEW to ensure smgr's idea of the
+ * relation length stays in sync with ours. XXX It's annoying to do this
+ * with metapage write lock held; would be better to use a lock that
+ * doesn't block incoming searches. Best way to fix it would be to stop
+ * maintaining hashm_spares[hashm_ovflpoint] and rely entirely on the
+ * smgr relation length to track where new overflow pages come from;
+ * then we could release the metapage before we do the smgrextend.
+ * FIXME later (not in beta...)
+ */
+ newbuf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+ if (BufferGetBlockNumber(newbuf) != blkno)
+ elog(ERROR, "unexpected hash relation size: %u, should be %u",
+ BufferGetBlockNumber(newbuf), blkno);
+
+ metap->hashm_spares[splitnum]++;
+
/*
* Adjust hashm_firstfree to avoid redundant searches. But don't
* risk changing it if someone moved it while we were searching
/* Write updated metapage and release lock, but not pin */
_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
- return blkno;
+ return newbuf;
found:
/* convert bit to bit number within page */
/* convert bit to absolute bit number */
bit += (i << BMPG_SHIFT(metap));
- /* Calculate address of the new overflow page */
+ /* Calculate address of the recycled overflow page */
blkno = bitno_to_blkno(metap, bit);
/*
_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
}
- return blkno;
+ /* Fetch and return the recycled page */
+ return _hash_getbuf(rel, blkno, HASH_WRITE);
}
/*
prevblkno = ovflopaque->hasho_prevblkno;
bucket = ovflopaque->hasho_bucket;
- /* Zero the page for debugging's sake; then write and release it */
+ /*
+ * Zero the page for debugging's sake; then write and release it.
+ * (Note: if we failed to zero the page here, we'd have problems
+ * with the Assert in _hash_pageinit() when the page is reused.)
+ */
MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf));
_hash_wrtbuf(rel, ovflbuf);
/*
* It is okay to write-lock the new bitmap page while holding metapage
* write lock, because no one else could be contending for the new page.
+ * Also, the metapage lock makes it safe to extend the index using P_NEW,
+ * which we want to do to ensure the smgr's idea of the relation size
+ * stays in step with ours.
*
* There is some loss of concurrency in possibly doing I/O for the new
* page while holding the metapage lock, but this path is taken so
* seldom that it's not worth worrying about.
*/
- buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+ buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+ if (BufferGetBlockNumber(buf) != blkno)
+ elog(ERROR, "unexpected hash relation size: %u, should be %u",
+ BufferGetBlockNumber(buf), blkno);
+
pg = BufferGetPage(buf);
/* initialize the page */
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.42 2003/09/04 22:06:27 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.42.2.1 2006/11/19 21:33:46 tgl Exp $
*
* NOTES
* Postgres hash pages look like ordinary relation pages. The opaque
#include "access/genam.h"
#include "access/hash.h"
#include "storage/lmgr.h"
+#include "storage/smgr.h"
#include "utils/lsyscache.h"
+static BlockNumber _hash_alloc_buckets(Relation rel, uint32 nblocks);
static void _hash_splitbucket(Relation rel, Buffer metabuf,
Bucket obucket, Bucket nbucket,
BlockNumber start_oblkno,
* requested buffer and its reference count has been incremented
* (ie, the buffer is "locked and pinned").
*
- * XXX P_NEW is not used because, unlike the tree structures, we
- * need the bucket blocks to be at certain block numbers. we must
- * depend on the caller to call _hash_pageinit on the block if it
- * knows that this is a new block.
+ * blkno == P_NEW is allowed, but it is caller's responsibility to
+ * ensure that only one process can extend the index at a time.
*/
Buffer
_hash_getbuf(Relation rel, BlockNumber blkno, int access)
{
Buffer buf;
- if (blkno == P_NEW)
- elog(ERROR, "hash AM does not use P_NEW");
-
buf = ReadBuffer(rel, blkno);
if (access != HASH_NOLOCK)
if (ffactor < 10)
ffactor = 10;
- metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+ /*
+ * We initialize the metapage, the first two bucket pages, and the
+ * first bitmap page in sequence, using P_NEW to cause smgrextend()
+ * calls to occur. This ensures that the smgr level has the right
+ * idea of the physical index length.
+ */
+ metabuf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+ Assert(BufferGetBlockNumber(metabuf) == HASH_METAPAGE);
pg = BufferGetPage(metabuf);
_hash_pageinit(pg, BufferGetPageSize(metabuf));
*/
for (i = 0; i <= 1; i++)
{
- buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
+ buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
+ Assert(BufferGetBlockNumber(buf) == BUCKET_TO_BLKNO(metap, i));
pg = BufferGetPage(buf);
_hash_pageinit(pg, BufferGetPageSize(buf));
pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
}
/*
- * Initialize first bitmap page. Can't do this until we
- * create the first two buckets, else smgr will complain.
+ * Initialize first bitmap page
*/
_hash_initbitmap(rel, metap, 3);
Bucket old_bucket;
Bucket new_bucket;
uint32 spare_ndx;
+ BlockNumber firstblock = InvalidBlockNumber;
BlockNumber start_oblkno;
BlockNumber start_nblkno;
uint32 maxbucket;
(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
goto fail;
+ /*
+ * Can't split anymore if maxbucket has reached its maximum possible value.
+ *
+ * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
+ * the calculation maxbucket+1 mustn't overflow). Currently we restrict
+ * to half that because of overflow looping in _hash_log2() and
+ * insufficient space in hashm_spares[]. It's moot anyway because an
+ * index with 2^32 buckets would certainly overflow BlockNumber and
+ * hence _hash_alloc_buckets() would fail, but if we supported buckets
+ * smaller than a disk block then this would be an independent constraint.
+ */
+ if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
+ goto fail;
+
+ /*
+ * If the split point is increasing (hashm_maxbucket's log base 2
+ * increases), we need to allocate a new batch of bucket pages.
+ */
+ new_bucket = metap->hashm_maxbucket + 1;
+ spare_ndx = _hash_log2(new_bucket + 1);
+ if (spare_ndx > metap->hashm_ovflpoint)
+ {
+ Assert(spare_ndx == metap->hashm_ovflpoint + 1);
+ /*
+ * The number of buckets in the new splitpoint is equal to the
+ * total number already in existence, i.e. new_bucket. Currently
+ * this maps one-to-one to blocks required, but someday we may need
+ * a more complicated calculation here.
+ */
+ firstblock = _hash_alloc_buckets(rel, new_bucket);
+ if (firstblock == InvalidBlockNumber)
+ goto fail; /* can't split due to BlockNumber overflow */
+ }
+
/*
* Determine which bucket is to be split, and attempt to lock the old
* bucket. If we can't get the lock, give up.
* then lock. This should be okay because no one else should be trying
* to lock the new bucket yet...
*/
- new_bucket = metap->hashm_maxbucket + 1;
old_bucket = (new_bucket & metap->hashm_lowmask);
start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
* increases), we need to adjust the hashm_spares[] array and
* hashm_ovflpoint so that future overflow pages will be created beyond
* this new batch of bucket pages.
- *
- * XXX should initialize new bucket pages to prevent out-of-order
- * page creation? Don't wanna do it right here though.
*/
- spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
if (spare_ndx > metap->hashm_ovflpoint)
{
- Assert(spare_ndx == metap->hashm_ovflpoint + 1);
metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
metap->hashm_ovflpoint = spare_ndx;
}
/* now we can compute the new bucket's primary block number */
start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
+ /* if we added a splitpoint, should match result of _hash_alloc_buckets */
+ if (firstblock != InvalidBlockNumber &&
+ firstblock != start_nblkno)
+ elog(PANIC, "unexpected hash relation size: %u, should be %u",
+ firstblock, start_nblkno);
+
Assert(!_hash_has_active_scan(rel, new_bucket));
if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
}
+/*
+ * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
+ *
+ * This does not need to initialize the new bucket pages; we'll do that as
+ * each one is used by _hash_expandtable(). But we have to extend the logical
+ * EOF to the end of the splitpoint; otherwise the first overflow page
+ * allocated beyond the splitpoint will represent a noncontiguous access,
+ * which can confuse md.c (and will probably be forbidden by future changes
+ * to md.c).
+ *
+ * We do this by writing a page of zeroes at the end of the splitpoint range.
+ * We expect that the filesystem will ensure that the intervening pages read
+ * as zeroes too. On many filesystems this "hole" will not be allocated
+ * immediately, which means that the index file may end up more fragmented
+ * than if we forced it all to be allocated now; but since we don't scan
+ * hash indexes sequentially anyway, that probably doesn't matter.
+ *
+ * XXX It's annoying that this code is executed with the metapage lock held.
+ * We need to interlock against _hash_getovflpage() adding a new overflow page
+ * concurrently, but it'd likely be better to use LockRelationForExtension
+ * for the purpose. OTOH, adding a splitpoint is a very infrequent operation,
+ * so it may not be worth worrying about.
+ *
+ * Returns the first block number in the new splitpoint's range, or
+ * InvalidBlockNumber if allocation failed due to BlockNumber overflow.
+ */
+static BlockNumber
+_hash_alloc_buckets(Relation rel, uint32 nblocks)
+{
+ BlockNumber firstblock;
+ BlockNumber lastblock;
+ BlockNumber endblock;
+ char zerobuf[BLCKSZ];
+
+ /*
+ * Since we hold metapage lock, no one else is either splitting or
+ * allocating a new page in _hash_getovflpage(); hence it's safe to
+ * assume that the relation length isn't changing under us.
+ */
+ firstblock = RelationGetNumberOfBlocks(rel);
+ lastblock = firstblock + nblocks - 1;
+
+ /*
+ * Check for overflow in block number calculation; if so, we cannot
+ * extend the index anymore.
+ */
+ if (lastblock < firstblock || lastblock == InvalidBlockNumber)
+ return InvalidBlockNumber;
+
+ /* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */
+
+ MemSet(zerobuf, 0, sizeof(zerobuf));
+
+ /*
+ * XXX If the extension results in creation of new segment files,
+ * we have to make sure that each non-last file is correctly filled out to
+ * RELSEG_SIZE blocks. This ought to be done inside mdextend, but
+ * changing the smgr API seems best left for development cycle not late
+ * beta. Temporary fix for bug #2737.
+ */
+#ifndef LET_OS_MANAGE_FILESIZE
+ for (endblock = firstblock | (RELSEG_SIZE - 1);
+ endblock < lastblock;
+ endblock += RELSEG_SIZE)
+ smgrextend(DEFAULT_SMGR, rel, endblock, zerobuf);
+#endif
+
+ smgrextend(DEFAULT_SMGR, rel, lastblock, zerobuf);
+
+ rel->rd_nblocks = lastblock+1;
+
+ return firstblock;
+}
+
+
/*
* _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
*