}
/*
- * Read in a buffer, using bulk-insert strategy if bistate isn't NULL.
+ * Read in a buffer in mode, using bulk-insert strategy if bistate isn't NULL.
*/
static Buffer
ReadBufferBI(Relation relation, BlockNumber targetBlock,
- BulkInsertState bistate)
+ ReadBufferMode mode, BulkInsertState bistate)
{
Buffer buffer;
/* If not bulk-insert, exactly like ReadBuffer */
if (!bistate)
- return ReadBuffer(relation, targetBlock);
+ return ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock,
+ mode, NULL);
/* If we have the desired block already pinned, re-pin and return it */
if (bistate->current_buf != InvalidBuffer)
{
if (BufferGetBlockNumber(bistate->current_buf) == targetBlock)
{
+ /*
+ * Currently the LOCK variants are only used for extending
+ * relation, which should never reach this branch.
+ */
+ Assert(mode != RBM_ZERO_AND_LOCK &&
+ mode != RBM_ZERO_AND_CLEANUP_LOCK);
+
IncrBufferRefCount(bistate->current_buf);
return bistate->current_buf;
}
/* Perform a read using the buffer strategy */
buffer = ReadBufferExtended(relation, MAIN_FORKNUM, targetBlock,
- RBM_NORMAL, bistate->strategy);
+ mode, bistate->strategy);
/* Save the selected block as target for future inserts */
IncrBufferRefCount(buffer);
/*
* Extend by one page. This should generally match the main-line
* extension code in RelationGetBufferForTuple, except that we hold
- * the relation extension lock throughout.
+ * the relation extension lock throughout, and we don't immediately
+ * initialize the page (see below).
*/
- buffer = ReadBufferBI(relation, P_NEW, bistate);
-
- LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
page = BufferGetPage(buffer);
if (!PageIsNew(page))
BufferGetBlockNumber(buffer),
RelationGetRelationName(relation));
- PageInit(page, BufferGetPageSize(buffer), 0);
-
/*
- * We mark all the new buffers dirty, but do nothing to write them
- * out; they'll probably get used soon, and even if they are not, a
- * crash will leave an okay all-zeroes page on disk.
+ * Add the page to the FSM without initializing. If we were to
+ * initialize here, the page would potentially get flushed out to disk
+ * before we add any useful content. There's no guarantee that that'd
+ * happen before a potential crash, so we need to deal with
+ * uninitialized pages anyway, thus avoid the potential for
+ * unnecessary writes.
*/
- MarkBufferDirty(buffer);
/* we'll need this info below */
blockNum = BufferGetBlockNumber(buffer);
- freespace = PageGetHeapFreeSpace(page);
+ freespace = BufferGetPageSize(buffer) - SizeOfPageHeaderData;
UnlockReleaseBuffer(buffer);
if (otherBuffer == InvalidBuffer)
{
/* easy case */
- buffer = ReadBufferBI(relation, targetBlock, bistate);
+ buffer = ReadBufferBI(relation, targetBlock, RBM_NORMAL, bistate);
if (PageIsAllVisible(BufferGetPage(buffer)))
visibilitymap_pin(relation, targetBlock, vmbuffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
* we're done.
*/
page = BufferGetPage(buffer);
+
+ /*
+ * If necessary initialize page, it'll be used soon. We could avoid
+ * dirtying the buffer here, and rely on the caller to do so whenever
+ * it puts a tuple onto the page, but there seems not much benefit in
+ * doing so.
+ */
+ if (PageIsNew(page))
+ {
+ PageInit(page, BufferGetPageSize(buffer), 0);
+ MarkBufferDirty(buffer);
+ }
+
pageFreeSpace = PageGetHeapFreeSpace(page);
if (len + saveFreeSpace <= pageFreeSpace)
{
* it worth keeping an accurate file length in shared memory someplace,
* rather than relying on the kernel to do it for us?
*/
- buffer = ReadBufferBI(relation, P_NEW, bistate);
+ buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
/*
- * We can be certain that locking the otherBuffer first is OK, since it
- * must have a lower page number.
+ * We need to initialize the empty new page. Double-check that it really
+ * is empty (this should never happen, but if it does we don't want to
+ * risk wiping out valid data).
*/
- if (otherBuffer != InvalidBuffer)
- LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
+ page = BufferGetPage(buffer);
- /*
- * Now acquire lock on the new page.
- */
- LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ if (!PageIsNew(page))
+ elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
+ BufferGetBlockNumber(buffer),
+ RelationGetRelationName(relation));
+
+ PageInit(page, BufferGetPageSize(buffer), 0);
+ MarkBufferDirty(buffer);
/*
* Release the file-extension lock; it's now OK for someone else to extend
- * the relation some more. Note that we cannot release this lock before
- * we have buffer lock on the new page, or we risk a race condition
- * against vacuumlazy.c --- see comments therein.
+ * the relation some more.
*/
if (needLock)
UnlockRelationForExtension(relation, ExclusiveLock);
/*
- * We need to initialize the empty new page. Double-check that it really
- * is empty (this should never happen, but if it does we don't want to
- * risk wiping out valid data).
+ * Lock the other buffer. It's guaranteed to be of a lower page number
+ * than the new page. To conform with the deadlock prevent rules, we ought
+ * to lock otherBuffer first, but that would give other backends a chance
+ * to put tuples on our page. To reduce the likelihood of that, attempt to
+ * lock the other buffer conditionally, that's very likely to work.
+ * Otherwise we need to lock buffers in the correct order, and retry if
+ * the space has been used in the mean time.
+ *
+ * Alternatively, we could acquire the lock on otherBuffer before
+ * extending the relation, but that'd require holding the lock while
+ * performing IO, which seems worse than an unlikely retry.
*/
- page = BufferGetPage(buffer);
+ if (otherBuffer != InvalidBuffer)
+ {
+ Assert(otherBuffer != buffer);
- if (!PageIsNew(page))
- elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
- BufferGetBlockNumber(buffer),
- RelationGetRelationName(relation));
+ if (unlikely(!ConditionalLockBuffer(otherBuffer)))
+ {
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
- PageInit(page, BufferGetPageSize(buffer), 0);
+ /*
+ * Because the buffer was unlocked for a while, it's possible,
+ * although unlikely, that the page was filled. If so, just retry
+ * from start.
+ */
+ if (len > PageGetHeapFreeSpace(page))
+ {
+ LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
+ UnlockReleaseBuffer(buffer);
+
+ goto loop;
+ }
+ }
+ }
if (len > PageGetHeapFreeSpace(page))
{
if (PageIsNew(page))
{
+ bool still_new;
+
/*
- * An all-zeroes page could be left over if a backend extends the
- * relation but crashes before initializing the page. Reclaim such
- * pages for use.
- *
- * We have to be careful here because we could be looking at a
- * page that someone has just added to the relation and not yet
- * been able to initialize (see RelationGetBufferForTuple). To
- * protect against that, release the buffer lock, grab the
- * relation extension lock momentarily, and re-lock the buffer. If
- * the page is still uninitialized by then, it must be left over
- * from a crashed backend, and we can initialize it.
+ * All-zeroes pages can be left over if either a backend extends
+ * the relation by a single page, but crashes before the newly
+ * initialized page has been written out, or when bulk-extending
+ * the relation (which creates a number of empty pages at the tail
+ * end of the relation, but enters them into the FSM).
*
- * We don't really need the relation lock when this is a new or
- * temp relation, but it's probably not worth the code space to
- * check that, since this surely isn't a critical path.
+ * Make sure these pages are in the FSM, to ensure they can be
+ * reused. Do that by testing if there's any space recorded for
+ * the page. If not, enter it.
*
- * Note: the comparable code in vacuum.c need not worry because
- * it's got exclusive lock on the whole relation.
+ * Note we do not enter the page into the visibilitymap. That has
+ * the downside that we repeatedly visit this page in subsequent
+ * vacuums, but otherwise we'll never not discover the space on a
+ * promoted standby. The harm of repeated checking ought to
+ * normally not be too bad - the space usually should be used at
+ * some point, otherwise there wouldn't be any regular vacuums.
*/
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
- LockRelationForExtension(onerel, ExclusiveLock);
- UnlockRelationForExtension(onerel, ExclusiveLock);
- LockBufferForCleanup(buf);
- if (PageIsNew(page))
+
+ /*
+ * Perform checking of FSM after releasing lock, the fsm is
+ * approximate, after all.
+ */
+ still_new = PageIsNew(page);
+ UnlockReleaseBuffer(buf);
+
+ if (still_new)
{
- ereport(WARNING,
- (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
- relname, blkno)));
- PageInit(page, BufferGetPageSize(buf), 0);
empty_pages++;
- }
- freespace = PageGetHeapFreeSpace(page);
- MarkBufferDirty(buf);
- UnlockReleaseBuffer(buf);
- RecordPageWithFreeSpace(onerel, blkno, freespace);
+ if (GetRecordedFreeSpace(onerel, blkno) == 0)
+ {
+ Size freespace;
+
+ freespace = BufferGetPageSize(buf) - SizeOfPageHeaderData;
+ RecordPageWithFreeSpace(onerel, blkno, freespace);
+ }
+ }
continue;
}
empty_pages++;
freespace = PageGetHeapFreeSpace(page);
- /* empty pages are always all-visible and all-frozen */
+ /*
+ * Empty pages are always all-visible and all-frozen (note that
+ * the same is currently not true for new pages, see above).
+ */
if (!PageIsAllVisible(page))
{
START_CRIT_SECTION();
*hastup = false;
- /* If we hit an uninitialized page, we want to force vacuuming it. */
- if (PageIsNew(page))
- return true;
-
- /* Quick out for ordinary empty page. */
- if (PageIsEmpty(page))
+ /*
+ * New and empty pages, obviously, don't contain tuples. We could make
+ * sure that the page is registered in the FSM, but it doesn't seem worth
+ * waiting for a cleanup lock just for that, especially because it's
+ * likely that the pin holder will do so.
+ */
+ if (PageIsNew(page) || PageIsEmpty(page))
return false;
maxoff = PageGetMaxOffsetNumber(page);
if (PageIsNew(page) || PageIsEmpty(page))
{
- /* PageIsNew probably shouldn't happen... */
UnlockReleaseBuffer(buf);
continue;
}