* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.19 2003/11/29 19:51:40 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.20 2004/05/31 03:47:54 tgl Exp $
*
*-------------------------------------------------------------------------
*/
Assert(status == TRANSACTION_STATUS_COMMITTED ||
status == TRANSACTION_STATUS_ABORTED);
- LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+ LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, true);
byteptr += byteno;
/* ...->page_status[slotno] = CLOG_PAGE_DIRTY; already done */
- LWLockRelease(ClogCtl->locks->ControlLock);
+ LWLockRelease(ClogCtl->ControlLock);
}
/*
char *byteptr;
XidStatus status;
- LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+ LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, false);
byteptr += byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
- LWLockRelease(ClogCtl->locks->ControlLock);
+ LWLockRelease(ClogCtl->ControlLock);
return status;
}
{
int slotno;
- LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+ LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
/* Create and zero the first page of the commit log */
slotno = ZeroCLOGPage(0, false);
/* Make sure it's written out */
- SimpleLruWritePage(ClogCtl, slotno);
+ SimpleLruWritePage(ClogCtl, slotno, NULL);
/* Assert(ClogCtl->page_status[slotno] == CLOG_PAGE_CLEAN); */
- LWLockRelease(ClogCtl->locks->ControlLock);
+ LWLockRelease(ClogCtl->ControlLock);
}
/*
pageno = TransactionIdToPage(newestXact);
- LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+ LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
/* Zero the page and make an XLOG entry about it */
ZeroCLOGPage(pageno, true);
- LWLockRelease(ClogCtl->locks->ControlLock);
+ LWLockRelease(ClogCtl->ControlLock);
}
memcpy(&pageno, XLogRecGetData(record), sizeof(int));
- LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+ LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
slotno = ZeroCLOGPage(pageno, false);
- SimpleLruWritePage(ClogCtl, slotno);
+ SimpleLruWritePage(ClogCtl, slotno, NULL);
/* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
- LWLockRelease(ClogCtl->locks->ControlLock);
+ LWLockRelease(ClogCtl->ControlLock);
}
}
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.15 2004/05/29 22:48:18 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.16 2004/05/31 03:47:54 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include <unistd.h>
#include "access/slru.h"
+#include "access/clog.h" /* only for NUM_CLOG_BUFFERS */
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/lwlock.h"
*/
typedef struct SlruSharedData
{
+ LWLockId ControlLock;
+
/*
* Info for each buffer slot. Page number is undefined when status is
* EMPTY. lru_count is essentially the number of page switches since
SlruPageStatus page_status[NUM_CLOG_BUFFERS];
int page_number[NUM_CLOG_BUFFERS];
unsigned int page_lru_count[NUM_CLOG_BUFFERS];
+ LWLockId BufferLocks[NUM_CLOG_BUFFERS]; /* Per-buffer I/O locks */
/*
* latest_page_number is the page number of the current end of the
*/
int latest_page_number;
} SlruSharedData;
-typedef SlruSharedData *SlruShared;
-
#define SlruFileName(ctl, path, seg) \
snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
+/*
+ * During SimpleLruFlush(), we will usually not need to write/fsync more
+ * than one or two physical files, but we may need to write several pages
+ * per file. We can consolidate the I/O requests by leaving files open
+ * until control returns to SimpleLruFlush(). This data structure remembers
+ * which files are open.
+ */
+typedef struct SlruFlushData
+{
+ int num_files; /* # files actually open */
+ int fd[NUM_CLOG_BUFFERS]; /* their FD's */
+ int segno[NUM_CLOG_BUFFERS]; /* their clog seg#s */
+} SlruFlushData;
+
/*
* Macro to mark a buffer slot "most recently used".
*/
SLRU_SEEK_FAILED,
SLRU_READ_FAILED,
SLRU_WRITE_FAILED,
+ SLRU_FSYNC_FAILED,
SLRU_CLOSE_FAILED
} SlruErrorCause;
+
static SlruErrorCause slru_errcause;
static int slru_errno;
static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
-static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno);
+static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
+ SlruFlush fdata);
static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
static int SlruSelectLRUPage(SlruCtl ctl, int pageno);
static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
int
SimpleLruShmemSize(void)
{
- return MAXALIGN(sizeof(SlruSharedData))
- + BLCKSZ * NUM_CLOG_BUFFERS
- + MAXALIGN(sizeof(SlruLockData))
- ;
+ return MAXALIGN(sizeof(SlruSharedData)) + BLCKSZ * NUM_CLOG_BUFFERS;
}
void
SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
{
- bool found;
- char *ptr;
SlruShared shared;
- SlruLock locks;
+ bool found;
- ptr = ShmemInitStruct(name, SimpleLruShmemSize(), &found);
- shared = (SlruShared) ptr;
- locks = (SlruLock) (ptr + MAXALIGN(sizeof(SlruSharedData)) +
- BLCKSZ * NUM_CLOG_BUFFERS);
+ shared = (SlruShared) ShmemInitStruct(name, SimpleLruShmemSize(), &found);
if (!IsUnderPostmaster)
{
Assert(!found);
- locks->ControlLock = LWLockAssign();
-
memset(shared, 0, sizeof(SlruSharedData));
+ shared->ControlLock = LWLockAssign();
+
bufptr = (char *) shared + MAXALIGN(sizeof(SlruSharedData));
for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
{
- locks->BufferLocks[slotno] = LWLockAssign();
shared->page_buffer[slotno] = bufptr;
shared->page_status[slotno] = SLRU_PAGE_EMPTY;
shared->page_lru_count[slotno] = 1;
+ shared->BufferLocks[slotno] = LWLockAssign();
bufptr += BLCKSZ;
}
Assert(found);
/* Initialize the unshared control struct */
- ctl->locks = locks;
ctl->shared = shared;
+ ctl->ControlLock = shared->ControlLock;
- /* Init directory path */
+ /* Initialize unshared copy of directory path */
snprintf(ctl->Dir, MAXPGPATH, "%s/%s", DataDir, subdir);
}
SimpleLruZeroPage(SlruCtl ctl, int pageno)
{
int slotno;
- SlruShared shared = (SlruShared) ctl->shared;
+ SlruShared shared = ctl->shared;
/* Find a suitable buffer slot for the page */
slotno = SlruSelectLRUPage(ctl, pageno);
char *
SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
{
- SlruShared shared = (SlruShared) ctl->shared;
+ SlruShared shared = ctl->shared;
/* Outer loop handles restart if we lose the buffer to someone else */
for (;;)
SlruRecentlyUsed(shared, slotno);
/* Release shared lock, grab per-buffer lock instead */
- LWLockRelease(ctl->locks->ControlLock);
- LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE);
+ LWLockRelease(shared->ControlLock);
+ LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
/*
* Check to see if someone else already did the read, or took the
if (shared->page_number[slotno] != pageno ||
shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
{
- LWLockRelease(ctl->locks->BufferLocks[slotno]);
- LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+ LWLockRelease(shared->BufferLocks[slotno]);
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
continue;
}
ok = SlruPhysicalReadPage(ctl, pageno, slotno);
/* Re-acquire shared control lock and update page state */
- LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
Assert(shared->page_number[slotno] == pageno &&
shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS);
shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_EMPTY;
- LWLockRelease(ctl->locks->BufferLocks[slotno]);
+ LWLockRelease(shared->BufferLocks[slotno]);
/* Now it's okay to ereport if we failed */
if (!ok)
* Control lock must be held at entry, and will be held at exit.
*/
void
-SimpleLruWritePage(SlruCtl ctl, int slotno)
+SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
{
int pageno;
bool ok;
- SlruShared shared = (SlruShared) ctl->shared;
+ SlruShared shared = ctl->shared;
/* Do nothing if page does not need writing */
if (shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
pageno = shared->page_number[slotno];
/* Release shared lock, grab per-buffer lock instead */
- LWLockRelease(ctl->locks->ControlLock);
- LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE);
+ LWLockRelease(shared->ControlLock);
+ LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
/*
* Check to see if someone else already did the write, or took the
(shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
shared->page_status[slotno] != SLRU_PAGE_WRITE_IN_PROGRESS))
{
- LWLockRelease(ctl->locks->BufferLocks[slotno]);
- LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+ LWLockRelease(shared->BufferLocks[slotno]);
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
return;
}
shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
/* Okay, do the write */
- ok = SlruPhysicalWritePage(ctl, pageno, slotno);
+ ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
+
+ /* If we failed, and we're in a flush, better close the files */
+ if (!ok && fdata)
+ {
+ int i;
+
+ for (i = 0; i < fdata->num_files; i++)
+ close(fdata->fd[i]);
+ }
/* Re-acquire shared control lock and update page state */
- LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
Assert(shared->page_number[slotno] == pageno &&
(shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS ||
if (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_DIRTY;
- LWLockRelease(ctl->locks->BufferLocks[slotno]);
+ LWLockRelease(shared->BufferLocks[slotno]);
/* Now it's okay to ereport if we failed */
if (!ok)
static bool
SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
{
- SlruShared shared = (SlruShared) ctl->shared;
+ SlruShared shared = ctl->shared;
int segno = pageno / SLRU_PAGES_PER_SEGMENT;
int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
int offset = rpageno * BLCKSZ;
{
slru_errcause = SLRU_SEEK_FAILED;
slru_errno = errno;
+ close(fd);
return false;
}
{
slru_errcause = SLRU_READ_FAILED;
slru_errno = errno;
+ close(fd);
return false;
}
* info in static variables to let SlruReportIOError make the report.
*
* For now, assume it's not worth keeping a file pointer open across
- * read/write operations. We could cache one virtual file pointer ...
+ * independent read/write operations. We do batch operations during
+ * SimpleLruFlush, though.
+ *
+ * fdata is NULL for a standalone write, pointer to open-file info during
+ * SimpleLruFlush.
*/
static bool
-SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
+SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
{
- SlruShared shared = (SlruShared) ctl->shared;
+ SlruShared shared = ctl->shared;
int segno = pageno / SLRU_PAGES_PER_SEGMENT;
int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
int offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
- int fd;
-
- SlruFileName(ctl, path, segno);
+ int fd = -1;
/*
- * If the file doesn't already exist, we should create it. It is
- * possible for this to need to happen when writing a page that's not
- * first in its segment; we assume the OS can cope with that. (Note:
- * it might seem that it'd be okay to create files only when
- * SimpleLruZeroPage is called for the first page of a segment.
- * However, if after a crash and restart the REDO logic elects to
- * replay the log from a checkpoint before the latest one, then it's
- * possible that we will get commands to set transaction status of
- * transactions that have already been truncated from the commit log.
- * Easiest way to deal with that is to accept references to
- * nonexistent files here and in SlruPhysicalReadPage.)
+ * During a Flush, we may already have the desired file open.
*/
- fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
- if (fd < 0)
+ if (fdata)
{
- if (errno != ENOENT)
+ int i;
+
+ for (i = 0; i < fdata->num_files; i++)
{
- slru_errcause = SLRU_OPEN_FAILED;
- slru_errno = errno;
- return false;
+ if (fdata->segno[i] == segno)
+ {
+ fd = fdata->fd[i];
+ break;
+ }
}
+ }
- fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
- S_IRUSR | S_IWUSR);
+ if (fd < 0)
+ {
+ /*
+ * If the file doesn't already exist, we should create it. It is
+ * possible for this to need to happen when writing a page that's not
+ * first in its segment; we assume the OS can cope with that.
+ * (Note: it might seem that it'd be okay to create files only when
+ * SimpleLruZeroPage is called for the first page of a segment.
+ * However, if after a crash and restart the REDO logic elects to
+ * replay the log from a checkpoint before the latest one, then it's
+ * possible that we will get commands to set transaction status of
+ * transactions that have already been truncated from the commit log.
+ * Easiest way to deal with that is to accept references to
+ * nonexistent files here and in SlruPhysicalReadPage.)
+ */
+ SlruFileName(ctl, path, segno);
+ fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
if (fd < 0)
{
- slru_errcause = SLRU_CREATE_FAILED;
- slru_errno = errno;
- return false;
+ if (errno != ENOENT)
+ {
+ slru_errcause = SLRU_OPEN_FAILED;
+ slru_errno = errno;
+ return false;
+ }
+
+ fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ S_IRUSR | S_IWUSR);
+ if (fd < 0)
+ {
+ slru_errcause = SLRU_CREATE_FAILED;
+ slru_errno = errno;
+ return false;
+ }
+ }
+
+ if (fdata)
+ {
+ fdata->fd[fdata->num_files] = fd;
+ fdata->segno[fdata->num_files] = segno;
+ fdata->num_files++;
}
}
{
slru_errcause = SLRU_SEEK_FAILED;
slru_errno = errno;
+ if (!fdata)
+ close(fd);
return false;
}
errno = ENOSPC;
slru_errcause = SLRU_WRITE_FAILED;
slru_errno = errno;
+ if (!fdata)
+ close(fd);
return false;
}
- if (close(fd))
+ /*
+ * If not part of Flush, need to fsync now. We assume this happens
+ * infrequently enough that it's not a performance issue.
+ */
+ if (!fdata)
{
- slru_errcause = SLRU_CLOSE_FAILED;
- slru_errno = errno;
- return false;
+ if (pg_fsync(fd))
+ {
+ slru_errcause = SLRU_FSYNC_FAILED;
+ slru_errno = errno;
+ close(fd);
+ return false;
+ }
+
+ if (close(fd))
+ {
+ slru_errcause = SLRU_CLOSE_FAILED;
+ slru_errno = errno;
+ return false;
+ }
}
return true;
errdetail("could not write to file \"%s\" at offset %u: %m",
path, offset)));
break;
+ case SLRU_FSYNC_FAILED:
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("could not fsync file \"%s\": %m",
+ path)));
+ break;
case SLRU_CLOSE_FAILED:
ereport(ERROR,
(errcode_for_file_access(),
static int
SlruSelectLRUPage(SlruCtl ctl, int pageno)
{
- SlruShared shared = (SlruShared) ctl->shared;
+ SlruShared shared = ctl->shared;
/* Outer loop handles restart after I/O */
for (;;)
(void) SimpleLruReadPage(ctl, shared->page_number[bestslot],
InvalidTransactionId, false);
else
- SimpleLruWritePage(ctl, bestslot);
+ SimpleLruWritePage(ctl, bestslot, NULL);
/*
* Now loop back and try again. This is the easiest way of
void
SimpleLruSetLatestPage(SlruCtl ctl, int pageno)
{
- SlruShared shared = (SlruShared) ctl->shared;
+ SlruShared shared = ctl->shared;
shared->latest_page_number = pageno;
}
void
SimpleLruFlush(SlruCtl ctl, bool checkpoint)
{
-#ifdef USE_ASSERT_CHECKING /* only used in Assert() */
- SlruShared shared = (SlruShared) ctl->shared;
-#endif
+ SlruShared shared = ctl->shared;
+ SlruFlushData fdata;
int slotno;
+ int pageno = 0;
+ int i;
+ bool ok;
+
+ fdata.num_files = 0;
- LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
{
- SimpleLruWritePage(ctl, slotno);
+ SimpleLruWritePage(ctl, slotno, &fdata);
/*
* When called during a checkpoint, we cannot assert that the slot
shared->page_status[slotno] == SLRU_PAGE_CLEAN);
}
- LWLockRelease(ctl->locks->ControlLock);
+ LWLockRelease(shared->ControlLock);
+
+ /*
+ * Now fsync and close any files that were open
+ */
+ ok = true;
+ for (i = 0; i < fdata.num_files; i++)
+ {
+ if (pg_fsync(fdata.fd[i]))
+ {
+ slru_errcause = SLRU_FSYNC_FAILED;
+ slru_errno = errno;
+ pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
+ ok = false;
+ }
+
+ if (close(fdata.fd[i]))
+ {
+ slru_errcause = SLRU_CLOSE_FAILED;
+ slru_errno = errno;
+ pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
+ ok = false;
+ }
+ }
+ if (!ok)
+ SlruReportIOError(ctl, pageno, InvalidTransactionId);
}
/*
SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
{
int slotno;
- SlruShared shared = (SlruShared) ctl->shared;
+ SlruShared shared = ctl->shared;
/*
* The cutoff point is the start of the segment containing cutoffPage.
* have been flushed already during the checkpoint, we're just being
* extra careful here.)
*/
- LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+ LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
restart:;
*/
if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
{
- LWLockRelease(ctl->locks->ControlLock);
+ LWLockRelease(shared->ControlLock);
ereport(LOG,
(errmsg("could not truncate directory \"%s\": apparent wraparound",
ctl->Dir)));
(void) SimpleLruReadPage(ctl, shared->page_number[slotno],
InvalidTransactionId, false);
else
- SimpleLruWritePage(ctl, slotno);
+ SimpleLruWritePage(ctl, slotno, NULL);
goto restart;
}
- LWLockRelease(ctl->locks->ControlLock);
+ LWLockRelease(shared->ControlLock);
/* Now we can remove the old segment(s) */
(void) SlruScanDirectory(ctl, cutoffPage, true);
if (cldir == NULL)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not open directory \"%s\": %m", ctl->Dir)));
+ errmsg("could not open directory \"%s\": %m",
+ ctl->Dir)));
errno = 0;
while ((clde = readdir(cldir)) != NULL)
*
* If the bgwriter exits unexpectedly, the postmaster treats that the same
* as a backend crash: shared memory may be corrupted, so remaining backends
- * should be killed by SIGQUIT and then a recovery cycle started.
+ * should be killed by SIGQUIT and then a recovery cycle started. (Even if
+ * shared memory isn't corrupted, we have lost information about which
+ * files need to be fsync'd for the next checkpoint, and so a system
+ * restart needs to be forced.)
*
*
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.1 2004/05/29 22:48:19 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.2 2004/05/31 03:47:59 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "utils/guc.h"
-/*
+/*----------
* Shared memory area for communication between bgwriter and backends
+ *
+ * The ckpt counters allow backends to watch for completion of a checkpoint
+ * request they send. Here's how it works:
+ * * At start of a checkpoint, bgwriter increments ckpt_started.
+ * * On completion of a checkpoint, bgwriter sets ckpt_done to
+ * equal ckpt_started.
+ * * On failure of a checkpoint, bgwrite first increments ckpt_failed,
+ * then sets ckpt_done to equal ckpt_started.
+ * All three fields are declared sig_atomic_t to ensure they can be read
+ * and written without explicit locking. The algorithm for backends is:
+ * 1. Record current values of ckpt_failed and ckpt_started (in that
+ * order!).
+ * 2. Send signal to request checkpoint.
+ * 3. Sleep until ckpt_started changes. Now you know a checkpoint has
+ * begun since you started this algorithm (although *not* that it was
+ * specifically initiated by your signal).
+ * 4. Record new value of ckpt_started.
+ * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo
+ * arithmetic here in case counters wrap around.) Now you know a
+ * checkpoint has started and completed, but not whether it was
+ * successful.
+ * 6. If ckpt_failed is different from the originally saved value,
+ * assume request failed; otherwise it was definitely successful.
+ *
+ * The requests array holds fsync requests sent by backends and not yet
+ * absorbed by the bgwriter.
+ *----------
*/
+typedef struct
+{
+ RelFileNode rnode;
+ BlockNumber segno;
+ /* might add a request-type field later */
+} BgWriterRequest;
+
typedef struct
{
pid_t bgwriter_pid; /* PID of bgwriter (0 if not started) */
- sig_atomic_t checkpoint_count; /* advances when checkpoint done */
+
+ sig_atomic_t ckpt_started; /* advances when checkpoint starts */
+ sig_atomic_t ckpt_done; /* advances when checkpoint done */
+ sig_atomic_t ckpt_failed; /* advances when checkpoint fails */
+
+ int num_requests; /* current # of requests */
+ int max_requests; /* allocated array size */
+ BgWriterRequest requests[1]; /* VARIABLE LENGTH ARRAY */
} BgWriterShmemStruct;
static BgWriterShmemStruct *BgWriterShmem;
/*
* Private state
*/
+static bool am_bg_writer = false;
+
+static bool ckpt_active = false;
+
static time_t last_checkpoint_time;
{
Assert(BgWriterShmem != NULL);
BgWriterShmem->bgwriter_pid = MyProcPid;
+ am_bg_writer = true;
/*
* Properly accept or ignore signals the postmaster might send us
*/
InError = false;
+ /* Warn any waiting backends that the checkpoint failed. */
+ if (ckpt_active)
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile BgWriterShmemStruct *bgs = BgWriterShmem;
+
+ bgs->ckpt_failed++;
+ bgs->ckpt_done = bgs->ckpt_started;
+ ckpt_active = false;
+ }
+
/*
* Exit interrupt holdoff section we implicitly established above.
*/
long udelay;
/*
- * Process any signals received recently.
+ * Emergency bailout if postmaster has died. This is to avoid the
+ * necessity for manual cleanup of all postmaster children.
*/
+ if (!PostmasterIsAlive(true))
+ exit(1);
+
+ /*
+ * Process any requests or signals received recently.
+ */
+ AbsorbFsyncRequests();
+
if (got_SIGHUP)
{
got_SIGHUP = false;
errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
}
+ /*
+ * Indicate checkpoint start to any waiting backends.
+ */
+ ckpt_active = true;
+ BgWriterShmem->ckpt_started++;
+
CreateCheckPoint(false, force_checkpoint);
+ /*
+ * Indicate checkpoint completion to any waiting backends.
+ */
+ BgWriterShmem->ckpt_done = BgWriterShmem->ckpt_started;
+ ckpt_active = false;
+
/*
* Note we record the checkpoint start time not end time as
* last_checkpoint_time. This is so that time-driven checkpoints
*/
last_checkpoint_time = now;
- /*
- * Indicate checkpoint completion to any waiting backends.
- */
- BgWriterShmem->checkpoint_count++;
-
/*
* After any checkpoint, close all smgr files. This is so we
* won't hang onto smgr references to deleted files indefinitely.
+ * (It is safe to do this because this process does not have a
+ * relcache, and so no dangling references could remain.)
*/
smgrcloseall();
* we respond reasonably promptly when someone signals us,
* break down the sleep into 1-second increments, and check for
* interrupts after each nap.
+ *
+ * We absorb pending requests after each short sleep.
*/
udelay = ((n > 0) ? BgWriterDelay : 10000) * 1000L;
while (udelay > 1000000L)
if (got_SIGHUP || checkpoint_requested || shutdown_requested)
break;
pg_usleep(1000000L);
+ AbsorbFsyncRequests();
udelay -= 1000000L;
}
if (!(got_SIGHUP || checkpoint_requested || shutdown_requested))
pg_usleep(udelay);
-
- /*
- * Emergency bailout if postmaster has died. This is to avoid the
- * necessity for manual cleanup of all postmaster children.
- */
- if (!PostmasterIsAlive(true))
- exit(1);
}
}
BgWriterShmemSize(void)
{
/*
- * This is not worth measuring right now, but may become so after we
- * add fsync signaling ...
+ * Currently, the size of the requests[] array is arbitrarily set
+ * equal to NBuffers. This may prove too large or small ...
*/
- return MAXALIGN(sizeof(BgWriterShmemStruct));
+ return MAXALIGN(sizeof(BgWriterShmemStruct) +
+ (NBuffers - 1) * sizeof(BgWriterRequest));
}
/*
BgWriterShmem = (BgWriterShmemStruct *)
ShmemInitStruct("Background Writer Data",
- sizeof(BgWriterShmemStruct),
+ BgWriterShmemSize(),
&found);
if (BgWriterShmem == NULL)
ereport(FATAL,
return; /* already initialized */
MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
+ BgWriterShmem->max_requests = NBuffers;
}
/*
void
RequestCheckpoint(bool waitforit)
{
- volatile sig_atomic_t *count_ptr = &BgWriterShmem->checkpoint_count;
- sig_atomic_t old_count = *count_ptr;
+ /* use volatile pointer to prevent code rearrangement */
+ volatile BgWriterShmemStruct *bgs = BgWriterShmem;
+ sig_atomic_t old_failed = bgs->ckpt_failed;
+ sig_atomic_t old_started = bgs->ckpt_started;
/*
* Send signal to request checkpoint. When waitforit is false,
"could not signal for checkpoint: %m");
/*
- * If requested, wait for completion. We detect completion by
- * observing a change in checkpoint_count in shared memory.
+ * If requested, wait for completion. We detect completion according
+ * to the algorithm given above.
*/
if (waitforit)
{
- while (*count_ptr == old_count)
+ while (bgs->ckpt_started == old_started)
{
CHECK_FOR_INTERRUPTS();
- pg_usleep(1000000L);
+ pg_usleep(100000L);
+ }
+ old_started = bgs->ckpt_started;
+ /*
+ * We are waiting for ckpt_done >= old_started, in a modulo
+ * sense. This is a little tricky since we don't know the
+ * width or signedness of sig_atomic_t. We make the lowest
+ * common denominator assumption that it is only as wide
+ * as "char". This means that this algorithm will cope
+ * correctly as long as we don't sleep for more than 127
+ * completed checkpoints. (If we do, we will get another
+ * chance to exit after 128 more checkpoints...)
+ */
+ while (((signed char) (bgs->ckpt_done - old_started)) < 0)
+ {
+ CHECK_FOR_INTERRUPTS();
+ pg_usleep(100000L);
}
+ if (bgs->ckpt_failed != old_failed)
+ ereport(ERROR,
+ (errmsg("checkpoint request failed"),
+ errhint("Consult the postmaster log for details.")));
+ }
+}
+
+/*
+ * ForwardFsyncRequest
+ * Forward a file-fsync request from a backend to the bgwriter
+ *
+ * Whenever a backend is compelled to write directly to a relation
+ * (which should be seldom, if the bgwriter is getting its job done),
+ * the backend calls this routine to pass over knowledge that the relation
+ * is dirty and must be fsync'd before next checkpoint.
+ *
+ * If we are unable to pass over the request (at present, this can happen
+ * if the shared memory queue is full), we return false. That forces
+ * the backend to do its own fsync. We hope that will be even more seldom.
+ *
+ * Note: we presently make no attempt to eliminate duplicate requests
+ * in the requests[] queue. The bgwriter will have to eliminate dups
+ * internally anyway, so we may as well avoid holding the lock longer
+ * than we have to here.
+ */
+bool
+ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno)
+{
+ BgWriterRequest *request;
+
+ if (!IsUnderPostmaster)
+ return false; /* probably shouldn't even get here */
+ Assert(BgWriterShmem != NULL);
+
+ LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
+ if (BgWriterShmem->bgwriter_pid == 0 ||
+ BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
+ {
+ LWLockRelease(BgWriterCommLock);
+ return false;
+ }
+ request = &BgWriterShmem->requests[BgWriterShmem->num_requests++];
+ request->rnode = rnode;
+ request->segno = segno;
+ LWLockRelease(BgWriterCommLock);
+ return true;
+}
+
+/*
+ * AbsorbFsyncRequests
+ * Retrieve queued fsync requests and pass them to local smgr.
+ *
+ * This is exported because it must be called during CreateCheckpoint;
+ * we have to be sure we have accepted all pending requests *after* we
+ * establish the checkpoint redo pointer. Since CreateCheckpoint
+ * sometimes runs in non-bgwriter processes, do nothing if not bgwriter.
+ */
+void
+AbsorbFsyncRequests(void)
+{
+ BgWriterRequest *requests = NULL;
+ BgWriterRequest *request;
+ int n;
+
+ if (!am_bg_writer)
+ return;
+
+ /*
+ * We try to avoid holding the lock for a long time by copying the
+ * request array.
+ */
+ LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
+
+ n = BgWriterShmem->num_requests;
+ if (n > 0)
+ {
+ requests = (BgWriterRequest *) palloc(n * sizeof(BgWriterRequest));
+ memcpy(requests, BgWriterShmem->requests, n * sizeof(BgWriterRequest));
+ }
+ BgWriterShmem->num_requests = 0;
+
+ LWLockRelease(BgWriterCommLock);
+
+ for (request = requests; n > 0; request++, n--)
+ {
+ RememberFsyncRequest(request->rnode, request->segno);
}
+ if (requests)
+ pfree(requests);
}
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.166 2004/05/29 22:48:19 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.167 2004/05/31 03:48:02 tgl Exp $
*
*-------------------------------------------------------------------------
*/
* bothering to write them out first. This is NOT rollback-able,
* and so should be used only with extreme caution!
*
+ * There is no particularly good reason why this doesn't have a
+ * firstDelBlock parameter, except that current callers don't need it.
+ *
* We assume that the caller holds an exclusive lock on the relation,
* which should assure that no new buffers will be acquired for the rel
* meanwhile.
void
DropRelationBuffers(Relation rel)
{
- DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp);
+ DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp, 0);
}
/* ---------------------------------------------------------------------
* DropRelFileNodeBuffers
*
* This is the same as DropRelationBuffers, except that the target
- * relation is specified by RelFileNode and temp status.
+ * relation is specified by RelFileNode and temp status, and one
+ * may specify the first block to drop.
*
* This is NOT rollback-able. One legitimate use is to clear the
* buffer cache of buffers for a relation that is being deleted
* --------------------------------------------------------------------
*/
void
-DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
+DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
+ BlockNumber firstDelBlock)
{
int i;
BufferDesc *bufHdr;
for (i = 0; i < NLocBuffer; i++)
{
bufHdr = &LocalBufferDescriptors[i];
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+ bufHdr->tag.blockNum >= firstDelBlock)
{
bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
bufHdr->cntxDirty = false;
{
bufHdr = &BufferDescriptors[i - 1];
recheck:
- if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+ if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+ bufHdr->tag.blockNum >= firstDelBlock)
{
/*
* If there is I/O in progress, better wait till it's done;
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.108 2004/02/23 23:03:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.109 2004/05/31 03:48:04 tgl Exp $
*
* NOTES:
*
DO_DB(_dump_lru());
}
+/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
LruInsert(File file)
{
return buf;
}
+/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
FileAccess(File file)
{
file, VfdCache[file].fileName,
VfdCache[file].seekPos, amount, buffer));
- FileAccess(file);
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
returnCode = read(VfdCache[file].fd, buffer, amount);
if (returnCode > 0)
VfdCache[file].seekPos += returnCode;
file, VfdCache[file].fileName,
VfdCache[file].seekPos, amount, buffer));
- FileAccess(file);
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
errno = 0;
returnCode = write(VfdCache[file].fd, buffer, amount);
return returnCode;
}
+int
+FileSync(File file)
+{
+ int returnCode;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileSync: %d (%s)",
+ file, VfdCache[file].fileName));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+ return pg_fsync(VfdCache[file].fd);
+}
+
long
FileSeek(File file, long offset, int whence)
{
+ int returnCode;
+
Assert(FileIsValid(file));
DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
VfdCache[file].seekPos += offset;
break;
case SEEK_END:
- FileAccess(file);
- VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+ VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+ offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
if (offset < 0)
elog(ERROR, "invalid seek offset: %ld", offset);
if (VfdCache[file].seekPos != offset)
- VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+ VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+ offset, whence);
break;
case SEEK_CUR:
if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
- VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+ VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+ offset, whence);
break;
case SEEK_END:
- VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+ VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+ offset, whence);
break;
default:
elog(ERROR, "invalid whence: %d", whence);
DO_DB(elog(LOG, "FileTruncate %d (%s)",
file, VfdCache[file].fileName));
- FileAccess(file);
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
return returnCode;
}
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.104 2004/04/19 17:42:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.105 2004/05/31 03:48:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "catalog/catalog.h"
#include "miscadmin.h"
+#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/smgr.h"
+#include "utils/hsearch.h"
#include "utils/memutils.h"
* system's file size limit (often 2GBytes). In order to do that,
* we break relations up into chunks of < 2GBytes and store one chunk
* in each of several files that represent the relation. See the
- * BLCKSZ and RELSEG_SIZE configuration constants in
- * include/pg_config.h. All chunks except the last MUST have size exactly
- * equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate().
+ * BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
+ * All chunks except the last MUST have size exactly equal to RELSEG_SIZE
+ * blocks --- see mdnblocks() and mdtruncate().
*
* The file descriptor pointer (md_fd field) stored in the SMgrRelation
* cache is, therefore, just the head of a list of MdfdVec objects.
* But note the md_fd pointer can be NULL, indicating relation not open.
*
+ * Note that mdfd_chain == NULL does not necessarily mean the relation
+ * doesn't have another segment after this one; we may just not have
+ * opened the next segment yet. (We could not have "all segments are
+ * in the chain" as an invariant anyway, since another backend could
+ * extend the relation when we weren't looking.)
+ *
* All MdfdVec objects are palloc'd in the MdCxt memory context.
*/
typedef struct _MdfdVec
{
File mdfd_vfd; /* fd number in fd.c's pool */
-
-#ifndef LET_OS_MANAGE_FILESIZE
- struct _MdfdVec *mdfd_chain; /* for large relations */
+ BlockNumber mdfd_segno; /* segment number, from 0 */
+#ifndef LET_OS_MANAGE_FILESIZE /* for large relations */
+ struct _MdfdVec *mdfd_chain; /* next segment, or NULL */
#endif
} MdfdVec;
static MemoryContext MdCxt; /* context for all md.c allocations */
-/* routines declared here */
-static MdfdVec *mdopen(SMgrRelation reln);
+/*
+ * In some contexts (currently, standalone backends and the bgwriter process)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint. This hash
+ * table remembers the pending operations. We use a hash table not because
+ * we want to look up individual operations, but simply as a convenient way
+ * of eliminating duplicate requests.
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the bgwriter.)
+ *
+ * XXX for WIN32, may want to expand this to track pending deletes, too.
+ */
+typedef struct
+{
+ RelFileNode rnode; /* the targeted relation */
+ BlockNumber segno; /* which segment */
+} PendingOperationEntry;
+
+static HTAB *pendingOpsTable = NULL;
+
+
+/* local routines */
+static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
+static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
static MdfdVec *_fdvec_alloc(void);
#ifndef LET_OS_MANAGE_FILESIZE
static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
int oflags);
#endif
-static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno);
+static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
+ bool allowNotFound);
static BlockNumber _mdnblocks(File file, Size blcksz);
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
+ /*
+ * Create pending-operations hashtable if we need it. Currently,
+ * we need it if we are standalone (not under a postmaster) OR
+ * if we are a bootstrap-mode subprocess of a postmaster (that is,
+ * a startup or bgwriter process).
+ */
+ if (!IsUnderPostmaster || IsBootstrapProcessingMode())
+ {
+ HASHCTL hash_ctl;
+
+ MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+ hash_ctl.keysize = sizeof(PendingOperationEntry);
+ hash_ctl.entrysize = sizeof(PendingOperationEntry);
+ hash_ctl.hash = tag_hash;
+ hash_ctl.hcxt = MdCxt;
+ pendingOpsTable = hash_create("Pending Ops Table",
+ 100L,
+ &hash_ctl,
+ HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+ if (pendingOpsTable == NULL)
+ ereport(FATAL,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
return true;
}
reln->md_fd = _fdvec_alloc();
reln->md_fd->mdfd_vfd = fd;
+ reln->md_fd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
reln->md_fd->mdfd_chain = NULL;
#endif
int nbytes;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum);
+ v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
return false;
}
+ if (!register_dirty_segment(reln, v))
+ return false;
+
#ifndef LET_OS_MANAGE_FILESIZE
Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
#endif
/*
* mdopen() -- Open the specified relation. ereport's on failure.
+ * (Optionally, can return NULL instead of ereport for ENOENT.)
*
* Note we only open the first segment, when there are multiple segments.
*/
static MdfdVec *
-mdopen(SMgrRelation reln)
+mdopen(SMgrRelation reln, bool allowNotFound)
{
+ MdfdVec *mdfd;
char *path;
File fd;
if (fd < 0)
{
pfree(path);
+ if (allowNotFound && errno == ENOENT)
+ return NULL;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open relation %u/%u: %m",
pfree(path);
- reln->md_fd = _fdvec_alloc();
+ reln->md_fd = mdfd = _fdvec_alloc();
- reln->md_fd->mdfd_vfd = fd;
+ mdfd->mdfd_vfd = fd;
+ mdfd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
- reln->md_fd->mdfd_chain = NULL;
+ mdfd->mdfd_chain = NULL;
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
#endif
- return reln->md_fd;
+ return mdfd;
}
/*
int nbytes;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum);
+ v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
long seekpos;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum);
+ v = _mdfd_getseg(reln, blocknum, false);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
return false;
+ if (!register_dirty_segment(reln, v))
+ return false;
+
return true;
}
BlockNumber
mdnblocks(SMgrRelation reln)
{
- MdfdVec *v = mdopen(reln);
+ MdfdVec *v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber nblocks;
if (nblocks == curnblk)
return nblocks; /* no work */
- v = mdopen(reln);
+ v = mdopen(reln, false);
#ifndef LET_OS_MANAGE_FILESIZE
priorblocks = 0;
}
/*
- * mdcommit() -- Commit a transaction.
+ * mdsync() -- Sync previous writes to stable storage.
+ *
+ * This is only called during checkpoints, and checkpoints should only
+ * occur in processes that have created a pendingOpsTable.
*/
bool
-mdcommit(void)
+mdsync(void)
{
+ HASH_SEQ_STATUS hstat;
+ PendingOperationEntry *entry;
+
+ if (!pendingOpsTable)
+ return false;
+
/*
- * We don't actually have to do anything here...
+ * If we are in the bgwriter, the sync had better include all fsync
+ * requests that were queued by backends before the checkpoint REDO
+ * point was determined. We go that a little better by accepting
+ * all requests queued up to the point where we start fsync'ing.
*/
+ AbsorbFsyncRequests();
+
+ hash_seq_init(&hstat, pendingOpsTable);
+ while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ /*
+ * If fsync is off then we don't have to bother opening the file
+ * at all. (We delay checking until this point so that changing
+ * fsync on the fly behaves sensibly.)
+ */
+ if (enableFsync)
+ {
+ SMgrRelation reln;
+ MdfdVec *seg;
+
+ /*
+ * Find or create an smgr hash entry for this relation.
+ * This may seem a bit unclean -- md calling smgr? But it's
+ * really the best solution. It ensures that the open file
+ * reference isn't permanently leaked if we get an error here.
+ * (You may say "but an unreferenced SMgrRelation is still a
+ * leak!" Not really, because the only case in which a checkpoint
+ * is done by a process that isn't about to shut down is in the
+ * bgwriter, and it will periodically do smgrcloseall(). This
+ * fact justifies our not closing the reln in the success path
+ * either, which is a good thing since in non-bgwriter cases
+ * we couldn't safely do that.) Furthermore, in many cases
+ * the relation will have been dirtied through this same smgr
+ * relation, and so we can save a file open/close cycle.
+ */
+ reln = smgropen(entry->rnode);
+
+ /*
+ * It is possible that the relation has been dropped or truncated
+ * since the fsync request was entered. Therefore, we have to
+ * allow file-not-found errors. This applies both during
+ * _mdfd_getseg() and during FileSync, since fd.c might have
+ * closed the file behind our back.
+ */
+ seg = _mdfd_getseg(reln,
+ entry->segno * ((BlockNumber) RELSEG_SIZE),
+ true);
+ if (seg)
+ {
+ if (FileSync(seg->mdfd_vfd) < 0 &&
+ errno != ENOENT)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not fsync segment %u of relation %u/%u: %m",
+ entry->segno,
+ entry->rnode.tblNode,
+ entry->rnode.relNode)));
+ return false;
+ }
+ }
+ }
+
+ /* Okay, delete this entry */
+ if (hash_search(pendingOpsTable, entry,
+ HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "pendingOpsTable corrupted");
+ }
+
return true;
}
/*
- * mdabort() -- Abort a transaction.
+ * register_dirty_segment() -- Mark a relation segment as needing fsync
+ *
+ * If there is a local pending-ops table, just make an entry in it for
+ * mdsync to process later. Otherwise, try to pass off the fsync request
+ * to the background writer process. If that fails, just do the fsync
+ * locally before returning (we expect this will not happen often enough
+ * to be a performance problem).
+ *
+ * A false result implies I/O failure during local fsync. errno will be
+ * valid for error reporting.
*/
-bool
-mdabort(void)
+static bool
+register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
{
- /*
- * We don't actually have to do anything here...
- */
+ if (pendingOpsTable)
+ {
+ PendingOperationEntry entry;
+
+ /* ensure any pad bytes in the struct are zeroed */
+ MemSet(&entry, 0, sizeof(entry));
+ entry.rnode = reln->smgr_rnode;
+ entry.segno = seg->mdfd_segno;
+
+ if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) != NULL)
+ return true;
+ /* out of memory: fall through to do it locally */
+ }
+ else
+ {
+ if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
+ return true;
+ }
+
+ if (FileSync(seg->mdfd_vfd) < 0)
+ return false;
return true;
}
/*
- * mdsync() -- Sync previous writes to stable storage.
+ * RememberFsyncRequest() -- callback from bgwriter side of fsync request
+ *
+ * We stuff the fsync request into the local hash table for execution
+ * during the bgwriter's next checkpoint.
*/
-bool
-mdsync(void)
+void
+RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
{
- sync();
- if (IsUnderPostmaster)
- pg_usleep(2000000L);
- sync();
- return true;
+ PendingOperationEntry entry;
+
+ Assert(pendingOpsTable);
+
+ /* ensure any pad bytes in the struct are zeroed */
+ MemSet(&entry, 0, sizeof(entry));
+ entry.rnode = rnode;
+ entry.segno = segno;
+
+ if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) == NULL)
+ ereport(FATAL,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
}
/*
static MdfdVec *
_fdvec_alloc(void)
{
- MdfdVec *v;
-
- v = (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
- v->mdfd_vfd = -1;
-#ifndef LET_OS_MANAGE_FILESIZE
- v->mdfd_chain = NULL;
-#endif
-
- return v;
+ return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
}
#ifndef LET_OS_MANAGE_FILESIZE
+
/*
* Open the specified segment of the relation,
* and make a MdfdVec object for it. Returns NULL on failure.
char *path,
*fullpath;
- /* be sure we have enough space for the '.segno', if any */
path = relpath(reln->smgr_rnode);
if (segno > 0)
{
+ /* be sure we have enough space for the '.segno' */
fullpath = (char *) palloc(strlen(path) + 12);
sprintf(fullpath, "%s.%u", path, segno);
pfree(path);
/* fill the entry */
v->mdfd_vfd = fd;
+ v->mdfd_segno = segno;
v->mdfd_chain = NULL;
Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
/* all done */
return v;
}
-#endif
+
+#endif /* LET_OS_MANAGE_FILESIZE */
/*
* _mdfd_getseg() -- Find the segment of the relation holding the
- * specified block. ereport's on failure.
+ * specified block. ereport's on failure.
+ * (Optionally, can return NULL instead of ereport for ENOENT.)
*/
static MdfdVec *
-_mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
+_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
{
- MdfdVec *v = mdopen(reln);
-
+ MdfdVec *v = mdopen(reln, allowNotFound);
#ifndef LET_OS_MANAGE_FILESIZE
- BlockNumber segno;
- BlockNumber i;
+ BlockNumber segstogo;
+ BlockNumber nextsegno;
- for (segno = blkno / ((BlockNumber) RELSEG_SIZE), i = 1;
- segno > 0;
- i++, segno--)
- {
+ if (!v)
+ return NULL; /* only possible if allowNotFound */
+ for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
+ segstogo > 0;
+ nextsegno++, segstogo--)
+ {
if (v->mdfd_chain == NULL)
{
/*
* one new segment per call, so this restriction seems
* reasonable.
*/
- v->mdfd_chain = _mdfd_openseg(reln, i, (segno == 1) ? O_CREAT : 0);
-
+ v->mdfd_chain = _mdfd_openseg(reln,
+ nextsegno,
+ (segstogo == 1) ? O_CREAT : 0);
if (v->mdfd_chain == NULL)
+ {
+ if (allowNotFound && errno == ENOENT)
+ return NULL;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open segment %u of relation %u/%u (target block %u): %m",
- i,
+ nextsegno,
reln->smgr_rnode.tblNode,
reln->smgr_rnode.relNode,
blkno)));
+ }
}
v = v->mdfd_chain;
}
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.70 2004/02/11 22:55:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.71 2004/05/31 03:48:06 tgl Exp $
*
*-------------------------------------------------------------------------
*/
static const f_smgr smgrsw[] = {
/* magnetic disk */
{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
- mdread, mdwrite, mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
+ mdread, mdwrite, mdnblocks, mdtruncate, NULL, NULL, mdsync
}
};
* Get rid of any leftover buffers for the rel (shouldn't be any in the
* commit case, but there can be in the abort case).
*/
- DropRelFileNodeBuffers(rnode, isTemp);
+ DropRelFileNodeBuffers(rnode, isTemp, 0);
/*
* Tell the free space map to forget this relation. It won't be accessed
if (smgrsw[i].smgr_commit)
{
if (! (*(smgrsw[i].smgr_commit)) ())
- elog(FATAL, "transaction commit failed on %s: %m",
+ elog(ERROR, "transaction commit failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
if (smgrsw[i].smgr_abort)
{
if (! (*(smgrsw[i].smgr_abort)) ())
- elog(FATAL, "transaction abort failed on %s: %m",
+ elog(ERROR, "transaction abort failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
if (smgrsw[i].smgr_sync)
{
if (! (*(smgrsw[i].smgr_sync)) ())
- elog(PANIC, "storage sync failed on %s: %m",
+ elog(ERROR, "storage sync failed on %s: %m",
DatumGetCString(DirectFunctionCall1(smgrout,
Int16GetDatum(i))));
}
reln = smgropen(xlrec->rnode);
+ /*
+ * First, force bufmgr to drop any buffers it has for the to-be-
+ * truncated blocks. We must do this, else subsequent XLogReadBuffer
+ * operations will not re-extend the file properly.
+ */
+ DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);
+
/* Can't use smgrtruncate because it would try to xlog */
/*
* Portions Copyright (c) 2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.5 2004/05/28 05:13:17 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.6 2004/05/31 03:48:08 tgl Exp $
*/
#ifndef SLRU_H
#define SLRU_H
-#include "access/xlog.h"
+#include "storage/lwlock.h"
-/* exported because lwlock.c needs it */
-#define NUM_CLOG_BUFFERS 8
-/*
- * Note: the separation between SlruLockData and SlruSharedData is purely
- * historical; the structs could be combined.
- */
-typedef struct SlruLockData
-{
- LWLockId ControlLock;
- LWLockId BufferLocks[NUM_CLOG_BUFFERS]; /* Per-buffer I/O locks */
-} SlruLockData;
-typedef SlruLockData *SlruLock;
+/* Opaque structs known only in slru.c */
+typedef struct SlruSharedData *SlruShared;
+typedef struct SlruFlushData *SlruFlush;
/*
* SlruCtlData is an unshared structure that points to the active information
*/
typedef struct SlruCtlData
{
- void *shared; /* pointer to SlruSharedData */
- SlruLock locks;
+ SlruShared shared;
+
+ LWLockId ControlLock;
/*
- * Dir is set during SimpleLruShmemInit and does not change thereafter.
- * The value is automatically inherited by backends via fork, and
- * doesn't need to be in shared memory.
+ * Dir is set during SimpleLruInit and does not change thereafter.
+ * Since it's always the same, it doesn't need to be in shared memory.
*/
char Dir[MAXPGPATH];
bool (*PagePrecedes) (int, int);
} SlruCtlData;
+
typedef SlruCtlData *SlruCtl;
+
extern int SimpleLruShmemSize(void);
extern void SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir);
extern int SimpleLruZeroPage(SlruCtl ctl, int pageno);
-extern char *SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite);
-extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
+extern char *SimpleLruReadPage(SlruCtl ctl, int pageno,
+ TransactionId xid, bool forwrite);
+extern void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
extern void SimpleLruSetLatestPage(SlruCtl ctl, int pageno);
extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint);
extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
*
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
*
- * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.1 2004/05/29 22:48:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.2 2004/05/31 03:48:09 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#ifndef _BGWRITER_H
#define _BGWRITER_H
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+
+
/* GUC options */
extern int BgWriterDelay;
extern int BgWriterPercent;
extern void RequestCheckpoint(bool waitforit);
+extern bool ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno);
+extern void AbsorbFsyncRequests(void);
+
extern int BgWriterShmemSize(void);
extern void BgWriterShmemInit(void);
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.80 2004/05/29 22:48:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.81 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
extern void RelationTruncate(Relation rel, BlockNumber nblocks);
extern int FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock);
extern void DropRelationBuffers(Relation rel);
-extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp);
+extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
+ BlockNumber firstDelBlock);
extern void DropBuffers(Oid dbid);
#ifdef NOT_USED
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.44 2004/02/23 23:03:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.45 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
/*
* calls:
*
- * File {Close, Read, Write, Seek, Tell, MarkDirty, Sync}
+ * File {Close, Read, Write, Seek, Tell, Sync}
* {File Name Open, Allocate, Free} File
*
* These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
extern void FileUnlink(File file);
extern int FileRead(File file, char *buffer, int amount);
extern int FileWrite(File file, char *buffer, int amount);
+extern int FileSync(File file);
extern long FileSeek(File file, long offset, int whence);
extern int FileTruncate(File file, long offset);
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.10 2003/12/20 17:31:21 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.11 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
ControlFileLock,
CheckpointLock,
RelCacheInitLock,
+ BgWriterCommLock,
NumFixedLWLocks, /* must be last except for
* MaxDynamicLWLock */
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.41 2004/02/11 22:55:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.42 2004/05/31 03:48:10 tgl Exp $
*
*-------------------------------------------------------------------------
*/
extern bool mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer);
extern BlockNumber mdnblocks(SMgrRelation reln);
extern BlockNumber mdtruncate(SMgrRelation reln, BlockNumber nblocks);
-extern bool mdcommit(void);
-extern bool mdabort(void);
extern bool mdsync(void);
+extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
+
/* smgrtype.c */
extern Datum smgrout(PG_FUNCTION_ARGS);
extern Datum smgrin(PG_FUNCTION_ARGS);