*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.125 2007/01/05 22:19:38 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.126 2007/01/17 00:17:21 tgl Exp $
*
*-------------------------------------------------------------------------
*/
/* interval for calling AbsorbFsyncRequests in mdsync */
#define FSYNCS_PER_ABSORB 10
+/*
+ * On Windows, we have to interpret EACCES as possibly meaning the same as
+ * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
+ * that's what you get. Ugh. This code is designed so that we don't
+ * actually believe these cases are okay without further evidence (namely,
+ * a pending fsync request getting revoked ... see mdsync).
+ */
+#ifndef WIN32
+#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
+#else
+#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
+#endif
+
/*
* The magnetic disk storage manager keeps track of open file
* descriptors in its own descriptor pool. This is done to make it
* we keep track of pending fsync operations: we need to remember all relation
* segments that have been written since the last checkpoint, so that we can
* fsync them down to disk before completing the next checkpoint. This hash
- * table remembers the pending operations. We use a hash table not because
- * we want to look up individual operations, but simply as a convenient way
- * of eliminating duplicate requests.
+ * table remembers the pending operations. We use a hash table mostly as
+ * a convenient way of eliminating duplicate requests.
*
* (Regular backends do not track pending operations locally, but forward
* them to the bgwriter.)
{
RelFileNode rnode; /* the targeted relation */
BlockNumber segno; /* which segment */
+} PendingOperationTag;
+
+typedef struct
+{
+ PendingOperationTag tag; /* hash table key (must be first!) */
+ int failures; /* number of failed attempts to fsync */
} PendingOperationEntry;
static HTAB *pendingOpsTable = NULL;
HASHCTL hash_ctl;
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
- hash_ctl.keysize = sizeof(PendingOperationEntry);
+ hash_ctl.keysize = sizeof(PendingOperationTag);
hash_ctl.entrysize = sizeof(PendingOperationEntry);
hash_ctl.hash = tag_hash;
hash_ctl.hcxt = MdCxt;
{
char *path;
+ /*
+ * We have to clean out any pending fsync requests for the doomed relation,
+ * else the next mdsync() will fail.
+ */
+ if (pendingOpsTable)
+ {
+ /* standalone backend or startup process: fsync state is local */
+ RememberFsyncRequest(rnode, InvalidBlockNumber);
+ }
+ else if (IsUnderPostmaster)
+ {
+ /*
+ * Notify the bgwriter about it. If we fail to queue the revoke
+ * message, we have to sleep and try again ... ugly, but hopefully
+ * won't happen often.
+ *
+ * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with
+ * an error would leave the no-longer-used file still present on
+ * disk, which would be bad, so I'm inclined to assume that the
+ * bgwriter will always empty the queue soon.
+ */
+ while (!ForwardFsyncRequest(rnode, InvalidBlockNumber))
+ pg_usleep(10000L); /* 10 msec seems a good number */
+ /*
+ * Note we don't wait for the bgwriter to actually absorb the
+ * revoke message; see mdsync() for the implications.
+ */
+ }
+
path = relpath(rnode);
/* Delete the first segment, or only segment if not doing segmenting */
if (fd < 0)
{
pfree(path);
- if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
+ if (behavior == EXTENSION_RETURN_NULL &&
+ FILE_POSSIBLY_DELETED(errno))
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
void
mdsync(void)
{
- HASH_SEQ_STATUS hstat;
- PendingOperationEntry *entry;
- int absorb_counter;
+ bool need_retry;
if (!pendingOpsTable)
elog(ERROR, "cannot sync without a pendingOpsTable");
/*
- * If we are in the bgwriter, the sync had better include all fsync
- * requests that were queued by backends before the checkpoint REDO point
- * was determined. We go that a little better by accepting all requests
- * queued up to the point where we start fsync'ing.
+ * The fsync table could contain requests to fsync relations that have
+ * been deleted (unlinked) by the time we get to them. Rather than
+ * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
+ * what we will do is retry the whole process after absorbing fsync
+ * request messages again. Since mdunlink() queues a "revoke" message
+ * before actually unlinking, the fsync request is guaranteed to be gone
+ * the second time if it really was this case.
*/
- AbsorbFsyncRequests();
+ do {
+ HASH_SEQ_STATUS hstat;
+ PendingOperationEntry *entry;
+ int absorb_counter;
+
+ need_retry = false;
- absorb_counter = FSYNCS_PER_ABSORB;
- hash_seq_init(&hstat, pendingOpsTable);
- while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
- {
/*
- * If fsync is off then we don't have to bother opening the file at
- * all. (We delay checking until this point so that changing fsync on
- * the fly behaves sensibly.)
+ * If we are in the bgwriter, the sync had better include all fsync
+ * requests that were queued by backends before the checkpoint REDO
+ * point was determined. We go that a little better by accepting all
+ * requests queued up to the point where we start fsync'ing.
*/
- if (enableFsync)
- {
- SMgrRelation reln;
- MdfdVec *seg;
+ AbsorbFsyncRequests();
+ absorb_counter = FSYNCS_PER_ABSORB;
+ hash_seq_init(&hstat, pendingOpsTable);
+ while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+ {
/*
- * If in bgwriter, absorb pending requests every so often to
- * prevent overflow of the fsync request queue. The hashtable
- * code does not specify whether entries added by this will be
- * visited by our search, but we don't really care: it's OK if we
- * do, and OK if we don't.
+ * If fsync is off then we don't have to bother opening the file
+ * at all. (We delay checking until this point so that changing
+ * fsync on the fly behaves sensibly.)
*/
- if (--absorb_counter <= 0)
+ if (enableFsync)
{
- AbsorbFsyncRequests();
- absorb_counter = FSYNCS_PER_ABSORB;
- }
-
- /*
- * Find or create an smgr hash entry for this relation. This may
- * seem a bit unclean -- md calling smgr? But it's really the
- * best solution. It ensures that the open file reference isn't
- * permanently leaked if we get an error here. (You may say "but
- * an unreferenced SMgrRelation is still a leak!" Not really,
- * because the only case in which a checkpoint is done by a
- * process that isn't about to shut down is in the bgwriter, and
- * it will periodically do smgrcloseall(). This fact justifies
- * our not closing the reln in the success path either, which is a
- * good thing since in non-bgwriter cases we couldn't safely do
- * that.) Furthermore, in many cases the relation will have been
- * dirtied through this same smgr relation, and so we can save a
- * file open/close cycle.
- */
- reln = smgropen(entry->rnode);
+ SMgrRelation reln;
+ MdfdVec *seg;
+
+ /*
+ * If in bgwriter, we want to absorb pending requests every so
+ * often to prevent overflow of the fsync request queue. This
+ * could result in deleting the current entry out from under
+ * our hashtable scan, so the procedure is to fall out of the
+ * scan and start over from the top of the function.
+ */
+ if (--absorb_counter <= 0)
+ {
+ need_retry = true;
+ break;
+ }
- /*
- * It is possible that the relation has been dropped or truncated
- * since the fsync request was entered. Therefore, we have to
- * allow file-not-found errors. This applies both during
- * _mdfd_getseg() and during FileSync, since fd.c might have
- * closed the file behind our back.
- */
- seg = _mdfd_getseg(reln,
- entry->segno * ((BlockNumber) RELSEG_SIZE),
- false, EXTENSION_RETURN_NULL);
- if (seg)
- {
- if (FileSync(seg->mdfd_vfd) < 0 &&
- errno != ENOENT)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
- entry->segno,
- entry->rnode.spcNode,
- entry->rnode.dbNode,
- entry->rnode.relNode)));
+ /*
+ * Find or create an smgr hash entry for this relation. This
+ * may seem a bit unclean -- md calling smgr? But it's really
+ * the best solution. It ensures that the open file reference
+ * isn't permanently leaked if we get an error here. (You may
+ * say "but an unreferenced SMgrRelation is still a leak!" Not
+ * really, because the only case in which a checkpoint is done
+ * by a process that isn't about to shut down is in the
+ * bgwriter, and it will periodically do smgrcloseall(). This
+ * fact justifies our not closing the reln in the success path
+ * either, which is a good thing since in non-bgwriter cases
+ * we couldn't safely do that.) Furthermore, in many cases
+ * the relation will have been dirtied through this same smgr
+ * relation, and so we can save a file open/close cycle.
+ */
+ reln = smgropen(entry->tag.rnode);
+
+ /*
+ * It is possible that the relation has been dropped or
+ * truncated since the fsync request was entered. Therefore,
+ * allow ENOENT, but only if we didn't fail once already on
+ * this file. This applies both during _mdfd_getseg() and
+ * during FileSync, since fd.c might have closed the file
+ * behind our back.
+ */
+ seg = _mdfd_getseg(reln,
+ entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
+ false, EXTENSION_RETURN_NULL);
+ if (seg == NULL ||
+ FileSync(seg->mdfd_vfd) < 0)
+ {
+ /*
+ * XXX is there any point in allowing more than one try?
+ * Don't see one at the moment, but easy to change the
+ * test here if so.
+ */
+ if (!FILE_POSSIBLY_DELETED(errno) ||
+ ++(entry->failures) > 1)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+ entry->tag.segno,
+ entry->tag.rnode.spcNode,
+ entry->tag.rnode.dbNode,
+ entry->tag.rnode.relNode)));
+ else
+ ereport(DEBUG1,
+ (errcode_for_file_access(),
+ errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
+ entry->tag.segno,
+ entry->tag.rnode.spcNode,
+ entry->tag.rnode.dbNode,
+ entry->tag.rnode.relNode)));
+ need_retry = true;
+ continue; /* don't delete the hashtable entry */
+ }
}
- }
- /* Okay, delete this entry */
- if (hash_search(pendingOpsTable, entry,
- HASH_REMOVE, NULL) == NULL)
- elog(ERROR, "pendingOpsTable corrupted");
- }
+ /* Okay, delete this entry */
+ if (hash_search(pendingOpsTable, &entry->tag,
+ HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "pendingOpsTable corrupted");
+ }
+ } while (need_retry);
}
/*
{
if (pendingOpsTable)
{
- PendingOperationEntry entry;
-
- /* ensure any pad bytes in the struct are zeroed */
- MemSet(&entry, 0, sizeof(entry));
- entry.rnode = reln->smgr_rnode;
- entry.segno = seg->mdfd_segno;
-
- (void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
+ /* push it into local pending-ops table */
+ RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
}
else
{
*
* We stuff the fsync request into the local hash table for execution
* during the bgwriter's next checkpoint.
+ *
+ * segno == InvalidBlockNumber is a "revoke" request: remove any pending
+ * fsync requests for the whole relation.
*/
void
RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
{
- PendingOperationEntry entry;
-
Assert(pendingOpsTable);
- /* ensure any pad bytes in the struct are zeroed */
- MemSet(&entry, 0, sizeof(entry));
- entry.rnode = rnode;
- entry.segno = segno;
+ if (segno != InvalidBlockNumber)
+ {
+ /* Enter a request to fsync this segment */
+ PendingOperationTag key;
+ PendingOperationEntry *entry;
+ bool found;
+
+ /* ensure any pad bytes in the hash key are zeroed */
+ MemSet(&key, 0, sizeof(key));
+ key.rnode = rnode;
+ key.segno = segno;
+
+ entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
+ &key,
+ HASH_ENTER,
+ &found);
+ if (!found) /* new entry, so initialize it */
+ entry->failures = 0;
+ }
+ else
+ {
+ /*
+ * Remove any pending requests for the entire relation. (This is a
+ * tad slow but it doesn't seem worth rethinking the table structure.)
+ */
+ HASH_SEQ_STATUS hstat;
+ PendingOperationEntry *entry;
- (void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
+ hash_seq_init(&hstat, pendingOpsTable);
+ while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ if (RelFileNodeEquals(entry->tag.rnode, rnode))
+ {
+ /* Okay, delete this entry */
+ if (hash_search(pendingOpsTable, &entry->tag,
+ HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "pendingOpsTable corrupted");
+ }
+ }
+ }
}
/*
}
if (v->mdfd_chain == NULL)
{
- if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
+ if (behavior == EXTENSION_RETURN_NULL &&
+ FILE_POSSIBLY_DELETED(errno))
return NULL;
ereport(ERROR,
(errcode_for_file_access(),