*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.49 2007/08/01 22:45:08 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/commands/tablespace.c,v 1.50 2007/11/15 20:36:40 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "commands/comment.h"
#include "commands/tablespace.h"
#include "miscadmin.h"
+#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "utils/acl.h"
#include "utils/builtins.h"
LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
/*
- * Try to remove the physical infrastructure
+ * Try to remove the physical infrastructure.
*/
if (!remove_tablespace_directories(tablespaceoid, false))
- ereport(ERROR,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("tablespace \"%s\" is not empty",
- tablespacename)));
+ {
+ /*
+ * Not all files deleted? However, there can be lingering empty files
+ * in the directories, left behind by for example DROP TABLE, that
+ * have been scheduled for deletion at next checkpoint (see comments
+ * in mdunlink() for details). We could just delete them immediately,
+ * but we can't tell them apart from important data files that we
+ * mustn't delete. So instead, we force a checkpoint which will clean
+ * out any lingering files, and try again.
+ */
+ RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
+ if (!remove_tablespace_directories(tablespaceoid, false))
+ {
+ /* Still not empty, the files must be important then */
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("tablespace \"%s\" is not empty",
+ tablespacename)));
+ }
+ }
/* Record the filesystem change in XLOG */
{
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.129 2007/07/03 14:51:24 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.130 2007/11/15 20:36:40 tgl Exp $
*
*-------------------------------------------------------------------------
*/
/* special values for the segno arg to RememberFsyncRequest */
#define FORGET_RELATION_FSYNC (InvalidBlockNumber)
#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
+#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
/*
* On Windows, we have to interpret EACCES as possibly meaning the same as
* table remembers the pending operations. We use a hash table mostly as
* a convenient way of eliminating duplicate requests.
*
+ * We use a similar mechanism to remember no-longer-needed files that can
+ * be deleted after the next checkpoint, but we use a linked list instead of
+ * a hash table, because we don't expect there to be any duplicate requests.
+ *
* (Regular backends do not track pending operations locally, but forward
* them to the bgwriter.)
*/
CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */
} PendingOperationEntry;
+typedef struct
+{
+ RelFileNode rnode; /* the dead relation to delete */
+ CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */
+} PendingUnlinkEntry;
+
static HTAB *pendingOpsTable = NULL;
+static List *pendingUnlinks = NIL;
static CycleCtr mdsync_cycle_ctr = 0;
+static CycleCtr mdckpt_cycle_ctr = 0;
typedef enum /* behavior for mdopen & _mdfd_getseg */
/* local routines */
static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
+static void register_unlink(RelFileNode rnode);
static MdfdVec *_fdvec_alloc(void);
#ifndef LET_OS_MANAGE_FILESIZE
100L,
&hash_ctl,
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+ pendingUnlinks = NIL;
}
}
* Note that we're passed a RelFileNode --- by the time this is called,
* there won't be an SMgrRelation hashtable entry anymore.
*
+ * Actually, we don't unlink the first segment file of the relation, but
+ * just truncate it to zero length, and record a request to unlink it after
+ * the next checkpoint. Additional segments can be unlinked immediately,
+ * however. Leaving the empty file in place prevents that relfilenode
+ * number from being reused. The scenario this protects us from is:
+ * 1. We delete a relation (and commit, and actually remove its file).
+ * 2. We create a new relation, which by chance gets the same relfilenode as
+ * the just-deleted one (OIDs must've wrapped around for that to happen).
+ * 3. We crash before another checkpoint occurs.
+ * During replay, we would delete the file and then recreate it, which is fine
+ * if the contents of the file were repopulated by subsequent WAL entries.
+ * But if we didn't WAL-log insertions, but instead relied on fsyncing the
+ * file after populating it (as for instance CLUSTER and CREATE INDEX do),
+ * the contents of the file would be lost forever. By leaving the empty file
+ * until after the next checkpoint, we prevent reassignment of the relfilenode
+ * number until it's safe, because relfilenode assignment skips over any
+ * existing file.
+ *
* If isRedo is true, it's okay for the relation to be already gone.
- * Also, any failure should be reported as WARNING not ERROR, because
+ * Also, we should remove the file immediately instead of queuing a request
+ * for later, since during redo there's no possibility of creating a
+ * conflicting relation.
+ *
+ * Note: any failure should be reported as WARNING not ERROR, because
* we are usually not in a transaction anymore when this is called.
*/
void
mdunlink(RelFileNode rnode, bool isRedo)
{
char *path;
+ int ret;
/*
* We have to clean out any pending fsync requests for the doomed relation,
path = relpath(rnode);
- /* Delete the first segment, or only segment if not doing segmenting */
- if (unlink(path) < 0)
+ /*
+ * Delete or truncate the first segment, or only segment if not doing
+ * segmenting
+ */
+ if (isRedo)
+ ret = unlink(path);
+ else
+ ret = truncate(path, 0);
+ if (ret < 0)
{
if (!isRedo || errno != ENOENT)
ereport(WARNING,
#endif
pfree(path);
+
+ /* Register request to unlink first segment later */
+ if (!isRedo)
+ register_unlink(rnode);
}
/*
mdsync_in_progress = false;
}
+/*
+ * mdpreckpt() -- Do pre-checkpoint work
+ *
+ * To distinguish unlink requests that arrived before this checkpoint
+ * started from those that arrived during the checkpoint, we use a cycle
+ * counter similar to the one we use for fsync requests. That cycle
+ * counter is incremented here.
+ *
+ * This must be called *before* the checkpoint REDO point is determined.
+ * That ensures that we won't delete files too soon.
+ *
+ * Note that we can't do anything here that depends on the assumption
+ * that the checkpoint will be completed.
+ */
+void
+mdpreckpt(void)
+{
+ ListCell *cell;
+
+ /*
+ * In case the prior checkpoint wasn't completed, stamp all entries in
+ * the list with the current cycle counter. Anything that's in the
+ * list at the start of checkpoint can surely be deleted after the
+ * checkpoint is finished, regardless of when the request was made.
+ */
+ foreach(cell, pendingUnlinks)
+ {
+ PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
+
+ entry->cycle_ctr = mdckpt_cycle_ctr;
+ }
+
+ /*
+ * Any unlink requests arriving after this point will be assigned the
+ * next cycle counter, and won't be unlinked until next checkpoint.
+ */
+ mdckpt_cycle_ctr++;
+}
+
+/*
+ * mdpostckpt() -- Do post-checkpoint work
+ *
+ * Remove any lingering files that can now be safely removed.
+ */
+void
+mdpostckpt(void)
+{
+ while (pendingUnlinks != NIL)
+ {
+ PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
+ char *path;
+
+ /*
+ * New entries are appended to the end, so if the entry is new
+ * we've reached the end of old entries.
+ */
+ if (entry->cycle_ctr == mdsync_cycle_ctr)
+ break;
+
+ /* Else assert we haven't missed it */
+ Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
+
+ /* Unlink the file */
+ path = relpath(entry->rnode);
+ if (unlink(path) < 0)
+ {
+ /*
+ * ENOENT shouldn't happen either, but it doesn't really matter
+ * because we would've deleted it now anyway.
+ */
+ if (errno != ENOENT)
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove relation %u/%u/%u: %m",
+ entry->rnode.spcNode,
+ entry->rnode.dbNode,
+ entry->rnode.relNode)));
+ }
+ pfree(path);
+
+ pendingUnlinks = list_delete_first(pendingUnlinks);
+ pfree(entry);
+ }
+}
+
/*
* register_dirty_segment() -- Mark a relation segment as needing fsync
*
}
}
+/*
+ * register_unlink() -- Schedule a file to be deleted after next checkpoint
+ *
+ * As with register_dirty_segment, this could involve either a local or
+ * a remote pending-ops table.
+ */
+static void
+register_unlink(RelFileNode rnode)
+{
+ if (pendingOpsTable)
+ {
+ /* push it into local pending-ops table */
+ RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
+ }
+ else
+ {
+ /*
+ * Notify the bgwriter about it. If we fail to queue the request
+ * message, we have to sleep and try again, because we can't simply
+ * delete the file now. Ugly, but hopefully won't happen often.
+ *
+ * XXX should we just leave the file orphaned instead?
+ */
+ Assert(IsUnderPostmaster);
+ while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
+ pg_usleep(10000L); /* 10 msec seems a good number */
+ }
+}
+
/*
* RememberFsyncRequest() -- callback from bgwriter side of fsync request
*
- * We stuff the fsync request into the local hash table for execution
- * during the bgwriter's next checkpoint.
+ * We stuff most fsync requests into the local hash table for execution
+ * during the bgwriter's next checkpoint. UNLINK requests go into a
+ * separate linked list, however, because they get processed separately.
*
* The range of possible segment numbers is way less than the range of
* BlockNumber, so we can reserve high values of segno for special purposes.
- * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
- * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
- * a whole database. (These are a tad slow because the hash table has to be
- * searched linearly, but it doesn't seem worth rethinking the table structure
- * for them.)
+ * We define three:
+ * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
+ * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
+ * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
+ * checkpoint.
+ *
+ * (Handling the FORGET_* requests is a tad slow because the hash table has
+ * to be searched linearly, but it doesn't seem worth rethinking the table
+ * structure for them.)
*/
void
RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
}
}
}
+ else if (segno == UNLINK_RELATION_REQUEST)
+ {
+ /* Unlink request: put it in the linked list */
+ MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
+ PendingUnlinkEntry *entry;
+
+ entry = palloc(sizeof(PendingUnlinkEntry));
+ entry->rnode = rnode;
+ entry->cycle_ctr = mdckpt_cycle_ctr;
+
+ pendingUnlinks = lappend(pendingUnlinks, entry);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
else
{
/* Normal case: enter a request to fsync this segment */
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.106 2007/09/05 18:10:48 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.107 2007/11/15 20:36:40 tgl Exp $
*
*-------------------------------------------------------------------------
*/
void (*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks,
bool isTemp);
void (*smgr_immedsync) (SMgrRelation reln);
- void (*smgr_commit) (void); /* may be NULL */
- void (*smgr_abort) (void); /* may be NULL */
- void (*smgr_sync) (void); /* may be NULL */
+ void (*smgr_commit) (void); /* may be NULL */
+ void (*smgr_abort) (void); /* may be NULL */
+ void (*smgr_pre_ckpt) (void); /* may be NULL */
+ void (*smgr_sync) (void); /* may be NULL */
+ void (*smgr_post_ckpt) (void); /* may be NULL */
} f_smgr;
/* magnetic disk */
{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
- NULL, NULL, mdsync
+ NULL, NULL, mdpreckpt, mdsync, mdpostckpt
}
};
}
/*
- * smgrsync() -- Sync files to disk at checkpoint time.
+ * smgrpreckpt() -- Prepare for checkpoint.
+ */
+void
+smgrpreckpt(void)
+{
+ int i;
+
+ for (i = 0; i < NSmgr; i++)
+ {
+ if (smgrsw[i].smgr_pre_ckpt)
+ (*(smgrsw[i].smgr_pre_ckpt)) ();
+ }
+}
+
+/*
+ * smgrsync() -- Sync files to disk during checkpoint.
*/
void
smgrsync(void)
}
}
+/*
+ * smgrpostckpt() -- Post-checkpoint cleanup.
+ */
+void
+smgrpostckpt(void)
+{
+ int i;
+
+ for (i = 0; i < NSmgr; i++)
+ {
+ if (smgrsw[i].smgr_post_ckpt)
+ (*(smgrsw[i].smgr_post_ckpt)) ();
+ }
+}
+
void
smgr_redo(XLogRecPtr lsn, XLogRecord *record)