* mapped catalogs can only be relocated by operations such as VACUUM FULL
* and CLUSTER, which make no transactionally-significant changes: it must be
* safe for the new file to replace the old, even if the transaction itself
- * aborts. An important factor here is that the indexes and toast table of
+ * aborts. An important factor here is that the indexes and toast table of
* a mapped catalog must also be mapped, so that the rewrites/relocations of
* all these files commit in a single map file update rather than being tied
* to transaction commit.
*
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
#include <unistd.h>
#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
#include "catalog/catalog.h"
#include "catalog/pg_tablespace.h"
#include "catalog/storage.h"
#include "miscadmin.h"
+#include "pgstat.h"
#include "storage/fd.h"
#include "storage/lwlock.h"
#include "utils/inval.h"
-#include "utils/pg_crc.h"
#include "utils/relmapper.h"
/*
* The map file is critical data: we have no automatic method for recovering
* from loss or corruption of it. We use a CRC so that we can detect
- * corruption. To minimize the risk of failed updates, the map file should
+ * corruption. To minimize the risk of failed updates, the map file should
* be kept to no more than one standard-size disk sector (ie 512 bytes),
* and we use overwrite-in-place rather than playing renaming games.
* The struct layout below is designed to occupy exactly 512 bytes, which
* might make filesystem updates a bit more efficient.
*
- * Entries in the mappings[] array are in no particular order. We could
+ * Entries in the mappings[] array are in no particular order. We could
* speed searching by insisting on OID order, but it really shouldn't be
* worth the trouble given the intended size of the mapping sets.
*/
#define RELMAPPER_FILENAME "pg_filenode.map"
-#define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
+#define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
-#define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */
+#define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */
typedef struct RelMapping
{
int32 magic; /* always RELMAPPER_FILEMAGIC */
int32 num_mappings; /* number of valid RelMapping entries */
RelMapping mappings[MAX_MAPPINGS];
- int32 crc; /* CRC of all above */
+ pg_crc32c crc; /* CRC of all above */
int32 pad; /* to make the struct size be 512 exactly */
} RelMapFile;
+/*
+ * State for serializing local and shared relmappings for parallel workers
+ * (active states only). See notes on active_* and pending_* updates state.
+ */
+typedef struct SerializedActiveRelMaps
+{
+ RelMapFile active_shared_updates;
+ RelMapFile active_local_updates;
+} SerializedActiveRelMaps;
+
/*
* The currently known contents of the shared map file and our database's
- * local map file are stored here. These can be reloaded from disk
+ * local map file are stored here. These can be reloaded from disk
* immediately whenever we receive an update sinval message.
*/
static RelMapFile shared_map;
* they will become active at the next CommandCounterIncrement. This setup
* lets map updates act similarly to updates of pg_class rows, ie, they
* become visible only at the next CommandCounterIncrement boundary.
+ *
+ * Active shared and active local updates are serialized by the parallel
+ * infrastructure, and deserialized within parallel workers.
*/
static RelMapFile active_shared_updates;
static RelMapFile active_local_updates;
/* non-export function prototypes */
static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode,
- bool add_okay);
+ bool add_okay);
static void merge_map_updates(RelMapFile *map, const RelMapFile *updates,
- bool add_okay);
+ bool add_okay);
static void load_relmap_file(bool shared);
static void write_relmap_file(bool shared, RelMapFile *newmap,
- bool write_wal, bool send_sinval, bool preserve_files,
- Oid dbid, Oid tsid, const char *dbpath);
+ bool write_wal, bool send_sinval, bool preserve_files,
+ Oid dbid, Oid tsid, const char *dbpath);
static void perform_relmap_update(bool shared, const RelMapFile *updates);
return InvalidOid;
}
+/*
+ * RelationMapFilenodeToOid
+ *
+ * Do the reverse of the normal direction of mapping done in
+ * RelationMapOidToFilenode.
+ *
+ * This is not supposed to be used during normal running but rather for
+ * information purposes when looking at the filesystem or xlog.
+ *
+ * Returns InvalidOid if the OID is not known; this can easily happen if the
+ * relfilenode doesn't pertain to a mapped relation.
+ */
+Oid
+RelationMapFilenodeToOid(Oid filenode, bool shared)
+{
+ const RelMapFile *map;
+ int32 i;
+
+ /* If there are active updates, believe those over the main maps */
+ if (shared)
+ {
+ map = &active_shared_updates;
+ for (i = 0; i < map->num_mappings; i++)
+ {
+ if (filenode == map->mappings[i].mapfilenode)
+ return map->mappings[i].mapoid;
+ }
+ map = &shared_map;
+ for (i = 0; i < map->num_mappings; i++)
+ {
+ if (filenode == map->mappings[i].mapfilenode)
+ return map->mappings[i].mapoid;
+ }
+ }
+ else
+ {
+ map = &active_local_updates;
+ for (i = 0; i < map->num_mappings; i++)
+ {
+ if (filenode == map->mappings[i].mapfilenode)
+ return map->mappings[i].mapoid;
+ }
+ map = &local_map;
+ for (i = 0; i < map->num_mappings; i++)
+ {
+ if (filenode == map->mappings[i].mapfilenode)
+ return map->mappings[i].mapoid;
+ }
+ }
+
+ return InvalidOid;
+}
+
/*
* RelationMapUpdateMap
*
else
{
/*
- * We don't currently support map changes within subtransactions. This
- * could be done with more bookkeeping infrastructure, but it doesn't
- * presently seem worth it.
+ * We don't currently support map changes within subtransactions, or
+ * when in parallel mode. This could be done with more bookkeeping
+ * infrastructure, but it doesn't presently seem worth it.
*/
if (GetCurrentTransactionNestLevel() > 1)
elog(ERROR, "cannot change relation mapping within subtransaction");
+ if (IsInParallelMode())
+ elog(ERROR, "cannot change relation mapping in parallel mode");
+
if (immediate)
{
/* Make it active, but only locally */
* RelationMapRemoveMapping
*
* Remove a relation's entry in the map. This is only allowed for "active"
- * (but not committed) local mappings. We need it so we can back out the
+ * (but not committed) local mappings. We need it so we can back out the
* entry for the transient target file when doing VACUUM FULL/CLUSTER on
* a mapped relation.
*/
* RelationMapInvalidate
*
* This routine is invoked for SI cache flush messages. We must re-read
- * the indicated map file. However, we might receive a SI message in a
+ * the indicated map file. However, we might receive a SI message in a
* process that hasn't yet, and might never, load the mapping files;
* for example the autovacuum launcher, which *must not* try to read
* a local map since it is attached to no particular database.
*
* During commit, this must be called as late as possible before the actual
* transaction commit, so as to minimize the window where the transaction
- * could still roll back after committing map changes. Although nothing
+ * could still roll back after committing map changes. Although nothing
* critically bad happens in such a case, we still would prefer that it
* not happen, since we'd possibly be losing useful updates to the relations'
* pg_class row(s).
*
* During abort, we just have to throw away any pending map changes.
* Normal post-abort cleanup will take care of fixing relcache entries.
+ * Parallel worker commit/abort is handled by resetting active mappings
+ * that may have been received from the leader process. (There should be
+ * no pending updates in parallel workers.)
*/
void
-AtEOXact_RelationMap(bool isCommit)
+AtEOXact_RelationMap(bool isCommit, bool isParallelWorker)
{
- if (isCommit)
+ if (isCommit && !isParallelWorker)
{
/*
* We should not get here with any "pending" updates. (We could
}
else
{
- /* Abort --- drop all local and pending updates */
+ /* Abort or parallel worker --- drop all local and pending updates */
+ Assert(!isParallelWorker || pending_shared_updates.num_mappings == 0);
+ Assert(!isParallelWorker || pending_local_updates.num_mappings == 0);
+
active_shared_updates.num_mappings = 0;
active_local_updates.num_mappings = 0;
pending_shared_updates.num_mappings = 0;
/*
* CheckPointRelationMap
*
- * This is called during a checkpoint. It must ensure that any relation map
+ * This is called during a checkpoint. It must ensure that any relation map
* updates that were WAL-logged before the start of the checkpoint are
* securely flushed to disk and will not need to be replayed later. This
* seems unlikely to be a performance-critical issue, so we use a simple
load_relmap_file(false);
}
+/*
+ * EstimateRelationMapSpace
+ *
+ * Estimate space needed to pass active shared and local relmaps to parallel
+ * workers.
+ */
+Size
+EstimateRelationMapSpace(void)
+{
+ return sizeof(SerializedActiveRelMaps);
+}
+
+/*
+ * SerializeRelationMap
+ *
+ * Serialize active shared and local relmap state for parallel workers.
+ */
+void
+SerializeRelationMap(Size maxSize, char *startAddress)
+{
+ SerializedActiveRelMaps *relmaps;
+
+ Assert(maxSize >= EstimateRelationMapSpace());
+
+ relmaps = (SerializedActiveRelMaps *) startAddress;
+ relmaps->active_shared_updates = active_shared_updates;
+ relmaps->active_local_updates = active_local_updates;
+}
+
+/*
+ * RestoreRelationMap
+ *
+ * Restore active shared and local relmap state within a parallel worker.
+ */
+void
+RestoreRelationMap(char *startAddress)
+{
+ SerializedActiveRelMaps *relmaps;
+
+ if (active_shared_updates.num_mappings != 0 ||
+ active_local_updates.num_mappings != 0 ||
+ pending_shared_updates.num_mappings != 0 ||
+ pending_local_updates.num_mappings != 0)
+ elog(ERROR, "parallel worker has existing mappings");
+
+ relmaps = (SerializedActiveRelMaps *) startAddress;
+ active_shared_updates = relmaps->active_shared_updates;
+ active_local_updates = relmaps->active_local_updates;
+}
+
/*
* load_relmap_file -- load data from the shared or local map file
*
{
RelMapFile *map;
char mapfilename[MAXPGPATH];
- pg_crc32 crc;
+ pg_crc32c crc;
int fd;
+ int r;
if (shared)
{
}
/* Read data ... */
- fd = BasicOpenFile(mapfilename, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR);
+ fd = OpenTransientFile(mapfilename, O_RDONLY | PG_BINARY);
if (fd < 0)
ereport(FATAL,
(errcode_for_file_access(),
- errmsg("could not open relation mapping file \"%s\": %m",
+ errmsg("could not open file \"%s\": %m",
mapfilename)));
/*
* look, the sinval signaling mechanism will make us re-read it before we
* are able to access any relation that's affected by the change.
*/
- if (read(fd, map, sizeof(RelMapFile)) != sizeof(RelMapFile))
+ pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_READ);
+ r = read(fd, map, sizeof(RelMapFile));
+ if (r != sizeof(RelMapFile))
+ {
+ if (r < 0)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m", mapfilename)));
+ else
+ ereport(FATAL,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read file \"%s\": read %d of %zu",
+ mapfilename, r, sizeof(RelMapFile))));
+ }
+ pgstat_report_wait_end();
+
+ if (CloseTransientFile(fd) != 0)
ereport(FATAL,
(errcode_for_file_access(),
- errmsg("could not read relation mapping file \"%s\": %m",
+ errmsg("could not close file \"%s\": %m",
mapfilename)));
- close(fd);
-
/* check for correct magic number, etc */
if (map->magic != RELMAPPER_FILEMAGIC ||
map->num_mappings < 0 ||
mapfilename)));
/* verify the CRC */
- INIT_CRC32(crc);
- COMP_CRC32(crc, (char *) map, offsetof(RelMapFile, crc));
- FIN_CRC32(crc);
+ INIT_CRC32C(crc);
+ COMP_CRC32C(crc, (char *) map, offsetof(RelMapFile, crc));
+ FIN_CRC32C(crc);
- if (!EQ_CRC32(crc, map->crc))
+ if (!EQ_CRC32C(crc, map->crc))
ereport(FATAL,
- (errmsg("relation mapping file \"%s\" contains incorrect checksum",
- mapfilename)));
+ (errmsg("relation mapping file \"%s\" contains incorrect checksum",
+ mapfilename)));
}
/*
* The magic number and CRC are automatically updated in *newmap. On
* success, we copy the data to the appropriate permanent static variable.
*
- * If write_wal is TRUE then an appropriate WAL message is emitted.
+ * If write_wal is true then an appropriate WAL message is emitted.
* (It will be false for bootstrap and WAL replay cases.)
*
- * If send_sinval is TRUE then a SI invalidation message is sent.
+ * If send_sinval is true then a SI invalidation message is sent.
* (This should be true except in bootstrap case.)
*
- * If preserve_files is TRUE then the storage manager is warned not to
+ * If preserve_files is true then the storage manager is warned not to
* delete the files listed in the map.
*
* Because this may be called during WAL replay when MyDatabaseId,
* DatabasePath, etc aren't valid, we require the caller to pass in suitable
- * values. The caller is also responsible for being sure no concurrent
+ * values. The caller is also responsible for being sure no concurrent
* map update could be happening.
*/
static void
if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
elog(ERROR, "attempt to write bogus relation mapping");
- INIT_CRC32(newmap->crc);
- COMP_CRC32(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
- FIN_CRC32(newmap->crc);
+ INIT_CRC32C(newmap->crc);
+ COMP_CRC32C(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
+ FIN_CRC32C(newmap->crc);
/*
* Open the target file. We prefer to do this before entering the
* critical section, so that an open() failure need not force PANIC.
- *
- * Note: since we use BasicOpenFile, we are nominally responsible for
- * ensuring the fd is closed on error. In practice, this isn't important
- * because either an error happens inside the critical section, or we are
- * in bootstrap or WAL replay; so an error past this point is always fatal
- * anyway.
*/
if (shared)
{
realmap = &local_map;
}
- fd = BasicOpenFile(mapfilename,
- O_WRONLY | O_CREAT | PG_BINARY,
- S_IRUSR | S_IWUSR);
+ fd = OpenTransientFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY);
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not open relation mapping file \"%s\": %m",
+ errmsg("could not open file \"%s\": %m",
mapfilename)));
if (write_wal)
{
xl_relmap_update xlrec;
- XLogRecData rdata[2];
XLogRecPtr lsn;
/* now errors are fatal ... */
xlrec.tsid = tsid;
xlrec.nbytes = sizeof(RelMapFile);
- rdata[0].data = (char *) (&xlrec);
- rdata[0].len = MinSizeOfRelmapUpdate;
- rdata[0].buffer = InvalidBuffer;
- rdata[0].next = &(rdata[1]);
- rdata[1].data = (char *) newmap;
- rdata[1].len = sizeof(RelMapFile);
- rdata[1].buffer = InvalidBuffer;
- rdata[1].next = NULL;
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate);
+ XLogRegisterData((char *) newmap, sizeof(RelMapFile));
- lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata);
+ lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE);
/* As always, WAL must hit the disk before the data update does */
XLogFlush(lsn);
}
errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
{
/* if write didn't set errno, assume problem is no disk space */
errno = ENOSPC;
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not write to relation mapping file \"%s\": %m",
+ errmsg("could not write file \"%s\": %m",
mapfilename)));
}
+ pgstat_report_wait_end();
/*
* We choose to fsync the data to disk before considering the task done.
* issue, but it would complicate checkpointing --- see notes for
* CheckPointRelationMap.
*/
+ pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
if (pg_fsync(fd) != 0)
- ereport(ERROR,
+ ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(),
- errmsg("could not fsync relation mapping file \"%s\": %m",
+ errmsg("could not fsync file \"%s\": %m",
mapfilename)));
+ pgstat_report_wait_end();
- if (close(fd))
+ if (CloseTransientFile(fd) != 0)
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not close relation mapping file \"%s\": %m",
+ errmsg("could not close file \"%s\": %m",
mapfilename)));
/*
/*
* Make sure that the files listed in the map are not deleted if the outer
- * transaction aborts. This had better be within the critical section
+ * transaction aborts. This had better be within the critical section
* too: it's not likely to fail, but if it did, we'd arrive at transaction
* abort with the files still vulnerable. PANICing will leave things in a
* good state on-disk.
rnode.spcNode = tsid;
rnode.dbNode = dbid;
rnode.relNode = newmap->mappings[i].mapfilenode;
- RelationPreserveStorage(rnode);
+ RelationPreserveStorage(rnode, false);
}
}
else
memcpy(&newmap, &local_map, sizeof(RelMapFile));
- /* Apply the updates to newmap. No new mappings should appear. */
- merge_map_updates(&newmap, updates, false);
+ /*
+ * Apply the updates to newmap. No new mappings should appear, unless
+ * somebody is adding indexes to system catalogs.
+ */
+ merge_map_updates(&newmap, updates, allowSystemTableMods);
/* Write out the updated map and do other necessary tasks */
write_relmap_file(shared, &newmap, true, true, true,
* RELMAP resource manager's routines
*/
void
-relmap_redo(XLogRecPtr lsn, XLogRecord *record)
+relmap_redo(XLogReaderState *record)
{
- uint8 info = record->xl_info & ~XLR_INFO_MASK;
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
/* Backup blocks are not used in relmap records */
- Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
+ Assert(!XLogRecHasAnyBlockRefs(record));
if (info == XLOG_RELMAP_UPDATE)
{
else
elog(PANIC, "relmap_redo: unknown op code %u", info);
}
-
-void
-relmap_desc(StringInfo buf, uint8 xl_info, char *rec)
-{
- uint8 info = xl_info & ~XLR_INFO_MASK;
-
- if (info == XLOG_RELMAP_UPDATE)
- {
- xl_relmap_update *xlrec = (xl_relmap_update *) rec;
-
- appendStringInfo(buf, "update relmap: database %u tablespace %u size %u",
- xlrec->dbid, xlrec->tsid, xlrec->nbytes);
- }
- else
- appendStringInfo(buf, "UNKNOWN");
-}