]> granicus.if.org Git - postgresql/commitdiff
Fix bugs in vacuum of shared rels, by keeping their relcache entries current.
authorAndres Freund <andres@anarazel.de>
Tue, 12 Jun 2018 18:13:21 +0000 (11:13 -0700)
committerAndres Freund <andres@anarazel.de>
Tue, 12 Jun 2018 18:13:21 +0000 (11:13 -0700)
When vacuum processes a relation it uses the corresponding relcache
entry's relfrozenxid / relminmxid as a cutoff for when to remove
tuples etc. Unfortunately for nailed relations (i.e. critical system
catalogs) bugs could frequently lead to the corresponding relcache
entry being stale.

This set of bugs could cause actual data corruption as vacuum would
potentially not remove the correct row versions, potentially reviving
them at a later point.  After 699bf7d05c some corruptions in this vein
were prevented, but the additional error checks could also trigger
spuriously. Examples of such errors are:
  ERROR: found xmin ... from before relfrozenxid ...
and
  ERROR: found multixact ... from before relminmxid ...
To be caused by this bug the errors have to occur on system catalog
tables.

The two bugs are:

1) Invalidations for nailed relations were ignored, based on the
   theory that the relcache entry for such tables doesn't
   change. Which is largely true, except for fields like relfrozenxid
   etc.  This means that changes to relations vacuumed in other
   sessions weren't picked up by already existing sessions.  Luckily
   autovacuum doesn't have particularly longrunning sessions.

2) For shared *and* nailed relations, the shared relcache init file
   was never invalidated while running.  That means that for such
   tables (e.g. pg_authid, pg_database) it's not just already existing
   sessions that are affected, but even new connections are as well.
   That explains why the reports usually were about pg_authid et. al.

To fix 1), revalidate the rd_rel portion of a relcache entry when
invalid. This implies a bit of extra complexity to deal with
bootstrapping, but it's not too bad.  The fix for 2) is simpler,
simply always remove both the shared and local init files.

Author: Andres Freund
Reviewed-By: Alvaro Herrera
Discussion:
    https://postgr.es/m/20180525203736.crkbg36muzxrjj5e@alap3.anarazel.de
    https://postgr.es/m/CAMa1XUhKSJd98JW4o9StWPrfS=11bPgG+_GDMxe25TvUY4Sugg@mail.gmail.com
    https://postgr.es/m/CAKMFJucqbuoDRfxPDX39WhA3vJyxweRg_zDVXzncr6+5wOguWA@mail.gmail.com
    https://postgr.es/m/CAGewt-ujGpMLQ09gXcUFMZaZsGJC98VXHEFbF-tpPB0fB13K+A@mail.gmail.com
Backpatch: 9.3-

src/backend/utils/cache/inval.c
src/backend/utils/cache/relcache.c
src/include/storage/standbydefs.h

index d0e54b85352bf1fed3eac57f447fb12eb9527260..2226b325720e57503c9ab295a0e5f19f12ceec80 100644 (file)
@@ -521,12 +521,12 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId)
        (void) GetCurrentCommandId(true);
 
        /*
-        * If the relation being invalidated is one of those cached in the local
-        * relcache init file, mark that we need to zap that file at commit. Same
-        * is true when we are invalidating whole relcache.
+        * If the relation being invalidated is one of those cached in a relcache
+        * init file, mark that we need to zap that file at commit. For simplicity
+        * invalidations for a specific database always invalidate the shared file
+        * as well.  Also zap when we are invalidating whole relcache.
         */
-       if (OidIsValid(dbId) &&
-               (RelationIdIsInInitFile(relId) || relId == InvalidOid))
+       if (relId == InvalidOid || RelationIdIsInInitFile(relId))
                transInvalInfo->RelcacheInitFileInval = true;
 }
 
@@ -881,18 +881,26 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
 
        if (RelcacheInitFileInval)
        {
+               elog(trace_recovery(DEBUG4), "removing relcache init files for database %u",
+                        dbid);
+
                /*
-                * RelationCacheInitFilePreInvalidate requires DatabasePath to be set,
-                * but we should not use SetDatabasePath during recovery, since it is
+                * RelationCacheInitFilePreInvalidate, when the invalidation message
+                * is for a specific database, requires DatabasePath to be set, but we
+                * should not use SetDatabasePath during recovery, since it is
                 * intended to be used only once by normal backends.  Hence, a quick
                 * hack: set DatabasePath directly then unset after use.
                 */
-               DatabasePath = GetDatabasePath(dbid, tsid);
-               elog(trace_recovery(DEBUG4), "removing relcache init file in \"%s\"",
-                        DatabasePath);
+               if (OidIsValid(dbid))
+                       DatabasePath = GetDatabasePath(dbid, tsid);
+
                RelationCacheInitFilePreInvalidate();
-               pfree(DatabasePath);
-               DatabasePath = NULL;
+
+               if (OidIsValid(dbid))
+               {
+                       pfree(DatabasePath);
+                       DatabasePath = NULL;
+               }
        }
 
        SendSharedInvalidMessages(msgs, nmsgs);
index c48ec20eee32e320642c9929159ac9b24544a2f0..d58237f5a1561c3635cd7a76e3717cbc5ef722e2 100644 (file)
@@ -248,6 +248,7 @@ static void RelationDestroyRelation(Relation relation, bool remember_tupdesc);
 static void RelationClearRelation(Relation relation, bool rebuild);
 
 static void RelationReloadIndexInfo(Relation relation);
+static void RelationReloadNailed(Relation relation);
 static void RelationFlushRelation(Relation relation);
 static void RememberToFreeTupleDescAtEOX(TupleDesc td);
 static void AtEOXact_cleanup(Relation relation, bool isCommit);
@@ -285,7 +286,7 @@ static void IndexSupportInitialize(oidvector *indclass,
 static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid,
                                  StrategyNumber numSupport);
 static void RelationCacheInitFileRemoveInDir(const char *tblspcpath);
-static void unlink_initfile(const char *initfilename);
+static void unlink_initfile(const char *initfilename, int elevel);
 static bool equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
                                        PartitionDesc partdesc2);
 
@@ -2052,7 +2053,16 @@ RelationIdGetRelation(Oid relationId)
                                RelationReloadIndexInfo(rd);
                        else
                                RelationClearRelation(rd, true);
-                       Assert(rd->rd_isvalid);
+
+                       /*
+                        * Normally entries need to be valid here, but before the relcache
+                        * has been initialized, not enough infrastructure exists to
+                        * perform pg_class lookups. The structure of such entries doesn't
+                        * change, but we still want to update the rd_rel entry. So
+                        * rd_isvalid = false is left in place for a later lookup.
+                        */
+                       Assert(rd->rd_isvalid ||
+                                  (rd->rd_isnailed && !criticalRelcachesBuilt));
                }
                return rd;
        }
@@ -2255,6 +2265,81 @@ RelationReloadIndexInfo(Relation relation)
        relation->rd_isvalid = true;
 }
 
+/*
+ * RelationReloadNailed - reload minimal information for nailed relations.
+ *
+ * The structure of a nailed relation can never change (which is good, because
+ * we rely on knowing their structure to be able to read catalog content). But
+ * some parts, e.g. pg_class.relfrozenxid, are still important to have
+ * accurate content for. Therefore those need to be reloaded after the arrival
+ * of invalidations.
+ */
+static void
+RelationReloadNailed(Relation relation)
+{
+       Assert(relation->rd_isnailed);
+
+       /*
+        * Redo RelationInitPhysicalAddr in case it is a mapped relation whose
+        * mapping changed.
+        */
+       RelationInitPhysicalAddr(relation);
+
+       /* flag as needing to be revalidated */
+       relation->rd_isvalid = false;
+
+       /*
+        * Can only reread catalog contents if in a transaction.  If the relation
+        * is currently open (not counting the nailed refcount), do so
+        * immediately. Otherwise we've already marked the entry as possibly
+        * invalid, and it'll be fixed when next opened.
+        */
+       if (!IsTransactionState() || relation->rd_refcnt <= 1)
+               return;
+
+       if (relation->rd_rel->relkind == RELKIND_INDEX)
+       {
+               /*
+                * If it's a nailed-but-not-mapped index, then we need to re-read the
+                * pg_class row to see if its relfilenode changed.
+                */
+               RelationReloadIndexInfo(relation);
+       }
+       else
+       {
+               /*
+                * Reload a non-index entry.  We can't easily do so if relcaches
+                * aren't yet built, but that's fine because at that stage the
+                * attributes that need to be current (like relfrozenxid) aren't yet
+                * accessed.  To ensure the entry will later be revalidated, we leave
+                * it in invalid state, but allow use (cf. RelationIdGetRelation()).
+                */
+               if (criticalRelcachesBuilt)
+               {
+                       HeapTuple       pg_class_tuple;
+                       Form_pg_class relp;
+
+                       /*
+                        * NB: Mark the entry as valid before starting to scan, to avoid
+                        * self-recursion when re-building pg_class.
+                        */
+                       relation->rd_isvalid = true;
+
+                       pg_class_tuple = ScanPgRelation(RelationGetRelid(relation),
+                                                                                       true, false);
+                       relp = (Form_pg_class) GETSTRUCT(pg_class_tuple);
+                       memcpy(relation->rd_rel, relp, CLASS_TUPLE_SIZE);
+                       heap_freetuple(pg_class_tuple);
+
+                       /*
+                        * Again mark as valid, to protect against concurrently arriving
+                        * invalidations.
+                        */
+                       relation->rd_isvalid = true;
+               }
+       }
+}
+
 /*
  * RelationDestroyRelation
  *
@@ -2368,26 +2453,12 @@ RelationClearRelation(Relation relation, bool rebuild)
        RelationCloseSmgr(relation);
 
        /*
-        * Never, never ever blow away a nailed-in system relation, because we'd
-        * be unable to recover.  However, we must redo RelationInitPhysicalAddr
-        * in case it is a mapped relation whose mapping changed.
-        *
-        * If it's a nailed-but-not-mapped index, then we need to re-read the
-        * pg_class row to see if its relfilenode changed. We do that immediately
-        * if we're inside a valid transaction and the relation is open (not
-        * counting the nailed refcount).  Otherwise just mark the entry as
-        * possibly invalid, and it'll be fixed when next opened.
+        * Treat nailed-in system relations separately, they always need to be
+        * accessible, so we can't blow them away.
         */
        if (relation->rd_isnailed)
        {
-               RelationInitPhysicalAddr(relation);
-
-               if (relation->rd_rel->relkind == RELKIND_INDEX)
-               {
-                       relation->rd_isvalid = false;   /* needs to be revalidated */
-                       if (relation->rd_refcnt > 1 && IsTransactionState())
-                               RelationReloadIndexInfo(relation);
-               }
+               RelationReloadNailed(relation);
                return;
        }
 
@@ -5904,24 +5975,26 @@ write_item(const void *data, Size len, FILE *fp)
 
 /*
  * Determine whether a given relation (identified by OID) is one of the ones
- * we should store in the local relcache init file.
+ * we should store in a relcache init file.
  *
  * We must cache all nailed rels, and for efficiency we should cache every rel
  * that supports a syscache.  The former set is almost but not quite a subset
- * of the latter.  Currently, we must special-case TriggerRelidNameIndexId,
- * which RelationCacheInitializePhase3 chooses to nail for efficiency reasons,
- * but which does not support any syscache.
- *
- * Note: this function is currently never called for shared rels.  If it were,
- * we'd probably also need a special case for DatabaseNameIndexId, which is
- * critical but does not support a syscache.
+ * of the latter. The special cases are relations where
+ * RelationCacheInitializePhase2/3 chooses to nail for efficiency reasons, but
+ * which do not support any syscache.
  */
 bool
 RelationIdIsInInitFile(Oid relationId)
 {
-       if (relationId == TriggerRelidNameIndexId)
+       if (relationId == SharedSecLabelRelationId ||
+               relationId == TriggerRelidNameIndexId ||
+               relationId == DatabaseNameIndexId ||
+               relationId == SharedSecLabelObjectIndexId)
        {
-               /* If this Assert fails, we don't need this special case anymore. */
+               /*
+                * If this Assert fails, we don't need the applicable special case
+                * anymore.
+                */
                Assert(!RelationSupportsSysCache(relationId));
                return true;
        }
@@ -5991,38 +6064,30 @@ RelationHasUnloggedIndex(Relation rel)
  * We take the lock and do the unlink in RelationCacheInitFilePreInvalidate,
  * then release the lock in RelationCacheInitFilePostInvalidate.  Caller must
  * send any pending SI messages between those calls.
- *
- * Notice this deals only with the local init file, not the shared init file.
- * The reason is that there can never be a "significant" change to the
- * relcache entry of a shared relation; the most that could happen is
- * updates of noncritical fields such as relpages/reltuples.  So, while
- * it's worth updating the shared init file from time to time, it can never
- * be invalid enough to make it necessary to remove it.
  */
 void
 RelationCacheInitFilePreInvalidate(void)
 {
-       char            initfilename[MAXPGPATH];
+       char            localinitfname[MAXPGPATH];
+       char            sharedinitfname[MAXPGPATH];
 
-       snprintf(initfilename, sizeof(initfilename), "%s/%s",
-                        DatabasePath, RELCACHE_INIT_FILENAME);
+       if (DatabasePath)
+               snprintf(localinitfname, sizeof(localinitfname), "%s/%s",
+                                DatabasePath, RELCACHE_INIT_FILENAME);
+       snprintf(sharedinitfname, sizeof(sharedinitfname), "global/%s",
+                        RELCACHE_INIT_FILENAME);
 
        LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);
 
-       if (unlink(initfilename) < 0)
-       {
-               /*
-                * The file might not be there if no backend has been started since
-                * the last removal.  But complain about failures other than ENOENT.
-                * Fortunately, it's not too late to abort the transaction if we can't
-                * get rid of the would-be-obsolete init file.
-                */
-               if (errno != ENOENT)
-                       ereport(ERROR,
-                                       (errcode_for_file_access(),
-                                        errmsg("could not remove cache file \"%s\": %m",
-                                                       initfilename)));
-       }
+       /*
+        * The files might not be there if no backend has been started since the
+        * last removal.  But complain about failures other than ENOENT with
+        * ERROR.  Fortunately, it's not too late to abort the transaction if we
+        * can't get rid of the would-be-obsolete init file.
+        */
+       if (DatabasePath)
+               unlink_initfile(localinitfname, ERROR);
+       unlink_initfile(sharedinitfname, ERROR);
 }
 
 void
@@ -6048,13 +6113,9 @@ RelationCacheInitFileRemove(void)
        struct dirent *de;
        char            path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
 
-       /*
-        * We zap the shared cache file too.  In theory it can't get out of sync
-        * enough to be a problem, but in data-corruption cases, who knows ...
-        */
        snprintf(path, sizeof(path), "global/%s",
                         RELCACHE_INIT_FILENAME);
-       unlink_initfile(path);
+       unlink_initfile(path, LOG);
 
        /* Scan everything in the default tablespace */
        RelationCacheInitFileRemoveInDir("base");
@@ -6106,7 +6167,7 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath)
                        /* Try to remove the init file in each database */
                        snprintf(initfilename, sizeof(initfilename), "%s/%s/%s",
                                         tblspcpath, de->d_name, RELCACHE_INIT_FILENAME);
-                       unlink_initfile(initfilename);
+                       unlink_initfile(initfilename, LOG);
                }
        }
 
@@ -6114,12 +6175,15 @@ RelationCacheInitFileRemoveInDir(const char *tblspcpath)
 }
 
 static void
-unlink_initfile(const char *initfilename)
+unlink_initfile(const char *initfilename, int elevel)
 {
        if (unlink(initfilename) < 0)
        {
                /* It might not be there, but log any error other than ENOENT */
                if (errno != ENOENT)
-                       elog(LOG, "could not remove cache file \"%s\": %m", initfilename);
+                       ereport(elevel,
+                                       (errcode_for_file_access(),
+                                        errmsg("could not remove cache file \"%s\": %m",
+                                                       initfilename)));
        }
 }
index a0af6788e96beef942e653adad046e1cd9c09667..ff2ac05de87f627222449c0efc69221449ab4dc7 100644 (file)
@@ -64,7 +64,7 @@ typedef struct xl_invalidations
 {
        Oid                     dbId;                   /* MyDatabaseId */
        Oid                     tsId;                   /* MyDatabaseTableSpace */
-       bool            relcacheInitFileInval;  /* invalidate relcache init file */
+       bool            relcacheInitFileInval;  /* invalidate relcache init files */
        int                     nmsgs;                  /* number of shared inval msgs */
        SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER];
 } xl_invalidations;