granicus.if.org Git - postgresql/blob - src/backend/utils/cache/relmapper.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * relmapper.c
   4  *        Catalog-to-filenode mapping
   5  *
   6  * For most tables, the physical file underlying the table is specified by
   7  * pg_class.relfilenode.  However, that obviously won't work for pg_class
   8  * itself, nor for the other "nailed" catalogs for which we have to be able
   9  * to set up working Relation entries without access to pg_class.  It also
  10  * does not work for shared catalogs, since there is no practical way to
  11  * update other databases' pg_class entries when relocating a shared catalog.
  12  * Therefore, for these special catalogs (henceforth referred to as "mapped
  13  * catalogs") we rely on a separately maintained file that shows the mapping
  14  * from catalog OIDs to filenode numbers.  Each database has a map file for
  15  * its local mapped catalogs, and there is a separate map file for shared
  16  * catalogs.  Mapped catalogs have zero in their pg_class.relfilenode entries.
  17  *
  18  * Relocation of a normal table is committed (ie, the new physical file becomes
  19  * authoritative) when the pg_class row update commits.  For mapped catalogs,
  20  * the act of updating the map file is effectively commit of the relocation.
  21  * We postpone the file update till just before commit of the transaction
  22  * doing the rewrite, but there is necessarily a window between.  Therefore
  23  * mapped catalogs can only be relocated by operations such as VACUUM FULL
  24  * and CLUSTER, which make no transactionally-significant changes: it must be
  25  * safe for the new file to replace the old, even if the transaction itself
  26  * aborts.  An important factor here is that the indexes and toast table of
  27  * a mapped catalog must also be mapped, so that the rewrites/relocations of
  28  * all these files commit in a single map file update rather than being tied
  29  * to transaction commit.
  30  *
  31  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
  32  * Portions Copyright (c) 1994, Regents of the University of California
  33  *
  34  *
  35  * IDENTIFICATION
  36  *        src/backend/utils/cache/relmapper.c
  37  *
  38  *-------------------------------------------------------------------------
  39  */
  40 #include "postgres.h"
  41
  42 #include <fcntl.h>
  43 #include <sys/stat.h>
  44 #include <unistd.h>
  45
  46 #include "access/xact.h"
  47 #include "access/xlog.h"
  48 #include "access/xloginsert.h"
  49 #include "catalog/catalog.h"
  50 #include "catalog/pg_tablespace.h"
  51 #include "catalog/storage.h"
  52 #include "miscadmin.h"
  53 #include "pgstat.h"
  54 #include "storage/fd.h"
  55 #include "storage/lwlock.h"
  56 #include "utils/inval.h"
  57 #include "utils/relmapper.h"
  58
  59
  60 /*
  61  * The map file is critical data: we have no automatic method for recovering
  62  * from loss or corruption of it.  We use a CRC so that we can detect
  63  * corruption.  To minimize the risk of failed updates, the map file should
  64  * be kept to no more than one standard-size disk sector (ie 512 bytes),
  65  * and we use overwrite-in-place rather than playing renaming games.
  66  * The struct layout below is designed to occupy exactly 512 bytes, which
  67  * might make filesystem updates a bit more efficient.
  68  *
  69  * Entries in the mappings[] array are in no particular order.  We could
  70  * speed searching by insisting on OID order, but it really shouldn't be
  71  * worth the trouble given the intended size of the mapping sets.
  72  */
  73 #define RELMAPPER_FILENAME              "pg_filenode.map"
  74
  75 #define RELMAPPER_FILEMAGIC             0x592717        /* version ID value */
  76
  77 #define MAX_MAPPINGS                    62      /* 62 * 8 + 16 = 512 */
  78
  79 typedef struct RelMapping
  80 {
  81         Oid                     mapoid;                 /* OID of a catalog */
  82         Oid                     mapfilenode;    /* its filenode number */
  83 } RelMapping;
  84
  85 typedef struct RelMapFile
  86 {
  87         int32           magic;                  /* always RELMAPPER_FILEMAGIC */
  88         int32           num_mappings;   /* number of valid RelMapping entries */
  89         RelMapping      mappings[MAX_MAPPINGS];
  90         pg_crc32c       crc;                    /* CRC of all above */
  91         int32           pad;                    /* to make the struct size be 512 exactly */
  92 } RelMapFile;
  93
  94 /*
  95  * The currently known contents of the shared map file and our database's
  96  * local map file are stored here.  These can be reloaded from disk
  97  * immediately whenever we receive an update sinval message.
  98  */
  99 static RelMapFile shared_map;
 100 static RelMapFile local_map;
 101
 102 /*
 103  * We use the same RelMapFile data structure to track uncommitted local
 104  * changes in the mappings (but note the magic and crc fields are not made
 105  * valid in these variables).  Currently, map updates are not allowed within
 106  * subtransactions, so one set of transaction-level changes is sufficient.
 107  *
 108  * The active_xxx variables contain updates that are valid in our transaction
 109  * and should be honored by RelationMapOidToFilenode.  The pending_xxx
 110  * variables contain updates we have been told about that aren't active yet;
 111  * they will become active at the next CommandCounterIncrement.  This setup
 112  * lets map updates act similarly to updates of pg_class rows, ie, they
 113  * become visible only at the next CommandCounterIncrement boundary.
 114  */
 115 static RelMapFile active_shared_updates;
 116 static RelMapFile active_local_updates;
 117 static RelMapFile pending_shared_updates;
 118 static RelMapFile pending_local_updates;
 119
 120
 121 /* non-export function prototypes */
 122 static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode,
 123                                  bool add_okay);
 124 static void merge_map_updates(RelMapFile *map, const RelMapFile *updates,
 125                                   bool add_okay);
 126 static void load_relmap_file(bool shared);
 127 static void write_relmap_file(bool shared, RelMapFile *newmap,
 128                                   bool write_wal, bool send_sinval, bool preserve_files,
 129                                   Oid dbid, Oid tsid, const char *dbpath);
 130 static void perform_relmap_update(bool shared, const RelMapFile *updates);
 131
 132
 133 /*
 134  * RelationMapOidToFilenode
 135  *
 136  * The raison d' etre ... given a relation OID, look up its filenode.
 137  *
 138  * Although shared and local relation OIDs should never overlap, the caller
 139  * always knows which we need --- so pass that information to avoid useless
 140  * searching.
 141  *
 142  * Returns InvalidOid if the OID is not known (which should never happen,
 143  * but the caller is in a better position to report a meaningful error).
 144  */
 145 Oid
 146 RelationMapOidToFilenode(Oid relationId, bool shared)
 147 {
 148         const RelMapFile *map;
 149         int32           i;
 150
 151         /* If there are active updates, believe those over the main maps */
 152         if (shared)
 153         {
 154                 map = &active_shared_updates;
 155                 for (i = 0; i < map->num_mappings; i++)
 156                 {
 157                         if (relationId == map->mappings[i].mapoid)
 158                                 return map->mappings[i].mapfilenode;
 159                 }
 160                 map = &shared_map;
 161                 for (i = 0; i < map->num_mappings; i++)
 162                 {
 163                         if (relationId == map->mappings[i].mapoid)
 164                                 return map->mappings[i].mapfilenode;
 165                 }
 166         }
 167         else
 168         {
 169                 map = &active_local_updates;
 170                 for (i = 0; i < map->num_mappings; i++)
 171                 {
 172                         if (relationId == map->mappings[i].mapoid)
 173                                 return map->mappings[i].mapfilenode;
 174                 }
 175                 map = &local_map;
 176                 for (i = 0; i < map->num_mappings; i++)
 177                 {
 178                         if (relationId == map->mappings[i].mapoid)
 179                                 return map->mappings[i].mapfilenode;
 180                 }
 181         }
 182
 183         return InvalidOid;
 184 }
 185
 186 /*
 187  * RelationMapFilenodeToOid
 188  *
 189  * Do the reverse of the normal direction of mapping done in
 190  * RelationMapOidToFilenode.
 191  *
 192  * This is not supposed to be used during normal running but rather for
 193  * information purposes when looking at the filesystem or xlog.
 194  *
 195  * Returns InvalidOid if the OID is not known; this can easily happen if the
 196  * relfilenode doesn't pertain to a mapped relation.
 197  */
 198 Oid
 199 RelationMapFilenodeToOid(Oid filenode, bool shared)
 200 {
 201         const RelMapFile *map;
 202         int32           i;
 203
 204         /* If there are active updates, believe those over the main maps */
 205         if (shared)
 206         {
 207                 map = &active_shared_updates;
 208                 for (i = 0; i < map->num_mappings; i++)
 209                 {
 210                         if (filenode == map->mappings[i].mapfilenode)
 211                                 return map->mappings[i].mapoid;
 212                 }
 213                 map = &shared_map;
 214                 for (i = 0; i < map->num_mappings; i++)
 215                 {
 216                         if (filenode == map->mappings[i].mapfilenode)
 217                                 return map->mappings[i].mapoid;
 218                 }
 219         }
 220         else
 221         {
 222                 map = &active_local_updates;
 223                 for (i = 0; i < map->num_mappings; i++)
 224                 {
 225                         if (filenode == map->mappings[i].mapfilenode)
 226                                 return map->mappings[i].mapoid;
 227                 }
 228                 map = &local_map;
 229                 for (i = 0; i < map->num_mappings; i++)
 230                 {
 231                         if (filenode == map->mappings[i].mapfilenode)
 232                                 return map->mappings[i].mapoid;
 233                 }
 234         }
 235
 236         return InvalidOid;
 237 }
 238
 239 /*
 240  * RelationMapUpdateMap
 241  *
 242  * Install a new relfilenode mapping for the specified relation.
 243  *
 244  * If immediate is true (or we're bootstrapping), the mapping is activated
 245  * immediately.  Otherwise it is made pending until CommandCounterIncrement.
 246  */
 247 void
 248 RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared,
 249                                          bool immediate)
 250 {
 251         RelMapFile *map;
 252
 253         if (IsBootstrapProcessingMode())
 254         {
 255                 /*
 256                  * In bootstrap mode, the mapping gets installed in permanent map.
 257                  */
 258                 if (shared)
 259                         map = &shared_map;
 260                 else
 261                         map = &local_map;
 262         }
 263         else
 264         {
 265                 /*
 266                  * We don't currently support map changes within subtransactions. This
 267                  * could be done with more bookkeeping infrastructure, but it doesn't
 268                  * presently seem worth it.
 269                  */
 270                 if (GetCurrentTransactionNestLevel() > 1)
 271                         elog(ERROR, "cannot change relation mapping within subtransaction");
 272
 273                 if (immediate)
 274                 {
 275                         /* Make it active, but only locally */
 276                         if (shared)
 277                                 map = &active_shared_updates;
 278                         else
 279                                 map = &active_local_updates;
 280                 }
 281                 else
 282                 {
 283                         /* Make it pending */
 284                         if (shared)
 285                                 map = &pending_shared_updates;
 286                         else
 287                                 map = &pending_local_updates;
 288                 }
 289         }
 290         apply_map_update(map, relationId, fileNode, true);
 291 }
 292
 293 /*
 294  * apply_map_update
 295  *
 296  * Insert a new mapping into the given map variable, replacing any existing
 297  * mapping for the same relation.
 298  *
 299  * In some cases the caller knows there must be an existing mapping; pass
 300  * add_okay = false to draw an error if not.
 301  */
 302 static void
 303 apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay)
 304 {
 305         int32           i;
 306
 307         /* Replace any existing mapping */
 308         for (i = 0; i < map->num_mappings; i++)
 309         {
 310                 if (relationId == map->mappings[i].mapoid)
 311                 {
 312                         map->mappings[i].mapfilenode = fileNode;
 313                         return;
 314                 }
 315         }
 316
 317         /* Nope, need to add a new mapping */
 318         if (!add_okay)
 319                 elog(ERROR, "attempt to apply a mapping to unmapped relation %u",
 320                          relationId);
 321         if (map->num_mappings >= MAX_MAPPINGS)
 322                 elog(ERROR, "ran out of space in relation map");
 323         map->mappings[map->num_mappings].mapoid = relationId;
 324         map->mappings[map->num_mappings].mapfilenode = fileNode;
 325         map->num_mappings++;
 326 }
 327
 328 /*
 329  * merge_map_updates
 330  *
 331  * Merge all the updates in the given pending-update map into the target map.
 332  * This is just a bulk form of apply_map_update.
 333  */
 334 static void
 335 merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay)
 336 {
 337         int32           i;
 338
 339         for (i = 0; i < updates->num_mappings; i++)
 340         {
 341                 apply_map_update(map,
 342                                                  updates->mappings[i].mapoid,
 343                                                  updates->mappings[i].mapfilenode,
 344                                                  add_okay);
 345         }
 346 }
 347
 348 /*
 349  * RelationMapRemoveMapping
 350  *
 351  * Remove a relation's entry in the map.  This is only allowed for "active"
 352  * (but not committed) local mappings.  We need it so we can back out the
 353  * entry for the transient target file when doing VACUUM FULL/CLUSTER on
 354  * a mapped relation.
 355  */
 356 void
 357 RelationMapRemoveMapping(Oid relationId)
 358 {
 359         RelMapFile *map = &active_local_updates;
 360         int32           i;
 361
 362         for (i = 0; i < map->num_mappings; i++)
 363         {
 364                 if (relationId == map->mappings[i].mapoid)
 365                 {
 366                         /* Found it, collapse it out */
 367                         map->mappings[i] = map->mappings[map->num_mappings - 1];
 368                         map->num_mappings--;
 369                         return;
 370                 }
 371         }
 372         elog(ERROR, "could not find temporary mapping for relation %u",
 373                  relationId);
 374 }
 375
 376 /*
 377  * RelationMapInvalidate
 378  *
 379  * This routine is invoked for SI cache flush messages.  We must re-read
 380  * the indicated map file.  However, we might receive a SI message in a
 381  * process that hasn't yet, and might never, load the mapping files;
 382  * for example the autovacuum launcher, which *must not* try to read
 383  * a local map since it is attached to no particular database.
 384  * So, re-read only if the map is valid now.
 385  */
 386 void
 387 RelationMapInvalidate(bool shared)
 388 {
 389         if (shared)
 390         {
 391                 if (shared_map.magic == RELMAPPER_FILEMAGIC)
 392                         load_relmap_file(true);
 393         }
 394         else
 395         {
 396                 if (local_map.magic == RELMAPPER_FILEMAGIC)
 397                         load_relmap_file(false);
 398         }
 399 }
 400
 401 /*
 402  * RelationMapInvalidateAll
 403  *
 404  * Reload all map files.  This is used to recover from SI message buffer
 405  * overflow: we can't be sure if we missed an inval message.
 406  * Again, reload only currently-valid maps.
 407  */
 408 void
 409 RelationMapInvalidateAll(void)
 410 {
 411         if (shared_map.magic == RELMAPPER_FILEMAGIC)
 412                 load_relmap_file(true);
 413         if (local_map.magic == RELMAPPER_FILEMAGIC)
 414                 load_relmap_file(false);
 415 }
 416
 417 /*
 418  * AtCCI_RelationMap
 419  *
 420  * Activate any "pending" relation map updates at CommandCounterIncrement time.
 421  */
 422 void
 423 AtCCI_RelationMap(void)
 424 {
 425         if (pending_shared_updates.num_mappings != 0)
 426         {
 427                 merge_map_updates(&active_shared_updates,
 428                                                   &pending_shared_updates,
 429                                                   true);
 430                 pending_shared_updates.num_mappings = 0;
 431         }
 432         if (pending_local_updates.num_mappings != 0)
 433         {
 434                 merge_map_updates(&active_local_updates,
 435                                                   &pending_local_updates,
 436                                                   true);
 437                 pending_local_updates.num_mappings = 0;
 438         }
 439 }
 440
 441 /*
 442  * AtEOXact_RelationMap
 443  *
 444  * Handle relation mapping at main-transaction commit or abort.
 445  *
 446  * During commit, this must be called as late as possible before the actual
 447  * transaction commit, so as to minimize the window where the transaction
 448  * could still roll back after committing map changes.  Although nothing
 449  * critically bad happens in such a case, we still would prefer that it
 450  * not happen, since we'd possibly be losing useful updates to the relations'
 451  * pg_class row(s).
 452  *
 453  * During abort, we just have to throw away any pending map changes.
 454  * Normal post-abort cleanup will take care of fixing relcache entries.
 455  */
 456 void
 457 AtEOXact_RelationMap(bool isCommit)
 458 {
 459         if (isCommit)
 460         {
 461                 /*
 462                  * We should not get here with any "pending" updates.  (We could
 463                  * logically choose to treat such as committed, but in the current
 464                  * code this should never happen.)
 465                  */
 466                 Assert(pending_shared_updates.num_mappings == 0);
 467                 Assert(pending_local_updates.num_mappings == 0);
 468
 469                 /*
 470                  * Write any active updates to the actual map files, then reset them.
 471                  */
 472                 if (active_shared_updates.num_mappings != 0)
 473                 {
 474                         perform_relmap_update(true, &active_shared_updates);
 475                         active_shared_updates.num_mappings = 0;
 476                 }
 477                 if (active_local_updates.num_mappings != 0)
 478                 {
 479                         perform_relmap_update(false, &active_local_updates);
 480                         active_local_updates.num_mappings = 0;
 481                 }
 482         }
 483         else
 484         {
 485                 /* Abort --- drop all local and pending updates */
 486                 active_shared_updates.num_mappings = 0;
 487                 active_local_updates.num_mappings = 0;
 488                 pending_shared_updates.num_mappings = 0;
 489                 pending_local_updates.num_mappings = 0;
 490         }
 491 }
 492
 493 /*
 494  * AtPrepare_RelationMap
 495  *
 496  * Handle relation mapping at PREPARE.
 497  *
 498  * Currently, we don't support preparing any transaction that changes the map.
 499  */
 500 void
 501 AtPrepare_RelationMap(void)
 502 {
 503         if (active_shared_updates.num_mappings != 0 ||
 504                 active_local_updates.num_mappings != 0 ||
 505                 pending_shared_updates.num_mappings != 0 ||
 506                 pending_local_updates.num_mappings != 0)
 507                 ereport(ERROR,
 508                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 509                                  errmsg("cannot PREPARE a transaction that modified relation mapping")));
 510 }
 511
 512 /*
 513  * CheckPointRelationMap
 514  *
 515  * This is called during a checkpoint.  It must ensure that any relation map
 516  * updates that were WAL-logged before the start of the checkpoint are
 517  * securely flushed to disk and will not need to be replayed later.  This
 518  * seems unlikely to be a performance-critical issue, so we use a simple
 519  * method: we just take and release the RelationMappingLock.  This ensures
 520  * that any already-logged map update is complete, because write_relmap_file
 521  * will fsync the map file before the lock is released.
 522  */
 523 void
 524 CheckPointRelationMap(void)
 525 {
 526         LWLockAcquire(RelationMappingLock, LW_SHARED);
 527         LWLockRelease(RelationMappingLock);
 528 }
 529
 530 /*
 531  * RelationMapFinishBootstrap
 532  *
 533  * Write out the initial relation mapping files at the completion of
 534  * bootstrap.  All the mapped files should have been made known to us
 535  * via RelationMapUpdateMap calls.
 536  */
 537 void
 538 RelationMapFinishBootstrap(void)
 539 {
 540         Assert(IsBootstrapProcessingMode());
 541
 542         /* Shouldn't be anything "pending" ... */
 543         Assert(active_shared_updates.num_mappings == 0);
 544         Assert(active_local_updates.num_mappings == 0);
 545         Assert(pending_shared_updates.num_mappings == 0);
 546         Assert(pending_local_updates.num_mappings == 0);
 547
 548         /* Write the files; no WAL or sinval needed */
 549         write_relmap_file(true, &shared_map, false, false, false,
 550                                           InvalidOid, GLOBALTABLESPACE_OID, NULL);
 551         write_relmap_file(false, &local_map, false, false, false,
 552                                           MyDatabaseId, MyDatabaseTableSpace, DatabasePath);
 553 }
 554
 555 /*
 556  * RelationMapInitialize
 557  *
 558  * This initializes the mapper module at process startup.  We can't access the
 559  * database yet, so just make sure the maps are empty.
 560  */
 561 void
 562 RelationMapInitialize(void)
 563 {
 564         /* The static variables should initialize to zeroes, but let's be sure */
 565         shared_map.magic = 0;           /* mark it not loaded */
 566         local_map.magic = 0;
 567         shared_map.num_mappings = 0;
 568         local_map.num_mappings = 0;
 569         active_shared_updates.num_mappings = 0;
 570         active_local_updates.num_mappings = 0;
 571         pending_shared_updates.num_mappings = 0;
 572         pending_local_updates.num_mappings = 0;
 573 }
 574
 575 /*
 576  * RelationMapInitializePhase2
 577  *
 578  * This is called to prepare for access to pg_database during startup.
 579  * We should be able to read the shared map file now.
 580  */
 581 void
 582 RelationMapInitializePhase2(void)
 583 {
 584         /*
 585          * In bootstrap mode, the map file isn't there yet, so do nothing.
 586          */
 587         if (IsBootstrapProcessingMode())
 588                 return;
 589
 590         /*
 591          * Load the shared map file, die on error.
 592          */
 593         load_relmap_file(true);
 594 }
 595
 596 /*
 597  * RelationMapInitializePhase3
 598  *
 599  * This is called as soon as we have determined MyDatabaseId and set up
 600  * DatabasePath.  At this point we should be able to read the local map file.
 601  */
 602 void
 603 RelationMapInitializePhase3(void)
 604 {
 605         /*
 606          * In bootstrap mode, the map file isn't there yet, so do nothing.
 607          */
 608         if (IsBootstrapProcessingMode())
 609                 return;
 610
 611         /*
 612          * Load the local map file, die on error.
 613          */
 614         load_relmap_file(false);
 615 }
 616
 617 /*
 618  * load_relmap_file -- load data from the shared or local map file
 619  *
 620  * Because the map file is essential for access to core system catalogs,
 621  * failure to read it is a fatal error.
 622  *
 623  * Note that the local case requires DatabasePath to be set up.
 624  */
 625 static void
 626 load_relmap_file(bool shared)
 627 {
 628         RelMapFile *map;
 629         char            mapfilename[MAXPGPATH];
 630         pg_crc32c       crc;
 631         int                     fd;
 632         int                     r;
 633
 634         if (shared)
 635         {
 636                 snprintf(mapfilename, sizeof(mapfilename), "global/%s",
 637                                  RELMAPPER_FILENAME);
 638                 map = &shared_map;
 639         }
 640         else
 641         {
 642                 snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
 643                                  DatabasePath, RELMAPPER_FILENAME);
 644                 map = &local_map;
 645         }
 646
 647         /* Read data ... */
 648         fd = OpenTransientFile(mapfilename, O_RDONLY | PG_BINARY);
 649         if (fd < 0)
 650                 ereport(FATAL,
 651                                 (errcode_for_file_access(),
 652                                  errmsg("could not open file \"%s\": %m",
 653                                                 mapfilename)));
 654
 655         /*
 656          * Note: we could take RelationMappingLock in shared mode here, but it
 657          * seems unnecessary since our read() should be atomic against any
 658          * concurrent updater's write().  If the file is updated shortly after we
 659          * look, the sinval signaling mechanism will make us re-read it before we
 660          * are able to access any relation that's affected by the change.
 661          */
 662         pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_READ);
 663         r = read(fd, map, sizeof(RelMapFile));
 664         if (r != sizeof(RelMapFile))
 665         {
 666                 if (r < 0)
 667                         ereport(FATAL,
 668                                         (errcode_for_file_access(),
 669                                          errmsg("could not read file \"%s\": %m", mapfilename)));
 670                 else
 671                         ereport(FATAL,
 672                                         (errcode(ERRCODE_DATA_CORRUPTED),
 673                                          errmsg("could not read file \"%s\": read %d of %zu",
 674                                                         mapfilename, r, sizeof(RelMapFile))));
 675         }
 676         pgstat_report_wait_end();
 677
 678         CloseTransientFile(fd);
 679
 680         /* check for correct magic number, etc */
 681         if (map->magic != RELMAPPER_FILEMAGIC ||
 682                 map->num_mappings < 0 ||
 683                 map->num_mappings > MAX_MAPPINGS)
 684                 ereport(FATAL,
 685                                 (errmsg("relation mapping file \"%s\" contains invalid data",
 686                                                 mapfilename)));
 687
 688         /* verify the CRC */
 689         INIT_CRC32C(crc);
 690         COMP_CRC32C(crc, (char *) map, offsetof(RelMapFile, crc));
 691         FIN_CRC32C(crc);
 692
 693         if (!EQ_CRC32C(crc, map->crc))
 694                 ereport(FATAL,
 695                                 (errmsg("relation mapping file \"%s\" contains incorrect checksum",
 696                                                 mapfilename)));
 697 }
 698
 699 /*
 700  * Write out a new shared or local map file with the given contents.
 701  *
 702  * The magic number and CRC are automatically updated in *newmap.  On
 703  * success, we copy the data to the appropriate permanent static variable.
 704  *
 705  * If write_wal is true then an appropriate WAL message is emitted.
 706  * (It will be false for bootstrap and WAL replay cases.)
 707  *
 708  * If send_sinval is true then a SI invalidation message is sent.
 709  * (This should be true except in bootstrap case.)
 710  *
 711  * If preserve_files is true then the storage manager is warned not to
 712  * delete the files listed in the map.
 713  *
 714  * Because this may be called during WAL replay when MyDatabaseId,
 715  * DatabasePath, etc aren't valid, we require the caller to pass in suitable
 716  * values.  The caller is also responsible for being sure no concurrent
 717  * map update could be happening.
 718  */
 719 static void
 720 write_relmap_file(bool shared, RelMapFile *newmap,
 721                                   bool write_wal, bool send_sinval, bool preserve_files,
 722                                   Oid dbid, Oid tsid, const char *dbpath)
 723 {
 724         int                     fd;
 725         RelMapFile *realmap;
 726         char            mapfilename[MAXPGPATH];
 727
 728         /*
 729          * Fill in the overhead fields and update CRC.
 730          */
 731         newmap->magic = RELMAPPER_FILEMAGIC;
 732         if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
 733                 elog(ERROR, "attempt to write bogus relation mapping");
 734
 735         INIT_CRC32C(newmap->crc);
 736         COMP_CRC32C(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
 737         FIN_CRC32C(newmap->crc);
 738
 739         /*
 740          * Open the target file.  We prefer to do this before entering the
 741          * critical section, so that an open() failure need not force PANIC.
 742          */
 743         if (shared)
 744         {
 745                 snprintf(mapfilename, sizeof(mapfilename), "global/%s",
 746                                  RELMAPPER_FILENAME);
 747                 realmap = &shared_map;
 748         }
 749         else
 750         {
 751                 snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
 752                                  dbpath, RELMAPPER_FILENAME);
 753                 realmap = &local_map;
 754         }
 755
 756         fd = OpenTransientFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY);
 757         if (fd < 0)
 758                 ereport(ERROR,
 759                                 (errcode_for_file_access(),
 760                                  errmsg("could not open file \"%s\": %m",
 761                                                 mapfilename)));
 762
 763         if (write_wal)
 764         {
 765                 xl_relmap_update xlrec;
 766                 XLogRecPtr      lsn;
 767
 768                 /* now errors are fatal ... */
 769                 START_CRIT_SECTION();
 770
 771                 xlrec.dbid = dbid;
 772                 xlrec.tsid = tsid;
 773                 xlrec.nbytes = sizeof(RelMapFile);
 774
 775                 XLogBeginInsert();
 776                 XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate);
 777                 XLogRegisterData((char *) newmap, sizeof(RelMapFile));
 778
 779                 lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE);
 780
 781                 /* As always, WAL must hit the disk before the data update does */
 782                 XLogFlush(lsn);
 783         }
 784
 785         errno = 0;
 786         pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_WRITE);
 787         if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
 788         {
 789                 /* if write didn't set errno, assume problem is no disk space */
 790                 if (errno == 0)
 791                         errno = ENOSPC;
 792                 ereport(ERROR,
 793                                 (errcode_for_file_access(),
 794                                  errmsg("could not write file \"%s\": %m",
 795                                                 mapfilename)));
 796         }
 797         pgstat_report_wait_end();
 798
 799         /*
 800          * We choose to fsync the data to disk before considering the task done.
 801          * It would be possible to relax this if it turns out to be a performance
 802          * issue, but it would complicate checkpointing --- see notes for
 803          * CheckPointRelationMap.
 804          */
 805         pgstat_report_wait_start(WAIT_EVENT_RELATION_MAP_SYNC);
 806         if (pg_fsync(fd) != 0)
 807                 ereport(ERROR,
 808                                 (errcode_for_file_access(),
 809                                  errmsg("could not fsync file \"%s\": %m",
 810                                                 mapfilename)));
 811         pgstat_report_wait_end();
 812
 813         if (CloseTransientFile(fd))
 814                 ereport(ERROR,
 815                                 (errcode_for_file_access(),
 816                                  errmsg("could not close file \"%s\": %m",
 817                                                 mapfilename)));
 818
 819         /*
 820          * Now that the file is safely on disk, send sinval message to let other
 821          * backends know to re-read it.  We must do this inside the critical
 822          * section: if for some reason we fail to send the message, we have to
 823          * force a database-wide PANIC.  Otherwise other backends might continue
 824          * execution with stale mapping information, which would be catastrophic
 825          * as soon as others began to use the now-committed data.
 826          */
 827         if (send_sinval)
 828                 CacheInvalidateRelmap(dbid);
 829
 830         /*
 831          * Make sure that the files listed in the map are not deleted if the outer
 832          * transaction aborts.  This had better be within the critical section
 833          * too: it's not likely to fail, but if it did, we'd arrive at transaction
 834          * abort with the files still vulnerable.  PANICing will leave things in a
 835          * good state on-disk.
 836          *
 837          * Note: we're cheating a little bit here by assuming that mapped files
 838          * are either in pg_global or the database's default tablespace.
 839          */
 840         if (preserve_files)
 841         {
 842                 int32           i;
 843
 844                 for (i = 0; i < newmap->num_mappings; i++)
 845                 {
 846                         RelFileNode rnode;
 847
 848                         rnode.spcNode = tsid;
 849                         rnode.dbNode = dbid;
 850                         rnode.relNode = newmap->mappings[i].mapfilenode;
 851                         RelationPreserveStorage(rnode, false);
 852                 }
 853         }
 854
 855         /* Success, update permanent copy */
 856         memcpy(realmap, newmap, sizeof(RelMapFile));
 857
 858         /* Critical section done */
 859         if (write_wal)
 860                 END_CRIT_SECTION();
 861 }
 862
 863 /*
 864  * Merge the specified updates into the appropriate "real" map,
 865  * and write out the changes.  This function must be used for committing
 866  * updates during normal multiuser operation.
 867  */
 868 static void
 869 perform_relmap_update(bool shared, const RelMapFile *updates)
 870 {
 871         RelMapFile      newmap;
 872
 873         /*
 874          * Anyone updating a relation's mapping info should take exclusive lock on
 875          * that rel and hold it until commit.  This ensures that there will not be
 876          * concurrent updates on the same mapping value; but there could easily be
 877          * concurrent updates on different values in the same file. We cover that
 878          * by acquiring the RelationMappingLock, re-reading the target file to
 879          * ensure it's up to date, applying the updates, and writing the data
 880          * before releasing RelationMappingLock.
 881          *
 882          * There is only one RelationMappingLock.  In principle we could try to
 883          * have one per mapping file, but it seems unlikely to be worth the
 884          * trouble.
 885          */
 886         LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
 887
 888         /* Be certain we see any other updates just made */
 889         load_relmap_file(shared);
 890
 891         /* Prepare updated data in a local variable */
 892         if (shared)
 893                 memcpy(&newmap, &shared_map, sizeof(RelMapFile));
 894         else
 895                 memcpy(&newmap, &local_map, sizeof(RelMapFile));
 896
 897         /*
 898          * Apply the updates to newmap.  No new mappings should appear, unless
 899          * somebody is adding indexes to system catalogs.
 900          */
 901         merge_map_updates(&newmap, updates, allowSystemTableMods);
 902
 903         /* Write out the updated map and do other necessary tasks */
 904         write_relmap_file(shared, &newmap, true, true, true,
 905                                           (shared ? InvalidOid : MyDatabaseId),
 906                                           (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace),
 907                                           DatabasePath);
 908
 909         /* Now we can release the lock */
 910         LWLockRelease(RelationMappingLock);
 911 }
 912
 913 /*
 914  * RELMAP resource manager's routines
 915  */
 916 void
 917 relmap_redo(XLogReaderState *record)
 918 {
 919         uint8           info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
 920
 921         /* Backup blocks are not used in relmap records */
 922         Assert(!XLogRecHasAnyBlockRefs(record));
 923
 924         if (info == XLOG_RELMAP_UPDATE)
 925         {
 926                 xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record);
 927                 RelMapFile      newmap;
 928                 char       *dbpath;
 929
 930                 if (xlrec->nbytes != sizeof(RelMapFile))
 931                         elog(PANIC, "relmap_redo: wrong size %u in relmap update record",
 932                                  xlrec->nbytes);
 933                 memcpy(&newmap, xlrec->data, sizeof(newmap));
 934
 935                 /* We need to construct the pathname for this database */
 936                 dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid);
 937
 938                 /*
 939                  * Write out the new map and send sinval, but of course don't write a
 940                  * new WAL entry.  There's no surrounding transaction to tell to
 941                  * preserve files, either.
 942                  *
 943                  * There shouldn't be anyone else updating relmaps during WAL replay,
 944                  * so we don't bother to take the RelationMappingLock.  We would need
 945                  * to do so if load_relmap_file needed to interlock against writers.
 946                  */
 947                 write_relmap_file((xlrec->dbid == InvalidOid), &newmap,
 948                                                   false, true, false,
 949                                                   xlrec->dbid, xlrec->tsid, dbpath);
 950
 951                 pfree(dbpath);
 952         }
 953         else
 954                 elog(PANIC, "relmap_redo: unknown op code %u", info);
 955 }