granicus.if.org Git - postgresql/blob - src/backend/commands/cluster.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * cluster.c
   4  *        CLUSTER a table on an index.  This is now also used for VACUUM FULL.
   5  *
   6  * There is hardly anything left of Paul Brown's original implementation...
   7  *
   8  *
   9  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
  10  * Portions Copyright (c) 1994-5, Regents of the University of California
  11  *
  12  *
  13  * IDENTIFICATION
  14  *        src/backend/commands/cluster.c
  15  *
  16  *-------------------------------------------------------------------------
  17  */
  18 #include "postgres.h"
  19
  20 #include "access/amapi.h"
  21 #include "access/multixact.h"
  22 #include "access/relscan.h"
  23 #include "access/rewriteheap.h"
  24 #include "access/transam.h"
  25 #include "access/tuptoaster.h"
  26 #include "access/xact.h"
  27 #include "access/xlog.h"
  28 #include "catalog/pg_am.h"
  29 #include "catalog/catalog.h"
  30 #include "catalog/dependency.h"
  31 #include "catalog/heap.h"
  32 #include "catalog/index.h"
  33 #include "catalog/namespace.h"
  34 #include "catalog/objectaccess.h"
  35 #include "catalog/toasting.h"
  36 #include "commands/cluster.h"
  37 #include "commands/tablecmds.h"
  38 #include "commands/vacuum.h"
  39 #include "miscadmin.h"
  40 #include "optimizer/planner.h"
  41 #include "storage/bufmgr.h"
  42 #include "storage/lmgr.h"
  43 #include "storage/predicate.h"
  44 #include "storage/smgr.h"
  45 #include "utils/acl.h"
  46 #include "utils/fmgroids.h"
  47 #include "utils/inval.h"
  48 #include "utils/lsyscache.h"
  49 #include "utils/memutils.h"
  50 #include "utils/pg_rusage.h"
  51 #include "utils/relmapper.h"
  52 #include "utils/snapmgr.h"
  53 #include "utils/syscache.h"
  54 #include "utils/tqual.h"
  55 #include "utils/tuplesort.h"
  56
  57
  58 /*
  59  * This struct is used to pass around the information on tables to be
  60  * clustered. We need this so we can make a list of them when invoked without
  61  * a specific table/index pair.
  62  */
  63 typedef struct
  64 {
  65         Oid                     tableOid;
  66         Oid                     indexOid;
  67 } RelToCluster;
  68
  69
  70 static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
  71 static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
  72                            bool verbose, bool *pSwapToastByContent,
  73                            TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
  74 static List *get_tables_to_cluster(MemoryContext cluster_context);
  75 static void reform_and_rewrite_tuple(HeapTuple tuple,
  76                                                  TupleDesc oldTupDesc, TupleDesc newTupDesc,
  77                                                  Datum *values, bool *isnull,
  78                                                  bool newRelHasOids, RewriteState rwstate);
  79
  80
  81 /*---------------------------------------------------------------------------
  82  * This cluster code allows for clustering multiple tables at once. Because
  83  * of this, we cannot just run everything on a single transaction, or we
  84  * would be forced to acquire exclusive locks on all the tables being
  85  * clustered, simultaneously --- very likely leading to deadlock.
  86  *
  87  * To solve this we follow a similar strategy to VACUUM code,
  88  * clustering each relation in a separate transaction. For this to work,
  89  * we need to:
  90  *      - provide a separate memory context so that we can pass information in
  91  *        a way that survives across transactions
  92  *      - start a new transaction every time a new relation is clustered
  93  *      - check for validity of the information on to-be-clustered relations,
  94  *        as someone might have deleted a relation behind our back, or
  95  *        clustered one on a different index
  96  *      - end the transaction
  97  *
  98  * The single-relation case does not have any such overhead.
  99  *
 100  * We also allow a relation to be specified without index.  In that case,
 101  * the indisclustered bit will be looked up, and an ERROR will be thrown
 102  * if there is no index with the bit set.
 103  *---------------------------------------------------------------------------
 104  */
 105 void
 106 cluster(ClusterStmt *stmt, bool isTopLevel)
 107 {
 108         if (stmt->relation != NULL)
 109         {
 110                 /* This is the single-relation case. */
 111                 Oid                     tableOid,
 112                                         indexOid = InvalidOid;
 113                 Relation        rel;
 114
 115                 /* Find, lock, and check permissions on the table */
 116                 tableOid = RangeVarGetRelidExtended(stmt->relation,
 117                                                                                         AccessExclusiveLock,
 118                                                                                         0,
 119                                                                                         RangeVarCallbackOwnsTable, NULL);
 120                 rel = heap_open(tableOid, NoLock);
 121
 122                 /*
 123                  * Reject clustering a remote temp table ... their local buffer
 124                  * manager is not going to cope.
 125                  */
 126                 if (RELATION_IS_OTHER_TEMP(rel))
 127                         ereport(ERROR,
 128                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 129                                          errmsg("cannot cluster temporary tables of other sessions")));
 130
 131                 /*
 132                  * Reject clustering a partitioned table.
 133                  */
 134                 if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 135                         ereport(ERROR,
 136                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 137                                          errmsg("cannot cluster a partitioned table")));
 138
 139                 if (stmt->indexname == NULL)
 140                 {
 141                         ListCell   *index;
 142
 143                         /* We need to find the index that has indisclustered set. */
 144                         foreach(index, RelationGetIndexList(rel))
 145                         {
 146                                 HeapTuple       idxtuple;
 147                                 Form_pg_index indexForm;
 148
 149                                 indexOid = lfirst_oid(index);
 150                                 idxtuple = SearchSysCache1(INDEXRELID,
 151                                                                                    ObjectIdGetDatum(indexOid));
 152                                 if (!HeapTupleIsValid(idxtuple))
 153                                         elog(ERROR, "cache lookup failed for index %u", indexOid);
 154                                 indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
 155                                 if (indexForm->indisclustered)
 156                                 {
 157                                         ReleaseSysCache(idxtuple);
 158                                         break;
 159                                 }
 160                                 ReleaseSysCache(idxtuple);
 161                                 indexOid = InvalidOid;
 162                         }
 163
 164                         if (!OidIsValid(indexOid))
 165                                 ereport(ERROR,
 166                                                 (errcode(ERRCODE_UNDEFINED_OBJECT),
 167                                                  errmsg("there is no previously clustered index for table \"%s\"",
 168                                                                 stmt->relation->relname)));
 169                 }
 170                 else
 171                 {
 172                         /*
 173                          * The index is expected to be in the same namespace as the
 174                          * relation.
 175                          */
 176                         indexOid = get_relname_relid(stmt->indexname,
 177                                                                                  rel->rd_rel->relnamespace);
 178                         if (!OidIsValid(indexOid))
 179                                 ereport(ERROR,
 180                                                 (errcode(ERRCODE_UNDEFINED_OBJECT),
 181                                                  errmsg("index \"%s\" for table \"%s\" does not exist",
 182                                                                 stmt->indexname, stmt->relation->relname)));
 183                 }
 184
 185                 /* close relation, keep lock till commit */
 186                 heap_close(rel, NoLock);
 187
 188                 /* Do the job. */
 189                 cluster_rel(tableOid, indexOid, false, stmt->verbose);
 190         }
 191         else
 192         {
 193                 /*
 194                  * This is the "multi relation" case. We need to cluster all tables
 195                  * that have some index with indisclustered set.
 196                  */
 197                 MemoryContext cluster_context;
 198                 List       *rvs;
 199                 ListCell   *rv;
 200
 201                 /*
 202                  * We cannot run this form of CLUSTER inside a user transaction block;
 203                  * we'd be holding locks way too long.
 204                  */
 205                 PreventInTransactionBlock(isTopLevel, "CLUSTER");
 206
 207                 /*
 208                  * Create special memory context for cross-transaction storage.
 209                  *
 210                  * Since it is a child of PortalContext, it will go away even in case
 211                  * of error.
 212                  */
 213                 cluster_context = AllocSetContextCreate(PortalContext,
 214                                                                                                 "Cluster",
 215                                                                                                 ALLOCSET_DEFAULT_SIZES);
 216
 217                 /*
 218                  * Build the list of relations to cluster.  Note that this lives in
 219                  * cluster_context.
 220                  */
 221                 rvs = get_tables_to_cluster(cluster_context);
 222
 223                 /* Commit to get out of starting transaction */
 224                 PopActiveSnapshot();
 225                 CommitTransactionCommand();
 226
 227                 /* Ok, now that we've got them all, cluster them one by one */
 228                 foreach(rv, rvs)
 229                 {
 230                         RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
 231
 232                         /* Start a new transaction for each relation. */
 233                         StartTransactionCommand();
 234                         /* functions in indexes may want a snapshot set */
 235                         PushActiveSnapshot(GetTransactionSnapshot());
 236                         /* Do the job. */
 237                         cluster_rel(rvtc->tableOid, rvtc->indexOid, true, stmt->verbose);
 238                         PopActiveSnapshot();
 239                         CommitTransactionCommand();
 240                 }
 241
 242                 /* Start a new transaction for the cleanup work. */
 243                 StartTransactionCommand();
 244
 245                 /* Clean up working storage */
 246                 MemoryContextDelete(cluster_context);
 247         }
 248 }
 249
 250 /*
 251  * cluster_rel
 252  *
 253  * This clusters the table by creating a new, clustered table and
 254  * swapping the relfilenodes of the new table and the old table, so
 255  * the OID of the original table is preserved.  Thus we do not lose
 256  * GRANT, inheritance nor references to this table (this was a bug
 257  * in releases through 7.3).
 258  *
 259  * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
 260  * the new table, it's better to create the indexes afterwards than to fill
 261  * them incrementally while we load the table.
 262  *
 263  * If indexOid is InvalidOid, the table will be rewritten in physical order
 264  * instead of index order.  This is the new implementation of VACUUM FULL,
 265  * and error messages should refer to the operation as VACUUM not CLUSTER.
 266  */
 267 void
 268 cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose)
 269 {
 270         Relation        OldHeap;
 271
 272         /* Check for user-requested abort. */
 273         CHECK_FOR_INTERRUPTS();
 274
 275         /*
 276          * We grab exclusive access to the target rel and index for the duration
 277          * of the transaction.  (This is redundant for the single-transaction
 278          * case, since cluster() already did it.)  The index lock is taken inside
 279          * check_index_is_clusterable.
 280          */
 281         OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
 282
 283         /* If the table has gone away, we can skip processing it */
 284         if (!OldHeap)
 285                 return;
 286
 287         /*
 288          * Since we may open a new transaction for each relation, we have to check
 289          * that the relation still is what we think it is.
 290          *
 291          * If this is a single-transaction CLUSTER, we can skip these tests. We
 292          * *must* skip the one on indisclustered since it would reject an attempt
 293          * to cluster a not-previously-clustered index.
 294          */
 295         if (recheck)
 296         {
 297                 HeapTuple       tuple;
 298                 Form_pg_index indexForm;
 299
 300                 /* Check that the user still owns the relation */
 301                 if (!pg_class_ownercheck(tableOid, GetUserId()))
 302                 {
 303                         relation_close(OldHeap, AccessExclusiveLock);
 304                         return;
 305                 }
 306
 307                 /*
 308                  * Silently skip a temp table for a remote session.  Only doing this
 309                  * check in the "recheck" case is appropriate (which currently means
 310                  * somebody is executing a database-wide CLUSTER), because there is
 311                  * another check in cluster() which will stop any attempt to cluster
 312                  * remote temp tables by name.  There is another check in cluster_rel
 313                  * which is redundant, but we leave it for extra safety.
 314                  */
 315                 if (RELATION_IS_OTHER_TEMP(OldHeap))
 316                 {
 317                         relation_close(OldHeap, AccessExclusiveLock);
 318                         return;
 319                 }
 320
 321                 if (OidIsValid(indexOid))
 322                 {
 323                         /*
 324                          * Check that the index still exists
 325                          */
 326                         if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
 327                         {
 328                                 relation_close(OldHeap, AccessExclusiveLock);
 329                                 return;
 330                         }
 331
 332                         /*
 333                          * Check that the index is still the one with indisclustered set.
 334                          */
 335                         tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
 336                         if (!HeapTupleIsValid(tuple))   /* probably can't happen */
 337                         {
 338                                 relation_close(OldHeap, AccessExclusiveLock);
 339                                 return;
 340                         }
 341                         indexForm = (Form_pg_index) GETSTRUCT(tuple);
 342                         if (!indexForm->indisclustered)
 343                         {
 344                                 ReleaseSysCache(tuple);
 345                                 relation_close(OldHeap, AccessExclusiveLock);
 346                                 return;
 347                         }
 348                         ReleaseSysCache(tuple);
 349                 }
 350         }
 351
 352         /*
 353          * We allow VACUUM FULL, but not CLUSTER, on shared catalogs.  CLUSTER
 354          * would work in most respects, but the index would only get marked as
 355          * indisclustered in the current database, leading to unexpected behavior
 356          * if CLUSTER were later invoked in another database.
 357          */
 358         if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
 359                 ereport(ERROR,
 360                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 361                                  errmsg("cannot cluster a shared catalog")));
 362
 363         /*
 364          * Don't process temp tables of other backends ... their local buffer
 365          * manager is not going to cope.
 366          */
 367         if (RELATION_IS_OTHER_TEMP(OldHeap))
 368         {
 369                 if (OidIsValid(indexOid))
 370                         ereport(ERROR,
 371                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 372                                          errmsg("cannot cluster temporary tables of other sessions")));
 373                 else
 374                         ereport(ERROR,
 375                                         (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 376                                          errmsg("cannot vacuum temporary tables of other sessions")));
 377         }
 378
 379         /*
 380          * Also check for active uses of the relation in the current transaction,
 381          * including open scans and pending AFTER trigger events.
 382          */
 383         CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
 384
 385         /* Check heap and index are valid to cluster on */
 386         if (OidIsValid(indexOid))
 387                 check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
 388
 389         /*
 390          * Quietly ignore the request if this is a materialized view which has not
 391          * been populated from its query. No harm is done because there is no data
 392          * to deal with, and we don't want to throw an error if this is part of a
 393          * multi-relation request -- for example, CLUSTER was run on the entire
 394          * database.
 395          */
 396         if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
 397                 !RelationIsPopulated(OldHeap))
 398         {
 399                 relation_close(OldHeap, AccessExclusiveLock);
 400                 return;
 401         }
 402
 403         /*
 404          * All predicate locks on the tuples or pages are about to be made
 405          * invalid, because we move tuples around.  Promote them to relation
 406          * locks.  Predicate locks on indexes will be promoted when they are
 407          * reindexed.
 408          */
 409         TransferPredicateLocksToHeapRelation(OldHeap);
 410
 411         /* rebuild_relation does all the dirty work */
 412         rebuild_relation(OldHeap, indexOid, verbose);
 413
 414         /* NB: rebuild_relation does heap_close() on OldHeap */
 415 }
 416
 417 /*
 418  * Verify that the specified heap and index are valid to cluster on
 419  *
 420  * Side effect: obtains lock on the index.  The caller may
 421  * in some cases already have AccessExclusiveLock on the table, but
 422  * not in all cases so we can't rely on the table-level lock for
 423  * protection here.
 424  */
 425 void
 426 check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
 427 {
 428         Relation        OldIndex;
 429
 430         OldIndex = index_open(indexOid, lockmode);
 431
 432         /*
 433          * Check that index is in fact an index on the given relation
 434          */
 435         if (OldIndex->rd_index == NULL ||
 436                 OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
 437                 ereport(ERROR,
 438                                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
 439                                  errmsg("\"%s\" is not an index for table \"%s\"",
 440                                                 RelationGetRelationName(OldIndex),
 441                                                 RelationGetRelationName(OldHeap))));
 442
 443         /* Index AM must allow clustering */
 444         if (!OldIndex->rd_amroutine->amclusterable)
 445                 ereport(ERROR,
 446                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 447                                  errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
 448                                                 RelationGetRelationName(OldIndex))));
 449
 450         /*
 451          * Disallow clustering on incomplete indexes (those that might not index
 452          * every row of the relation).  We could relax this by making a separate
 453          * seqscan pass over the table to copy the missing rows, but that seems
 454          * expensive and tedious.
 455          */
 456         if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
 457                 ereport(ERROR,
 458                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 459                                  errmsg("cannot cluster on partial index \"%s\"",
 460                                                 RelationGetRelationName(OldIndex))));
 461
 462         /*
 463          * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
 464          * it might well not contain entries for every heap row, or might not even
 465          * be internally consistent.  (But note that we don't check indcheckxmin;
 466          * the worst consequence of following broken HOT chains would be that we
 467          * might put recently-dead tuples out-of-order in the new table, and there
 468          * is little harm in that.)
 469          */
 470         if (!IndexIsValid(OldIndex->rd_index))
 471                 ereport(ERROR,
 472                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 473                                  errmsg("cannot cluster on invalid index \"%s\"",
 474                                                 RelationGetRelationName(OldIndex))));
 475
 476         /* Drop relcache refcnt on OldIndex, but keep lock */
 477         index_close(OldIndex, NoLock);
 478 }
 479
 480 /*
 481  * mark_index_clustered: mark the specified index as the one clustered on
 482  *
 483  * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
 484  */
 485 void
 486 mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
 487 {
 488         HeapTuple       indexTuple;
 489         Form_pg_index indexForm;
 490         Relation        pg_index;
 491         ListCell   *index;
 492
 493         /* Disallow applying to a partitioned table */
 494         if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 495                 ereport(ERROR,
 496                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 497                                  errmsg("cannot mark index clustered in partitioned table")));
 498
 499         /*
 500          * If the index is already marked clustered, no need to do anything.
 501          */
 502         if (OidIsValid(indexOid))
 503         {
 504                 indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
 505                 if (!HeapTupleIsValid(indexTuple))
 506                         elog(ERROR, "cache lookup failed for index %u", indexOid);
 507                 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
 508
 509                 if (indexForm->indisclustered)
 510                 {
 511                         ReleaseSysCache(indexTuple);
 512                         return;
 513                 }
 514
 515                 ReleaseSysCache(indexTuple);
 516         }
 517
 518         /*
 519          * Check each index of the relation and set/clear the bit as needed.
 520          */
 521         pg_index = heap_open(IndexRelationId, RowExclusiveLock);
 522
 523         foreach(index, RelationGetIndexList(rel))
 524         {
 525                 Oid                     thisIndexOid = lfirst_oid(index);
 526
 527                 indexTuple = SearchSysCacheCopy1(INDEXRELID,
 528                                                                                  ObjectIdGetDatum(thisIndexOid));
 529                 if (!HeapTupleIsValid(indexTuple))
 530                         elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
 531                 indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
 532
 533                 /*
 534                  * Unset the bit if set.  We know it's wrong because we checked this
 535                  * earlier.
 536                  */
 537                 if (indexForm->indisclustered)
 538                 {
 539                         indexForm->indisclustered = false;
 540                         CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
 541                 }
 542                 else if (thisIndexOid == indexOid)
 543                 {
 544                         /* this was checked earlier, but let's be real sure */
 545                         if (!IndexIsValid(indexForm))
 546                                 elog(ERROR, "cannot cluster on invalid index %u", indexOid);
 547                         indexForm->indisclustered = true;
 548                         CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
 549                 }
 550
 551                 InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
 552                                                                          InvalidOid, is_internal);
 553
 554                 heap_freetuple(indexTuple);
 555         }
 556
 557         heap_close(pg_index, RowExclusiveLock);
 558 }
 559
 560 /*
 561  * rebuild_relation: rebuild an existing relation in index or physical order
 562  *
 563  * OldHeap: table to rebuild --- must be opened and exclusive-locked!
 564  * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
 565  *
 566  * NB: this routine closes OldHeap at the right time; caller should not.
 567  */
 568 static void
 569 rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
 570 {
 571         Oid                     tableOid = RelationGetRelid(OldHeap);
 572         Oid                     tableSpace = OldHeap->rd_rel->reltablespace;
 573         Oid                     OIDNewHeap;
 574         char            relpersistence;
 575         bool            is_system_catalog;
 576         bool            swap_toast_by_content;
 577         TransactionId frozenXid;
 578         MultiXactId cutoffMulti;
 579
 580         /* Mark the correct index as clustered */
 581         if (OidIsValid(indexOid))
 582                 mark_index_clustered(OldHeap, indexOid, true);
 583
 584         /* Remember info about rel before closing OldHeap */
 585         relpersistence = OldHeap->rd_rel->relpersistence;
 586         is_system_catalog = IsSystemRelation(OldHeap);
 587
 588         /* Close relcache entry, but keep lock until transaction commit */
 589         heap_close(OldHeap, NoLock);
 590
 591         /* Create the transient table that will receive the re-ordered data */
 592         OIDNewHeap = make_new_heap(tableOid, tableSpace,
 593                                                            relpersistence,
 594                                                            AccessExclusiveLock);
 595
 596         /* Copy the heap data into the new table in the desired order */
 597         copy_heap_data(OIDNewHeap, tableOid, indexOid, verbose,
 598                                    &swap_toast_by_content, &frozenXid, &cutoffMulti);
 599
 600         /*
 601          * Swap the physical files of the target and transient tables, then
 602          * rebuild the target's indexes and throw away the transient table.
 603          */
 604         finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
 605                                          swap_toast_by_content, false, true,
 606                                          frozenXid, cutoffMulti,
 607                                          relpersistence);
 608 }
 609
 610
 611 /*
 612  * Create the transient table that will be filled with new data during
 613  * CLUSTER, ALTER TABLE, and similar operations.  The transient table
 614  * duplicates the logical structure of the OldHeap, but is placed in
 615  * NewTableSpace which might be different from OldHeap's.  Also, it's built
 616  * with the specified persistence, which might differ from the original's.
 617  *
 618  * After this, the caller should load the new heap with transferred/modified
 619  * data, then call finish_heap_swap to complete the operation.
 620  */
 621 Oid
 622 make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
 623                           LOCKMODE lockmode)
 624 {
 625         TupleDesc       OldHeapDesc;
 626         char            NewHeapName[NAMEDATALEN];
 627         Oid                     OIDNewHeap;
 628         Oid                     toastid;
 629         Relation        OldHeap;
 630         HeapTuple       tuple;
 631         Datum           reloptions;
 632         bool            isNull;
 633         Oid                     namespaceid;
 634
 635         OldHeap = heap_open(OIDOldHeap, lockmode);
 636         OldHeapDesc = RelationGetDescr(OldHeap);
 637
 638         /*
 639          * Note that the NewHeap will not receive any of the defaults or
 640          * constraints associated with the OldHeap; we don't need 'em, and there's
 641          * no reason to spend cycles inserting them into the catalogs only to
 642          * delete them.
 643          */
 644
 645         /*
 646          * But we do want to use reloptions of the old heap for new heap.
 647          */
 648         tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
 649         if (!HeapTupleIsValid(tuple))
 650                 elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
 651         reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
 652                                                                  &isNull);
 653         if (isNull)
 654                 reloptions = (Datum) 0;
 655
 656         if (relpersistence == RELPERSISTENCE_TEMP)
 657                 namespaceid = LookupCreationNamespace("pg_temp");
 658         else
 659                 namespaceid = RelationGetNamespace(OldHeap);
 660
 661         /*
 662          * Create the new heap, using a temporary name in the same namespace as
 663          * the existing table.  NOTE: there is some risk of collision with user
 664          * relnames.  Working around this seems more trouble than it's worth; in
 665          * particular, we can't create the new heap in a different namespace from
 666          * the old, or we will have problems with the TEMP status of temp tables.
 667          *
 668          * Note: the new heap is not a shared relation, even if we are rebuilding
 669          * a shared rel.  However, we do make the new heap mapped if the source is
 670          * mapped.  This simplifies swap_relation_files, and is absolutely
 671          * necessary for rebuilding pg_class, for reasons explained there.
 672          */
 673         snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
 674
 675         OIDNewHeap = heap_create_with_catalog(NewHeapName,
 676                                                                                   namespaceid,
 677                                                                                   NewTableSpace,
 678                                                                                   InvalidOid,
 679                                                                                   InvalidOid,
 680                                                                                   InvalidOid,
 681                                                                                   OldHeap->rd_rel->relowner,
 682                                                                                   OldHeapDesc,
 683                                                                                   NIL,
 684                                                                                   RELKIND_RELATION,
 685                                                                                   relpersistence,
 686                                                                                   false,
 687                                                                                   RelationIsMapped(OldHeap),
 688                                                                                   true,
 689                                                                                   0,
 690                                                                                   ONCOMMIT_NOOP,
 691                                                                                   reloptions,
 692                                                                                   false,
 693                                                                                   true,
 694                                                                                   true,
 695                                                                                   OIDOldHeap,
 696                                                                                   NULL);
 697         Assert(OIDNewHeap != InvalidOid);
 698
 699         ReleaseSysCache(tuple);
 700
 701         /*
 702          * Advance command counter so that the newly-created relation's catalog
 703          * tuples will be visible to heap_open.
 704          */
 705         CommandCounterIncrement();
 706
 707         /*
 708          * If necessary, create a TOAST table for the new relation.
 709          *
 710          * If the relation doesn't have a TOAST table already, we can't need one
 711          * for the new relation.  The other way around is possible though: if some
 712          * wide columns have been dropped, NewHeapCreateToastTable can decide that
 713          * no TOAST table is needed for the new table.
 714          *
 715          * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
 716          * that the TOAST table will be visible for insertion.
 717          */
 718         toastid = OldHeap->rd_rel->reltoastrelid;
 719         if (OidIsValid(toastid))
 720         {
 721                 /* keep the existing toast table's reloptions, if any */
 722                 tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
 723                 if (!HeapTupleIsValid(tuple))
 724                         elog(ERROR, "cache lookup failed for relation %u", toastid);
 725                 reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
 726                                                                          &isNull);
 727                 if (isNull)
 728                         reloptions = (Datum) 0;
 729
 730                 NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode);
 731
 732                 ReleaseSysCache(tuple);
 733         }
 734
 735         heap_close(OldHeap, NoLock);
 736
 737         return OIDNewHeap;
 738 }
 739
 740 /*
 741  * Do the physical copying of heap data.
 742  *
 743  * There are three output parameters:
 744  * *pSwapToastByContent is set true if toast tables must be swapped by content.
 745  * *pFreezeXid receives the TransactionId used as freeze cutoff point.
 746  * *pCutoffMulti receives the MultiXactId used as a cutoff point.
 747  */
 748 static void
 749 copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 750                            bool *pSwapToastByContent, TransactionId *pFreezeXid,
 751                            MultiXactId *pCutoffMulti)
 752 {
 753         Relation        NewHeap,
 754                                 OldHeap,
 755                                 OldIndex;
 756         Relation        relRelation;
 757         HeapTuple       reltup;
 758         Form_pg_class relform;
 759         TupleDesc       oldTupDesc;
 760         TupleDesc       newTupDesc;
 761         int                     natts;
 762         Datum      *values;
 763         bool       *isnull;
 764         IndexScanDesc indexScan;
 765         HeapScanDesc heapScan;
 766         bool            use_wal;
 767         bool            is_system_catalog;
 768         TransactionId OldestXmin;
 769         TransactionId FreezeXid;
 770         MultiXactId MultiXactCutoff;
 771         RewriteState rwstate;
 772         bool            use_sort;
 773         Tuplesortstate *tuplesort;
 774         double          num_tuples = 0,
 775                                 tups_vacuumed = 0,
 776                                 tups_recently_dead = 0;
 777         BlockNumber num_pages;
 778         int                     elevel = verbose ? INFO : DEBUG2;
 779         PGRUsage        ru0;
 780
 781         pg_rusage_init(&ru0);
 782
 783         /*
 784          * Open the relations we need.
 785          */
 786         NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
 787         OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
 788         if (OidIsValid(OIDOldIndex))
 789                 OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
 790         else
 791                 OldIndex = NULL;
 792
 793         /*
 794          * Their tuple descriptors should be exactly alike, but here we only need
 795          * assume that they have the same number of columns.
 796          */
 797         oldTupDesc = RelationGetDescr(OldHeap);
 798         newTupDesc = RelationGetDescr(NewHeap);
 799         Assert(newTupDesc->natts == oldTupDesc->natts);
 800
 801         /* Preallocate values/isnull arrays */
 802         natts = newTupDesc->natts;
 803         values = (Datum *) palloc(natts * sizeof(Datum));
 804         isnull = (bool *) palloc(natts * sizeof(bool));
 805
 806         /*
 807          * If the OldHeap has a toast table, get lock on the toast table to keep
 808          * it from being vacuumed.  This is needed because autovacuum processes
 809          * toast tables independently of their main tables, with no lock on the
 810          * latter.  If an autovacuum were to start on the toast table after we
 811          * compute our OldestXmin below, it would use a later OldestXmin, and then
 812          * possibly remove as DEAD toast tuples belonging to main tuples we think
 813          * are only RECENTLY_DEAD.  Then we'd fail while trying to copy those
 814          * tuples.
 815          *
 816          * We don't need to open the toast relation here, just lock it.  The lock
 817          * will be held till end of transaction.
 818          */
 819         if (OldHeap->rd_rel->reltoastrelid)
 820                 LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
 821
 822         /*
 823          * We need to log the copied data in WAL iff WAL archiving/streaming is
 824          * enabled AND it's a WAL-logged rel.
 825          */
 826         use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
 827
 828         /* use_wal off requires smgr_targblock be initially invalid */
 829         Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
 830
 831         /*
 832          * If both tables have TOAST tables, perform toast swap by content.  It is
 833          * possible that the old table has a toast table but the new one doesn't,
 834          * if toastable columns have been dropped.  In that case we have to do
 835          * swap by links.  This is okay because swap by content is only essential
 836          * for system catalogs, and we don't support schema changes for them.
 837          */
 838         if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
 839         {
 840                 *pSwapToastByContent = true;
 841
 842                 /*
 843                  * When doing swap by content, any toast pointers written into NewHeap
 844                  * must use the old toast table's OID, because that's where the toast
 845                  * data will eventually be found.  Set this up by setting rd_toastoid.
 846                  * This also tells toast_save_datum() to preserve the toast value
 847                  * OIDs, which we want so as not to invalidate toast pointers in
 848                  * system catalog caches, and to avoid making multiple copies of a
 849                  * single toast value.
 850                  *
 851                  * Note that we must hold NewHeap open until we are done writing data,
 852                  * since the relcache will not guarantee to remember this setting once
 853                  * the relation is closed.  Also, this technique depends on the fact
 854                  * that no one will try to read from the NewHeap until after we've
 855                  * finished writing it and swapping the rels --- otherwise they could
 856                  * follow the toast pointers to the wrong place.  (It would actually
 857                  * work for values copied over from the old toast table, but not for
 858                  * any values that we toast which were previously not toasted.)
 859                  */
 860                 NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
 861         }
 862         else
 863                 *pSwapToastByContent = false;
 864
 865         /*
 866          * Compute xids used to freeze and weed out dead tuples and multixacts.
 867          * Since we're going to rewrite the whole table anyway, there's no reason
 868          * not to be aggressive about this.
 869          */
 870         vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
 871                                                   &OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
 872                                                   NULL);
 873
 874         /*
 875          * FreezeXid will become the table's new relfrozenxid, and that mustn't go
 876          * backwards, so take the max.
 877          */
 878         if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
 879                 FreezeXid = OldHeap->rd_rel->relfrozenxid;
 880
 881         /*
 882          * MultiXactCutoff, similarly, shouldn't go backwards either.
 883          */
 884         if (MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
 885                 MultiXactCutoff = OldHeap->rd_rel->relminmxid;
 886
 887         /* return selected values to caller */
 888         *pFreezeXid = FreezeXid;
 889         *pCutoffMulti = MultiXactCutoff;
 890
 891         /* Remember if it's a system catalog */
 892         is_system_catalog = IsSystemRelation(OldHeap);
 893
 894         /* Initialize the rewrite operation */
 895         rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
 896                                                                  MultiXactCutoff, use_wal);
 897
 898         /*
 899          * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
 900          * the OldHeap.  We know how to use a sort to duplicate the ordering of a
 901          * btree index, and will use seqscan-and-sort for that case if the planner
 902          * tells us it's cheaper.  Otherwise, always indexscan if an index is
 903          * provided, else plain seqscan.
 904          */
 905         if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
 906                 use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
 907         else
 908                 use_sort = false;
 909
 910         /* Set up sorting if wanted */
 911         if (use_sort)
 912                 tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
 913                                                                                         maintenance_work_mem,
 914                                                                                         NULL, false);
 915         else
 916                 tuplesort = NULL;
 917
 918         /*
 919          * Prepare to scan the OldHeap.  To ensure we see recently-dead tuples
 920          * that still need to be copied, we scan with SnapshotAny and use
 921          * HeapTupleSatisfiesVacuum for the visibility test.
 922          */
 923         if (OldIndex != NULL && !use_sort)
 924         {
 925                 heapScan = NULL;
 926                 indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
 927                 index_rescan(indexScan, NULL, 0, NULL, 0);
 928         }
 929         else
 930         {
 931                 heapScan = heap_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
 932                 indexScan = NULL;
 933         }
 934
 935         /* Log what we're doing */
 936         if (indexScan != NULL)
 937                 ereport(elevel,
 938                                 (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
 939                                                 get_namespace_name(RelationGetNamespace(OldHeap)),
 940                                                 RelationGetRelationName(OldHeap),
 941                                                 RelationGetRelationName(OldIndex))));
 942         else if (tuplesort != NULL)
 943                 ereport(elevel,
 944                                 (errmsg("clustering \"%s.%s\" using sequential scan and sort",
 945                                                 get_namespace_name(RelationGetNamespace(OldHeap)),
 946                                                 RelationGetRelationName(OldHeap))));
 947         else
 948                 ereport(elevel,
 949                                 (errmsg("vacuuming \"%s.%s\"",
 950                                                 get_namespace_name(RelationGetNamespace(OldHeap)),
 951                                                 RelationGetRelationName(OldHeap))));
 952
 953         /*
 954          * Scan through the OldHeap, either in OldIndex order or sequentially;
 955          * copy each tuple into the NewHeap, or transiently to the tuplesort
 956          * module.  Note that we don't bother sorting dead tuples (they won't get
 957          * to the new table anyway).
 958          */
 959         for (;;)
 960         {
 961                 HeapTuple       tuple;
 962                 Buffer          buf;
 963                 bool            isdead;
 964
 965                 CHECK_FOR_INTERRUPTS();
 966
 967                 if (indexScan != NULL)
 968                 {
 969                         tuple = index_getnext(indexScan, ForwardScanDirection);
 970                         if (tuple == NULL)
 971                                 break;
 972
 973                         /* Since we used no scan keys, should never need to recheck */
 974                         if (indexScan->xs_recheck)
 975                                 elog(ERROR, "CLUSTER does not support lossy index conditions");
 976
 977                         buf = indexScan->xs_cbuf;
 978                 }
 979                 else
 980                 {
 981                         tuple = heap_getnext(heapScan, ForwardScanDirection);
 982                         if (tuple == NULL)
 983                                 break;
 984
 985                         buf = heapScan->rs_cbuf;
 986                 }
 987
 988                 LockBuffer(buf, BUFFER_LOCK_SHARE);
 989
 990                 switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
 991                 {
 992                         case HEAPTUPLE_DEAD:
 993                                 /* Definitely dead */
 994                                 isdead = true;
 995                                 break;
 996                         case HEAPTUPLE_RECENTLY_DEAD:
 997                                 tups_recently_dead += 1;
 998                                 /* fall through */
 999                         case HEAPTUPLE_LIVE:
1000                                 /* Live or recently dead, must copy it */
1001                                 isdead = false;
1002                                 break;
1003                         case HEAPTUPLE_INSERT_IN_PROGRESS:
1004
1005                                 /*
1006                                  * Since we hold exclusive lock on the relation, normally the
1007                                  * only way to see this is if it was inserted earlier in our
1008                                  * own transaction.  However, it can happen in system
1009                                  * catalogs, since we tend to release write lock before commit
1010                                  * there.  Give a warning if neither case applies; but in any
1011                                  * case we had better copy it.
1012                                  */
1013                                 if (!is_system_catalog &&
1014                                         !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
1015                                         elog(WARNING, "concurrent insert in progress within table \"%s\"",
1016                                                  RelationGetRelationName(OldHeap));
1017                                 /* treat as live */
1018                                 isdead = false;
1019                                 break;
1020                         case HEAPTUPLE_DELETE_IN_PROGRESS:
1021
1022                                 /*
1023                                  * Similar situation to INSERT_IN_PROGRESS case.
1024                                  */
1025                                 if (!is_system_catalog &&
1026                                         !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
1027                                         elog(WARNING, "concurrent delete in progress within table \"%s\"",
1028                                                  RelationGetRelationName(OldHeap));
1029                                 /* treat as recently dead */
1030                                 tups_recently_dead += 1;
1031                                 isdead = false;
1032                                 break;
1033                         default:
1034                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1035                                 isdead = false; /* keep compiler quiet */
1036                                 break;
1037                 }
1038
1039                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1040
1041                 if (isdead)
1042                 {
1043                         tups_vacuumed += 1;
1044                         /* heap rewrite module still needs to see it... */
1045                         if (rewrite_heap_dead_tuple(rwstate, tuple))
1046                         {
1047                                 /* A previous recently-dead tuple is now known dead */
1048                                 tups_vacuumed += 1;
1049                                 tups_recently_dead -= 1;
1050                         }
1051                         continue;
1052                 }
1053
1054                 num_tuples += 1;
1055                 if (tuplesort != NULL)
1056                         tuplesort_putheaptuple(tuplesort, tuple);
1057                 else
1058                         reform_and_rewrite_tuple(tuple,
1059                                                                          oldTupDesc, newTupDesc,
1060                                                                          values, isnull,
1061                                                                          NewHeap->rd_rel->relhasoids, rwstate);
1062         }
1063
1064         if (indexScan != NULL)
1065                 index_endscan(indexScan);
1066         if (heapScan != NULL)
1067                 heap_endscan(heapScan);
1068
1069         /*
1070          * In scan-and-sort mode, complete the sort, then read out all live tuples
1071          * from the tuplestore and write them to the new relation.
1072          */
1073         if (tuplesort != NULL)
1074         {
1075                 tuplesort_performsort(tuplesort);
1076
1077                 for (;;)
1078                 {
1079                         HeapTuple       tuple;
1080
1081                         CHECK_FOR_INTERRUPTS();
1082
1083                         tuple = tuplesort_getheaptuple(tuplesort, true);
1084                         if (tuple == NULL)
1085                                 break;
1086
1087                         reform_and_rewrite_tuple(tuple,
1088                                                                          oldTupDesc, newTupDesc,
1089                                                                          values, isnull,
1090                                                                          NewHeap->rd_rel->relhasoids, rwstate);
1091                 }
1092
1093                 tuplesort_end(tuplesort);
1094         }
1095
1096         /* Write out any remaining tuples, and fsync if needed */
1097         end_heap_rewrite(rwstate);
1098
1099         /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
1100         NewHeap->rd_toastoid = InvalidOid;
1101
1102         num_pages = RelationGetNumberOfBlocks(NewHeap);
1103
1104         /* Log what we did */
1105         ereport(elevel,
1106                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
1107                                         RelationGetRelationName(OldHeap),
1108                                         tups_vacuumed, num_tuples,
1109                                         RelationGetNumberOfBlocks(OldHeap)),
1110                          errdetail("%.0f dead row versions cannot be removed yet.\n"
1111                                            "%s.",
1112                                            tups_recently_dead,
1113                                            pg_rusage_show(&ru0))));
1114
1115         /* Clean up */
1116         pfree(values);
1117         pfree(isnull);
1118
1119         if (OldIndex != NULL)
1120                 index_close(OldIndex, NoLock);
1121         heap_close(OldHeap, NoLock);
1122         heap_close(NewHeap, NoLock);
1123
1124         /* Update pg_class to reflect the correct values of pages and tuples. */
1125         relRelation = heap_open(RelationRelationId, RowExclusiveLock);
1126
1127         reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
1128         if (!HeapTupleIsValid(reltup))
1129                 elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
1130         relform = (Form_pg_class) GETSTRUCT(reltup);
1131
1132         relform->relpages = num_pages;
1133         relform->reltuples = num_tuples;
1134
1135         /* Don't update the stats for pg_class.  See swap_relation_files. */
1136         if (OIDOldHeap != RelationRelationId)
1137                 CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1138         else
1139                 CacheInvalidateRelcacheByTuple(reltup);
1140
1141         /* Clean up. */
1142         heap_freetuple(reltup);
1143         heap_close(relRelation, RowExclusiveLock);
1144
1145         /* Make the update visible */
1146         CommandCounterIncrement();
1147 }
1148
1149 /*
1150  * Swap the physical files of two given relations.
1151  *
1152  * We swap the physical identity (reltablespace, relfilenode) while keeping the
1153  * same logical identities of the two relations.  relpersistence is also
1154  * swapped, which is critical since it determines where buffers live for each
1155  * relation.
1156  *
1157  * We can swap associated TOAST data in either of two ways: recursively swap
1158  * the physical content of the toast tables (and their indexes), or swap the
1159  * TOAST links in the given relations' pg_class entries.  The former is needed
1160  * to manage rewrites of shared catalogs (where we cannot change the pg_class
1161  * links) while the latter is the only way to handle cases in which a toast
1162  * table is added or removed altogether.
1163  *
1164  * Additionally, the first relation is marked with relfrozenxid set to
1165  * frozenXid.  It seems a bit ugly to have this here, but the caller would
1166  * have to do it anyway, so having it here saves a heap_update.  Note: in
1167  * the swap-toast-links case, we assume we don't need to change the toast
1168  * table's relfrozenxid: the new version of the toast table should already
1169  * have relfrozenxid set to RecentXmin, which is good enough.
1170  *
1171  * Lastly, if r2 and its toast table and toast index (if any) are mapped,
1172  * their OIDs are emitted into mapped_tables[].  This is hacky but beats
1173  * having to look the information up again later in finish_heap_swap.
1174  */
1175 static void
1176 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
1177                                         bool swap_toast_by_content,
1178                                         bool is_internal,
1179                                         TransactionId frozenXid,
1180                                         MultiXactId cutoffMulti,
1181                                         Oid *mapped_tables)
1182 {
1183         Relation        relRelation;
1184         HeapTuple       reltup1,
1185                                 reltup2;
1186         Form_pg_class relform1,
1187                                 relform2;
1188         Oid                     relfilenode1,
1189                                 relfilenode2;
1190         Oid                     swaptemp;
1191         char            swptmpchr;
1192
1193         /* We need writable copies of both pg_class tuples. */
1194         relRelation = heap_open(RelationRelationId, RowExclusiveLock);
1195
1196         reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
1197         if (!HeapTupleIsValid(reltup1))
1198                 elog(ERROR, "cache lookup failed for relation %u", r1);
1199         relform1 = (Form_pg_class) GETSTRUCT(reltup1);
1200
1201         reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
1202         if (!HeapTupleIsValid(reltup2))
1203                 elog(ERROR, "cache lookup failed for relation %u", r2);
1204         relform2 = (Form_pg_class) GETSTRUCT(reltup2);
1205
1206         relfilenode1 = relform1->relfilenode;
1207         relfilenode2 = relform2->relfilenode;
1208
1209         if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
1210         {
1211                 /*
1212                  * Normal non-mapped relations: swap relfilenodes, reltablespaces,
1213                  * relpersistence
1214                  */
1215                 Assert(!target_is_pg_class);
1216
1217                 swaptemp = relform1->relfilenode;
1218                 relform1->relfilenode = relform2->relfilenode;
1219                 relform2->relfilenode = swaptemp;
1220
1221                 swaptemp = relform1->reltablespace;
1222                 relform1->reltablespace = relform2->reltablespace;
1223                 relform2->reltablespace = swaptemp;
1224
1225                 swptmpchr = relform1->relpersistence;
1226                 relform1->relpersistence = relform2->relpersistence;
1227                 relform2->relpersistence = swptmpchr;
1228
1229                 /* Also swap toast links, if we're swapping by links */
1230                 if (!swap_toast_by_content)
1231                 {
1232                         swaptemp = relform1->reltoastrelid;
1233                         relform1->reltoastrelid = relform2->reltoastrelid;
1234                         relform2->reltoastrelid = swaptemp;
1235                 }
1236         }
1237         else
1238         {
1239                 /*
1240                  * Mapped-relation case.  Here we have to swap the relation mappings
1241                  * instead of modifying the pg_class columns.  Both must be mapped.
1242                  */
1243                 if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
1244                         elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
1245                                  NameStr(relform1->relname));
1246
1247                 /*
1248                  * We can't change the tablespace nor persistence of a mapped rel, and
1249                  * we can't handle toast link swapping for one either, because we must
1250                  * not apply any critical changes to its pg_class row.  These cases
1251                  * should be prevented by upstream permissions tests, so these checks
1252                  * are non-user-facing emergency backstop.
1253                  */
1254                 if (relform1->reltablespace != relform2->reltablespace)
1255                         elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
1256                                  NameStr(relform1->relname));
1257                 if (relform1->relpersistence != relform2->relpersistence)
1258                         elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
1259                                  NameStr(relform1->relname));
1260                 if (!swap_toast_by_content &&
1261                         (relform1->reltoastrelid || relform2->reltoastrelid))
1262                         elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
1263                                  NameStr(relform1->relname));
1264
1265                 /*
1266                  * Fetch the mappings --- shouldn't fail, but be paranoid
1267                  */
1268                 relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
1269                 if (!OidIsValid(relfilenode1))
1270                         elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1271                                  NameStr(relform1->relname), r1);
1272                 relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
1273                 if (!OidIsValid(relfilenode2))
1274                         elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
1275                                  NameStr(relform2->relname), r2);
1276
1277                 /*
1278                  * Send replacement mappings to relmapper.  Note these won't actually
1279                  * take effect until CommandCounterIncrement.
1280                  */
1281                 RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
1282                 RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
1283
1284                 /* Pass OIDs of mapped r2 tables back to caller */
1285                 *mapped_tables++ = r2;
1286         }
1287
1288         /*
1289          * In the case of a shared catalog, these next few steps will only affect
1290          * our own database's pg_class row; but that's okay, because they are all
1291          * noncritical updates.  That's also an important fact for the case of a
1292          * mapped catalog, because it's possible that we'll commit the map change
1293          * and then fail to commit the pg_class update.
1294          */
1295
1296         /* set rel1's frozen Xid and minimum MultiXid */
1297         if (relform1->relkind != RELKIND_INDEX)
1298         {
1299                 Assert(TransactionIdIsNormal(frozenXid));
1300                 relform1->relfrozenxid = frozenXid;
1301                 Assert(MultiXactIdIsValid(cutoffMulti));
1302                 relform1->relminmxid = cutoffMulti;
1303         }
1304
1305         /* swap size statistics too, since new rel has freshly-updated stats */
1306         {
1307                 int32           swap_pages;
1308                 float4          swap_tuples;
1309                 int32           swap_allvisible;
1310
1311                 swap_pages = relform1->relpages;
1312                 relform1->relpages = relform2->relpages;
1313                 relform2->relpages = swap_pages;
1314
1315                 swap_tuples = relform1->reltuples;
1316                 relform1->reltuples = relform2->reltuples;
1317                 relform2->reltuples = swap_tuples;
1318
1319                 swap_allvisible = relform1->relallvisible;
1320                 relform1->relallvisible = relform2->relallvisible;
1321                 relform2->relallvisible = swap_allvisible;
1322         }
1323
1324         /*
1325          * Update the tuples in pg_class --- unless the target relation of the
1326          * swap is pg_class itself.  In that case, there is zero point in making
1327          * changes because we'd be updating the old data that we're about to throw
1328          * away.  Because the real work being done here for a mapped relation is
1329          * just to change the relation map settings, it's all right to not update
1330          * the pg_class rows in this case. The most important changes will instead
1331          * performed later, in finish_heap_swap() itself.
1332          */
1333         if (!target_is_pg_class)
1334         {
1335                 CatalogIndexState indstate;
1336
1337                 indstate = CatalogOpenIndexes(relRelation);
1338                 CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
1339                                                                    indstate);
1340                 CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
1341                                                                    indstate);
1342                 CatalogCloseIndexes(indstate);
1343         }
1344         else
1345         {
1346                 /* no update ... but we do still need relcache inval */
1347                 CacheInvalidateRelcacheByTuple(reltup1);
1348                 CacheInvalidateRelcacheByTuple(reltup2);
1349         }
1350
1351         /*
1352          * Post alter hook for modified relations. The change to r2 is always
1353          * internal, but r1 depends on the invocation context.
1354          */
1355         InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
1356                                                                  InvalidOid, is_internal);
1357         InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
1358                                                                  InvalidOid, true);
1359
1360         /*
1361          * If we have toast tables associated with the relations being swapped,
1362          * deal with them too.
1363          */
1364         if (relform1->reltoastrelid || relform2->reltoastrelid)
1365         {
1366                 if (swap_toast_by_content)
1367                 {
1368                         if (relform1->reltoastrelid && relform2->reltoastrelid)
1369                         {
1370                                 /* Recursively swap the contents of the toast tables */
1371                                 swap_relation_files(relform1->reltoastrelid,
1372                                                                         relform2->reltoastrelid,
1373                                                                         target_is_pg_class,
1374                                                                         swap_toast_by_content,
1375                                                                         is_internal,
1376                                                                         frozenXid,
1377                                                                         cutoffMulti,
1378                                                                         mapped_tables);
1379                         }
1380                         else
1381                         {
1382                                 /* caller messed up */
1383                                 elog(ERROR, "cannot swap toast files by content when there's only one");
1384                         }
1385                 }
1386                 else
1387                 {
1388                         /*
1389                          * We swapped the ownership links, so we need to change dependency
1390                          * data to match.
1391                          *
1392                          * NOTE: it is possible that only one table has a toast table.
1393                          *
1394                          * NOTE: at present, a TOAST table's only dependency is the one on
1395                          * its owning table.  If more are ever created, we'd need to use
1396                          * something more selective than deleteDependencyRecordsFor() to
1397                          * get rid of just the link we want.
1398                          */
1399                         ObjectAddress baseobject,
1400                                                 toastobject;
1401                         long            count;
1402
1403                         /*
1404                          * We disallow this case for system catalogs, to avoid the
1405                          * possibility that the catalog we're rebuilding is one of the
1406                          * ones the dependency changes would change.  It's too late to be
1407                          * making any data changes to the target catalog.
1408                          */
1409                         if (IsSystemClass(r1, relform1))
1410                                 elog(ERROR, "cannot swap toast files by links for system catalogs");
1411
1412                         /* Delete old dependencies */
1413                         if (relform1->reltoastrelid)
1414                         {
1415                                 count = deleteDependencyRecordsFor(RelationRelationId,
1416                                                                                                    relform1->reltoastrelid,
1417                                                                                                    false);
1418                                 if (count != 1)
1419                                         elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1420                                                  count);
1421                         }
1422                         if (relform2->reltoastrelid)
1423                         {
1424                                 count = deleteDependencyRecordsFor(RelationRelationId,
1425                                                                                                    relform2->reltoastrelid,
1426                                                                                                    false);
1427                                 if (count != 1)
1428                                         elog(ERROR, "expected one dependency record for TOAST table, found %ld",
1429                                                  count);
1430                         }
1431
1432                         /* Register new dependencies */
1433                         baseobject.classId = RelationRelationId;
1434                         baseobject.objectSubId = 0;
1435                         toastobject.classId = RelationRelationId;
1436                         toastobject.objectSubId = 0;
1437
1438                         if (relform1->reltoastrelid)
1439                         {
1440                                 baseobject.objectId = r1;
1441                                 toastobject.objectId = relform1->reltoastrelid;
1442                                 recordDependencyOn(&toastobject, &baseobject,
1443                                                                    DEPENDENCY_INTERNAL);
1444                         }
1445
1446                         if (relform2->reltoastrelid)
1447                         {
1448                                 baseobject.objectId = r2;
1449                                 toastobject.objectId = relform2->reltoastrelid;
1450                                 recordDependencyOn(&toastobject, &baseobject,
1451                                                                    DEPENDENCY_INTERNAL);
1452                         }
1453                 }
1454         }
1455
1456         /*
1457          * If we're swapping two toast tables by content, do the same for their
1458          * valid index. The swap can actually be safely done only if the relations
1459          * have indexes.
1460          */
1461         if (swap_toast_by_content &&
1462                 relform1->relkind == RELKIND_TOASTVALUE &&
1463                 relform2->relkind == RELKIND_TOASTVALUE)
1464         {
1465                 Oid                     toastIndex1,
1466                                         toastIndex2;
1467
1468                 /* Get valid index for each relation */
1469                 toastIndex1 = toast_get_valid_index(r1,
1470                                                                                         AccessExclusiveLock);
1471                 toastIndex2 = toast_get_valid_index(r2,
1472                                                                                         AccessExclusiveLock);
1473
1474                 swap_relation_files(toastIndex1,
1475                                                         toastIndex2,
1476                                                         target_is_pg_class,
1477                                                         swap_toast_by_content,
1478                                                         is_internal,
1479                                                         InvalidTransactionId,
1480                                                         InvalidMultiXactId,
1481                                                         mapped_tables);
1482         }
1483
1484         /* Clean up. */
1485         heap_freetuple(reltup1);
1486         heap_freetuple(reltup2);
1487
1488         heap_close(relRelation, RowExclusiveLock);
1489
1490         /*
1491          * Close both relcache entries' smgr links.  We need this kluge because
1492          * both links will be invalidated during upcoming CommandCounterIncrement.
1493          * Whichever of the rels is the second to be cleared will have a dangling
1494          * reference to the other's smgr entry.  Rather than trying to avoid this
1495          * by ordering operations just so, it's easiest to close the links first.
1496          * (Fortunately, since one of the entries is local in our transaction,
1497          * it's sufficient to clear out our own relcache this way; the problem
1498          * cannot arise for other backends when they see our update on the
1499          * non-transient relation.)
1500          *
1501          * Caution: the placement of this step interacts with the decision to
1502          * handle toast rels by recursion.  When we are trying to rebuild pg_class
1503          * itself, the smgr close on pg_class must happen after all accesses in
1504          * this function.
1505          */
1506         RelationCloseSmgrByOid(r1);
1507         RelationCloseSmgrByOid(r2);
1508 }
1509
1510 /*
1511  * Remove the transient table that was built by make_new_heap, and finish
1512  * cleaning up (including rebuilding all indexes on the old heap).
1513  */
1514 void
1515 finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
1516                                  bool is_system_catalog,
1517                                  bool swap_toast_by_content,
1518                                  bool check_constraints,
1519                                  bool is_internal,
1520                                  TransactionId frozenXid,
1521                                  MultiXactId cutoffMulti,
1522                                  char newrelpersistence)
1523 {
1524         ObjectAddress object;
1525         Oid                     mapped_tables[4];
1526         int                     reindex_flags;
1527         int                     i;
1528
1529         /* Zero out possible results from swapped_relation_files */
1530         memset(mapped_tables, 0, sizeof(mapped_tables));
1531
1532         /*
1533          * Swap the contents of the heap relations (including any toast tables).
1534          * Also set old heap's relfrozenxid to frozenXid.
1535          */
1536         swap_relation_files(OIDOldHeap, OIDNewHeap,
1537                                                 (OIDOldHeap == RelationRelationId),
1538                                                 swap_toast_by_content, is_internal,
1539                                                 frozenXid, cutoffMulti, mapped_tables);
1540
1541         /*
1542          * If it's a system catalog, queue a sinval message to flush all catcaches
1543          * on the catalog when we reach CommandCounterIncrement.
1544          */
1545         if (is_system_catalog)
1546                 CacheInvalidateCatalog(OIDOldHeap);
1547
1548         /*
1549          * Rebuild each index on the relation (but not the toast table, which is
1550          * all-new at this point).  It is important to do this before the DROP
1551          * step because if we are processing a system catalog that will be used
1552          * during DROP, we want to have its indexes available.  There is no
1553          * advantage to the other order anyway because this is all transactional,
1554          * so no chance to reclaim disk space before commit.  We do not need a
1555          * final CommandCounterIncrement() because reindex_relation does it.
1556          *
1557          * Note: because index_build is called via reindex_relation, it will never
1558          * set indcheckxmin true for the indexes.  This is OK even though in some
1559          * sense we are building new indexes rather than rebuilding existing ones,
1560          * because the new heap won't contain any HOT chains at all, let alone
1561          * broken ones, so it can't be necessary to set indcheckxmin.
1562          */
1563         reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
1564         if (check_constraints)
1565                 reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
1566
1567         /*
1568          * Ensure that the indexes have the same persistence as the parent
1569          * relation.
1570          */
1571         if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
1572                 reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
1573         else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
1574                 reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
1575
1576         reindex_relation(OIDOldHeap, reindex_flags, 0);
1577
1578         /*
1579          * If the relation being rebuild is pg_class, swap_relation_files()
1580          * couldn't update pg_class's own pg_class entry (check comments in
1581          * swap_relation_files()), thus relfrozenxid was not updated. That's
1582          * annoying because a potential reason for doing a VACUUM FULL is a
1583          * imminent or actual anti-wraparound shutdown.  So, now that we can
1584          * access the new relation using its indices, update relfrozenxid.
1585          * pg_class doesn't have a toast relation, so we don't need to update the
1586          * corresponding toast relation. Not that there's little point moving all
1587          * relfrozenxid updates here since swap_relation_files() needs to write to
1588          * pg_class for non-mapped relations anyway.
1589          */
1590         if (OIDOldHeap == RelationRelationId)
1591         {
1592                 Relation        relRelation;
1593                 HeapTuple       reltup;
1594                 Form_pg_class relform;
1595
1596                 relRelation = heap_open(RelationRelationId, RowExclusiveLock);
1597
1598                 reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
1599                 if (!HeapTupleIsValid(reltup))
1600                         elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
1601                 relform = (Form_pg_class) GETSTRUCT(reltup);
1602
1603                 relform->relfrozenxid = frozenXid;
1604                 relform->relminmxid = cutoffMulti;
1605
1606                 CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
1607
1608                 heap_close(relRelation, RowExclusiveLock);
1609         }
1610
1611         /* Destroy new heap with old filenode */
1612         object.classId = RelationRelationId;
1613         object.objectId = OIDNewHeap;
1614         object.objectSubId = 0;
1615
1616         /*
1617          * The new relation is local to our transaction and we know nothing
1618          * depends on it, so DROP_RESTRICT should be OK.
1619          */
1620         performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
1621
1622         /* performDeletion does CommandCounterIncrement at end */
1623
1624         /*
1625          * Now we must remove any relation mapping entries that we set up for the
1626          * transient table, as well as its toast table and toast index if any. If
1627          * we fail to do this before commit, the relmapper will complain about new
1628          * permanent map entries being added post-bootstrap.
1629          */
1630         for (i = 0; OidIsValid(mapped_tables[i]); i++)
1631                 RelationMapRemoveMapping(mapped_tables[i]);
1632
1633         /*
1634          * At this point, everything is kosher except that, if we did toast swap
1635          * by links, the toast table's name corresponds to the transient table.
1636          * The name is irrelevant to the backend because it's referenced by OID,
1637          * but users looking at the catalogs could be confused.  Rename it to
1638          * prevent this problem.
1639          *
1640          * Note no lock required on the relation, because we already hold an
1641          * exclusive lock on it.
1642          */
1643         if (!swap_toast_by_content)
1644         {
1645                 Relation        newrel;
1646
1647                 newrel = heap_open(OIDOldHeap, NoLock);
1648                 if (OidIsValid(newrel->rd_rel->reltoastrelid))
1649                 {
1650                         Oid                     toastidx;
1651                         char            NewToastName[NAMEDATALEN];
1652
1653                         /* Get the associated valid index to be renamed */
1654                         toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
1655                                                                                          AccessShareLock);
1656
1657                         /* rename the toast table ... */
1658                         snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
1659                                          OIDOldHeap);
1660                         RenameRelationInternal(newrel->rd_rel->reltoastrelid,
1661                                                                    NewToastName, true);
1662
1663                         /* ... and its valid index too. */
1664                         snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
1665                                          OIDOldHeap);
1666
1667                         RenameRelationInternal(toastidx,
1668                                                                    NewToastName, true);
1669                 }
1670                 relation_close(newrel, NoLock);
1671         }
1672
1673         /* if it's not a catalog table, clear any missing attribute settings */
1674         if (!is_system_catalog)
1675         {
1676                 Relation        newrel;
1677
1678                 newrel = heap_open(OIDOldHeap, NoLock);
1679                 RelationClearMissing(newrel);
1680                 relation_close(newrel, NoLock);
1681         }
1682 }
1683
1684
1685 /*
1686  * Get a list of tables that the current user owns and
1687  * have indisclustered set.  Return the list in a List * of rvsToCluster
1688  * with the tableOid and the indexOid on which the table is already
1689  * clustered.
1690  */
1691 static List *
1692 get_tables_to_cluster(MemoryContext cluster_context)
1693 {
1694         Relation        indRelation;
1695         HeapScanDesc scan;
1696         ScanKeyData entry;
1697         HeapTuple       indexTuple;
1698         Form_pg_index index;
1699         MemoryContext old_context;
1700         RelToCluster *rvtc;
1701         List       *rvs = NIL;
1702
1703         /*
1704          * Get all indexes that have indisclustered set and are owned by
1705          * appropriate user. System relations or nailed-in relations cannot ever
1706          * have indisclustered set, because CLUSTER will refuse to set it when
1707          * called with one of them as argument.
1708          */
1709         indRelation = heap_open(IndexRelationId, AccessShareLock);
1710         ScanKeyInit(&entry,
1711                                 Anum_pg_index_indisclustered,
1712                                 BTEqualStrategyNumber, F_BOOLEQ,
1713                                 BoolGetDatum(true));
1714         scan = heap_beginscan_catalog(indRelation, 1, &entry);
1715         while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1716         {
1717                 index = (Form_pg_index) GETSTRUCT(indexTuple);
1718
1719                 if (!pg_class_ownercheck(index->indrelid, GetUserId()))
1720                         continue;
1721
1722                 /*
1723                  * We have to build the list in a different memory context so it will
1724                  * survive the cross-transaction processing
1725                  */
1726                 old_context = MemoryContextSwitchTo(cluster_context);
1727
1728                 rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
1729                 rvtc->tableOid = index->indrelid;
1730                 rvtc->indexOid = index->indexrelid;
1731                 rvs = lcons(rvtc, rvs);
1732
1733                 MemoryContextSwitchTo(old_context);
1734         }
1735         heap_endscan(scan);
1736
1737         relation_close(indRelation, AccessShareLock);
1738
1739         return rvs;
1740 }
1741
1742
1743 /*
1744  * Reconstruct and rewrite the given tuple
1745  *
1746  * We cannot simply copy the tuple as-is, for several reasons:
1747  *
1748  * 1. We'd like to squeeze out the values of any dropped columns, both
1749  * to save space and to ensure we have no corner-case failures. (It's
1750  * possible for example that the new table hasn't got a TOAST table
1751  * and so is unable to store any large values of dropped cols.)
1752  *
1753  * 2. The tuple might not even be legal for the new table; this is
1754  * currently only known to happen as an after-effect of ALTER TABLE
1755  * SET WITHOUT OIDS.
1756  *
1757  * So, we must reconstruct the tuple from component Datums.
1758  */
1759 static void
1760 reform_and_rewrite_tuple(HeapTuple tuple,
1761                                                  TupleDesc oldTupDesc, TupleDesc newTupDesc,
1762                                                  Datum *values, bool *isnull,
1763                                                  bool newRelHasOids, RewriteState rwstate)
1764 {
1765         HeapTuple       copiedTuple;
1766         int                     i;
1767
1768         heap_deform_tuple(tuple, oldTupDesc, values, isnull);
1769
1770         /* Be sure to null out any dropped columns */
1771         for (i = 0; i < newTupDesc->natts; i++)
1772         {
1773                 if (TupleDescAttr(newTupDesc, i)->attisdropped)
1774                         isnull[i] = true;
1775         }
1776
1777         copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
1778
1779         /* Preserve OID, if any */
1780         if (newRelHasOids)
1781                 HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
1782
1783         /* The heap rewrite module does the rest */
1784         rewrite_heap_tuple(rwstate, tuple, copiedTuple);
1785
1786         heap_freetuple(copiedTuple);
1787 }