tableam: relation creation, VACUUM FULL/CLUSTER, SET TABLESPACE.

author Andres Freund <andres@anarazel.de>

Fri, 29 Mar 2019 03:01:14 +0000 (20:01 -0700)

committer Andres Freund <andres@anarazel.de>

Fri, 29 Mar 2019 03:01:43 +0000 (20:01 -0700)
author Andres Freund <andres@anarazel.de>
Fri, 29 Mar 2019 03:01:14 +0000 (20:01 -0700)
committer Andres Freund <andres@anarazel.de>
Fri, 29 Mar 2019 03:01:43 +0000 (20:01 -0700)
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c

index 1e4394a665ba1bc3387b786934f674b687a52efa..581a6bd9d16d8e645ae78a99fd808ca89c33fc84 100644 (file)
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -23,16 +23,32 @@
  
  #include "access/genam.h"
  #include "access/heapam.h"
+#include "access/multixact.h"
+#include "access/rewriteheap.h"
  #include "access/tableam.h"
  #include "access/xact.h"
  #include "catalog/catalog.h"
  #include "catalog/index.h"
+#include "catalog/storage.h"
+#include "catalog/storage_xlog.h"
+#include "commands/progress.h"
  #include "executor/executor.h"
+#include "pgstat.h"
  #include "storage/bufmgr.h"
  #include "storage/bufpage.h"
+#include "storage/bufmgr.h"
  #include "storage/lmgr.h"
+#include "storage/predicate.h"
  #include "storage/procarray.h"
+#include "storage/smgr.h"
  #include "utils/builtins.h"
+#include "utils/rel.h"
+
+
+static void
+reform_and_rewrite_tuple(HeapTuple tuple,
+                                                Relation OldHeap, Relation NewHeap,
+                                                Datum *values, bool *isnull, RewriteState rwstate);
  
  
  static const TableAmRoutine heapam_methods;
@@ -523,6 +539,388 @@ tuple_lock_retry:
   * ------------------------------------------------------------------------
   */
  
+static void
+heapam_relation_set_new_filenode(Relation rel, char persistence,
+                                                                TransactionId *freezeXid,
+                                                                MultiXactId *minmulti)
+{
+       /*
+        * Initialize to the minimum XID that could put tuples in the table. We
+        * know that no xacts older than RecentXmin are still running, so that
+        * will do.
+        */
+       *freezeXid = RecentXmin;
+
+       /*
+        * Similarly, initialize the minimum Multixact to the first value that
+        * could possibly be stored in tuples in the table.  Running transactions
+        * could reuse values from their local cache, so we are careful to
+        * consider all currently running multis.
+        *
+        * XXX this could be refined further, but is it worth the hassle?
+        */
+       *minmulti = GetOldestMultiXactId();
+
+       RelationCreateStorage(rel->rd_node, persistence);
+
+       /*
+        * If required, set up an init fork for an unlogged table so that it can
+        * be correctly reinitialized on restart.  An immediate sync is required
+        * even if the page has been logged, because the write did not go through
+        * shared_buffers and therefore a concurrent checkpoint may have moved the
+        * redo pointer past our xlog record.  Recovery may as well remove it
+        * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
+        * record. Therefore, logging is necessary even if wal_level=minimal.
+        */
+       if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
+       {
+               Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
+                          rel->rd_rel->relkind == RELKIND_MATVIEW ||
+                          rel->rd_rel->relkind == RELKIND_TOASTVALUE);
+               RelationOpenSmgr(rel);
+               smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
+               log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
+               smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
+       }
+}
+
+static void
+heapam_relation_nontransactional_truncate(Relation rel)
+{
+       RelationTruncate(rel, 0);
+}
+
+static void
+heapam_relation_copy_data(Relation rel, RelFileNode newrnode)
+{
+       SMgrRelation dstrel;
+
+       dstrel = smgropen(newrnode, rel->rd_backend);
+       RelationOpenSmgr(rel);
+
+       /*
+        * Create and copy all forks of the relation, and schedule unlinking of
+        * old physical files.
+        *
+        * NOTE: any conflict in relfilenode value will be caught in
+        * RelationCreateStorage().
+        */
+       RelationCreateStorage(newrnode, rel->rd_rel->relpersistence);
+
+       /* copy main fork */
+       RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM,
+                                               rel->rd_rel->relpersistence);
+
+       /* copy those extra forks that exist */
+       for (ForkNumber forkNum = MAIN_FORKNUM + 1;
+                forkNum <= MAX_FORKNUM; forkNum++)
+       {
+               if (smgrexists(rel->rd_smgr, forkNum))
+               {
+                       smgrcreate(dstrel, forkNum, false);
+
+                       /*
+                        * WAL log creation if the relation is persistent, or this is the
+                        * init fork of an unlogged relation.
+                        */
+                       if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
+                               (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
+                                forkNum == INIT_FORKNUM))
+                               log_smgrcreate(&newrnode, forkNum);
+                       RelationCopyStorage(rel->rd_smgr, dstrel, forkNum,
+                                                               rel->rd_rel->relpersistence);
+               }
+       }
+
+
+       /* drop old relation, and close new one */
+       RelationDropStorage(rel);
+       smgrclose(dstrel);
+}
+
+static void
+heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
+                                                                Relation OldIndex, bool use_sort,
+                                                                TransactionId OldestXmin,
+                                                                TransactionId FreezeXid,
+                                                                MultiXactId MultiXactCutoff,
+                                                                double *num_tuples,
+                                                                double *tups_vacuumed,
+                                                                double *tups_recently_dead)
+{
+       RewriteState rwstate;
+       IndexScanDesc indexScan;
+       TableScanDesc tableScan;
+       HeapScanDesc heapScan;
+       bool            use_wal;
+       bool            is_system_catalog;
+       Tuplesortstate *tuplesort;
+       TupleDesc       oldTupDesc = RelationGetDescr(OldHeap);
+       TupleDesc       newTupDesc = RelationGetDescr(NewHeap);
+       TupleTableSlot *slot;
+       int                     natts;
+       Datum      *values;
+       bool       *isnull;
+       BufferHeapTupleTableSlot *hslot;
+
+       /* Remember if it's a system catalog */
+       is_system_catalog = IsSystemRelation(OldHeap);
+
+       /*
+        * We need to log the copied data in WAL iff WAL archiving/streaming is
+        * enabled AND it's a WAL-logged rel.
+        */
+       use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
+
+       /* use_wal off requires smgr_targblock be initially invalid */
+       Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
+
+       /* Preallocate values/isnull arrays */
+       natts = newTupDesc->natts;
+       values = (Datum *) palloc(natts * sizeof(Datum));
+       isnull = (bool *) palloc(natts * sizeof(bool));
+
+       /* Initialize the rewrite operation */
+       rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
+                                                                MultiXactCutoff, use_wal);
+
+
+       /* Set up sorting if wanted */
+       if (use_sort)
+               tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
+                                                                                       maintenance_work_mem,
+                                                                                       NULL, false);
+       else
+               tuplesort = NULL;
+
+       /*
+        * Prepare to scan the OldHeap.  To ensure we see recently-dead tuples
+        * that still need to be copied, we scan with SnapshotAny and use
+        * HeapTupleSatisfiesVacuum for the visibility test.
+        */
+       if (OldIndex != NULL && !use_sort)
+       {
+               const int   ci_index[] = {
+                       PROGRESS_CLUSTER_PHASE,
+                       PROGRESS_CLUSTER_INDEX_RELID
+               };
+               int64       ci_val[2];
+
+               /* Set phase and OIDOldIndex to columns */
+               ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
+               ci_val[1] = RelationGetRelid(OldIndex);
+               pgstat_progress_update_multi_param(2, ci_index, ci_val);
+
+               tableScan = NULL;
+               heapScan = NULL;
+               indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
+               index_rescan(indexScan, NULL, 0, NULL, 0);
+       }
+       else
+       {
+               /* In scan-and-sort mode and also VACUUM FULL, set phase */
+               pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
+                                                                        PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
+
+               tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
+               heapScan = (HeapScanDesc) tableScan;
+               indexScan = NULL;
+
+               /* Set total heap blocks */
+               pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS,
+                                                                        heapScan->rs_nblocks);
+       }
+
+       slot = table_slot_create(OldHeap, NULL);
+       hslot = (BufferHeapTupleTableSlot *) slot;
+
+       /*
+        * Scan through the OldHeap, either in OldIndex order or sequentially;
+        * copy each tuple into the NewHeap, or transiently to the tuplesort
+        * module.  Note that we don't bother sorting dead tuples (they won't get
+        * to the new table anyway).
+        */
+       for (;;)
+       {
+               HeapTuple       tuple;
+               Buffer          buf;
+               bool            isdead;
+
+               CHECK_FOR_INTERRUPTS();
+
+               if (indexScan != NULL)
+               {
+                       if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
+                               break;
+
+                       /* Since we used no scan keys, should never need to recheck */
+                       if (indexScan->xs_recheck)
+                               elog(ERROR, "CLUSTER does not support lossy index conditions");
+               }
+               else
+               {
+                       if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot))
+                               break;
+
+                       /* In scan-and-sort mode and also VACUUM FULL, set heap blocks scanned */
+                       pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
+                                                                                heapScan->rs_cblock + 1);
+               }
+
+               tuple = ExecFetchSlotHeapTuple(slot, false, NULL);
+               buf = hslot->buffer;
+
+               LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+               switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
+               {
+                       case HEAPTUPLE_DEAD:
+                               /* Definitely dead */
+                               isdead = true;
+                               break;
+                       case HEAPTUPLE_RECENTLY_DEAD:
+                               *tups_recently_dead += 1;
+                               /* fall through */
+                       case HEAPTUPLE_LIVE:
+                               /* Live or recently dead, must copy it */
+                               isdead = false;
+                               break;
+                       case HEAPTUPLE_INSERT_IN_PROGRESS:
+
+                               /*
+                                * Since we hold exclusive lock on the relation, normally the
+                                * only way to see this is if it was inserted earlier in our
+                                * own transaction.  However, it can happen in system
+                                * catalogs, since we tend to release write lock before commit
+                                * there.  Give a warning if neither case applies; but in any
+                                * case we had better copy it.
+                                */
+                               if (!is_system_catalog &&
+                                       !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
+                                       elog(WARNING, "concurrent insert in progress within table \"%s\"",
+                                                RelationGetRelationName(OldHeap));
+                               /* treat as live */
+                               isdead = false;
+                               break;
+                       case HEAPTUPLE_DELETE_IN_PROGRESS:
+
+                               /*
+                                * Similar situation to INSERT_IN_PROGRESS case.
+                                */
+                               if (!is_system_catalog &&
+                                       !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
+                                       elog(WARNING, "concurrent delete in progress within table \"%s\"",
+                                                RelationGetRelationName(OldHeap));
+                               /* treat as recently dead */
+                               *tups_recently_dead += 1;
+                               isdead = false;
+                               break;
+                       default:
+                               elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+                               isdead = false; /* keep compiler quiet */
+                               break;
+               }
+
+               LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+               if (isdead)
+               {
+                       *tups_vacuumed += 1;
+                       /* heap rewrite module still needs to see it... */
+                       if (rewrite_heap_dead_tuple(rwstate, tuple))
+                       {
+                               /* A previous recently-dead tuple is now known dead */
+                               *tups_vacuumed += 1;
+                               *tups_recently_dead -= 1;
+                       }
+                       continue;
+               }
+
+               *num_tuples += 1;
+               if (tuplesort != NULL)
+               {
+                       tuplesort_putheaptuple(tuplesort, tuple);
+
+                       /* In scan-and-sort mode, report increase in number of tuples scanned */
+                       pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
+                                                                                *num_tuples);
+               }
+               else
+               {
+                       const int   ct_index[] = {
+                               PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
+                               PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN
+                       };
+                       int64       ct_val[2];
+
+                       reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
+                                                                        values, isnull, rwstate);
+
+                       /*
+                        * In indexscan mode and also VACUUM FULL, report increase in
+                        * number of tuples scanned and written
+                        */
+                       ct_val[0] = *num_tuples;
+                       ct_val[1] = *num_tuples;
+                       pgstat_progress_update_multi_param(2, ct_index, ct_val);
+               }
+       }
+
+       if (indexScan != NULL)
+               index_endscan(indexScan);
+       if (tableScan != NULL)
+               table_endscan(tableScan);
+       if (slot)
+               ExecDropSingleTupleTableSlot(slot);
+
+       /*
+        * In scan-and-sort mode, complete the sort, then read out all live tuples
+        * from the tuplestore and write them to the new relation.
+        */
+       if (tuplesort != NULL)
+       {
+               double n_tuples = 0;
+               /* Report that we are now sorting tuples */
+               pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
+                                                                        PROGRESS_CLUSTER_PHASE_SORT_TUPLES);
+
+               tuplesort_performsort(tuplesort);
+
+               /* Report that we are now writing new heap */
+               pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
+                                                                        PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP);
+
+               for (;;)
+               {
+                       HeapTuple       tuple;
+
+                       CHECK_FOR_INTERRUPTS();
+
+                       tuple = tuplesort_getheaptuple(tuplesort, true);
+                       if (tuple == NULL)
+                               break;
+
+                       n_tuples += 1;
+                       reform_and_rewrite_tuple(tuple,
+                                                                        OldHeap, NewHeap,
+                                                                        values, isnull,
+                                                                        rwstate);
+                       /* Report n_tuples */
+                       pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
+                                                                                n_tuples);
+               }
+
+               tuplesort_end(tuplesort);
+       }
+
+       /* Write out any remaining tuples, and fsync if needed */
+       end_heap_rewrite(rwstate);
+
+       /* Clean up */
+       pfree(values);
+       pfree(isnull);
+}
+
  static double
  heapam_index_build_range_scan(Relation heapRelation,
                                                           Relation indexRelation,
@@ -1256,6 +1654,55 @@ heapam_index_validate_scan(Relation heapRelation,
  }
  
  
+/* ----------------------------------------------------------------------------
+ *  Helper functions for the above.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * Reconstruct and rewrite the given tuple
+ *
+ * We cannot simply copy the tuple as-is, for several reasons:
+ *
+ * 1. We'd like to squeeze out the values of any dropped columns, both
+ * to save space and to ensure we have no corner-case failures. (It's
+ * possible for example that the new table hasn't got a TOAST table
+ * and so is unable to store any large values of dropped cols.)
+ *
+ * 2. The tuple might not even be legal for the new table; this is
+ * currently only known to happen as an after-effect of ALTER TABLE
+ * SET WITHOUT OIDS.
+ *
+ * So, we must reconstruct the tuple from component Datums.
+ */
+static void
+reform_and_rewrite_tuple(HeapTuple tuple,
+                                                Relation OldHeap, Relation NewHeap,
+                                                Datum *values, bool *isnull, RewriteState rwstate)
+{
+       TupleDesc       oldTupDesc = RelationGetDescr(OldHeap);
+       TupleDesc       newTupDesc = RelationGetDescr(NewHeap);
+       HeapTuple       copiedTuple;
+       int                     i;
+
+       heap_deform_tuple(tuple, oldTupDesc, values, isnull);
+
+       /* Be sure to null out any dropped columns */
+       for (i = 0; i < newTupDesc->natts; i++)
+       {
+               if (TupleDescAttr(newTupDesc, i)->attisdropped)
+                       isnull[i] = true;
+       }
+
+       copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
+
+       /* The heap rewrite module does the rest */
+       rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+
+       heap_freetuple(copiedTuple);
+}
+
+
  /* ------------------------------------------------------------------------
   * Definition of the heap table access method.
   * ------------------------------------------------------------------------
@@ -1292,6 +1739,10 @@ static const TableAmRoutine heapam_methods = {
         .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
         .compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples,
  
+       .relation_set_new_filenode = heapam_relation_set_new_filenode,
+       .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
+       .relation_copy_data = heapam_relation_copy_data,
+       .relation_copy_for_cluster = heapam_relation_copy_for_cluster,
         .index_build_range_scan = heapam_index_build_range_scan,
         .index_validate_scan = heapam_index_validate_scan,
  };
diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y

index fef6e7c3dc4ff2043f86ea623c93e5586704933a..6d7e11645d2f71d7fbe85c3f923ace4a61823f51 100644 (file)
--- a/src/backend/bootstrap/bootparse.y
+++ b/src/backend/bootstrap/bootparse.y
@@ -209,6 +209,9 @@ Boot_CreateStmt:
  
                                         if ($4)
                                         {
+                                               TransactionId relfrozenxid;
+                                               MultiXactId relminmxid;
+
                                                 if (boot_reldesc)
                                                 {
                                                         elog(DEBUG4, "create bootstrap: warning, open relation exists, closing first");
@@ -226,7 +229,9 @@ Boot_CreateStmt:
                                                                                                    RELPERSISTENCE_PERMANENT,
                                                                                                    shared_relation,
                                                                                                    mapped_relation,
-                                                                                                  true);
+                                                                                                  true,
+                                                                                                  &relfrozenxid,
+                                                                                                  &relminmxid);
                                                 elog(DEBUG4, "bootstrap relation created");
                                         }
                                         else
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c

index fc682e0b5212b6482fc39e6de88da2576675050d..71ad8c43c32d757d01b49f34395d21355235995c 100644 (file)
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -35,6 +35,7 @@
  #include "access/relation.h"
  #include "access/sysattr.h"
  #include "access/table.h"
+#include "access/tableam.h"
  #include "access/transam.h"
  #include "access/xact.h"
  #include "access/xlog.h"
@@ -98,6 +99,8 @@ static void AddNewRelationTuple(Relation pg_class_desc,
                                         Oid reloftype,
                                         Oid relowner,
                                         char relkind,
+                                       TransactionId relfrozenxid,
+                                       TransactionId relminmxid,
                                         Datum relacl,
                                         Datum reloptions);
  static ObjectAddress AddNewRelationType(const char *typeName,
@@ -300,7 +303,9 @@ heap_create(const char *relname,
                         char relpersistence,
                         bool shared_relation,
                         bool mapped_relation,
-                       bool allow_system_table_mods)
+                       bool allow_system_table_mods,
+                       TransactionId *relfrozenxid,
+                       MultiXactId *relminmxid)
  {
         bool            create_storage;
         Relation        rel;
@@ -327,6 +332,9 @@ heap_create(const char *relname,
                                                 get_namespace_name(relnamespace), relname),
                                  errdetail("System catalog modifications are currently disallowed.")));
  
+       *relfrozenxid = InvalidTransactionId;
+       *relminmxid = InvalidMultiXactId;
+
         /* Handle reltablespace for specific relkinds. */
         switch (relkind)
         {
@@ -400,13 +408,36 @@ heap_create(const char *relname,
         /*
          * Have the storage manager create the relation's disk file, if needed.
          *
-        * We only create the main fork here, other forks will be created on
-        * demand.
+        * For relations the callback creates both the main and the init fork, for
+        * indexes only the main fork is created. The other forks will be created
+        * on demand.
          */
         if (create_storage)
         {
                 RelationOpenSmgr(rel);
-               RelationCreateStorage(rel->rd_node, relpersistence);
+
+               switch (rel->rd_rel->relkind)
+               {
+                       case RELKIND_VIEW:
+                       case RELKIND_COMPOSITE_TYPE:
+                       case RELKIND_FOREIGN_TABLE:
+                       case RELKIND_PARTITIONED_TABLE:
+                       case RELKIND_PARTITIONED_INDEX:
+                               Assert(false);
+                               break;
+
+                       case RELKIND_INDEX:
+                       case RELKIND_SEQUENCE:
+                               RelationCreateStorage(rel->rd_node, relpersistence);
+                               break;
+
+                       case RELKIND_RELATION:
+                       case RELKIND_TOASTVALUE:
+                       case RELKIND_MATVIEW:
+                               table_relation_set_new_filenode(rel, relpersistence,
+                                                                          relfrozenxid, relminmxid);
+                               break;
+               }
         }
  
         return rel;
@@ -892,6 +923,8 @@ AddNewRelationTuple(Relation pg_class_desc,
                                         Oid reloftype,
                                         Oid relowner,
                                         char relkind,
+                                       TransactionId relfrozenxid,
+                                       TransactionId relminmxid,
                                         Datum relacl,
                                         Datum reloptions)
  {
@@ -928,40 +961,8 @@ AddNewRelationTuple(Relation pg_class_desc,
                         break;
         }
  
-       /* Initialize relfrozenxid and relminmxid */
-       if (relkind == RELKIND_RELATION ||
-               relkind == RELKIND_MATVIEW ||
-               relkind == RELKIND_TOASTVALUE)
-       {
-               /*
-                * Initialize to the minimum XID that could put tuples in the table.
-                * We know that no xacts older than RecentXmin are still running, so
-                * that will do.
-                */
-               new_rel_reltup->relfrozenxid = RecentXmin;
-
-               /*
-                * Similarly, initialize the minimum Multixact to the first value that
-                * could possibly be stored in tuples in the table.  Running
-                * transactions could reuse values from their local cache, so we are
-                * careful to consider all currently running multis.
-                *
-                * XXX this could be refined further, but is it worth the hassle?
-                */
-               new_rel_reltup->relminmxid = GetOldestMultiXactId();
-       }
-       else
-       {
-               /*
-                * Other relation types will not contain XIDs, so set relfrozenxid to
-                * InvalidTransactionId.  (Note: a sequence does contain a tuple, but
-                * we force its xmin to be FrozenTransactionId always; see
-                * commands/sequence.c.)
-                */
-               new_rel_reltup->relfrozenxid = InvalidTransactionId;
-               new_rel_reltup->relminmxid = InvalidMultiXactId;
-       }
-
+       new_rel_reltup->relfrozenxid = relfrozenxid;
+       new_rel_reltup->relminmxid = relminmxid;
         new_rel_reltup->relowner = relowner;
         new_rel_reltup->reltype = new_type_oid;
         new_rel_reltup->reloftype = reloftype;
@@ -1089,6 +1090,8 @@ heap_create_with_catalog(const char *relname,
         Oid                     new_type_oid;
         ObjectAddress new_type_addr;
         Oid                     new_array_oid = InvalidOid;
+       TransactionId relfrozenxid;
+       MultiXactId relminmxid;
  
         pg_class_desc = table_open(RelationRelationId, RowExclusiveLock);
  
@@ -1220,7 +1223,9 @@ heap_create_with_catalog(const char *relname,
                                                            relpersistence,
                                                            shared_relation,
                                                            mapped_relation,
-                                                          allow_system_table_mods);
+                                                          allow_system_table_mods,
+                                                          &relfrozenxid,
+                                                          &relminmxid);
  
         Assert(relid == RelationGetRelid(new_rel_desc));
  
@@ -1319,6 +1324,8 @@ heap_create_with_catalog(const char *relname,
                                                 reloftypeid,
                                                 ownerid,
                                                 relkind,
+                                               relfrozenxid,
+                                               relminmxid,
                                                 PointerGetDatum(relacl),
                                                 reloptions);
  
@@ -1407,14 +1414,6 @@ heap_create_with_catalog(const char *relname,
         if (oncommit != ONCOMMIT_NOOP)
                 register_on_commit_action(relid, oncommit);
  
-       /*
-        * Unlogged objects need an init fork, except for partitioned tables which
-        * have no storage at all.
-        */
-       if (relpersistence == RELPERSISTENCE_UNLOGGED &&
-               relkind != RELKIND_PARTITIONED_TABLE)
-               heap_create_init_fork(new_rel_desc);
-
         /*
          * ok, the relation has been cataloged, so close our relations and return
          * the OID of the newly created relation.
@@ -1425,27 +1424,6 @@ heap_create_with_catalog(const char *relname,
         return relid;
  }
  
-/*
- * Set up an init fork for an unlogged table so that it can be correctly
- * reinitialized on restart.  An immediate sync is required even if the
- * page has been logged, because the write did not go through
- * shared_buffers and therefore a concurrent checkpoint may have moved
- * the redo pointer past our xlog record.  Recovery may as well remove it
- * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
- * record. Therefore, logging is necessary even if wal_level=minimal.
- */
-void
-heap_create_init_fork(Relation rel)
-{
-       Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
-                  rel->rd_rel->relkind == RELKIND_MATVIEW ||
-                  rel->rd_rel->relkind == RELKIND_TOASTVALUE);
-       RelationOpenSmgr(rel);
-       smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
-       log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
-       smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
-}
-
  /*
   *             RelationRemoveInheritance
   *
@@ -3168,8 +3146,8 @@ heap_truncate_one_rel(Relation rel)
         if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
                 return;
  
-       /* Truncate the actual file (and discard buffers) */
-       RelationTruncate(rel, 0);
+       /* Truncate the underlying relation */
+       table_relation_nontransactional_truncate(rel);
  
         /* If the relation has indexes, truncate the indexes too */
         RelationTruncateIndexes(rel);
@@ -3180,7 +3158,7 @@ heap_truncate_one_rel(Relation rel)
         {
                 Relation        toastrel = table_open(toastrelid, AccessExclusiveLock);
  
-               RelationTruncate(toastrel, 0);
+               table_relation_nontransactional_truncate(toastrel);
                 RelationTruncateIndexes(toastrel);
                 /* keep the lock... */
                 table_close(toastrel, NoLock);
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c

index 104a8cceb78e2f5952605681d988f7d31f0cdfab..337361a652291b7d73da348878bf4faca7305815 100644 (file)
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -739,6 +739,8 @@ index_create(Relation heapRelation,
         bool            concurrent = (flags & INDEX_CREATE_CONCURRENT) != 0;
         bool            partitioned = (flags & INDEX_CREATE_PARTITIONED) != 0;
         char            relkind;
+       TransactionId relfrozenxid;
+       MultiXactId relminmxid;
  
         /* constraint flags can only be set when a constraint is requested */
         Assert((constr_flags == 0) ||
@@ -899,8 +901,12 @@ index_create(Relation heapRelation,
                                                                 relpersistence,
                                                                 shared_relation,
                                                                 mapped_relation,
-                                                               allow_system_table_mods);
+                                                               allow_system_table_mods,
+                                                               &relfrozenxid,
+                                                               &relminmxid);
  
+       Assert(relfrozenxid == InvalidTransactionId);
+       Assert(relminmxid == InvalidMultiXactId);
         Assert(indexRelationId == RelationGetRelid(indexRelation));
  
         /*
@@ -2850,8 +2856,7 @@ reindex_index(Oid indexId, bool skip_constraint_checks, char persistence,
                 }
  
                 /* We'll build a new physical relation for the index */
-               RelationSetNewRelfilenode(iRel, persistence, InvalidTransactionId,
-                                                                 InvalidMultiXactId);
+               RelationSetNewRelfilenode(iRel, persistence);
  
                 /* Initialize the index and rebuild */
                 /* Note: we do not need to re-establish pkey setting */
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c

index 0302507e6ff5f44cb36f763ae0d0b792e2d9c0f7..72242b24761f87e4162af0b747306cb0d3283c9b 100644 (file)
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -19,6 +19,8 @@
  
  #include "postgres.h"
  
+#include "miscadmin.h"
+
  #include "access/visibilitymap.h"
  #include "access/xact.h"
  #include "access/xlog.h"
@@ -290,6 +292,92 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
         smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
  }
  
+/*
+ * Copy a fork's data, block by block.
+ */
+void
+RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
+                                       ForkNumber forkNum, char relpersistence)
+{
+       PGAlignedBlock buf;
+       Page            page;
+       bool            use_wal;
+       bool            copying_initfork;
+       BlockNumber nblocks;
+       BlockNumber blkno;
+
+       page = (Page) buf.data;
+
+       /*
+        * The init fork for an unlogged relation in many respects has to be
+        * treated the same as normal relation, changes need to be WAL logged and
+        * it needs to be synced to disk.
+        */
+       copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
+               forkNum == INIT_FORKNUM;
+
+       /*
+        * We need to log the copied data in WAL iff WAL archiving/streaming is
+        * enabled AND it's a permanent relation.
+        */
+       use_wal = XLogIsNeeded() &&
+               (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
+
+       nblocks = smgrnblocks(src, forkNum);
+
+       for (blkno = 0; blkno < nblocks; blkno++)
+       {
+               /* If we got a cancel signal during the copy of the data, quit */
+               CHECK_FOR_INTERRUPTS();
+
+               smgrread(src, forkNum, blkno, buf.data);
+
+               if (!PageIsVerified(page, blkno))
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_DATA_CORRUPTED),
+                                        errmsg("invalid page in block %u of relation %s",
+                                                       blkno,
+                                                       relpathbackend(src->smgr_rnode.node,
+                                                                                  src->smgr_rnode.backend,
+                                                                                  forkNum))));
+
+               /*
+                * WAL-log the copied page. Unfortunately we don't know what kind of a
+                * page this is, so we have to log the full page including any unused
+                * space.
+                */
+               if (use_wal)
+                       log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
+
+               PageSetChecksumInplace(page, blkno);
+
+               /*
+                * Now write the page.  We say isTemp = true even if it's not a temp
+                * rel, because there's no need for smgr to schedule an fsync for this
+                * write; we'll do it ourselves below.
+                */
+               smgrextend(dst, forkNum, blkno, buf.data, true);
+       }
+
+       /*
+        * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
+        * to ensure that the toast table gets fsync'd too.  (For a temp or
+        * unlogged rel we don't care since the data will be gone after a crash
+        * anyway.)
+        *
+        * It's obvious that we must do this when not WAL-logging the copy. It's
+        * less obvious that we have to do it even if we did WAL-log the copied
+        * pages. The reason is that since we're copying outside shared buffers, a
+        * CHECKPOINT occurring during the copy has no way to flush the previously
+        * written data to disk (indeed it won't know the new rel even exists).  A
+        * crash later on would replay WAL from the checkpoint, therefore it
+        * wouldn't replay our earlier WAL entries. If we do not fsync those pages
+        * here, they might still not be on disk when the crash occurs.
+        */
+       if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
+               smgrimmedsync(dst, forkNum);
+}
+
  /*
   *     smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
   *
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c

index 205070b83d228efff11b42e73791414a24d3c5cf..4f4be1efbfc322ac5e1460be03ac75d7fe7c1db2 100644 (file)
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -21,7 +21,6 @@
  #include "access/heapam.h"
  #include "access/multixact.h"
  #include "access/relscan.h"
-#include "access/rewriteheap.h"
  #include "access/tableam.h"
  #include "access/transam.h"
  #include "access/tuptoaster.h"
@@ -45,7 +44,6 @@
  #include "storage/bufmgr.h"
  #include "storage/lmgr.h"
  #include "storage/predicate.h"
-#include "storage/smgr.h"
  #include "utils/acl.h"
  #include "utils/fmgroids.h"
  #include "utils/inval.h"
@@ -71,14 +69,10 @@ typedef struct
  
  
  static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
-static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
+static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
                            bool verbose, bool *pSwapToastByContent,
                            TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
  static List *get_tables_to_cluster(MemoryContext cluster_context);
-static void reform_and_rewrite_tuple(HeapTuple tuple,
-                                                TupleDesc oldTupDesc, TupleDesc newTupDesc,
-                                                Datum *values, bool *isnull,
-                                                RewriteState rwstate);
  
  
  /*---------------------------------------------------------------------------
@@ -619,7 +613,7 @@ rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
                                                            AccessExclusiveLock);
  
         /* Copy the heap data into the new table in the desired order */
-       copy_heap_data(OIDNewHeap, tableOid, indexOid, verbose,
+       copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
                                    &swap_toast_by_content, &frozenXid, &cutoffMulti);
  
         /*
@@ -762,7 +756,7 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
  }
  
  /*
- * Do the physical copying of heap data.
+ * Do the physical copying of table data.
   *
   * There are three output parameters:
   * *pSwapToastByContent is set true if toast tables must be swapped by content.
@@ -770,9 +764,9 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence,
   * *pCutoffMulti receives the MultiXactId used as a cutoff point.
   */
  static void
-copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
-                          bool *pSwapToastByContent, TransactionId *pFreezeXid,
-                          MultiXactId *pCutoffMulti)
+copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
+                               bool *pSwapToastByContent, TransactionId *pFreezeXid,
+                               MultiXactId *pCutoffMulti)
  {
         Relation        NewHeap,
                                 OldHeap,
@@ -780,30 +774,18 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
         Relation        relRelation;
         HeapTuple       reltup;
         Form_pg_class relform;
-       TupleDesc       oldTupDesc;
-       TupleDesc       newTupDesc;
-       int                     natts;
-       Datum      *values;
-       bool       *isnull;
-       IndexScanDesc indexScan;
-       TableScanDesc tableScan;
-       HeapScanDesc heapScan;
-       bool            use_wal;
-       bool            is_system_catalog;
+       TupleDesc       oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
+       TupleDesc       newTupDesc PG_USED_FOR_ASSERTS_ONLY;
         TransactionId OldestXmin;
         TransactionId FreezeXid;
         MultiXactId MultiXactCutoff;
-       RewriteState rwstate;
         bool            use_sort;
-       Tuplesortstate *tuplesort;
         double          num_tuples = 0,
                                 tups_vacuumed = 0,
                                 tups_recently_dead = 0;
         BlockNumber num_pages;
         int                     elevel = verbose ? INFO : DEBUG2;
         PGRUsage        ru0;
-       TupleTableSlot *slot;
-       BufferHeapTupleTableSlot *hslot;
  
         pg_rusage_init(&ru0);
  
@@ -825,11 +807,6 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
         newTupDesc = RelationGetDescr(NewHeap);
         Assert(newTupDesc->natts == oldTupDesc->natts);
  
-       /* Preallocate values/isnull arrays */
-       natts = newTupDesc->natts;
-       values = (Datum *) palloc(natts * sizeof(Datum));
-       isnull = (bool *) palloc(natts * sizeof(bool));
-
         /*
          * If the OldHeap has a toast table, get lock on the toast table to keep
          * it from being vacuumed.  This is needed because autovacuum processes
@@ -846,15 +823,6 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
         if (OldHeap->rd_rel->reltoastrelid)
                 LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
  
-       /*
-        * We need to log the copied data in WAL iff WAL archiving/streaming is
-        * enabled AND it's a WAL-logged rel.
-        */
-       use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
-
-       /* use_wal off requires smgr_targblock be initially invalid */
-       Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
-
         /*
          * If both tables have TOAST tables, perform toast swap by content.  It is
          * possible that the old table has a toast table but the new one doesn't,
@@ -915,13 +883,6 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
         *pFreezeXid = FreezeXid;
         *pCutoffMulti = MultiXactCutoff;
  
-       /* Remember if it's a system catalog */
-       is_system_catalog = IsSystemRelation(OldHeap);
-
-       /* Initialize the rewrite operation */
-       rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
-                                                                MultiXactCutoff, use_wal);
-
         /*
          * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
          * the OldHeap.  We know how to use a sort to duplicate the ordering of a
@@ -934,63 +895,14 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
         else
                 use_sort = false;
  
-       /* Set up sorting if wanted */
-       if (use_sort)
-               tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
-                                                                                       maintenance_work_mem,
-                                                                                       NULL, false);
-       else
-               tuplesort = NULL;
-
-       /*
-        * Prepare to scan the OldHeap.  To ensure we see recently-dead tuples
-        * that still need to be copied, we scan with SnapshotAny and use
-        * HeapTupleSatisfiesVacuum for the visibility test.
-        */
-       if (OldIndex != NULL && !use_sort)
-       {
-               const int   ci_index[] = {
-                       PROGRESS_CLUSTER_PHASE,
-                       PROGRESS_CLUSTER_INDEX_RELID
-               };
-               int64       ci_val[2];
-
-               /* Set phase and OIDOldIndex to columns */
-               ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
-               ci_val[1] = OIDOldIndex;
-               pgstat_progress_update_multi_param(2, ci_index, ci_val);
-
-               tableScan = NULL;
-               heapScan = NULL;
-               indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
-               index_rescan(indexScan, NULL, 0, NULL, 0);
-       }
-       else
-       {
-               /* In scan-and-sort mode and also VACUUM FULL, set phase */
-               pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
-                                                                        PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
-
-               tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
-               heapScan = (HeapScanDesc) tableScan;
-               indexScan = NULL;
-
-               /* Set total heap blocks */
-               pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS,
-                                                                        heapScan->rs_nblocks);
-       }
-
-       slot = table_slot_create(OldHeap, NULL);
-       hslot = (BufferHeapTupleTableSlot *) slot;
-
         /* Log what we're doing */
-       if (indexScan != NULL)
+       if (OldIndex != NULL && !use_sort)
                 ereport(elevel,
                                 (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
                                                 get_namespace_name(RelationGetNamespace(OldHeap)),
                                                 RelationGetRelationName(OldHeap),
                                                 RelationGetRelationName(OldIndex))));
-       else if (tuplesort != NULL)
+       else if (use_sort)
                 ereport(elevel,
                                 (errmsg("clustering \"%s.%s\" using sequential scan and sort",
                                                 get_namespace_name(RelationGetNamespace(OldHeap)),
@@ -1002,188 +914,13 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
                                                 RelationGetRelationName(OldHeap))));
  
         /*
-        * Scan through the OldHeap, either in OldIndex order or sequentially;
-        * copy each tuple into the NewHeap, or transiently to the tuplesort
-        * module.  Note that we don't bother sorting dead tuples (they won't get
-        * to the new table anyway).
+        * Hand of the actual copying to AM specific function, the generic code
+        * cannot know how to deal with visibility across AMs.
          */
-       for (;;)
-       {
-               HeapTuple       tuple;
-               Buffer          buf;
-               bool            isdead;
-
-               CHECK_FOR_INTERRUPTS();
-
-               if (indexScan != NULL)
-               {
-                       if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
-                               break;
-
-                       /* Since we used no scan keys, should never need to recheck */
-                       if (indexScan->xs_recheck)
-                               elog(ERROR, "CLUSTER does not support lossy index conditions");
-
-                       tuple = hslot->base.tuple;
-                       buf = hslot->buffer;
-               }
-               else
-               {
-                       tuple = heap_getnext(tableScan, ForwardScanDirection);
-                       if (tuple == NULL)
-                               break;
-
-                       buf = heapScan->rs_cbuf;
-
-                       /* In scan-and-sort mode and also VACUUM FULL, set heap blocks scanned */
-                       pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
-                                                                                heapScan->rs_cblock + 1);
-               }
-
-               LockBuffer(buf, BUFFER_LOCK_SHARE);
-
-               switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
-               {
-                       case HEAPTUPLE_DEAD:
-                               /* Definitely dead */
-                               isdead = true;
-                               break;
-                       case HEAPTUPLE_RECENTLY_DEAD:
-                               tups_recently_dead += 1;
-                               /* fall through */
-                       case HEAPTUPLE_LIVE:
-                               /* Live or recently dead, must copy it */
-                               isdead = false;
-                               break;
-                       case HEAPTUPLE_INSERT_IN_PROGRESS:
-
-                               /*
-                                * Since we hold exclusive lock on the relation, normally the
-                                * only way to see this is if it was inserted earlier in our
-                                * own transaction.  However, it can happen in system
-                                * catalogs, since we tend to release write lock before commit
-                                * there.  Give a warning if neither case applies; but in any
-                                * case we had better copy it.
-                                */
-                               if (!is_system_catalog &&
-                                       !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
-                                       elog(WARNING, "concurrent insert in progress within table \"%s\"",
-                                                RelationGetRelationName(OldHeap));
-                               /* treat as live */
-                               isdead = false;
-                               break;
-                       case HEAPTUPLE_DELETE_IN_PROGRESS:
-
-                               /*
-                                * Similar situation to INSERT_IN_PROGRESS case.
-                                */
-                               if (!is_system_catalog &&
-                                       !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
-                                       elog(WARNING, "concurrent delete in progress within table \"%s\"",
-                                                RelationGetRelationName(OldHeap));
-                               /* treat as recently dead */
-                               tups_recently_dead += 1;
-                               isdead = false;
-                               break;
-                       default:
-                               elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
-                               isdead = false; /* keep compiler quiet */
-                               break;
-               }
-
-               LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-
-               if (isdead)
-               {
-                       tups_vacuumed += 1;
-                       /* heap rewrite module still needs to see it... */
-                       if (rewrite_heap_dead_tuple(rwstate, tuple))
-                       {
-                               /* A previous recently-dead tuple is now known dead */
-                               tups_vacuumed += 1;
-                               tups_recently_dead -= 1;
-                       }
-                       continue;
-               }
-
-               num_tuples += 1;
-               if (tuplesort != NULL)
-               {
-                       tuplesort_putheaptuple(tuplesort, tuple);
-
-                       /* In scan-and-sort mode, report increase in number of tuples scanned */
-                       pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
-                                                                                num_tuples);
-               }
-               else
-               {
-                       const int   ct_index[] = {
-                               PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
-                               PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN
-                       };
-                       int64       ct_val[2];
-
-                       reform_and_rewrite_tuple(tuple,
-                                                                        oldTupDesc, newTupDesc,
-                                                                        values, isnull,
-                                                                        rwstate);
-
-                       /* In indexscan mode and also VACUUM FULL, report increase in number of tuples scanned and written */
-                       ct_val[0] = num_tuples;
-                       ct_val[1] = num_tuples;
-                       pgstat_progress_update_multi_param(2, ct_index, ct_val);
-               }
-       }
-
-       if (indexScan != NULL)
-               index_endscan(indexScan);
-       if (heapScan != NULL)
-               table_endscan(tableScan);
-       if (slot)
-               ExecDropSingleTupleTableSlot(slot);
-
-       /*
-        * In scan-and-sort mode, complete the sort, then read out all live tuples
-        * from the tuplestore and write them to the new relation.
-        */
-       if (tuplesort != NULL)
-       {
-               double n_tuples = 0;
-               /* Report that we are now sorting tuples */
-               pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
-                                                                        PROGRESS_CLUSTER_PHASE_SORT_TUPLES);
-
-               tuplesort_performsort(tuplesort);
-
-               /* Report that we are now writing new heap */
-               pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
-                                                                        PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP);
-
-               for (;;)
-               {
-                       HeapTuple       tuple;
-
-                       CHECK_FOR_INTERRUPTS();
-
-                       tuple = tuplesort_getheaptuple(tuplesort, true);
-                       if (tuple == NULL)
-                               break;
-
-                       n_tuples += 1;
-                       reform_and_rewrite_tuple(tuple,
-                                                                        oldTupDesc, newTupDesc,
-                                                                        values, isnull,
-                                                                        rwstate);
-                       /* Report n_tuples */
-                       pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
-                                                                                n_tuples);
-               }
-
-               tuplesort_end(tuplesort);
-       }
-
-       /* Write out any remaining tuples, and fsync if needed */
-       end_heap_rewrite(rwstate);
+       table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
+                                                                       OldestXmin, FreezeXid, MultiXactCutoff,
+                                                                       &num_tuples, &tups_vacuumed,
+                                                                       &tups_recently_dead);
  
         /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
         NewHeap->rd_toastoid = InvalidOid;
@@ -1201,10 +938,6 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
                                            tups_recently_dead,
                                            pg_rusage_show(&ru0))));
  
-       /* Clean up */
-       pfree(values);
-       pfree(isnull);
-
         if (OldIndex != NULL)
                 index_close(OldIndex, NoLock);
         table_close(OldHeap, NoLock);
@@ -1839,46 +1572,3 @@ get_tables_to_cluster(MemoryContext cluster_context)
  
         return rvs;
  }
-
-
-/*
- * Reconstruct and rewrite the given tuple
- *
- * We cannot simply copy the tuple as-is, for several reasons:
- *
- * 1. We'd like to squeeze out the values of any dropped columns, both
- * to save space and to ensure we have no corner-case failures. (It's
- * possible for example that the new table hasn't got a TOAST table
- * and so is unable to store any large values of dropped cols.)
- *
- * 2. The tuple might not even be legal for the new table; this is
- * currently only known to happen as an after-effect of ALTER TABLE
- * SET WITHOUT OIDS (in an older version, via pg_upgrade).
- *
- * So, we must reconstruct the tuple from component Datums.
- */
-static void
-reform_and_rewrite_tuple(HeapTuple tuple,
-                                                TupleDesc oldTupDesc, TupleDesc newTupDesc,
-                                                Datum *values, bool *isnull,
-                                                RewriteState rwstate)
-{
-       HeapTuple       copiedTuple;
-       int                     i;
-
-       heap_deform_tuple(tuple, oldTupDesc, values, isnull);
-
-       /* Be sure to null out any dropped columns */
-       for (i = 0; i < newTupDesc->natts; i++)
-       {
-               if (TupleDescAttr(newTupDesc, i)->attisdropped)
-                       isnull[i] = true;
-       }
-
-       copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
-
-       /* The heap rewrite module does the rest */
-       rewrite_heap_tuple(rwstate, tuple, copiedTuple);
-
-       heap_freetuple(copiedTuple);
-}
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c

index 574b46a28123947324aa6ea02e76aa25151f5bf4..e9add1b9873a25d071d61a5758740544090b019f 100644 (file)
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -312,12 +312,17 @@ ResetSequence(Oid seq_relid)
         seq->log_cnt = 0;
  
         /*
-        * Create a new storage file for the sequence.  We want to keep the
-        * sequence's relfrozenxid at 0, since it won't contain any unfrozen XIDs.
-        * Same with relminmxid, since a sequence will never contain multixacts.
+        * Create a new storage file for the sequence.
          */
-       RelationSetNewRelfilenode(seq_rel, seq_rel->rd_rel->relpersistence,
-                                                         InvalidTransactionId, InvalidMultiXactId);
+       RelationSetNewRelfilenode(seq_rel, seq_rel->rd_rel->relpersistence);
+
+       /*
+        * Ensure sequence's relfrozenxid is at 0, since it won't contain any
+        * unfrozen XIDs.  Same with relminmxid, since a sequence will never
+        * contain multixacts.
+        */
+       Assert(seq_rel->rd_rel->relfrozenxid == InvalidTransactionId);
+       Assert(seq_rel->rd_rel->relminmxid == InvalidMultiXactId);
  
         /*
          * Insert the modified tuple into the new storage file.
@@ -482,12 +487,17 @@ AlterSequence(ParseState *pstate, AlterSeqStmt *stmt)
  
                 /*
                  * Create a new storage file for the sequence, making the state
-                * changes transactional.  We want to keep the sequence's relfrozenxid
-                * at 0, since it won't contain any unfrozen XIDs.  Same with
-                * relminmxid, since a sequence will never contain multixacts.
+                * changes transactional.
+                */
+               RelationSetNewRelfilenode(seqrel, seqrel->rd_rel->relpersistence);
+
+               /*
+                * Ensure sequence's relfrozenxid is at 0, since it won't contain any
+                * unfrozen XIDs.  Same with relminmxid, since a sequence will never
+                * contain multixacts.
                  */
-               RelationSetNewRelfilenode(seqrel, seqrel->rd_rel->relpersistence,
-                                                                 InvalidTransactionId, InvalidMultiXactId);
+               Assert(seqrel->rd_rel->relfrozenxid == InvalidTransactionId);
+               Assert(seqrel->rd_rel->relminmxid == InvalidMultiXactId);
  
                 /*
                  * Insert the modified tuple into the new storage file.
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c

index 048c1196685b1026e156729707162fae73eb12fd..06e7caa9cff9db8a57caf9d42f73c2c70710a14a 100644 (file)
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -20,6 +20,7 @@
  #include "access/multixact.h"
  #include "access/reloptions.h"
  #include "access/relscan.h"
+#include "access/tableam.h"
  #include "access/sysattr.h"
  #include "access/tableam.h"
  #include "access/tupconvert.h"
@@ -473,8 +474,7 @@ static void ATExecEnableRowSecurity(Relation rel);
  static void ATExecDisableRowSecurity(Relation rel);
  static void ATExecForceNoForceRowSecurity(Relation rel, bool force_rls);
  
-static void copy_relation_data(SMgrRelation rel, SMgrRelation dst,
-                                  ForkNumber forkNum, char relpersistence);
+static void index_copy_data(Relation rel, RelFileNode newrnode);
  static const char *storage_name(char c);
  
  static void RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid,
@@ -1697,7 +1697,6 @@ ExecuteTruncateGuts(List *explicit_rels, List *relids, List *relids_logged,
                 {
                         Oid                     heap_relid;
                         Oid                     toast_relid;
-                       MultiXactId minmulti;
  
                         /*
                          * This effectively deletes all rows in the table, and may be done
@@ -1707,8 +1706,6 @@ ExecuteTruncateGuts(List *explicit_rels, List *relids, List *relids_logged,
                          */
                         CheckTableForSerializableConflictIn(rel);
  
-                       minmulti = GetOldestMultiXactId();
-
                         /*
                          * Need the full transaction-safe pushups.
                          *
@@ -1716,10 +1713,7 @@ ExecuteTruncateGuts(List *explicit_rels, List *relids, List *relids_logged,
                          * as the relfilenode value. The old storage file is scheduled for
                          * deletion at commit.
                          */
-                       RelationSetNewRelfilenode(rel, rel->rd_rel->relpersistence,
-                                                                         RecentXmin, minmulti);
-                       if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
-                               heap_create_init_fork(rel);
+                       RelationSetNewRelfilenode(rel, rel->rd_rel->relpersistence);
  
                         heap_relid = RelationGetRelid(rel);
  
@@ -1731,12 +1725,8 @@ ExecuteTruncateGuts(List *explicit_rels, List *relids, List *relids_logged,
                         {
                                 Relation        toastrel = relation_open(toast_relid,
                                                                                                          AccessExclusiveLock);
-
                                 RelationSetNewRelfilenode(toastrel,
-                                                                                 toastrel->rd_rel->relpersistence,
-                                                                                 RecentXmin, minmulti);
-                               if (toastrel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
-                                       heap_create_init_fork(toastrel);
+                                                                                 toastrel->rd_rel->relpersistence);
                                 table_close(toastrel, NoLock);
                         }
  
@@ -4928,13 +4918,7 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
  
                         /* Write the tuple out to the new relation */
                         if (newrel)
-                       {
-                               HeapTuple       tuple;
-
-                               tuple = ExecFetchSlotHeapTuple(newslot, true, NULL);
-                               heap_insert(newrel, tuple, mycid, hi_options, bistate);
-                               ItemPointerCopy(&tuple->t_self, &newslot->tts_tid);
-                       }
+                               table_insert(newrel, insertslot, mycid, hi_options, bistate);
  
                         ResetExprContext(econtext);
  
@@ -11492,11 +11476,9 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
         Oid                     reltoastrelid;
         Oid                     newrelfilenode;
         RelFileNode newrnode;
-       SMgrRelation dstrel;
         Relation        pg_class;
         HeapTuple       tuple;
         Form_pg_class rd_rel;
-       ForkNumber      forkNum;
         List       *reltoastidxids = NIL;
         ListCell   *lc;
  
@@ -11581,46 +11563,19 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
         newrnode = rel->rd_node;
         newrnode.relNode = newrelfilenode;
         newrnode.spcNode = newTableSpace;
-       dstrel = smgropen(newrnode, rel->rd_backend);
-
-       RelationOpenSmgr(rel);
-
-       /*
-        * Create and copy all forks of the relation, and schedule unlinking of
-        * old physical files.
-        *
-        * NOTE: any conflict in relfilenode value will be caught in
-        * RelationCreateStorage().
-        */
-       RelationCreateStorage(newrnode, rel->rd_rel->relpersistence);
-
-       /* copy main fork */
-       copy_relation_data(rel->rd_smgr, dstrel, MAIN_FORKNUM,
-                                          rel->rd_rel->relpersistence);
  
-       /* copy those extra forks that exist */
-       for (forkNum = MAIN_FORKNUM + 1; forkNum <= MAX_FORKNUM; forkNum++)
+       /* hand off to AM to actually create the new filenode and copy the data */
+       if (rel->rd_rel->relkind == RELKIND_INDEX)
         {
-               if (smgrexists(rel->rd_smgr, forkNum))
-               {
-                       smgrcreate(dstrel, forkNum, false);
-
-                       /*
-                        * WAL log creation if the relation is persistent, or this is the
-                        * init fork of an unlogged relation.
-                        */
-                       if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
-                               (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
-                                forkNum == INIT_FORKNUM))
-                               log_smgrcreate(&newrnode, forkNum);
-                       copy_relation_data(rel->rd_smgr, dstrel, forkNum,
-                                                          rel->rd_rel->relpersistence);
-               }
+               index_copy_data(rel, newrnode);
+       }
+       else
+       {
+               Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
+                          rel->rd_rel->relkind == RELKIND_MATVIEW ||
+                          rel->rd_rel->relkind == RELKIND_TOASTVALUE);
+               table_relation_copy_data(rel, newrnode);
         }
-
-       /* drop old relation, and close new one */
-       RelationDropStorage(rel);
-       smgrclose(dstrel);
  
         /* update the pg_class row */
         rd_rel->reltablespace = (newTableSpace == MyDatabaseTableSpace) ? InvalidOid : newTableSpace;
@@ -11882,90 +11837,51 @@ AlterTableMoveAll(AlterTableMoveAllStmt *stmt)
         return new_tablespaceoid;
  }
  
-/*
- * Copy data, block by block
- */
  static void
-copy_relation_data(SMgrRelation src, SMgrRelation dst,
-                                  ForkNumber forkNum, char relpersistence)
+index_copy_data(Relation rel, RelFileNode newrnode)
  {
-       PGAlignedBlock buf;
-       Page            page;
-       bool            use_wal;
-       bool            copying_initfork;
-       BlockNumber nblocks;
-       BlockNumber blkno;
-
-       page = (Page) buf.data;
+       SMgrRelation dstrel;
  
-       /*
-        * The init fork for an unlogged relation in many respects has to be
-        * treated the same as normal relation, changes need to be WAL logged and
-        * it needs to be synced to disk.
-        */
-       copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
-               forkNum == INIT_FORKNUM;
+       dstrel = smgropen(newrnode, rel->rd_backend);
+       RelationOpenSmgr(rel);
  
         /*
-        * We need to log the copied data in WAL iff WAL archiving/streaming is
-        * enabled AND it's a permanent relation.
+        * Create and copy all forks of the relation, and schedule unlinking of
+        * old physical files.
+        *
+        * NOTE: any conflict in relfilenode value will be caught in
+        * RelationCreateStorage().
          */
-       use_wal = XLogIsNeeded() &&
-               (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
+       RelationCreateStorage(newrnode, rel->rd_rel->relpersistence);
  
-       nblocks = smgrnblocks(src, forkNum);
+       /* copy main fork */
+       RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM,
+                                               rel->rd_rel->relpersistence);
  
-       for (blkno = 0; blkno < nblocks; blkno++)
+       /* copy those extra forks that exist */
+       for (ForkNumber forkNum = MAIN_FORKNUM + 1;
+                forkNum <= MAX_FORKNUM; forkNum++)
         {
-               /* If we got a cancel signal during the copy of the data, quit */
-               CHECK_FOR_INTERRUPTS();
-
-               smgrread(src, forkNum, blkno, buf.data);
-
-               if (!PageIsVerified(page, blkno))
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_DATA_CORRUPTED),
-                                        errmsg("invalid page in block %u of relation %s",
-                                                       blkno,
-                                                       relpathbackend(src->smgr_rnode.node,
-                                                                                  src->smgr_rnode.backend,
-                                                                                  forkNum))));
-
-               /*
-                * WAL-log the copied page. Unfortunately we don't know what kind of a
-                * page this is, so we have to log the full page including any unused
-                * space.
-                */
-               if (use_wal)
-                       log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
-
-               PageSetChecksumInplace(page, blkno);
+               if (smgrexists(rel->rd_smgr, forkNum))
+               {
+                       smgrcreate(dstrel, forkNum, false);
  
-               /*
-                * Now write the page.  We say isTemp = true even if it's not a temp
-                * rel, because there's no need for smgr to schedule an fsync for this
-                * write; we'll do it ourselves below.
-                */
-               smgrextend(dst, forkNum, blkno, buf.data, true);
+                       /*
+                        * WAL log creation if the relation is persistent, or this is the
+                        * init fork of an unlogged relation.
+                        */
+                       if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
+                               (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
+                                forkNum == INIT_FORKNUM))
+                               log_smgrcreate(&newrnode, forkNum);
+                       RelationCopyStorage(rel->rd_smgr, dstrel, forkNum,
+                                                               rel->rd_rel->relpersistence);
+               }
         }
  
-       /*
-        * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
-        * to ensure that the toast table gets fsync'd too.  (For a temp or
-        * unlogged rel we don't care since the data will be gone after a crash
-        * anyway.)
-        *
-        * It's obvious that we must do this when not WAL-logging the copy. It's
-        * less obvious that we have to do it even if we did WAL-log the copied
-        * pages. The reason is that since we're copying outside shared buffers, a
-        * CHECKPOINT occurring during the copy has no way to flush the previously
-        * written data to disk (indeed it won't know the new rel even exists).  A
-        * crash later on would replay WAL from the checkpoint, therefore it
-        * wouldn't replay our earlier WAL entries. If we do not fsync those pages
-        * here, they might still not be on disk when the crash occurs.
-        */
-       if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
-               smgrimmedsync(dst, forkNum);
+       /* drop old relation, and close new one */
+       RelationDropStorage(rel);
+       smgrclose(dstrel);
  }
  
  /*
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c

index 84609e07253607634639b9003ae1b35fc62afb71..12f813f0bc5f372c9b4b91fe46db0659a522c07a 100644 (file)
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -3422,31 +3422,16 @@ RelationBuildLocalRelation(const char *relname,
   * such as TRUNCATE or rebuilding an index from scratch.
   *
   * Caller must already hold exclusive lock on the relation.
- *
- * The relation is marked with relfrozenxid = freezeXid (InvalidTransactionId
- * must be passed for indexes and sequences).  This should be a lower bound on
- * the XIDs that will be put into the new relation contents.
- *
- * The new filenode's persistence is set to the given value.  This is useful
- * for the cases that are changing the relation's persistence; other callers
- * need to pass the original relpersistence value.
   */
  void
-RelationSetNewRelfilenode(Relation relation, char persistence,
-                                                 TransactionId freezeXid, MultiXactId minmulti)
+RelationSetNewRelfilenode(Relation relation, char persistence)
  {
         Oid                     newrelfilenode;
-       RelFileNodeBackend newrnode;
         Relation        pg_class;
         HeapTuple       tuple;
         Form_pg_class classform;
-
-       /* Indexes, sequences must have Invalid frozenxid; other rels must not */
-       Assert((relation->rd_rel->relkind == RELKIND_INDEX ||
-                       relation->rd_rel->relkind == RELKIND_SEQUENCE) ?
-                  freezeXid == InvalidTransactionId :
-                  TransactionIdIsNormal(freezeXid));
-       Assert(TransactionIdIsNormal(freezeXid) == MultiXactIdIsValid(minmulti));
+       MultiXactId minmulti = InvalidMultiXactId;
+       TransactionId freezeXid = InvalidTransactionId;
  
         /* Allocate a new relfilenode */
         newrelfilenode = GetNewRelFileNode(relation->rd_rel->reltablespace, NULL,
@@ -3464,18 +3449,6 @@ RelationSetNewRelfilenode(Relation relation, char persistence,
                          RelationGetRelid(relation));
         classform = (Form_pg_class) GETSTRUCT(tuple);
  
-       /*
-        * Create storage for the main fork of the new relfilenode.
-        *
-        * NOTE: any conflict in relfilenode value will be caught here, if
-        * GetNewRelFileNode messes up for any reason.
-        */
-       newrnode.node = relation->rd_node;
-       newrnode.node.relNode = newrelfilenode;
-       newrnode.backend = relation->rd_backend;
-       RelationCreateStorage(newrnode.node, persistence);
-       smgrclosenode(newrnode);
-
         /*
          * Schedule unlinking of the old storage at transaction commit.
          */
@@ -3490,9 +3463,51 @@ RelationSetNewRelfilenode(Relation relation, char persistence,
                 RelationMapUpdateMap(RelationGetRelid(relation),
                                                          newrelfilenode,
                                                          relation->rd_rel->relisshared,
-                                                        false);
+                                                        true);
         else
+       {
+               relation->rd_rel->relfilenode = newrelfilenode;
                 classform->relfilenode = newrelfilenode;
+       }
+
+       RelationInitPhysicalAddr(relation);
+
+       /*
+        * Create storage for the main fork of the new relfilenode. If it's
+        * table-like object, call into table AM to do so, which'll also create
+        * the table's init fork.
+        *
+        * NOTE: any conflict in relfilenode value will be caught here, if
+        * GetNewRelFileNode messes up for any reason.
+        */
+
+       /*
+        * Create storage for relation.
+        */
+       switch (relation->rd_rel->relkind)
+       {
+               /* shouldn't be called for these */
+               case RELKIND_VIEW:
+               case RELKIND_COMPOSITE_TYPE:
+               case RELKIND_FOREIGN_TABLE:
+               case RELKIND_PARTITIONED_TABLE:
+               case RELKIND_PARTITIONED_INDEX:
+                       elog(ERROR, "should not have storage");
+                       break;
+
+               case RELKIND_INDEX:
+               case RELKIND_SEQUENCE:
+                       RelationCreateStorage(relation->rd_node, persistence);
+                       RelationOpenSmgr(relation);
+                       break;
+
+               case RELKIND_RELATION:
+               case RELKIND_TOASTVALUE:
+               case RELKIND_MATVIEW:
+                       table_relation_set_new_filenode(relation, persistence,
+                                                                                       &freezeXid, &minmulti);
+                       break;
+       }
  
         /* These changes are safe even for a mapped relation */
         if (relation->rd_rel->relkind != RELKIND_SEQUENCE)
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h

index 85398e641ee31b4921580f48d1351fc0a8d1a231..7101d46c02c6fa712e7c6b4d8a54f2cec9ae3de1 100644 (file)
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -378,6 +378,46 @@ typedef struct TableAmRoutine
          * ------------------------------------------------------------------------
          */
  
+       /*
+        * This callback needs to create a new relation filenode for `rel`, with
+        * appropriate durability behaviour for `persistence`.
+        *
+        * On output *freezeXid, *minmulti should be set to the values appropriate
+        * for pg_class.{relfrozenxid, relminmxid} have to be set to. For AMs that
+        * don't need those fields to be filled they can be set to
+        * InvalidTransactionId, InvalidMultiXactId respectively.
+        *
+        * See also table_relation_set_new_filenode().
+        */
+       void            (*relation_set_new_filenode) (Relation rel,
+                                                                                         char persistence,
+                                                                                         TransactionId *freezeXid,
+                                                                                         MultiXactId *minmulti);
+
+       /*
+        * This callback needs to remove all contents from `rel`'s current
+        * relfilenode. No provisions for transactional behaviour need to be
+        * made. Often this can be implemented by truncating the underlying
+        * storage to its minimal size.
+        *
+        * See also table_relation_nontransactional_truncate().
+        */
+       void            (*relation_nontransactional_truncate) (Relation rel);
+
+       /*
+        * See table_relation_copy_data().
+        *
+        * This can typically be implemented by directly copying the underlying
+        * storage, unless it contains references to the tablespace internally.
+        */
+       void            (*relation_copy_data) (Relation rel, RelFileNode newrnode);
+
+       /* See table_relation_copy_for_cluster() */
+       void            (*relation_copy_for_cluster) (Relation NewHeap, Relation OldHeap, Relation OldIndex,
+                                                                                         bool use_sort,
+                                                                                         TransactionId OldestXmin, TransactionId FreezeXid, MultiXactId MultiXactCutoff,
+                                                                                         double *num_tuples, double *tups_vacuumed, double *tups_recently_dead);
+
         /* see table_index_build_range_scan for reference about parameters */
         double          (*index_build_range_scan) (Relation heap_rel,
                                                                                    Relation index_rel,
@@ -961,6 +1001,83 @@ table_lock_tuple(Relation rel, ItemPointer tid, Snapshot snapshot,
   * ------------------------------------------------------------------------
   */
  
+/*
+ * Create a new relation filenode for `rel`, with persistence set to
+ * `persistence`.
+ *
+ * This is used both during relation creation and various DDL operations to
+ * create a new relfilenode that can be filled from scratch.
+ *
+ * *freezeXid, *minmulti are set to the xid / multixact horizon for the table
+ * that pg_class.{relfrozenxid, relminmxid} have to be set to.
+ */
+static inline void
+table_relation_set_new_filenode(Relation rel, char persistence,
+                                                               TransactionId *freezeXid,
+                                                               MultiXactId *minmulti)
+{
+       rel->rd_tableam->relation_set_new_filenode(rel, persistence,
+                                                                                          freezeXid, minmulti);
+}
+
+/*
+ * Remove all table contents from `rel`, in a non-transactional manner.
+ * Non-transactional meaning that there's no need to support rollbacks. This
+ * commonly only is used to perform truncations for relfilenodes created in the
+ * current transaction.
+ */
+static inline void
+table_relation_nontransactional_truncate(Relation rel)
+{
+       rel->rd_tableam->relation_nontransactional_truncate(rel);
+}
+
+/*
+ * Copy data from `rel` into the new relfilenode `newrnode`. The new
+ * relfilenode may not have storage associated before this function is
+ * called. This is only supposed to be used for low level operations like
+ * changing a relation's tablespace.
+ */
+static inline void
+table_relation_copy_data(Relation rel, RelFileNode newrnode)
+{
+       rel->rd_tableam->relation_copy_data(rel, newrnode);
+}
+
+/*
+ * Copy data from `OldHeap` into `NewHeap`, as part of a CLUSTER or VACUUM
+ * FULL.
+ *
+ * If `use_sort` is true, the table contents are sorted appropriate for
+ * `OldIndex`; if use_sort is false and OldIndex is not InvalidOid, the data
+ * is copied in that index's order; if use_sort is false and OidIndex is
+ * InvalidOid, no sorting is performed.
+ *
+ * OldestXmin, FreezeXid, MultiXactCutoff need to currently valid values for
+ * the table.
+ *
+ * *num_tuples, *tups_vacuumed, *tups_recently_dead will contain statistics
+ * computed while copying for the relation. Not all might make sense for every
+ * AM.
+ */
+static inline void
+table_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
+                                                               Relation OldIndex,
+                                                               bool use_sort,
+                                                               TransactionId OldestXmin,
+                                                               TransactionId FreezeXid,
+                                                               MultiXactId MultiXactCutoff,
+                                                               double *num_tuples,
+                                                               double *tups_vacuumed,
+                                                               double *tups_recently_dead)
+{
+       OldHeap->rd_tableam->relation_copy_for_cluster(OldHeap, NewHeap, OldIndex,
+                                                                                                  use_sort, OldestXmin,
+                                                                                                  FreezeXid, MultiXactCutoff,
+                                                                                                  num_tuples, tups_vacuumed,
+                                                                                                  tups_recently_dead);
+}
+
  /*
   * table_index_build_range_scan - scan the table to find tuples to be indexed
   *
diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h

index 85076d0743723c9c6ae9fe3ddcaa2738e75c6bbd..f58d74edca19d59fc83908d7a4b5e40db490c7d9 100644 (file)
--- a/src/include/catalog/heap.h
+++ b/src/include/catalog/heap.h
@@ -55,7 +55,9 @@ extern Relation heap_create(const char *relname,
                         char relpersistence,
                         bool shared_relation,
                         bool mapped_relation,
-                       bool allow_system_table_mods);
+                       bool allow_system_table_mods,
+                       TransactionId *relfrozenxid,
+                       MultiXactId *relminmxid);
  
  extern Oid heap_create_with_catalog(const char *relname,
                                                  Oid relnamespace,
@@ -79,8 +81,6 @@ extern Oid heap_create_with_catalog(const char *relname,
                                                  Oid relrewrite,
                                                  ObjectAddress *typaddress);
  
-extern void heap_create_init_fork(Relation rel);
-
  extern void heap_drop_with_catalog(Oid relid);
  
  extern void heap_truncate(List *relids);
diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h

index 9f638be9249bef5310f2259d5b47215a32356bbb..882dc65c893abffd514f5cb20c65591a43dc2bcb 100644 (file)
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -16,12 +16,15 @@
  
  #include "storage/block.h"
  #include "storage/relfilenode.h"
+#include "storage/smgr.h"
  #include "utils/relcache.h"
  
  extern void RelationCreateStorage(RelFileNode rnode, char relpersistence);
  extern void RelationDropStorage(Relation rel);
  extern void RelationPreserveStorage(RelFileNode rnode, bool atCommit);
  extern void RelationTruncate(Relation rel, BlockNumber nblocks);
+extern void RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
+                                                               ForkNumber forkNum, char relpersistence);
  
  /*
   * These functions used to be in storage/smgr/smgr.c, which explains the
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h

index 8f5bd67649813f7e6952998c588169bc62baa284..809d6aa12363ecf3beee407101844fbee92b7192 100644 (file)
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -110,8 +110,7 @@ extern Relation RelationBuildLocalRelation(const char *relname,
  /*
   * Routine to manage assignment of new relfilenode to a relation
   */
-extern void RelationSetNewRelfilenode(Relation relation, char persistence,
-                                                 TransactionId freezeXid, MultiXactId minmulti);
+extern void RelationSetNewRelfilenode(Relation relation, char persistence);
  
  /*
   * Routines for flushing/rebuilding relcache entries in various scenarios
author	Andres Freund <andres@anarazel.de>
	Fri, 29 Mar 2019 03:01:14 +0000 (20:01 -0700)
committer	Andres Freund <andres@anarazel.de>
	Fri, 29 Mar 2019 03:01:43 +0000 (20:01 -0700)
src/backend/access/heap/heapam_handler.c		patch \| blob \| history
src/backend/bootstrap/bootparse.y		patch \| blob \| history
src/backend/catalog/heap.c		patch \| blob \| history
src/backend/catalog/index.c		patch \| blob \| history
src/backend/catalog/storage.c		patch \| blob \| history
src/backend/commands/cluster.c		patch \| blob \| history
src/backend/commands/sequence.c		patch \| blob \| history
src/backend/commands/tablecmds.c		patch \| blob \| history
src/backend/utils/cache/relcache.c		patch \| blob \| history
src/include/access/tableam.h		patch \| blob \| history
src/include/catalog/heap.h		patch \| blob \| history
src/include/catalog/storage.h		patch \| blob \| history
src/include/utils/relcache.h		patch \| blob \| history