Teach CLUSTER to skip writing WAL if not needed (ie, not using archiving)

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 29 Mar 2007 00:15:39 +0000 (00:15 +0000)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 29 Mar 2007 00:15:39 +0000 (00:15 +0000)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 29 Mar 2007 00:15:39 +0000 (00:15 +0000)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 29 Mar 2007 00:15:39 +0000 (00:15 +0000)
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml

index 5c2a9599455d82ee8b64186ec3897b8e279566b7..dcd0d1d2d330b3f8552541f156fba34d261f9d96 100644 (file)
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/perform.sgml,v 1.63 2007/02/01 19:10:24 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/perform.sgml,v 1.64 2007/03/29 00:15:36 tgl Exp $ -->
  
   <chapter id="performance-tips">
    <title>Performance Tips</title>
@@ -801,7 +801,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
      <command>EXECUTE</command> as many times as required.  This avoids
      some of the overhead of repeatedly parsing and planning
      <command>INSERT</command>. Different interfaces provide this facility
-    in different ways; look for Prepared Statements in the interface
+    in different ways; look for <quote>prepared statements</> in the interface
      documentation.
     </para>
  
@@ -815,14 +815,12 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
     <para>
      <command>COPY</command> is fastest when used within the same
      transaction as an earlier <command>CREATE TABLE</command> or
-    <command>TRUNCATE</command> command. In those cases, no WAL
-    needs to be written because in case of an error, the files
-    containing the newly loaded data will be removed automatically.
-    <command>CREATE TABLE AS SELECT</command> is also optimized 
-    to avoid writing WAL. <command>COPY</command> and
-    <command>CREATE TABLE AS SELECT</command> will write WAL 
-    when <xref linkend="guc-archive-command"> is set and will not
-    therefore be optimized in that case.
+    <command>TRUNCATE</command> command. In such cases no WAL
+    needs to be written, because in case of an error, the files
+    containing the newly loaded data will be removed anyway.
+    However, this consideration does not apply when
+    <xref linkend="guc-archive-command"> is set, as all commands
+    must write WAL in that case.
     </para>
  
    </sect2>
@@ -897,23 +895,51 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
     <title>Turn off <varname>archive_command</varname></title>
  
     <para>
-    When loading large amounts of data you might want to unset the
-    <xref linkend="guc-archive-command"> before loading. It might be
-    faster to take a new base backup once the load has completed
-    than to allow a large archive to accumulate.
+    When loading large amounts of data into an installation that uses
+    WAL archiving, you might want to disable archiving (unset the
+    <xref linkend="guc-archive-command"> configuration variable)
+    while loading.  It might be
+    faster to take a new base backup after the load has completed
+    than to process a large amount of incremental WAL data.
     </para>
  
     <para>
-    This is particularly important advice because certain commands 
-    will perform more slowly when <varname>archive_command</varname>
-    is set, as a result of their needing to write large amounts of WAL.
+    Aside from avoiding the time for the archiver to process the WAL data,
+    doing this will actually make certain commands faster, because they
+    are designed not to write WAL at all if <varname>archive_command</varname>
+    is unset.  (They can guarantee crash safety more cheaply by doing an
+    <function>fsync</> at the end than by writing WAL.)
      This applies to the following commands: 
-    <command>CREATE TABLE AS SELECT</command>, 
-    <command>CREATE INDEX</command> and also <command>COPY</command>, when
-    it is executed in the same transaction as a prior 
-    <command>CREATE TABLE</command> or <command>TRUNCATE</command> command.
+    <itemizedlist>
+     <listitem>
+      <para>
+       <command>CREATE TABLE AS SELECT</command>
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       <command>CREATE INDEX</command> (and variants such as
+       <command>ALTER TABLE ADD PRIMARY KEY</command>)
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       <command>ALTER TABLE SET TABLESPACE</command>
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       <command>CLUSTER</command>
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       <command>COPY FROM</command>, when the target table has been
+       created or truncated earlier in the same transaction
+      </para>
+     </listitem>
+    </itemizedlist>
     </para>
-
    </sect2>
  
    <sect2 id="populate-analyze">
@@ -950,15 +976,43 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
      By default, <application>pg_dump</> uses <command>COPY</>, and when
      it is generating a complete schema-and-data dump, it is careful to
      load data before creating indexes and foreign keys.  So in this case
-    the first several guidelines are handled automatically.  What is left
-    for you to do is to set appropriate (i.e., larger than normal) values
-    for <varname>maintenance_work_mem</varname> and
-    <varname>checkpoint_segments</varname>, as well as unsetting 
-    <varname>archive_command</varname> before loading the dump script,
-    and then to run <command>ANALYZE</> afterwards and resetting
-    <varname>archive_command</varname> if required. All of the 
-    parameters can be reset once the load has completed without needing
-    to restart the server, as described in <xref linkend="config-setting">.
+    several guidelines are handled automatically.  What is left
+    for you to do is to:
+    <itemizedlist>
+     <listitem>
+      <para>
+       Set appropriate (i.e., larger than normal) values for
+       <varname>maintenance_work_mem</varname> and
+       <varname>checkpoint_segments</varname>.
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       If using WAL archiving, consider disabling it during the restore.
+       To do that, unset <varname>archive_command</varname> before loading the
+       dump script, and afterwards restore <varname>archive_command</varname>
+       and take a fresh base backup.
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       Consider whether the whole dump should be restored as a single
+       transaction.  To do that, pass the <option>-1</> or
+       <option>--single-transaction</> command-line option to
+       <application>psql</> or <application>pg_restore</>. When using this
+       mode, even the smallest of errors will rollback the entire restore,
+       possibly discarding many hours of processing.  Depending on how
+       interrelated the data is, that might seem preferable to manual cleanup,
+       or not.  <command>COPY</> commands will run fastest if you use a single
+       transaction and have WAL archiving turned off.
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       Run <command>ANALYZE</> afterwards.
+      </para>
+     </listitem>
+    </itemizedlist>
     </para>
  
     <para>
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c

index 367831a515a8b9943002ffa280e09bee7175b4ca..a99aa4ced0af7024d295990f23033af79adb3eb9 100644 (file)
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.229 2007/03/25 19:45:13 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.230 2007/03/29 00:15:37 tgl Exp $
   *
   *
   * INTERFACE ROUTINES
@@ -1360,11 +1360,14 @@ heap_get_latest_tid(Relation relation,
   * that all new tuples go into new pages not containing any tuples from other
   * transactions, that the relation gets fsync'd before commit, and that the
   * transaction emits at least one WAL record to ensure RecordTransactionCommit
- * will decide to WAL-log the commit. (see heap_sync() comments also)
+ * will decide to WAL-log the commit.  (See also heap_sync() comments)
   *
   * use_fsm is passed directly to RelationGetBufferForTuple, which see for
   * more info.
   *
+ * Note that use_wal and use_fsm will be applied when inserting into the
+ * heap's TOAST table, too, if the tuple requires any out-of-line data.
+ *
   * The return value is the OID assigned to the tuple (either here or by the
   * caller), or InvalidOid if no OID.  The header fields of *tup are updated
   * to match the stored tuple; in particular tup->t_self receives the actual
@@ -1418,7 +1421,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
          * into the relation; tup is the caller's original untoasted data.
          */
         if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
-               heaptup = toast_insert_or_update(relation, tup, NULL, use_wal);
+               heaptup = toast_insert_or_update(relation, tup, NULL,
+                                                                                use_wal, use_fsm);
         else
                 heaptup = tup;
  
@@ -1526,8 +1530,10 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
   *     simple_heap_insert - insert a tuple
   *
   * Currently, this routine differs from heap_insert only in supplying
- * a default command ID.  But it should be used rather than using
- * heap_insert directly in most places where we are modifying system catalogs.
+ * a default command ID and not allowing access to the speedup options.
+ *
+ * This should be used rather than using heap_insert directly in most places
+ * where we are modifying system catalogs.
   */
  Oid
  simple_heap_insert(Relation relation, HeapTuple tup)
@@ -1535,18 +1541,6 @@ simple_heap_insert(Relation relation, HeapTuple tup)
         return heap_insert(relation, tup, GetCurrentCommandId(), true, true);
  }
  
-/*
- *     fast_heap_insert - insert a tuple with options to improve speed
- *
- * Currently, this routine allows specifying additional options for speed
- * in certain cases, such as WAL-avoiding COPY command
- */
-Oid
-fast_heap_insert(Relation relation, HeapTuple tup, bool use_wal)
-{
-       return heap_insert(relation, tup, GetCurrentCommandId(), use_wal, use_wal);
-}
-
  /*
   *     heap_delete - delete a tuple
   *
@@ -2112,7 +2106,9 @@ l2:
                  */
                 if (need_toast)
                 {
-                       heaptup = toast_insert_or_update(relation, newtup, &oldtup, true);
+                       /* Note we always use WAL and FSM during updates */
+                       heaptup = toast_insert_or_update(relation, newtup, &oldtup,
+                                                                                        true, true);
                         newtupsize = MAXALIGN(heaptup->t_len);
                 }
                 else
@@ -3988,23 +3984,40 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
                 appendStringInfo(buf, "UNKNOWN");
  }
  
-/* ----------------
- *             heap_sync - sync a heap, for use when no WAL has been written
- *
- * ----------------
+/*
+ *     heap_sync               - sync a heap, for use when no WAL has been written
+ *
+ * This forces the heap contents (including TOAST heap if any) down to disk.
+ * If we skipped using WAL, and it's not a temp relation, we must force the
+ * relation down to disk before it's safe to commit the transaction.  This
+ * requires writing out any dirty buffers and then doing a forced fsync.
+ *
+ * Indexes are not touched.  (Currently, index operations associated with
+ * the commands that use this are WAL-logged and so do not need fsync.
+ * That behavior might change someday, but in any case it's likely that
+ * any fsync decisions required would be per-index and hence not appropriate
+ * to be done here.)
   */
  void
  heap_sync(Relation rel)
  {
-       if (!rel->rd_istemp)
+       /* temp tables never need fsync */
+       if (rel->rd_istemp)
+               return;
+
+       /* main heap */
+       FlushRelationBuffers(rel);
+       /* FlushRelationBuffers will have opened rd_smgr */
+       smgrimmedsync(rel->rd_smgr);
+
+       /* toast heap, if any */
+       if (OidIsValid(rel->rd_rel->reltoastrelid))
         {
-               /*
-                * If we skipped using WAL, and it's not a temp relation,
-                * we must force the relation down to disk before it's
-                * safe to commit the transaction.  This requires forcing
-                * out any dirty buffers and then doing a forced fsync.
-                */
-               FlushRelationBuffers(rel);
-               smgrimmedsync(rel->rd_smgr);
+               Relation                toastrel;
+
+               toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock);
+               FlushRelationBuffers(toastrel);
+               smgrimmedsync(toastrel->rd_smgr);
+               heap_close(toastrel, AccessShareLock);
         }
  }
diff --git a/src/backend/access/heap/tuptoaster.c b/src/backend/access/heap/tuptoaster.c

index b1eb8aea4d362e0d308477a90bb864290d76dbf1..b1e02e13755c0110f3159295029f729c6b6b3696 100644 (file)
--- a/src/backend/access/heap/tuptoaster.c
+++ b/src/backend/access/heap/tuptoaster.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.71 2007/02/27 23:48:07 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.72 2007/03/29 00:15:37 tgl Exp $
   *
   *
   * INTERFACE ROUTINES
@@ -33,6 +33,7 @@
  #include "access/genam.h"
  #include "access/heapam.h"
  #include "access/tuptoaster.h"
+#include "access/xact.h"
  #include "catalog/catalog.h"
  #include "utils/fmgroids.h"
  #include "utils/pg_lzcompress.h"
@@ -42,7 +43,8 @@
  #undef TOAST_DEBUG
  
  static void toast_delete_datum(Relation rel, Datum value);
-static Datum toast_save_datum(Relation rel, Datum value, bool use_wal);
+static Datum toast_save_datum(Relation rel, Datum value,
+                                                         bool use_wal, bool use_fsm);
  static varattrib *toast_fetch_datum(varattrib *attr);
  static varattrib *toast_fetch_datum_slice(varattrib *attr,
                                                 int32 sliceoffset, int32 length);
@@ -333,6 +335,7 @@ toast_delete(Relation rel, HeapTuple oldtup)
   * Inputs:
   *     newtup: the candidate new tuple to be inserted
   *     oldtup: the old row version for UPDATE, or NULL for INSERT
+ *     use_wal, use_fsm: flags to be passed to heap_insert() for toast rows
   * Result:
   *     either newtup if no toasting is needed, or a palloc'd modified tuple
   *     that is what should actually get stored
@@ -342,7 +345,8 @@ toast_delete(Relation rel, HeapTuple oldtup)
   * ----------
   */
  HeapTuple
-toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, bool use_wal)
+toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup,
+                                          bool use_wal, bool use_fsm)
  {
         HeapTuple       result_tuple;
         TupleDesc       tupleDesc;
@@ -618,7 +622,8 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, bool us
                 i = biggest_attno;
                 old_value = toast_values[i];
                 toast_action[i] = 'p';
-               toast_values[i] = toast_save_datum(rel, toast_values[i], use_wal);
+               toast_values[i] = toast_save_datum(rel, toast_values[i],
+                                                                                  use_wal, use_fsm);
                 if (toast_free[i])
                         pfree(DatumGetPointer(old_value));
  
@@ -729,7 +734,8 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, bool us
                 i = biggest_attno;
                 old_value = toast_values[i];
                 toast_action[i] = 'p';
-               toast_values[i] = toast_save_datum(rel, toast_values[i], use_wal);
+               toast_values[i] = toast_save_datum(rel, toast_values[i],
+                                                                                  use_wal, use_fsm);
                 if (toast_free[i])
                         pfree(DatumGetPointer(old_value));
  
@@ -977,7 +983,8 @@ toast_compress_datum(Datum value)
   * ----------
   */
  static Datum
-toast_save_datum(Relation rel, Datum value, bool use_wal)
+toast_save_datum(Relation rel, Datum value,
+                                bool use_wal, bool use_fsm)
  {
         Relation        toastrel;
         Relation        toastidx;
@@ -985,6 +992,7 @@ toast_save_datum(Relation rel, Datum value, bool use_wal)
         TupleDesc       toasttupDesc;
         Datum           t_values[3];
         bool            t_isnull[3];
+       CommandId       mycid = GetCurrentCommandId();
         varattrib  *result;
         struct
         {
@@ -1063,7 +1071,7 @@ toast_save_datum(Relation rel, Datum value, bool use_wal)
                 if (!HeapTupleIsValid(toasttup))
                         elog(ERROR, "failed to build TOAST tuple");
  
-               fast_heap_insert(toastrel, toasttup, use_wal);
+               heap_insert(toastrel, toasttup, mycid, use_wal, use_fsm);
  
                 /*
                  * Create the index entry.      We cheat a little here by not using
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c

index b660a94aabc4a04772c0204ec48dabfac1694671..377bc9f4f2cbac495551efd19392aa6c40aad4f5 100644 (file)
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.281 2007/03/25 19:45:14 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.282 2007/03/29 00:15:37 tgl Exp $
   *
   *
   * INTERFACE ROUTINES
@@ -1248,12 +1248,11 @@ setNewRelfilenode(Relation relation)
  
         heap_close(pg_class, RowExclusiveLock);
  
-       /* Remember we did this in current transaction, to allow later optimisations */
-       relation->rd_newRelfilenodeSubid = GetCurrentSubTransactionId();
-       RelationCacheResetAtEOXact();
-
         /* Make sure the relfilenode change is visible */
         CommandCounterIncrement();
+
+       /* Mark the rel as having a new relfilenode in current transaction */
+       RelationCacheMarkNewRelfilenode(relation);
  }
  
  
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c

index aa911369409584e02cfda92b641af23d5dfd8089..ac771b77a605127165778a72d148fe194b6494ef 100644 (file)
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -11,7 +11,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/commands/cluster.c,v 1.157 2007/03/13 00:33:39 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/commands/cluster.c,v 1.158 2007/03/29 00:15:37 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -653,6 +653,8 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
         char       *nulls;
         IndexScanDesc scan;
         HeapTuple       tuple;
+       CommandId       mycid = GetCurrentCommandId();
+       bool            use_wal;
  
         /*
          * Open the relations we need.
@@ -675,6 +677,17 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
         nulls = (char *) palloc(natts * sizeof(char));
         memset(nulls, 'n', natts * sizeof(char));
  
+       /*
+        * We need to log the copied data in WAL iff WAL archiving is enabled AND
+        * it's not a temp rel.  (Since we know the target relation is new and
+        * can't have any FSM data, we can always tell heap_insert to ignore FSM,
+        * even when using WAL.)
+        */
+       use_wal = XLogArchivingActive() && !NewHeap->rd_istemp;
+
+       /* use_wal off requires rd_targblock be initially invalid */
+       Assert(NewHeap->rd_targblock == InvalidBlockNumber);
+
         /*
          * Scan through the OldHeap on the OldIndex and copy each tuple into the
          * NewHeap.
@@ -722,7 +735,7 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
                 if (NewHeap->rd_rel->relhasoids)
                         HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
  
-               simple_heap_insert(NewHeap, copiedTuple);
+               heap_insert(NewHeap, copiedTuple, mycid, use_wal, false);
  
                 heap_freetuple(copiedTuple);
  
@@ -734,6 +747,9 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
         pfree(values);
         pfree(nulls);
  
+       if (!use_wal)
+               heap_sync(NewHeap);
+
         index_close(OldIndex, NoLock);
         heap_close(OldHeap, NoLock);
         heap_close(NewHeap, NoLock);
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c

index a2e1939ea252b49be37d6dcfe28bd3bde15e1214..99d347f590796941d2338e57e1fac031b9e8a3fc 100644 (file)
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.278 2007/03/13 00:33:39 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.279 2007/03/29 00:15:38 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1125,11 +1125,10 @@ DoCopy(const CopyStmt *stmt, const char *queryString)
         cstate->copy_dest = COPY_FILE;          /* default */
         cstate->filename = stmt->filename;
  
-       if (is_from)                            /* copy from file to database */
-               CopyFrom(cstate);
+       if (is_from)
+               CopyFrom(cstate);               /* copy from file to database */
         else
-               /* copy from database to file */
-               DoCopyTo(cstate);
+               DoCopyTo(cstate);               /* copy from database to file */
  
         /*
          * Close the relation or query.  If reading, we can release the
@@ -1640,7 +1639,9 @@ CopyFrom(CopyState cstate)
         ExprContext *econtext;          /* used for ExecEvalExpr for default atts */
         MemoryContext oldcontext = CurrentMemoryContext;
         ErrorContextCallback errcontext;
-       bool            use_wal = true; /* By default, we use WAL to log db changes */
+       CommandId       mycid = GetCurrentCommandId();
+       bool            use_wal = true;         /* by default, use WAL logging */
+       bool            use_fsm = true;         /* by default, use FSM for free space */
  
         Assert(cstate->rel);
  
@@ -1663,6 +1664,48 @@ CopyFrom(CopyState cstate)
                                                         RelationGetRelationName(cstate->rel))));
         }
  
+       /*----------
+        * Check to see if we can avoid writing WAL
+        *
+        * If archive logging is not enabled *and* either
+        *      - table was created in same transaction as this COPY
+        *      - data is being written to relfilenode created in this transaction
+        * then we can skip writing WAL.  It's safe because if the transaction
+        * doesn't commit, we'll discard the table (or the new relfilenode file).
+        * If it does commit, we'll have done the heap_sync at the bottom of this
+        * routine first.
+        *
+        * As mentioned in comments in utils/rel.h, the in-same-transaction test
+        * is not completely reliable, since in rare cases rd_createSubid or
+        * rd_newRelfilenodeSubid can be cleared before the end of the transaction.
+        * However this is OK since at worst we will fail to make the optimization.
+        *
+        * When skipping WAL it's entirely possible that COPY itself will write no
+        * WAL records at all.  This is of concern because RecordTransactionCommit
+        * might decide it doesn't need to log our eventual commit, which we
+        * certainly need it to do.  However, we need no special action here for
+        * that, because if we have a new table or new relfilenode then there
+        * must have been a WAL-logged pg_class update earlier in the transaction.
+        *
+        * Also, if the target file is new-in-transaction, we assume that checking
+        * FSM for free space is a waste of time, even if we must use WAL because
+        * of archiving.  This could possibly be wrong, but it's unlikely.
+        *
+        * The comments for heap_insert and RelationGetBufferForTuple specify that
+        * skipping WAL logging is only safe if we ensure that our tuples do not
+        * go into pages containing tuples from any other transactions --- but this
+        * must be the case if we have a new table or new relfilenode, so we need
+        * no additional work to enforce that.
+        *----------
+        */
+       if (cstate->rel->rd_createSubid != InvalidSubTransactionId ||
+               cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId)
+       {
+               use_fsm = false;
+               if (!XLogArchivingActive())
+                       use_wal = false;
+       }
+
         if (pipe)
         {
                 if (whereToSendOutput == DestRemote)
@@ -1832,28 +1875,6 @@ CopyFrom(CopyState cstate)
         nfields = file_has_oids ? (attr_count + 1) : attr_count;
         field_strings = (char **) palloc(nfields * sizeof(char *));
  
-       /*
-        * Check for performance optimization by avoiding WAL writes
-        *
-        * If archive logging is not be enabled *and* either
-        * - table is created in same transaction as this COPY
-        * - table data is now being written to new relfilenode
-        * then we can safely avoid writing WAL. Why? 
-        * The data files for the table plus toast table/index, plus any indexes
-        * will all be dropped at the end of the transaction if it fails, so we
-        * do not need to worry about inconsistent states.
-        * As mentioned in comments in utils/rel.h, the in-same-transaction test is
-        * not completely reliable, since rd_createSubId can be reset to zero in
-        * certain cases before the end of the creating transaction. 
-        * We are doing this for performance only, so we only need to know: 
-        * if rd_createSubid != InvalidSubTransactionId then it is *always* just 
-        * created. If we have PITR enabled, then we *must* use_wal
-        */
-       if ((cstate->rel->rd_createSubid                 != InvalidSubTransactionId ||
-            cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId)
-               && !XLogArchivingActive())
-               use_wal = false;
-
         /* Initialize state variables */
         cstate->fe_eof = false;
         cstate->eol_type = EOL_UNKNOWN;
@@ -2087,7 +2108,7 @@ CopyFrom(CopyState cstate)
                                 ExecConstraints(resultRelInfo, slot, estate);
  
                         /* OK, store the tuple and create index entries for it */
-                       fast_heap_insert(cstate->rel, tuple, use_wal);
+                       heap_insert(cstate->rel, tuple, mycid, use_wal, use_fsm);
  
                         if (resultRelInfo->ri_NumIndices > 0)
                                 ExecInsertIndexTuples(slot, &(tuple->t_self), estate, false);
@@ -2104,32 +2125,6 @@ CopyFrom(CopyState cstate)
                 }
         }
  
-       /* 
-        * If we skipped writing WAL for heaps, then we need to sync
-        */
-       if (!use_wal)
-       {
-               /* main heap */
-               heap_sync(cstate->rel);
-
-               /* main heap indexes, if any */
-               /* we always use WAL for index inserts, so no need to sync */
-
-               /* toast heap, if any */
-               if (OidIsValid(cstate->rel->rd_rel->reltoastrelid))
-               {
-                        Relation               toastrel;
-
-                        toastrel = heap_open(cstate->rel->rd_rel->reltoastrelid,
-                                                                 AccessShareLock);
-                        heap_sync(toastrel);
-                        heap_close(toastrel, AccessShareLock);
-               }
-
-               /* toast index, if toast heap */
-               /* we always use WAL for index inserts, so no need to sync */
-       }
-
         /* Done, clean up */
         error_context_stack = errcontext.previous;
  
@@ -2164,6 +2159,13 @@ CopyFrom(CopyState cstate)
                                          errmsg("could not read from file \"%s\": %m",
                                                         cstate->filename)));
         }
+
+       /* 
+        * If we skipped writing WAL, then we need to sync the heap (but not
+        * indexes since those use WAL anyway)
+        */
+       if (!use_wal)
+               heap_sync(cstate->rel);
  }
  
  
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c

index b2f7159e8c0099a76e065fb02c092be7cf8a62e5..cacd7c6fe7099235367580dce4e0ff0705a118a2 100644 (file)
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -26,7 +26,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.291 2007/03/25 19:45:14 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.292 2007/03/29 00:15:38 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -2541,14 +2541,8 @@ CloseIntoRel(QueryDesc *queryDesc)
         /* OpenIntoRel might never have gotten called */
         if (estate->es_into_relation_descriptor)
         {
-               /*
-                * If we skipped using WAL, and it's not a temp relation, we must
-                * force the relation down to disk before it's safe to commit the
-                * transaction.  This requires forcing out any dirty buffers and then
-                * doing a forced fsync.
-                */
-               if (!estate->es_into_relation_use_wal &&
-                       !estate->es_into_relation_descriptor->rd_istemp)
+               /* If we skipped using WAL, must heap_sync before commit */
+               if (!estate->es_into_relation_use_wal)
                         heap_sync(estate->es_into_relation_descriptor);
  
                 /* close rel, but keep lock until commit */
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c

index 91b7f146b43abd96b9a8d1dcf2d9e0e169908856..d8bd36bc94f6d9dc50b23f45c90031c37be613f7 100644 (file)
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.258 2007/03/19 23:38:29 wieck Exp $
+ *       $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.259 2007/03/29 00:15:38 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -1572,7 +1572,8 @@ RelationClose(Relation relation)
  
  #ifdef RELCACHE_FORCE_RELEASE
         if (RelationHasReferenceCountZero(relation) &&
-               relation->rd_createSubid == InvalidSubTransactionId)
+               relation->rd_createSubid == InvalidSubTransactionId &&
+               relation->rd_newRelfilenodeSubid == InvalidSubTransactionId)
                 RelationClearRelation(relation, false);
  #endif
  }
@@ -1759,11 +1760,12 @@ RelationClearRelation(Relation relation, bool rebuild)
         {
                 /*
                  * When rebuilding an open relcache entry, must preserve ref count and
-                * rd_createSubid state.  Also attempt to preserve the tupledesc and
-                * rewrite-rule substructures in place.  (Note: the refcount mechanism
-                * for tupledescs may eventually ensure that we don't really need to
-                * preserve the tupledesc in-place, but for now there are still a lot
-                * of places that assume an open rel's tupledesc won't move.)
+                * rd_createSubid/rd_newRelfilenodeSubid state.  Also attempt to
+                * preserve the tupledesc and rewrite-rule substructures in place.
+                * (Note: the refcount mechanism for tupledescs may eventually ensure
+                * that we don't really need to preserve the tupledesc in-place, but
+                * for now there are still a lot of places that assume an open rel's
+                * tupledesc won't move.)
                  *
                  * Note that this process does not touch CurrentResourceOwner; which
                  * is good because whatever ref counts the entry may have do not
@@ -1839,7 +1841,7 @@ RelationFlushRelation(Relation relation)
                 /*
                  * New relcache entries are always rebuilt, not flushed; else we'd
                  * forget the "new" status of the relation, which is a useful
-                * optimization to have.
+                * optimization to have.  Ditto for the new-relfilenode status.
                  */
                 rebuild = true;
         }
@@ -1916,6 +1918,8 @@ RelationCacheInvalidateEntry(Oid relationId)
   *      so we do not touch new-in-transaction relations; they cannot be targets
   *      of cross-backend SI updates (and our own updates now go through a
   *      separate linked list that isn't limited by the SI message buffer size).
+ *      Likewise, we need not discard new-relfilenode-in-transaction hints,
+ *      since any invalidation of those would be a local event.
   *
   *      We do this in two phases: the first pass deletes deletable items, and
   *      the second one rebuilds the rebuildable items.  This is essential for
@@ -1958,14 +1962,6 @@ RelationCacheInvalidate(void)
                 if (relation->rd_createSubid != InvalidSubTransactionId)
                         continue;
  
-               /* 
-                * Reset newRelfilenode hint. It is never used for correctness, only
-                * for performance optimization. An incorrectly set hint can lead
-                * to data loss in some circumstances, so play safe.
-                */
-               if (relation->rd_newRelfilenodeSubid != InvalidSubTransactionId)
-                       relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
-
                 relcacheInvalsReceived++;
  
                 if (RelationHasReferenceCountZero(relation))
@@ -2017,17 +2013,6 @@ RelationCacheInvalidate(void)
         list_free(rebuildList);
  }
  
-/*
- * RelationCacheResetAtEOXact
- *
- *  Register that work will be required at main-transaction commit or abort
- */
-void
-RelationCacheResetAtEOXact(void)
-{
-       need_eoxact_work = true;
-}
-
  /*
   * AtEOXact_RelationCache
   *
@@ -2056,9 +2041,10 @@ AtEOXact_RelationCache(bool isCommit)
          * the debug-only Assert checks, most transactions don't create any work
          * for us to do here, so we keep a static flag that gets set if there is
          * anything to do.      (Currently, this means either a relation is created in
-        * the current xact, or an index list is forced.)  For simplicity, the
-        * flag remains set till end of top-level transaction, even though we
-        * could clear it at subtransaction end in some cases.
+        * the current xact, or one is given a new relfilenode, or an index list
+        * is forced.)  For simplicity, the flag remains set till end of top-level
+        * transaction, even though we could clear it at subtransaction end in
+        * some cases.
          */
         if (!need_eoxact_work
  #ifdef USE_ASSERT_CHECKING
@@ -2111,6 +2097,10 @@ AtEOXact_RelationCache(bool isCommit)
                                 continue;
                         }
                 }
+
+               /*
+                * Likewise, reset the hint about the relfilenode being new.
+                */
                 relation->rd_newRelfilenodeSubid = InvalidSubTransactionId;
  
                 /*
@@ -2173,6 +2163,10 @@ AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
                                 continue;
                         }
                 }
+
+               /*
+                * Likewise, update or drop any new-relfilenode-in-subtransaction hint.
+                */
                 if (relation->rd_newRelfilenodeSubid == mySubid)
                 {
                         if (isCommit)
@@ -2194,6 +2188,23 @@ AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
         }
  }
  
+/*
+ * RelationCacheMarkNewRelfilenode
+ *
+ *     Mark the rel as having been given a new relfilenode in the current
+ *     (sub) transaction.  This is a hint that can be used to optimize
+ *     later operations on the rel in the same transaction.
+ */
+void
+RelationCacheMarkNewRelfilenode(Relation rel)
+{
+       /* Mark it... */
+       rel->rd_newRelfilenodeSubid = GetCurrentSubTransactionId();
+       /* ... and now we have eoxact cleanup work to do */
+       need_eoxact_work = true;
+}
+
+
  /*
   *             RelationBuildLocalRelation
   *                     Build a relcache entry for an about-to-be-created relation,
@@ -2272,7 +2283,7 @@ RelationBuildLocalRelation(const char *relname,
         rel->rd_newRelfilenodeSubid = InvalidSubTransactionId;
  
         /* must flag that we have rels created in this transaction */
-       RelationCacheResetAtEOXact();
+       need_eoxact_work = true;
  
         /* is it a temporary relation? */
         rel->rd_istemp = isTempNamespace(relnamespace);
@@ -2928,7 +2939,7 @@ RelationSetIndexList(Relation relation, List *indexIds, Oid oidIndex)
         relation->rd_oidindex = oidIndex;
         relation->rd_indexvalid = 2;    /* mark list as forced */
         /* must flag that we have a forced index list */
-       RelationCacheResetAtEOXact();
+       need_eoxact_work = true;
  }
  
  /*
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h

index 3a689599570cc14f3346b7ddd87f5df70305c0ab..6c7c98b3f286719f1a024d709b56432354b85741 100644 (file)
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.120 2007/01/25 02:17:26 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.121 2007/03/29 00:15:39 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -178,9 +178,6 @@ extern void simple_heap_delete(Relation relation, ItemPointer tid);
  extern void simple_heap_update(Relation relation, ItemPointer otid,
                                    HeapTuple tup);
  
-extern Oid fast_heap_insert(Relation relation, HeapTuple tup, bool use_wal);
-               
-
  extern void heap_markpos(HeapScanDesc scan);
  extern void heap_restrpos(HeapScanDesc scan);
  
diff --git a/src/include/access/tuptoaster.h b/src/include/access/tuptoaster.h

index 3693379dba72edb0312539daa41b1dd123aee3c1..6cc0bdcbe8c960233908dbb803c7960a04c2f82f 100644 (file)
--- a/src/include/access/tuptoaster.h
+++ b/src/include/access/tuptoaster.h
@@ -6,7 +6,7 @@
   *
   * Copyright (c) 2000-2007, PostgreSQL Global Development Group
   *
- * $PostgreSQL: pgsql/src/include/access/tuptoaster.h,v 1.32 2007/02/05 04:22:18 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/tuptoaster.h,v 1.33 2007/03/29 00:15:39 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -79,7 +79,8 @@
   * ----------
   */
  extern HeapTuple toast_insert_or_update(Relation rel,
-                                          HeapTuple newtup, HeapTuple oldtup, bool use_wal);
+                                                                               HeapTuple newtup, HeapTuple oldtup,
+                                                                               bool use_wal, bool use_fsm);
  
  /* ----------
   * toast_delete -
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h

index 2963cc6616aae31030224a7fdf17869ed80aab71..33795de2bf8acb17a278c74e7e69045bbffeef40 100644 (file)
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.99 2007/03/19 23:38:32 wieck Exp $
+ * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.100 2007/03/29 00:15:39 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -138,13 +138,17 @@ typedef struct RelationData
         char            rd_indexvalid;  /* state of rd_indexlist: 0 = not valid, 1 =
                                                                  * valid, 2 = temporarily forced */
         SubTransactionId rd_createSubid;        /* rel was created in current xact */
-       SubTransactionId rd_newRelfilenodeSubid;        /* rel had new relfilenode in current xact */
+       SubTransactionId rd_newRelfilenodeSubid;        /* new relfilenode assigned
+                                                                                                * in current xact */
  
         /*
          * rd_createSubid is the ID of the highest subtransaction the rel has
          * survived into; or zero if the rel was not created in the current top
          * transaction.  This should be relied on only for optimization purposes;
          * it is possible for new-ness to be "forgotten" (eg, after CLUSTER).
+        * Likewise, rd_newRelfilenodeSubid is the ID of the highest subtransaction
+        * the relfilenode change has survived into, or zero if not changed in
+        * the current transaction (or we have forgotten changing it).
          */
         Form_pg_class rd_rel;           /* RELATION tuple */
         TupleDesc       rd_att;                 /* tuple descriptor */
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h

index c7b549d1cff2fdfe689f8f1fd66b210da366ef16..25b60082a093f44609f2a4683246f4ecaf477b34 100644 (file)
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.58 2007/03/03 20:08:41 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.59 2007/03/29 00:15:39 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -60,12 +60,12 @@ extern void RelationCacheInvalidateEntry(Oid relationId);
  
  extern void RelationCacheInvalidate(void);
  
-extern void RelationCacheResetAtEOXact(void);
-
  extern void AtEOXact_RelationCache(bool isCommit);
  extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
                                                   SubTransactionId parentSubid);
  
+extern void RelationCacheMarkNewRelfilenode(Relation rel);
+
  /*
   * Routines to help manage rebuilding of relcache init file
   */
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 29 Mar 2007 00:15:39 +0000 (00:15 +0000)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 29 Mar 2007 00:15:39 +0000 (00:15 +0000)
doc/src/sgml/perform.sgml		patch \| blob \| history
src/backend/access/heap/heapam.c		patch \| blob \| history
src/backend/access/heap/tuptoaster.c		patch \| blob \| history
src/backend/catalog/index.c		patch \| blob \| history
src/backend/commands/cluster.c		patch \| blob \| history
src/backend/commands/copy.c		patch \| blob \| history
src/backend/executor/execMain.c		patch \| blob \| history
src/backend/utils/cache/relcache.c		patch \| blob \| history
src/include/access/heapam.h		patch \| blob \| history
src/include/access/tuptoaster.h		patch \| blob \| history
src/include/utils/rel.h		patch \| blob \| history
src/include/utils/relcache.h		patch \| blob \| history