]> granicus.if.org Git - postgresql/commitdiff
Avoid WAL-logging individual tuple insertions during CREATE TABLE AS
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 20 Jun 2005 18:37:02 +0000 (18:37 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 20 Jun 2005 18:37:02 +0000 (18:37 +0000)
(a/k/a SELECT INTO).  Instead, flush and fsync the whole relation before
committing.  We do still need the WAL log when PITR is active, however.
Simon Riggs and Tom Lane.

src/backend/access/heap/heapam.c
src/backend/access/heap/hio.c
src/backend/executor/execMain.c
src/backend/executor/execUtils.c
src/backend/storage/smgr/md.c
src/backend/storage/smgr/smgr.c
src/include/access/heapam.h
src/include/access/hio.h
src/include/nodes/execnodes.h

index 74f76c1d16aeeef14506ca83d78b4b6a9a809874..843b2909ef27afdb0a1a93249c601fc20cfabb45 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.194 2005/06/08 15:50:21 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.195 2005/06/20 18:37:01 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -1034,9 +1034,20 @@ heap_get_latest_tid(Relation relation,
  *
  * The new tuple is stamped with current transaction ID and the specified
  * command ID.
+ *
+ * If use_wal is false, the new tuple is not logged in WAL, even for a
+ * non-temp relation.  Safe usage of this behavior requires that we arrange
+ * that all new tuples go into new pages not containing any tuples from other
+ * transactions, that the relation gets fsync'd before commit, and that the
+ * transaction emits at least one WAL record to ensure RecordTransactionCommit
+ * will decide to WAL-log the commit.
+ *
+ * use_fsm is passed directly to RelationGetBufferForTuple, which see for
+ * more info.
  */
 Oid
-heap_insert(Relation relation, HeapTuple tup, CommandId cid)
+heap_insert(Relation relation, HeapTuple tup, CommandId cid,
+                       bool use_wal, bool use_fsm)
 {
        TransactionId xid = GetCurrentTransactionId();
        Buffer          buffer;
@@ -1086,7 +1097,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid)
                heap_tuple_toast_attrs(relation, tup, NULL);
 
        /* Find buffer to insert this tuple into */
-       buffer = RelationGetBufferForTuple(relation, tup->t_len, InvalidBuffer);
+       buffer = RelationGetBufferForTuple(relation, tup->t_len,
+                                                                          InvalidBuffer, use_fsm);
 
        /* NO EREPORT(ERROR) from here till changes are logged */
        START_CRIT_SECTION();
@@ -1096,7 +1108,12 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid)
        pgstat_count_heap_insert(&relation->pgstat_info);
 
        /* XLOG stuff */
-       if (!relation->rd_istemp)
+       if (relation->rd_istemp)
+       {
+               /* No XLOG record, but still need to flag that XID exists on disk */
+               MyXactMadeTempRelUpdate = true;
+       }
+       else if (use_wal)
        {
                xl_heap_insert xlrec;
                xl_heap_header xlhdr;
@@ -1151,11 +1168,6 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid)
                PageSetLSN(page, recptr);
                PageSetTLI(page, ThisTimeLineID);
        }
-       else
-       {
-               /* No XLOG record, but still need to flag that XID exists on disk */
-               MyXactMadeTempRelUpdate = true;
-       }
 
        END_CRIT_SECTION();
 
@@ -1183,7 +1195,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid)
 Oid
 simple_heap_insert(Relation relation, HeapTuple tup)
 {
-       return heap_insert(relation, tup, GetCurrentCommandId());
+       return heap_insert(relation, tup, GetCurrentCommandId(), true, true);
 }
 
 /*
@@ -1743,7 +1755,7 @@ l2:
                {
                        /* Assume there's no chance to put newtup on same page. */
                        newbuf = RelationGetBufferForTuple(relation, newtup->t_len,
-                                                                                          buffer);
+                                                                                          buffer, true);
                }
                else
                {
@@ -1760,7 +1772,7 @@ l2:
                                 */
                                LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                                newbuf = RelationGetBufferForTuple(relation, newtup->t_len,
-                                                                                                  buffer);
+                                                                                                  buffer, true);
                        }
                        else
                        {
index 583bb209336c47e5b3cee00a05abbdf156e9771f..fc1b0afd21e8b5fa57913d5acc22349a5357924a 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.56 2005/05/07 21:32:23 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.57 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -79,12 +79,26 @@ RelationPutHeapTuple(Relation relation,
  *     happen if space is freed in that page after heap_update finds there's not
  *     enough there).  In that case, the page will be pinned and locked only once.
  *
+ *     If use_fsm is true (the normal case), we use FSM to help us find free
+ *     space.  If use_fsm is false, we always append a new empty page to the
+ *     end of the relation if the tuple won't fit on the current target page.
+ *     This can save some cycles when we know the relation is new and doesn't
+ *     contain useful amounts of free space.
+ *
+ *     The use_fsm = false case is also useful for non-WAL-logged additions to a
+ *     relation, if the caller holds exclusive lock and is careful to invalidate
+ *     relation->rd_targblock before the first insertion --- that ensures that
+ *     all insertions will occur into newly added pages and not be intermixed
+ *     with tuples from other transactions.  That way, a crash can't risk losing
+ *     any committed data of other transactions.  (See heap_insert's comments
+ *     for additional constraints needed for safe usage of this behavior.)
+ *
  *     ereport(ERROR) is allowed here, so this routine *must* be called
  *     before any (unlogged) changes are made in buffer pool.
  */
 Buffer
 RelationGetBufferForTuple(Relation relation, Size len,
-                                                 Buffer otherBuffer)
+                                                 Buffer otherBuffer, bool use_fsm)
 {
        Buffer          buffer = InvalidBuffer;
        Page            pageHeader;
@@ -121,11 +135,14 @@ RelationGetBufferForTuple(Relation relation, Size len,
         * on each page that proves not to be suitable.)  If the FSM has no
         * record of a page with enough free space, we give up and extend the
         * relation.
+        *
+        * When use_fsm is false, we either put the tuple onto the existing
+        * target page or extend the relation.
         */
 
        targetBlock = relation->rd_targblock;
 
-       if (targetBlock == InvalidBlockNumber)
+       if (targetBlock == InvalidBlockNumber && use_fsm)
        {
                /*
                 * We have no cached target page, so ask the FSM for an initial
@@ -209,6 +226,10 @@ RelationGetBufferForTuple(Relation relation, Size len,
                        ReleaseBuffer(buffer);
                }
 
+               /* Without FSM, always fall out of the loop and extend */
+               if (!use_fsm)
+                       break;
+
                /*
                 * Update FSM as to condition of this page, and ask for another
                 * page to try.
index a390829bb8ef749dbb5882ce3b88ceb3d0fa202b..938474610ae80df3fcb45d2ac22118d6a2248481 100644 (file)
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.249 2005/05/22 22:30:19 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.250 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
 #include "access/heapam.h"
+#include "access/xlog.h"
 #include "catalog/heap.h"
 #include "catalog/namespace.h"
 #include "commands/tablecmds.h"
@@ -44,6 +45,7 @@
 #include "optimizer/clauses.h"
 #include "optimizer/var.h"
 #include "parser/parsetree.h"
+#include "storage/smgr.h"
 #include "utils/acl.h"
 #include "utils/guc.h"
 #include "utils/lsyscache.h"
@@ -784,6 +786,20 @@ InitPlan(QueryDesc *queryDesc, bool explainOnly)
                 * And open the constructed table for writing.
                 */
                intoRelationDesc = heap_open(intoRelationId, AccessExclusiveLock);
+
+               /* use_wal off requires rd_targblock be initially invalid */
+               Assert(intoRelationDesc->rd_targblock == InvalidBlockNumber);
+
+               /*
+                * We can skip WAL-logging the insertions, unless PITR is in use.
+                *
+                * Note that for a non-temp INTO table, this is safe only because
+                * we know that the catalog changes above will have been WAL-logged,
+                * and so RecordTransactionCommit will think it needs to WAL-log the
+                * eventual transaction commit.  Else the commit might be lost, even
+                * though all the data is safely fsync'd ...
+                */
+               estate->es_into_relation_use_wal = XLogArchivingActive();
        }
 
        estate->es_into_relation_descriptor = intoRelationDesc;
@@ -979,7 +995,22 @@ ExecEndPlan(PlanState *planstate, EState *estate)
         * close the "into" relation if necessary, again keeping lock
         */
        if (estate->es_into_relation_descriptor != NULL)
+       {
+               /*
+                * If we skipped using WAL, and it's not a temp relation,
+                * we must force the relation down to disk before it's
+                * safe to commit the transaction.  This requires forcing
+                * out any dirty buffers and then doing a forced fsync.
+                */
+               if (!estate->es_into_relation_use_wal &&
+                       !estate->es_into_relation_descriptor->rd_istemp)
+               {
+                       FlushRelationBuffers(estate->es_into_relation_descriptor);
+                       smgrimmedsync(estate->es_into_relation_descriptor->rd_smgr);
+               }
+
                heap_close(estate->es_into_relation_descriptor, NoLock);
+   }
 
        /*
         * close any relations selected FOR UPDATE/FOR SHARE, again keeping locks
@@ -1307,7 +1338,9 @@ ExecSelect(TupleTableSlot *slot,
 
                tuple = ExecCopySlotTuple(slot);
                heap_insert(estate->es_into_relation_descriptor, tuple,
-                                       estate->es_snapshot->curcid);
+                                       estate->es_snapshot->curcid,
+                                       estate->es_into_relation_use_wal,
+                                       false);         /* never any point in using FSM */
                /* we know there are no indexes to update */
                heap_freetuple(tuple);
                IncrAppended();
@@ -1386,7 +1419,8 @@ ExecInsert(TupleTableSlot *slot,
         * insert the tuple
         */
        newId = heap_insert(resultRelationDesc, tuple,
-                                               estate->es_snapshot->curcid);
+                                               estate->es_snapshot->curcid,
+                                               true, true);
 
        IncrAppended();
        (estate->es_processed)++;
@@ -2089,6 +2123,7 @@ EvalPlanQualStart(evalPlanQual *epq, EState *estate, evalPlanQual *priorepq)
        epqstate->es_result_relation_info = estate->es_result_relation_info;
        epqstate->es_junkFilter = estate->es_junkFilter;
        epqstate->es_into_relation_descriptor = estate->es_into_relation_descriptor;
+       epqstate->es_into_relation_use_wal = estate->es_into_relation_use_wal;
        epqstate->es_param_list_info = estate->es_param_list_info;
        if (estate->es_topPlan->nParamExec > 0)
                epqstate->es_param_exec_vals = (ParamExecData *)
index 133bf57bca23724dc5c1351c45dabbe8cf151b9a..8eaff494e3ea1d8e738aabc93adee353a376e1d9 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/executor/execUtils.c,v 1.123 2005/04/28 21:47:12 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/executor/execUtils.c,v 1.124 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -186,7 +186,9 @@ CreateExecutorState(void)
        estate->es_result_relation_info = NULL;
 
        estate->es_junkFilter = NULL;
+
        estate->es_into_relation_descriptor = NULL;
+       estate->es_into_relation_use_wal = false;
 
        estate->es_param_list_info = NULL;
        estate->es_param_exec_vals = NULL;
index 1c0cb7e240b9b61a65cfef399a9251da056bfd9f..fa7913aff74bd5db0e3559e008200a9abf1cd874 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.115 2005/05/29 04:23:05 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.116 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -660,6 +660,9 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 
 /*
  *     mdimmedsync() -- Immediately sync a relation to stable storage.
+ *
+ * Note that only writes already issued are synced; this routine knows
+ * nothing of dirty buffers that may exist inside the buffer manager.
  */
 bool
 mdimmedsync(SMgrRelation reln)
index 2c8cf07eec83993d063ad7fffb6b3ca90c3053ce..f286b20ee2533fdb4765504f0524ab2d652788a5 100644 (file)
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.90 2005/06/17 22:32:46 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.91 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -650,7 +650,8 @@ smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 /*
  *     smgrimmedsync() -- Force the specified relation to stable storage.
  *
- *             Synchronously force all of the specified relation down to disk.
+ *             Synchronously force all previous writes to the specified relation
+ *             down to disk.
  *
  *             This is useful for building completely new relations (eg, new
  *             indexes).  Instead of incrementally WAL-logging the index build
@@ -664,6 +665,10 @@ smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
  *
  *             The preceding writes should specify isTemp = true to avoid
  *             duplicative fsyncs.
+ *
+ *             Note that you need to do FlushRelationBuffers() first if there is
+ *             any possibility that there are dirty buffers for the relation;
+ *             otherwise the sync is not very meaningful.
  */
 void
 smgrimmedsync(SMgrRelation reln)
index 151a62f9b68b02f9c67878a3c9f68b8561ba257e..dde6fe8ecd89e23e362d1cca54c278196038543f 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.101 2005/06/06 17:01:24 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.102 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -156,7 +156,8 @@ extern ItemPointer heap_get_latest_tid(Relation relation, Snapshot snapshot,
                                        ItemPointer tid);
 extern void setLastTid(const ItemPointer tid);
 
-extern Oid     heap_insert(Relation relation, HeapTuple tup, CommandId cid);
+extern Oid     heap_insert(Relation relation, HeapTuple tup, CommandId cid,
+                                               bool use_wal, bool use_fsm);
 extern HTSU_Result heap_delete(Relation relation, ItemPointer tid, ItemPointer ctid,
                        CommandId cid, Snapshot crosscheck, bool wait);
 extern HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple tup,
index 49091eb202fae1f16ecb1ed5be1dd1f766780a67..e706fea4ca136454753b67a6d72e3816074a0406 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/hio.h,v 1.27 2004/12/31 22:03:21 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/hio.h,v 1.28 2005/06/20 18:37:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,6 @@
 extern void RelationPutHeapTuple(Relation relation, Buffer buffer,
                                         HeapTuple tuple);
 extern Buffer RelationGetBufferForTuple(Relation relation, Size len,
-                                                 Buffer otherBuffer);
+                                                                               Buffer otherBuffer, bool use_fsm);
 
 #endif   /* HIO_H */
index 19f264119c3c0b3814ae4db2f238103f0304676a..df41c8561084b511a237a5ab134c443c438b3eb7 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.134 2005/06/15 07:27:44 neilc Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.135 2005/06/20 18:37:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -304,7 +304,9 @@ typedef struct EState
        ResultRelInfo *es_result_relation_info;         /* currently active array
                                                                                                 * elt */
        JunkFilter *es_junkFilter;      /* currently active junk filter */
+
        Relation        es_into_relation_descriptor;    /* for SELECT INTO */
+       bool            es_into_relation_use_wal;
 
        /* Parameter info: */
        ParamListInfo es_param_list_info;       /* values of external params */