]> granicus.if.org Git - postgresql/commitdiff
Do not decode TOAST data for table rewrites
authorTomas Vondra <tomas.vondra@postgresql.org>
Wed, 28 Nov 2018 00:11:15 +0000 (01:11 +0100)
committerTomas Vondra <tomas.vondra@postgresql.org>
Wed, 28 Nov 2018 00:44:43 +0000 (01:44 +0100)
During table rewrites (VACUUM FULL and CLUSTER), the main heap is logged
using XLOG / FPI records, and thus (correctly) ignored in decoding.
But the associated TOAST table is WAL-logged as plain INSERT records,
and so was logically decoded and passed to reorder buffer.

That has severe consequences with TOAST tables of non-trivial size.
Firstly, reorder buffer has to keep all those changes, possibly spilling
them to a file, incurring I/O costs and disk space.

Secondly, ReoderBufferCommit() was stashing all those TOAST chunks into
a hash table, which got discarded only after processing the row from the
main heap.  But as the main heap is not decoded for rewrites, this never
happened, so all the TOAST data accumulated in memory, resulting either
in excessive memory consumption or OOM.

The fix is simple, as commit e9edc1ba already introduced infrastructure
(namely HEAP_INSERT_NO_LOGICAL flag) to skip logical decoding of TOAST
tables, but it only applied it to system tables.  So simply use it for
all TOAST data in raw_heap_insert().

That would however solve only the memory consumption issue - the TOAST
changes would still be decoded and added to the reorder buffer, and
spilled to disk (although without TOAST tuple data, so much smaller).
But we can solve that by tweaking DecodeInsert() to just ignore such
INSERT records altogether, using XLH_INSERT_CONTAINS_NEW_TUPLE flag,
instead of skipping them later in ReorderBufferCommit().

Review: Masahiko Sawada
Discussion: https://www.postgresql.org/message-id/flat/1a17c643-e9af-3dba-486b-fbe31bc1823a%402ndquadrant.com
Backpatch: 9.4-, where logical decoding was introduced

src/backend/access/heap/rewriteheap.c
src/backend/replication/logical/decode.c
src/backend/replication/logical/reorderbuffer.c

index 0577c540447dcc0285cafb332556f08b6abb94e8..4eb5c5b180f69682b55eaaa28b529d79f7fabadd 100644 (file)
@@ -658,12 +658,11 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
                        options |= HEAP_INSERT_SKIP_WAL;
 
                /*
-                * The new relfilenode's relcache entrye doesn't have the necessary
-                * information to determine whether a relation should emit data for
-                * logical decoding.  Force it to off if necessary.
+                * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data
+                * for the TOAST table are not logically decoded.  The main heap is
+                * WAL-logged as XLOG FPI records, which are not logically decoded.
                 */
-               if (!RelationIsLogicallyLogged(state->rs_old_rel))
-                       options |= HEAP_INSERT_NO_LOGICAL;
+               options |= HEAP_INSERT_NO_LOGICAL;
 
                heaptup = toast_insert_or_update(state->rs_new_rel, tup, NULL,
                                                                                 options);
index b103c7c8ca23e3957b1acdaa005b7b51ada7115d..82ef8bbe1b6f112c7f9a56114e0be53a3a2d2c30 100644 (file)
@@ -577,6 +577,9 @@ DecodeAbort(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
 static void
 DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 {
+       Size            datalen;
+       char       *tupledata;
+       Size            tuplelen;
        XLogReaderState *r = buf->record;
        xl_heap_insert *xlrec;
        ReorderBufferChange *change;
@@ -584,6 +587,13 @@ DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 
        xlrec = (xl_heap_insert *) XLogRecGetData(r);
 
+       /*
+        * Ignore insert records without new tuples (this does happen when
+        * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL).
+        */
+       if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE))
+               return;
+
        /* only interested in our database */
        XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL);
        if (target_node.dbNode != ctx->slot->data.database)
@@ -602,17 +612,13 @@ DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 
        memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode));
 
-       if (xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)
-       {
-               Size            datalen;
-               char       *tupledata = XLogRecGetBlockData(r, 0, &datalen);
-               Size            tuplelen = datalen - SizeOfHeapHeader;
+       tupledata = XLogRecGetBlockData(r, 0, &datalen);
+       tuplelen = datalen - SizeOfHeapHeader;
 
-               change->data.tp.newtuple =
-                       ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
+       change->data.tp.newtuple =
+               ReorderBufferGetTupleBuf(ctx->reorder, tuplelen);
 
-               DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple);
-       }
+       DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple);
 
        change->data.tp.clear_toast_afterwards = true;
 
index 44d9642fd86aae2c6f7d2df3570fbe2b9693fad0..9298b710c44d8a6608d55028426e1c8a6eac21d9 100644 (file)
@@ -1592,17 +1592,12 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
                                                 * transaction's changes. Otherwise it will get
                                                 * freed/reused while restoring spooled data from
                                                 * disk.
-                                                *
-                                                * But skip doing so if there's no tuple-data. That
-                                                * happens if a non-mapped system catalog with a toast
-                                                * table is rewritten.
                                                 */
-                                               if (change->data.tp.newtuple != NULL)
-                                               {
-                                                       dlist_delete(&change->node);
-                                                       ReorderBufferToastAppendChunk(rb, txn, relation,
-                                                                                                                 change);
-                                               }
+                                               Assert(change->data.tp.newtuple != NULL);
+
+                                               dlist_delete(&change->node);
+                                               ReorderBufferToastAppendChunk(rb, txn, relation,
+                                                                                                         change);
                                        }
 
                        change_done: