]> granicus.if.org Git - postgresql/commitdiff
Accelerate end-of-transaction dropping of relations
authorAlvaro Herrera <alvherre@alvh.no-ip.org>
Thu, 17 Jan 2013 18:55:10 +0000 (15:55 -0300)
committerAlvaro Herrera <alvherre@alvh.no-ip.org>
Thu, 17 Jan 2013 19:13:17 +0000 (16:13 -0300)
When relations are dropped, at end of transaction we need to remove the
files and clean the buffer pool of buffers containing pages of those
relations.  Previously we would scan the buffer pool once per relation
to clean up buffers.  When there are many relations to drop, the
repeated scans make this process slow; so we now instead pass a list of
relations to drop and scan the pool once, checking each buffer against
the passed list.  When the number of relations is larger than a
threshold (which as of this patch is being set to 20 relations) we sort
the array before starting, and bsearch the array; when it's smaller, we
simply scan the array linearly each time, because that's faster.  The
exact optimal threshold value depends on many factors, but the
difference is not likely to be significant enough to justify making it
user-settable.

This has been measured to be a significant win (a 15x win when dropping
100,000 relations; an extreme case, but reportedly a real one).

Author: Tomas Vondra, some tweaks by me
Reviewed by: Robert Haas, Shigeru Hanada, Andres Freund, Álvaro Herrera

src/backend/catalog/storage.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/smgr/smgr.c
src/include/storage/bufmgr.h
src/include/storage/smgr.h

index 95eda7af474bff0dc007b95feaddff2257e0518a..c43bebce851923a83326bc1c796cdb7ef3ec8b3c 100644 (file)
@@ -312,6 +312,10 @@ smgrDoPendingDeletes(bool isCommit)
        PendingRelDelete *pending;
        PendingRelDelete *prev;
        PendingRelDelete *next;
+       int                     nrels = 0,
+                               i = 0,
+                               maxrels = 8;
+       SMgrRelation *srels = palloc(maxrels * sizeof(SMgrRelation));
 
        prev = NULL;
        for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -335,14 +339,32 @@ smgrDoPendingDeletes(bool isCommit)
                                SMgrRelation srel;
 
                                srel = smgropen(pending->relnode, pending->backend);
-                               smgrdounlink(srel, false);
-                               smgrclose(srel);
+
+                               /* extend the array if needed (double the size) */
+                               if (maxrels <= nrels)
+                               {
+                                       maxrels *= 2;
+                                       srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
+                               }
+
+                               srels[nrels++] = srel;
                        }
                        /* must explicitly free the list entry */
                        pfree(pending);
                        /* prev does not change */
                }
        }
+
+       if (nrels > 0)
+       {
+               smgrdounlinkall(srels, nrels, false);
+
+               for (i = 0; i < nrels; i++)
+                       smgrclose(srels[i]);
+       }
+
+       pfree(srels);
+
 }
 
 /*
index 03ed41dc15262495bd636fc4791b96812a71e246..13b80aefc5bfade510b546d38928f4f2ea64d2a5 100644 (file)
@@ -62,6 +62,7 @@
 #define BUF_WRITTEN                            0x01
 #define BUF_REUSABLE                   0x02
 
+#define DROP_RELS_BSEARCH_THRESHOLD            20
 
 /* GUC variables */
 bool           zero_damaged_pages = false;
@@ -107,6 +108,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
                        bool *foundPtr);
 static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
 static void AtProcExit_Buffers(int code, Datum arg);
+static int rnode_comparator(const void *p1, const void *p2);
 
 
 /*
@@ -2086,43 +2088,103 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
 }
 
 /* ---------------------------------------------------------------------
- *             DropRelFileNodeAllBuffers
+ *             DropRelFileNodesAllBuffers
  *
  *             This function removes from the buffer pool all the pages of all
- *             forks of the specified relation.  It's equivalent to calling
- *             DropRelFileNodeBuffers once per fork with firstDelBlock = 0.
+ *             forks of the specified relations.  It's equivalent to calling
+ *             DropRelFileNodeBuffers once per fork per relation with
+ *             firstDelBlock = 0.
  * --------------------------------------------------------------------
  */
 void
-DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
+DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
 {
-       int                     i;
+       int         i,
+                               n = 0;
+       RelFileNode *nodes;
+       bool            use_bsearch;
+
+       if (nnodes == 0)
+               return;
+
+       nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
 
        /* If it's a local relation, it's localbuf.c's problem. */
-       if (RelFileNodeBackendIsTemp(rnode))
+       for (i = 0; i < nnodes; i++)
        {
-               if (rnode.backend == MyBackendId)
-                       DropRelFileNodeAllLocalBuffers(rnode.node);
+               if (RelFileNodeBackendIsTemp(rnodes[i]))
+               {
+                       if (rnodes[i].backend == MyBackendId)
+                               DropRelFileNodeAllLocalBuffers(rnodes[i].node);
+               }
+               else
+                       nodes[n++] = rnodes[i].node;
+       }
+
+       /*
+        * If there are no non-local relations, then we're done. Release the memory
+        * and return.
+        */
+       if (n == 0)
+       {
+               pfree(nodes);
                return;
        }
 
+       /*
+        * For low number of relations to drop just use a simple walk through, to
+        * save the bsearch overhead. The threshold to use is rather a guess than a
+        * exactly determined value, as it depends on many factors (CPU and RAM
+        * speeds, amount of shared buffers etc.).
+        */
+       use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
+
+       /* sort the list of rnodes if necessary */
+       if (use_bsearch)
+               pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
+
        for (i = 0; i < NBuffers; i++)
        {
+               RelFileNode *rnode = NULL;
                volatile BufferDesc *bufHdr = &BufferDescriptors[i];
 
                /*
                 * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
                 * and saves some cycles.
                 */
-               if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+
+               if (!use_bsearch)
+               {
+                       int             j;
+
+                       for (j = 0; j < n; j++)
+                       {
+                               if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
+                               {
+                                       rnode = &nodes[j];
+                                       break;
+                               }
+                       }
+               }
+               else
+               {
+                       rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+                                                       nodes, n, sizeof(RelFileNode),
+                                                       rnode_comparator);
+               }
+
+               /* buffer doesn't belong to any of the given relfilenodes; skip it */
+               if (rnode == NULL)
                        continue;
 
                LockBufHdr(bufHdr);
-               if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+               if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
                        InvalidateBuffer(bufHdr);       /* releases spinlock */
                else
                        UnlockBufHdr(bufHdr);
        }
+
+       pfree(nodes);
 }
 
 /* ---------------------------------------------------------------------
@@ -2953,3 +3015,30 @@ local_buffer_write_error_callback(void *arg)
                pfree(path);
        }
 }
+
+/*
+ * RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
+ */
+static int
+rnode_comparator(const void *p1, const void *p2)
+{
+       RelFileNode n1 = *(RelFileNode *) p1;
+       RelFileNode n2 = *(RelFileNode *) p2;
+
+       if (n1.relNode < n2.relNode)
+               return -1;
+       else if (n1.relNode > n2.relNode)
+               return 1;
+
+       if (n1.dbNode < n2.dbNode)
+               return -1;
+       else if (n1.dbNode > n2.dbNode)
+               return 1;
+
+       if (n1.spcNode < n2.spcNode)
+               return -1;
+       else if (n1.spcNode > n2.spcNode)
+               return 1;
+       else
+               return 0;
+}
index 925238cd89204ac6ffd0a7f4bd98337296c945b7..3aa6325481f66d07e5c48e0370b45fb156fe6d3c 100644 (file)
@@ -390,7 +390,7 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
         * Get rid of any remaining buffers for the relation.  bufmgr will just
         * drop them without bothering to write the contents.
         */
-       DropRelFileNodeAllBuffers(rnode);
+       DropRelFileNodesAllBuffers(&rnode, 1);
 
        /*
         * It'd be nice to tell the stats collector to forget it immediately, too.
@@ -419,6 +419,86 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
        (*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
 }
 
+/*
+ *     smgrdounlinkall() -- Immediately unlink all forks of all given relations
+ *
+ *             All forks of all given relations are removed from the store.  This
+ *             should not be used during transactional operations, since it can't be
+ *             undone.
+ *
+ *             If isRedo is true, it is okay for the underlying file(s) to be gone
+ *             already.
+ *
+ *             This is equivalent to calling smgrdounlink for each relation, but it's
+ *             significantly quicker so should be preferred when possible.
+ */
+void
+smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
+{
+       int             i = 0;
+       RelFileNodeBackend *rnodes;
+       ForkNumber  forknum;
+
+       if (nrels == 0)
+               return;
+
+       /*
+        * create an array which contains all relations to be dropped, and
+        * close each relation's forks at the smgr level while at it
+        */
+       rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
+       for (i = 0; i < nrels; i++)
+       {
+               RelFileNodeBackend rnode = rels[i]->smgr_rnode;
+               int                     which = rels[i]->smgr_which;
+
+               rnodes[i] = rnode;
+
+               /* Close the forks at smgr level */
+               for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+                       (*(smgrsw[which].smgr_close)) (rels[i], forknum);
+       }
+
+       /*
+        * Get rid of any remaining buffers for the relations.  bufmgr will just
+        * drop them without bothering to write the contents.
+        */
+       DropRelFileNodesAllBuffers(rnodes, nrels);
+
+       /*
+        * It'd be nice to tell the stats collector to forget them immediately, too.
+        * But we can't because we don't know the OIDs.
+        */
+
+       /*
+        * Send a shared-inval message to force other backends to close any
+        * dangling smgr references they may have for these rels.  We should do
+        * this before starting the actual unlinking, in case we fail partway
+        * through that step.  Note that the sinval messages will eventually come
+        * back to this backend, too, and thereby provide a backstop that we closed
+        * our own smgr rel.
+        */
+       for (i = 0; i < nrels; i++)
+               CacheInvalidateSmgr(rnodes[i]);
+
+       /*
+        * Delete the physical file(s).
+        *
+        * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+        * ERROR, because we've already decided to commit or abort the current
+        * xact.
+        */
+
+       for (i = 0; i < nrels; i++)
+       {
+               int     which = rels[i]->smgr_which;
+               for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+                       (*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
+       }
+
+       pfree(rnodes);
+}
+
 /*
  *     smgrdounlinkfork() -- Immediately unlink one fork of a relation.
  *
index d34034bcb79beb0c5ad988a2ab0a394b80ea778b..2ad536b745fb74e2189554ab8211e556ae223f75 100644 (file)
@@ -188,7 +188,7 @@ extern void FlushRelationBuffers(Relation rel);
 extern void FlushDatabaseBuffers(Oid dbid);
 extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
                                           ForkNumber forkNum, BlockNumber firstDelBlock);
-extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
+extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
 extern void DropDatabaseBuffers(Oid dbid);
 
 #define RelationGetNumberOfBlocks(reln) \
index 4547a0f5180751236a0f8357accb68021f8dfaad..98b6f13137422a4106a2e38940fc5cb6712f36b7 100644 (file)
@@ -85,6 +85,7 @@ extern void smgrcloseall(void);
 extern void smgrclosenode(RelFileNodeBackend rnode);
 extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrdounlink(SMgrRelation reln, bool isRedo);
+extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
 extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
                   BlockNumber blocknum, char *buffer, bool skipFsync);