]> granicus.if.org Git - postgresql/commitdiff
Fix efficiency problems in tuplestore_trim().
authorTom Lane <tgl@sss.pgh.pa.us>
Fri, 10 Dec 2010 16:33:38 +0000 (11:33 -0500)
committerTom Lane <tgl@sss.pgh.pa.us>
Fri, 10 Dec 2010 16:33:38 +0000 (11:33 -0500)
The original coding in tuplestore_trim() was only meant to work efficiently
in cases where each trim call deleted most of the tuples in the store.
Which, in fact, was the pattern of the original usage with a Material node
supporting mark/restore operations underneath a MergeJoin.  However,
WindowAgg now uses tuplestores and it has considerably less friendly
trimming behavior.  In particular it can attempt to trim one tuple at a
time off a large tuplestore.  tuplestore_trim() had O(N^2) runtime in this
situation because of repeatedly shifting its tuple pointer array.  Fix by
avoiding shifting the array until a reasonably large number of tuples have
been deleted.  This can waste some pointer space, but we do still reclaim
the tuples themselves, so the percentage wastage should be pretty small.

Per Jie Li's report of slow percent_rank() evaluation.  cume_dist() and
ntile() would certainly be affected as well, along with any other window
function that has a moving frame start and requires reading substantially
ahead of the current row.

Back-patch to 8.4, where window functions were introduced.  There's no
need to tweak it before that.

src/backend/utils/sort/tuplestore.c

index 9bbaba43771f495fdf24e9f2afd545b69a22ecbd..8c8139c897679892e0d4ad13e69ae8d814484206 100644 (file)
@@ -145,8 +145,15 @@ struct Tuplestorestate
        /*
         * This array holds pointers to tuples in memory if we are in state INMEM.
         * In states WRITEFILE and READFILE it's not used.
+        *
+        * When memtupdeleted > 0, the first memtupdeleted pointers are already
+        * released due to a tuplestore_trim() operation, but we haven't expended
+        * the effort to slide the remaining pointers down.  These unused pointers
+        * are set to NULL to catch any invalid accesses.  Note that memtupcount
+        * includes the deleted pointers.
         */
        void      **memtuples;          /* array of pointers to palloc'd tuples */
+       int                     memtupdeleted;  /* the first N slots are currently unused */
        int                     memtupcount;    /* number of tuples currently present */
        int                     memtupsize;             /* allocated length of memtuples array */
 
@@ -252,6 +259,7 @@ tuplestore_begin_common(int eflags, bool interXact, int maxKBytes)
        state->context = CurrentMemoryContext;
        state->resowner = CurrentResourceOwner;
 
+       state->memtupdeleted = 0;
        state->memtupcount = 0;
        state->memtupsize = 1024;       /* initial guess */
        state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *));
@@ -401,7 +409,7 @@ tuplestore_clear(Tuplestorestate *state)
        state->myfile = NULL;
        if (state->memtuples)
        {
-               for (i = 0; i < state->memtupcount; i++)
+               for (i = state->memtupdeleted; i < state->memtupcount; i++)
                {
                        FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i]));
                        pfree(state->memtuples[i]);
@@ -409,6 +417,7 @@ tuplestore_clear(Tuplestorestate *state)
        }
        state->status = TSS_INMEM;
        state->truncated = false;
+       state->memtupdeleted = 0;
        state->memtupcount = 0;
        readptr = state->readptrs;
        for (i = 0; i < state->readptrcount; readptr++, i++)
@@ -432,7 +441,7 @@ tuplestore_end(Tuplestorestate *state)
                BufFileClose(state->myfile);
        if (state->memtuples)
        {
-               for (i = 0; i < state->memtupcount; i++)
+               for (i = state->memtupdeleted; i < state->memtupcount; i++)
                        pfree(state->memtuples[i]);
                pfree(state->memtuples);
        }
@@ -774,14 +783,14 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward,
                                }
                                else
                                {
-                                       if (readptr->current <= 0)
+                                       if (readptr->current <= state->memtupdeleted)
                                        {
                                                Assert(!state->truncated);
                                                return NULL;
                                        }
                                        readptr->current--; /* last returned tuple */
                                }
-                               if (readptr->current <= 0)
+                               if (readptr->current <= state->memtupdeleted)
                                {
                                        Assert(!state->truncated);
                                        return NULL;
@@ -969,7 +978,7 @@ dumptuples(Tuplestorestate *state)
 {
        int                     i;
 
-       for (i = 0;; i++)
+       for (i = state->memtupdeleted;; i++)
        {
                TSReadPointer *readptr = state->readptrs;
                int                     j;
@@ -984,6 +993,7 @@ dumptuples(Tuplestorestate *state)
                        break;
                WRITETUP(state, state->memtuples[i]);
        }
+       state->memtupdeleted = 0;
        state->memtupcount = 0;
 }
 
@@ -1153,24 +1163,36 @@ tuplestore_trim(Tuplestorestate *state)
        nremove = oldest - 1;
        if (nremove <= 0)
                return;                                 /* nothing to do */
+
+       Assert(nremove >= state->memtupdeleted);
        Assert(nremove <= state->memtupcount);
 
        /* Release no-longer-needed tuples */
-       for (i = 0; i < nremove; i++)
+       for (i = state->memtupdeleted; i < nremove; i++)
        {
                FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i]));
                pfree(state->memtuples[i]);
+               state->memtuples[i] = NULL;
        }
+       state->memtupdeleted = nremove;
+
+       /* mark tuplestore as truncated (used for Assert crosschecks only) */
+       state->truncated = true;
+
+       /*
+        * If nremove is less than 1/8th memtupcount, just stop here, leaving the
+        * "deleted" slots as NULL.  This prevents us from expending O(N^2) time
+        * repeatedly memmove-ing a large pointer array.  The worst case space
+        * wastage is pretty small, since it's just pointers and not whole tuples.
+        */
+       if (nremove < state->memtupcount / 8)
+               return;
 
        /*
-        * Slide the array down and readjust pointers.  This may look pretty
-        * stupid, but we expect that there will usually not be very many
-        * tuple-pointers to move, so this isn't that expensive; and it keeps a
-        * lot of other logic simple.
+        * Slide the array down and readjust pointers.
         *
-        * In fact, in the current usage for merge joins, it's demonstrable that
-        * there will always be exactly one non-removed tuple; so optimize that
-        * case.
+        * In mergejoin's current usage, it's demonstrable that there will always
+        * be exactly one non-removed tuple; so optimize that case.
         */
        if (nremove + 1 == state->memtupcount)
                state->memtuples[0] = state->memtuples[nremove];
@@ -1178,15 +1200,13 @@ tuplestore_trim(Tuplestorestate *state)
                memmove(state->memtuples, state->memtuples + nremove,
                                (state->memtupcount - nremove) * sizeof(void *));
 
+       state->memtupdeleted = 0;
        state->memtupcount -= nremove;
        for (i = 0; i < state->readptrcount; i++)
        {
                if (!state->readptrs[i].eof_reached)
                        state->readptrs[i].current -= nremove;
        }
-
-       /* mark tuplestore as truncated (used for Assert crosschecks only) */
-       state->truncated = true;
 }
 
 /*