]> granicus.if.org Git - postgresql/commitdiff
Restrict pgbench's zipfian parameter to ensure good performance.
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 1 Apr 2019 21:37:26 +0000 (17:37 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 1 Apr 2019 21:37:34 +0000 (17:37 -0400)
Remove the code that supported zipfian distribution parameters less
than 1.0, as it had undocumented performance hazards, and it's not
clear that the case is useful enough to justify either fixing or
documenting those hazards.

Also, since the code path for parameter > 1.0 could perform badly
for values very close to 1.0, establish a minimum allowed value
of 1.001.  This solution seems superior to the previous vague
documentation warning about small values not performing well.

Fabien Coelho, per a gripe from Tomas Vondra

Discussion: https://postgr.es/m/b5e172e9-ad22-48a3-86a3-589afa20e8f7@2ndquadrant.com

doc/src/sgml/ref/pgbench.sgml
src/bin/pgbench/pgbench.c
src/bin/pgbench/t/001_pgbench_with_server.pl

index f11d36620d65a92538812640ecd739bf9e07b1f6..ee2501be552ec543b19d6178baef763fef265131 100644 (file)
@@ -1543,29 +1543,17 @@ f(x) = PHI(2.0 * parameter * (x - mu) / (max - min + 1)) /
       middle quarter (1.0 / 4.0) of the interval (i.e. from
       <literal>3.0 / 8.0</literal> to <literal>5.0 / 8.0</literal>) and 95% from
       the middle half (<literal>2.0 / 4.0</literal>) of the interval (second and third
-      quartiles). The minimum <replaceable>parameter</replaceable> is 2.0 for performance
-      of the Box-Muller transform.
+      quartiles). The minimum allowed <replaceable>parameter</replaceable>
+      value is 2.0.
      </para>
     </listitem>
     <listitem>
      <para>
-      <literal>random_zipfian</literal> generates an approximated bounded Zipfian
-      distribution. For <replaceable>parameter</replaceable> in (0, 1), an
-      approximated algorithm is taken from
-      "Quickly Generating Billion-Record Synthetic Databases",
-      Jim Gray et al, SIGMOD 1994. For <replaceable>parameter</replaceable>
-      in (1, 1000), a rejection method is used, based on
-      "Non-Uniform Random Variate Generation", Luc Devroye, p. 550-551,
-      Springer 1986. The distribution is not defined when the parameter's
-      value is 1.0. The function's performance is poor for parameter values
-      close and above 1.0 and on a small range.
-     </para>
-     <para>
+      <literal>random_zipfian</literal> generates a bounded Zipfian
+      distribution.
       <replaceable>parameter</replaceable> defines how skewed the distribution
       is. The larger the <replaceable>parameter</replaceable>, the more
       frequently values closer to the beginning of the interval are drawn.
-      The closer to 0 <replaceable>parameter</replaceable> is,
-      the flatter (more uniform) the output distribution.
       The distribution is such that, assuming the range starts from 1,
       the ratio of the probability of drawing <replaceable>k</replaceable>
       versus drawing <replaceable>k+1</replaceable> is
@@ -1576,6 +1564,13 @@ f(x) = PHI(2.0 * parameter * (x - mu) / (max - min + 1)) /
       itself is produced <literal>(3/2)*2.5 = 2.76</literal> times more
       frequently than <literal>3</literal>, and so on.
      </para>
+     <para>
+      <application>pgbench</application>'s implementation is based on
+      "Non-Uniform Random Variate Generation", Luc Devroye, p. 550-551,
+      Springer 1986.  Due to limitations of that algorithm,
+      the <replaceable>parameter</replaceable> value is restricted to
+      the range [1.001, 1000].
+     </para>
     </listitem>
    </itemizedlist>
 
index 93e2d350e733e3125f2bdb1723ef7f6576a4a3c6..e5078af79610b62369eb574fe05d354c467a12b4 100644 (file)
@@ -135,10 +135,10 @@ static int        pthread_join(pthread_t th, void **thread_return);
 #define LOG_STEP_SECONDS       5       /* seconds between log messages */
 #define DEFAULT_NXACTS 10              /* default nxacts */
 
-#define ZIPF_CACHE_SIZE        15              /* cache cells number */
-
 #define MIN_GAUSSIAN_PARAM             2.0 /* minimum parameter for gauss */
-#define MAX_ZIPFIAN_PARAM              1000    /* maximum parameter for zipfian */
+
+#define MIN_ZIPFIAN_PARAM              1.001   /* minimum parameter for zipfian */
+#define MAX_ZIPFIAN_PARAM              1000.0  /* maximum parameter for zipfian */
 
 int                    nxacts = 0;                     /* number of transactions per client */
 int                    duration = 0;           /* duration in seconds */
@@ -410,35 +410,6 @@ typedef struct
        int                     ecnt;                   /* error count */
 } CState;
 
-/*
- * Cache cell for random_zipfian call
- */
-typedef struct
-{
-       /* cell keys */
-       double          s;                              /* s - parameter of random_zipfian function */
-       int64           n;                              /* number of elements in range (max - min + 1) */
-
-       double          harmonicn;              /* generalizedHarmonicNumber(n, s) */
-       double          alpha;
-       double          beta;
-       double          eta;
-
-       uint64          last_used;              /* last used logical time */
-} ZipfCell;
-
-/*
- * Zipf cache for zeta values
- */
-typedef struct
-{
-       uint64          current;                /* counter for LRU cache replacement algorithm */
-
-       int                     nb_cells;               /* number of filled cells */
-       int                     overflowCount;  /* number of cache overflows */
-       ZipfCell        cells[ZIPF_CACHE_SIZE];
-} ZipfCache;
-
 /*
  * Thread state
  */
@@ -460,8 +431,6 @@ typedef struct
 
        int64           throttle_trigger;       /* previous/next throttling (us) */
        FILE       *logfile;            /* where to log, or NULL */
-       ZipfCache       zipf_cache;             /* for thread-safe  zipfian random number
-                                                                * generation */
 
        /* per thread collected stats */
        instr_time      start_time;             /* thread start time */
@@ -975,77 +944,12 @@ getPoissonRand(RandomState *random_state, double center)
        return (int64) (-log(uniform) * center + 0.5);
 }
 
-/* helper function for getZipfianRand */
-static double
-generalizedHarmonicNumber(int64 n, double s)
-{
-       int                     i;
-       double          ans = 0.0;
-
-       for (i = n; i > 1; i--)
-               ans += pow(i, -s);
-       return ans + 1.0;
-}
-
-/* set harmonicn and other parameters to cache cell */
-static void
-zipfSetCacheCell(ZipfCell *cell, int64 n, double s)
-{
-       double          harmonic2;
-
-       cell->n = n;
-       cell->s = s;
-
-       harmonic2 = generalizedHarmonicNumber(2, s);
-       cell->harmonicn = generalizedHarmonicNumber(n, s);
-
-       cell->alpha = 1.0 / (1.0 - s);
-       cell->beta = pow(0.5, s);
-       cell->eta = (1.0 - pow(2.0 / n, 1.0 - s)) / (1.0 - harmonic2 / cell->harmonicn);
-}
-
-/*
- * search for cache cell with keys (n, s)
- * and create new cell if it does not exist
- */
-static ZipfCell *
-zipfFindOrCreateCacheCell(ZipfCache *cache, int64 n, double s)
-{
-       int                     i,
-                               least_recently_used = 0;
-       ZipfCell   *cell;
-
-       /* search cached cell for given parameters */
-       for (i = 0; i < cache->nb_cells; i++)
-       {
-               cell = &cache->cells[i];
-               if (cell->n == n && cell->s == s)
-                       return &cache->cells[i];
-
-               if (cell->last_used < cache->cells[least_recently_used].last_used)
-                       least_recently_used = i;
-       }
-
-       /* create new one if it does not exist */
-       if (cache->nb_cells < ZIPF_CACHE_SIZE)
-               i = cache->nb_cells++;
-       else
-       {
-               /* replace LRU cell if cache is full */
-               i = least_recently_used;
-               cache->overflowCount++;
-       }
-
-       zipfSetCacheCell(&cache->cells[i], n, s);
-
-       cache->cells[i].last_used = cache->current++;
-       return &cache->cells[i];
-}
-
 /*
  * Computing zipfian using rejection method, based on
  * "Non-Uniform Random Variate Generation",
  * Luc Devroye, p. 550-551, Springer 1986.
+ *
+ * This works for s > 1.0, but may perform badly for s very close to 1.0.
  */
 static int64
 computeIterativeZipfian(RandomState *random_state, int64 n, double s)
@@ -1056,6 +960,10 @@ computeIterativeZipfian(RandomState *random_state, int64 n, double s)
                                u,
                                v;
 
+       /* Ensure n is sane */
+       if (n <= 1)
+               return 1;
+
        while (true)
        {
                /* random variates */
@@ -1072,39 +980,16 @@ computeIterativeZipfian(RandomState *random_state, int64 n, double s)
        return (int64) x;
 }
 
-/*
- * Computing zipfian using harmonic numbers, based on algorithm described in
- * "Quickly Generating Billion-Record Synthetic Databases",
- * Jim Gray et al, SIGMOD 1994
- */
-static int64
-computeHarmonicZipfian(ZipfCache *zipf_cache, RandomState *random_state,
-                                          int64 n, double s)
-{
-       ZipfCell   *cell = zipfFindOrCreateCacheCell(zipf_cache, n, s);
-       double          uniform = pg_erand48(random_state->xseed);
-       double          uz = uniform * cell->harmonicn;
-
-       if (uz < 1.0)
-               return 1;
-       if (uz < 1.0 + cell->beta)
-               return 2;
-       return 1 + (int64) (cell->n * pow(cell->eta * uniform - cell->eta + 1.0, cell->alpha));
-}
-
 /* random number generator: zipfian distribution from min to max inclusive */
 static int64
-getZipfianRand(ZipfCache *zipf_cache, RandomState *random_state, int64 min,
-                          int64 max, double s)
+getZipfianRand(RandomState *random_state, int64 min, int64 max, double s)
 {
        int64           n = max - min + 1;
 
        /* abort if parameter is invalid */
-       Assert(s > 0.0 && s != 1.0 && s <= MAX_ZIPFIAN_PARAM);
+       Assert(MIN_ZIPFIAN_PARAM <= s && s <= MAX_ZIPFIAN_PARAM);
 
-       return min - 1 + ((s > 1)
-                                         ? computeIterativeZipfian(random_state, n, s)
-                                         : computeHarmonicZipfian(zipf_cache, random_state, n, s));
+       return min - 1 + computeIterativeZipfian(random_state, n, s);
 }
 
 /*
@@ -2426,17 +2311,17 @@ evalStandardFunc(TState *thread, CState *st,
                                        }
                                        else if (func == PGBENCH_RANDOM_ZIPFIAN)
                                        {
-                                               if (param <= 0.0 || param == 1.0 || param > MAX_ZIPFIAN_PARAM)
+                                               if (param < MIN_ZIPFIAN_PARAM || param > MAX_ZIPFIAN_PARAM)
                                                {
                                                        fprintf(stderr,
-                                                                       "zipfian parameter must be in range (0, 1) U (1, %d]"
-                                                                       " (got %f)\n", MAX_ZIPFIAN_PARAM, param);
+                                                                       "zipfian parameter must be in range [%.3f, %.0f]"
+                                                                       " (not %f)\n",
+                                                                       MIN_ZIPFIAN_PARAM, MAX_ZIPFIAN_PARAM, param);
                                                        return false;
                                                }
+
                                                setIntValue(retval,
-                                                                       getZipfianRand(&thread->zipf_cache,
-                                                                                                  &st->cs_func_rs,
-                                                                                                  imin, imax, param));
+                                                                       getZipfianRand(&st->cs_func_rs, imin, imax, param));
                                        }
                                        else            /* exponential */
                                        {
@@ -2444,7 +2329,7 @@ evalStandardFunc(TState *thread, CState *st,
                                                {
                                                        fprintf(stderr,
                                                                        "exponential parameter must be greater than zero"
-                                                                       " (got %f)\n", param);
+                                                                       " (not %f)\n", param);
                                                        return false;
                                                }
 
@@ -4996,8 +4881,6 @@ printResults(TState *threads, StatsData *total, instr_time total_time,
                                tps_include,
                                tps_exclude;
        int64           ntx = total->cnt - total->skipped;
-       int                     i,
-                               totalCacheOverflows = 0;
 
        time_include = INSTR_TIME_GET_DOUBLE(total_time);
 
@@ -5025,15 +4908,6 @@ printResults(TState *threads, StatsData *total, instr_time total_time,
                printf("number of transactions actually processed: " INT64_FORMAT "\n",
                           ntx);
        }
-       /* Report zipfian cache overflow */
-       for (i = 0; i < nthreads; i++)
-       {
-               totalCacheOverflows += threads[i].zipf_cache.overflowCount;
-       }
-       if (totalCacheOverflows > 0)
-       {
-               printf("zipfian cache array overflowed %d time(s)\n", totalCacheOverflows);
-       }
 
        /* Remaining stats are nonsensical if we failed to execute any xacts */
        if (total->cnt <= 0)
@@ -5916,9 +5790,6 @@ main(int argc, char **argv)
                initRandomState(&thread->ts_sample_rs);
                thread->logfile = NULL; /* filled in later */
                thread->latency_late = 0;
-               thread->zipf_cache.nb_cells = 0;
-               thread->zipf_cache.current = 0;
-               thread->zipf_cache.overflowCount = 0;
                initStats(&thread->stats, 0);
 
                nclients_dealt += thread->nstate;
index ad3a7a79f8773d690f9ee93bbec65857ff613ddf..62906d5e55d8309f95667cfbc2624d1848799cbd 100644 (file)
@@ -666,13 +666,13 @@ SELECT LEAST(}.join(', ', (':i') x 256).q{)}
        [
                'set zipfian param to 1',
                2,
-               [qr{zipfian parameter must be in range \(0, 1\) U \(1, \d+\]}],
+               [qr{zipfian parameter must be in range \[1\.001, 1000\]}],
                q{\set i random_zipfian(0, 10, 1)}
        ],
        [
                'set zipfian param too large',
                2,
-               [qr{zipfian parameter must be in range \(0, 1\) U \(1, \d+\]}],
+               [qr{zipfian parameter must be in range \[1\.001, 1000\]}],
                q{\set i random_zipfian(0, 10, 1000000)}
        ],
        [
@@ -802,33 +802,6 @@ for my $e (@errors)
                { $n => $script });
 }
 
-# zipfian cache array overflow
-pgbench(
-       '-t 1', 0,
-       [ qr{processed: 1/1}, qr{zipfian cache array overflowed 1 time\(s\)} ],
-       [qr{^}],
-       'pgbench zipfian array overflow on random_zipfian',
-       {
-               '001_pgbench_random_zipfian' => q{
-\set i random_zipfian(1, 100, 0.5)
-\set i random_zipfian(2, 100, 0.5)
-\set i random_zipfian(3, 100, 0.5)
-\set i random_zipfian(4, 100, 0.5)
-\set i random_zipfian(5, 100, 0.5)
-\set i random_zipfian(6, 100, 0.5)
-\set i random_zipfian(7, 100, 0.5)
-\set i random_zipfian(8, 100, 0.5)
-\set i random_zipfian(9, 100, 0.5)
-\set i random_zipfian(10, 100, 0.5)
-\set i random_zipfian(11, 100, 0.5)
-\set i random_zipfian(12, 100, 0.5)
-\set i random_zipfian(13, 100, 0.5)
-\set i random_zipfian(14, 100, 0.5)
-\set i random_zipfian(15, 100, 0.5)
-\set i random_zipfian(16, 100, 0.5)
-}
-       });
-
 # throttling
 pgbench(
        '-t 100 -S --rate=100000 --latency-limit=1000000 -c 2 -n -r',