]> granicus.if.org Git - postgresql/commitdiff
pgbench: Allow \setrandom to generate Gaussian/exponential distributions.
authorRobert Haas <rhaas@postgresql.org>
Wed, 30 Jul 2014 17:22:08 +0000 (13:22 -0400)
committerRobert Haas <rhaas@postgresql.org>
Wed, 30 Jul 2014 17:22:08 +0000 (13:22 -0400)
Mitsumasa KONDO and Fabien COELHO, with further wordsmithing by me.

contrib/pgbench/pgbench.c
doc/src/sgml/pgbench.sgml

index 4aa8a5031a09bda403447eb1ac76c1906cb64894..ad55c3cc030bd255746f54526cc37d4c0729e67e 100644 (file)
@@ -98,6 +98,8 @@ static int    pthread_join(pthread_t th, void **thread_return);
 #define LOG_STEP_SECONDS       5       /* seconds between log messages */
 #define DEFAULT_NXACTS 10              /* default nxacts */
 
+#define MIN_GAUSSIAN_THRESHOLD         2.0     /* minimum threshold for gauss */
+
 int                    nxacts = 0;                     /* number of transactions per client */
 int                    duration = 0;           /* duration in seconds */
 
@@ -471,6 +473,76 @@ getrand(TState *thread, int64 min, int64 max)
        return min + (int64) ((max - min + 1) * pg_erand48(thread->random_state));
 }
 
+/*
+ * random number generator: exponential distribution from min to max inclusive.
+ * the threshold is so that the density of probability for the last cut-off max
+ * value is exp(-threshold).
+ */
+static int64
+getExponentialRand(TState *thread, int64 min, int64 max, double threshold)
+{
+       double cut, uniform, rand;
+       Assert(threshold > 0.0);
+       cut = exp(-threshold);
+       /* erand in [0, 1), uniform in (0, 1] */
+       uniform = 1.0 - pg_erand48(thread->random_state);
+       /*
+        * inner expresion in (cut, 1] (if threshold > 0),
+        * rand in [0, 1)
+        */
+       Assert((1.0 - cut) != 0.0);
+       rand = - log(cut + (1.0 - cut) * uniform) / threshold;
+       /* return int64 random number within between min and max */
+       return min + (int64)((max - min + 1) * rand);
+}
+
+/* random number generator: gaussian distribution from min to max inclusive */
+static int64
+getGaussianRand(TState *thread, int64 min, int64 max, double threshold)
+{
+       double          stdev;
+       double          rand;
+
+       /*
+        * Get user specified random number from this loop, with
+        * -threshold < stdev <= threshold
+        *
+        * This loop is executed until the number is in the expected range.
+        *
+        * As the minimum threshold is 2.0, the probability of looping is low:
+        * sqrt(-2 ln(r)) <= 2 => r >= e^{-2} ~ 0.135, then when taking the average
+        * sinus multiplier as 2/pi, we have a 8.6% looping probability in the
+        * worst case. For a 5.0 threshold value, the looping probability
+        * is about e^{-5} * 2 / pi ~ 0.43%.
+        */
+       do
+       {
+               /*
+                * pg_erand48 generates [0,1), but for the basic version of the
+                * Box-Muller transform the two uniformly distributed random numbers
+                * are expected in (0, 1] (see http://en.wikipedia.org/wiki/Box_muller)
+                */
+               double rand1 = 1.0 - pg_erand48(thread->random_state);
+               double rand2 = 1.0 - pg_erand48(thread->random_state);
+
+               /* Box-Muller basic form transform */
+               double var_sqrt = sqrt(-2.0 * log(rand1));
+               stdev = var_sqrt * sin(2.0 * M_PI * rand2);
+
+               /*
+                * we may try with cos, but there may be a bias induced if the previous
+                * value fails the test. To be on the safe side, let us try over.
+                */
+       }
+       while (stdev < -threshold || stdev >= threshold);
+
+       /* stdev is in [-threshold, threshold), normalization to [0,1) */
+       rand = (stdev + threshold) / (threshold * 2.0);
+
+       /* return int64 random number within between min and max */
+       return min + (int64)((max - min + 1) * rand);
+}
+
 /* call PQexec() and exit() on failure */
 static void
 executeStatement(PGconn *con, const char *sql)
@@ -1319,6 +1391,7 @@ top:
                        char       *var;
                        int64           min,
                                                max;
+                       double          threshold = 0;
                        char            res[64];
 
                        if (*argv[2] == ':')
@@ -1364,11 +1437,11 @@ top:
                        }
 
                        /*
-                        * getrand() needs to be able to subtract max from min and add one
-                        * to the result without overflowing.  Since we know max > min, we
-                        * can detect overflow just by checking for a negative result. But
-                        * we must check both that the subtraction doesn't overflow, and
-                        * that adding one to the result doesn't overflow either.
+                        * Generate random number functions need to be able to subtract
+                        * max from min and add one to the result without overflowing.
+                        * Since we know max > min, we can detect overflow just by checking
+                        * for a negative result. But we must check both that the subtraction
+                        * doesn't overflow, and that adding one to the result doesn't overflow either.
                         */
                        if (max - min < 0 || (max - min) + 1 < 0)
                        {
@@ -1377,10 +1450,64 @@ top:
                                return true;
                        }
 
+                       if (argc == 4 || /* uniform without or with "uniform" keyword */
+                               (argc == 5 && pg_strcasecmp(argv[4], "uniform") == 0))
+                       {
+#ifdef DEBUG
+                               printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max));
+#endif
+                               snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max));
+                       }
+                       else if (argc == 6 &&
+                                        ((pg_strcasecmp(argv[4], "gaussian") == 0) ||
+                                         (pg_strcasecmp(argv[4], "exponential") == 0)))
+                       {
+                               if (*argv[5] == ':')
+                               {
+                                       if ((var = getVariable(st, argv[5] + 1)) == NULL)
+                                       {
+                                               fprintf(stderr, "%s: invalid threshold number %s\n", argv[0], argv[5]);
+                                               st->ecnt++;
+                                               return true;
+                                       }
+                                       threshold = strtod(var, NULL);
+                               }
+                               else
+                                       threshold = strtod(argv[5], NULL);
+
+                               if (pg_strcasecmp(argv[4], "gaussian") == 0)
+                               {
+                                       if (threshold < MIN_GAUSSIAN_THRESHOLD)
+                                       {
+                                               fprintf(stderr, "%s: gaussian threshold must be at least %f\n,", argv[5], MIN_GAUSSIAN_THRESHOLD);
+                                               st->ecnt++;
+                                               return true;
+                                       }
+#ifdef DEBUG
+                                       printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getGaussianRand(thread, min, max, threshold));
+#endif
+                                       snprintf(res, sizeof(res), INT64_FORMAT, getGaussianRand(thread, min, max, threshold));
+                               }
+                               else if (pg_strcasecmp(argv[4], "exponential") == 0)
+                               {
+                                       if (threshold <= 0.0)
+                                       {
+                                               fprintf(stderr, "%s: exponential threshold must be strictly positive\n,", argv[5]);
+                                               st->ecnt++;
+                                               return true;
+                                       }
 #ifdef DEBUG
-                       printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max));
+                                       printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getExponentialRand(thread, min, max, threshold));
 #endif
-                       snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max));
+                                       snprintf(res, sizeof(res), INT64_FORMAT, getExponentialRand(thread, min, max, threshold));
+                               }
+                       }
+                       else /* this means an error somewhere in the parsing phase... */
+                       {
+                               fprintf(stderr, "%s: unexpected arguments\n", argv[0]);
+                               st->ecnt++;
+                               return true;
+                       }
 
                        if (!putVariable(st, argv[0], argv[1], res))
                        {
@@ -1914,15 +2041,51 @@ process_commands(char *buf)
 
                if (pg_strcasecmp(my_commands->argv[0], "setrandom") == 0)
                {
+                       /* parsing:
+                        * \setrandom variable min max [uniform]
+                        * \setrandom variable min max (gaussian|exponential) threshold
+                        */
+
                        if (my_commands->argc < 4)
                        {
                                fprintf(stderr, "%s: missing argument\n", my_commands->argv[0]);
                                exit(1);
                        }
+                       /* argc >= 4 */
 
-                       for (j = 4; j < my_commands->argc; j++)
-                               fprintf(stderr, "%s: extra argument \"%s\" ignored\n",
-                                               my_commands->argv[0], my_commands->argv[j]);
+                       if (my_commands->argc == 4 || /* uniform without/with "uniform" keyword */
+                               (my_commands->argc == 5 &&
+                                pg_strcasecmp(my_commands->argv[4], "uniform") == 0))
+                       {
+                               /* nothing to do */
+                       }
+                       else if (/* argc >= 5 */
+                                        (pg_strcasecmp(my_commands->argv[4], "gaussian") == 0) ||
+                                        (pg_strcasecmp(my_commands->argv[4], "exponential") == 0))
+                       {
+                               if (my_commands->argc < 6)
+                               {
+                                       fprintf(stderr, "%s(%s): missing threshold argument\n", my_commands->argv[0], my_commands->argv[4]);
+                                       exit(1);
+                               }
+                               else if (my_commands->argc > 6)
+                               {
+                                       fprintf(stderr, "%s(%s): too many arguments (extra:",
+                                                       my_commands->argv[0], my_commands->argv[4]);
+                                       for (j = 6; j < my_commands->argc; j++)
+                                               fprintf(stderr, " %s", my_commands->argv[j]);
+                                       fprintf(stderr, ")\n");
+                                       exit(1);
+                               }
+                       }
+                       else /* cannot parse, unexpected arguments */
+                       {
+                               fprintf(stderr, "%s: unexpected arguments (bad:", my_commands->argv[0]);
+                               for (j = 4; j < my_commands->argc; j++)
+                                       fprintf(stderr, " %s", my_commands->argv[j]);
+                               fprintf(stderr, ")\n");
+                               exit(1);
+                       }
                }
                else if (pg_strcasecmp(my_commands->argv[0], "set") == 0)
                {
index f264c245ec0f1e301b17b0656f7ca8560d9c7ffd..b7d88f3000536ec7b0d1e323d5b20c93a201d073 100644 (file)
@@ -748,8 +748,8 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
 
    <varlistentry>
     <term>
-     <literal>\setrandom <replaceable>varname</> <replaceable>min</> <replaceable>max</></literal>
-    </term>
+     <literal>\setrandom <replaceable>varname</> <replaceable>min</> <replaceable>max</> [ uniform | [ { gaussian | exponential } <replaceable>threshold</> ] ]</literal>
+     </term>
 
     <listitem>
      <para>
@@ -760,10 +760,65 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
       having an integer value.
      </para>
 
+     <para>
+      By default, or when <literal>uniform</> is specified, all values in the
+      range are drawn with equal probability.  Specifiying <literal>gaussian</>
+      or  <literal>exponential</> options modifies this behavior; each
+      requires a mandatory threshold which determines the precise shape of the
+      distribution.
+     </para>
+
+     <para>
+      For a Gaussian distribution, the interval is mapped onto a standard
+      normal distribution (the classical bell-shaped Gaussian curve) truncated
+      at <literal>-threshold</> on the left and <literal>+threshold</>
+      on the right.
+      To be precise, if <literal>PHI(x)</> is the cumulative distribution
+      function of the standard normal distribution, with mean <literal>mu</>
+      defined as <literal>(max + min) / 2.0</>, then value <replaceable>i</>
+      between <replaceable>min</> and <replaceable>max</> inclusive is drawn
+      with probability:
+      <literal>
+        (PHI(2.0 * threshold * (i - min - mu + 0.5) / (max - min + 1)) -
+         PHI(2.0 * threshold * (i - min - mu - 0.5) / (max - min + 1))) /
+         (2.0 * PHI(threshold) - 1.0)
+      </>
+      Intuitively, the larger the <replaceable>threshold</>, the more
+      frequently values close to the middle of the interval are drawn, and the
+      less frequently values close to the <replaceable>min</> and
+      <replaceable>max</> bounds.
+      About 67% of values are drawn from the middle <literal>1.0 / threshold</>
+      and 95% in the middle <literal>2.0 / threshold</>; for instance, if
+      <replaceable>threshold</> is 4.0, 67% of values are drawn from the middle
+      quarter and 95% from the middle half of the interval.
+      The minimum <replaceable>threshold</> is 2.0 for performance of
+      the Box-Muller transform.
+     </para>
+
+     <para>
+      For an exponential distribution, the <replaceable>threshold</>
+      parameter controls the distribution by truncating a quickly-decreasing
+      exponential distribution at <replaceable>threshold</>, and then
+      projecting onto integers between the bounds.
+      To be precise, value <replaceable>i</> between <replaceable>min</> and
+      <replaceable>max</> inclusive is drawn with probability:
+      <literal>(exp(-threshold*(i-min)/(max+1-min)) -
+       exp(-threshold*(i+1-min)/(max+1-min))) / (1.0 - exp(-threshold))</>.
+      Intuitively, the larger the <replaceable>threshold</>, the more
+      frequently values close to <replaceable>min</> are accessed, and the
+      less frequently values close to <replaceable>max</> are accessed.
+      The closer to 0 the threshold, the flatter (more uniform) the access
+      distribution.
+      A crude approximation of the distribution is that the most frequent 1%
+      values in the range, close to <replaceable>min</>, are drawn
+      <replaceable>threshold</>%  of the time.
+      The <replaceable>threshold</> value must be strictly positive.
+     </para>
+
      <para>
       Example:
 <programlisting>
-\setrandom aid 1 :naccounts
+\setrandom aid 1 :naccounts gaussian 5.0
 </programlisting></para>
     </listitem>
    </varlistentry>