]> granicus.if.org Git - postgresql/commitdiff
Add --sampling-rate option to pgbench.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 3 Oct 2012 12:37:42 +0000 (15:37 +0300)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 3 Oct 2012 12:37:42 +0000 (15:37 +0300)
This allows logging only some fraction of transactions, greatly reducing
the amount of log generated.

Tomas Vondra, reviewed by Robert Haas and Jeff Janes.

contrib/pgbench/pgbench.c
doc/src/sgml/pgbench.sgml

index a669cf406ba0b639cdb24e06aabeeea21f3262f2..c399d59d25b448a92c6b3df23c02d149c2ab5434 100644 (file)
@@ -129,6 +129,11 @@ int                        foreign_keys = 0;
  */
 int                    unlogged_tables = 0;
 
+/*
+ * log sampling rate (1.0 = log everything, 0.0 = option not given)
+ */
+double         sample_rate = 0.0;
+
 /*
  * tablespace selection
  */
@@ -370,6 +375,8 @@ usage(void)
                   "  -f FILENAME  read transaction script from FILENAME\n"
                   "  -j NUM       number of threads (default: 1)\n"
                   "  -l           write transaction times to log file\n"
+                  "  --sampling-rate NUM\n"
+                  "               fraction of transactions to log (e.g. 0.01 for 1%% sample)\n"
                   "  -M simple|extended|prepared\n"
                   "               protocol for submitting queries to server (default: simple)\n"
                   "  -n           do not run VACUUM before tests\n"
@@ -883,21 +890,30 @@ top:
                        instr_time      diff;
                        double          usec;
 
-                       INSTR_TIME_SET_CURRENT(now);
-                       diff = now;
-                       INSTR_TIME_SUBTRACT(diff, st->txn_begin);
-                       usec = (double) INSTR_TIME_GET_MICROSEC(diff);
+                       /*
+                        * write the log entry if this row belongs to the random sample,
+                        * or no sampling rate was given which means log everything.
+                        */
+                       if (sample_rate == 0.0 ||
+                               pg_erand48(thread->random_state) <= sample_rate)
+                       {
+
+                               INSTR_TIME_SET_CURRENT(now);
+                               diff = now;
+                               INSTR_TIME_SUBTRACT(diff, st->txn_begin);
+                               usec = (double) INSTR_TIME_GET_MICROSEC(diff);
 
 #ifndef WIN32
-                       /* This is more than we really ought to know about instr_time */
-                       fprintf(logfile, "%d %d %.0f %d %ld %ld\n",
-                                       st->id, st->cnt, usec, st->use_file,
-                                       (long) now.tv_sec, (long) now.tv_usec);
+                               /* This is more than we really ought to know about instr_time */
+                               fprintf(logfile, "%d %d %.0f %d %ld %ld\n",
+                                               st->id, st->cnt, usec, st->use_file,
+                                               (long) now.tv_sec, (long) now.tv_usec);
 #else
-                       /* On Windows, instr_time doesn't provide a timestamp anyway */
-                       fprintf(logfile, "%d %d %.0f %d 0 0\n",
-                                       st->id, st->cnt, usec, st->use_file);
+                               /* On Windows, instr_time doesn't provide a timestamp anyway */
+                               fprintf(logfile, "%d %d %.0f %d 0 0\n",
+                                               st->id, st->cnt, usec, st->use_file);
 #endif
+                       }
                }
 
                if (commands[st->state]->type == SQL_COMMAND)
@@ -1926,6 +1942,7 @@ main(int argc, char **argv)
                {"index-tablespace", required_argument, NULL, 3},
                {"tablespace", required_argument, NULL, 2},
                {"unlogged-tables", no_argument, &unlogged_tables, 1},
+               {"sampling-rate", required_argument, NULL, 4},
                {NULL, 0, NULL, 0}
        };
 
@@ -2131,6 +2148,14 @@ main(int argc, char **argv)
                        case 3:                         /* index-tablespace */
                                index_tablespace = optarg;
                                break;
+                       case 4:
+                               sample_rate = atof(optarg);
+                               if (sample_rate <= 0.0 || sample_rate > 1.0)
+                               {
+                                       fprintf(stderr, "invalid sampling rate: %f\n", sample_rate);
+                                       exit(1);
+                               }
+                               break;
                        default:
                                fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
                                exit(1);
@@ -2166,6 +2191,13 @@ main(int argc, char **argv)
                exit(1);
        }
 
+       /* --sampling-rate may be used only with -l */
+       if (sample_rate > 0.0 && !use_log)
+       {
+               fprintf(stderr, "log sampling rate is allowed only when logging transactions (-l) \n");
+               exit(1);
+       }
+
        /*
         * is_latencies only works with multiple threads in thread-based
         * implementations, not fork-based ones, because it supposes that the
index 437fceab2e482e9821e09be3e8e988e18c2289cd..91530abe5b0ab737ca944bed6712760d1d84aae3 100644 (file)
@@ -316,6 +316,24 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>--sampling-rate</option> <replaceable>rate</></term>
+      <listitem>
+       <para>
+        Sampling rate, used when writing data into the log, to reduce the
+        amount of log generated. If this option is given, only the specified
+        fraction of transactions are logged. 1.0 means all transactions will
+        be logged, 0.05 means only 5% of the transactions will be logged.
+       </para>
+       <para>
+        Remember to take the sampling rate into account when processing the
+        log file. For example, when computing tps values, you need to multiply
+        the numbers accordingly (e.g. with 0.01 sample rate, you'll only get
+        1/100 of the actual tps).
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>-M</option> <replaceable>querymode</></term>
       <listitem>
@@ -750,6 +768,12 @@ END;
  0 201 2513 0 1175850569 608
  0 202 2038 0 1175850569 2663
 </screen></para>
+
+  <para>
+   When running a long test on hardware that can handle a lot of transactions,
+   the log files can become very large.  The <option>--sampling-rate</> option
+   can be used to log only a random sample of transactions.
+  </para>
  </refsect2>
 
  <refsect2>