From e1be1df51facd3c25317c809fd72ecac8b93b4e7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 3 Oct 2012 15:37:42 +0300 Subject: [PATCH] Add --sampling-rate option to pgbench. This allows logging only some fraction of transactions, greatly reducing the amount of log generated. Tomas Vondra, reviewed by Robert Haas and Jeff Janes. --- contrib/pgbench/pgbench.c | 54 +++++++++++++++++++++++++++++++-------- doc/src/sgml/pgbench.sgml | 24 +++++++++++++++++ 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c index a669cf406b..c399d59d25 100644 --- a/contrib/pgbench/pgbench.c +++ b/contrib/pgbench/pgbench.c @@ -129,6 +129,11 @@ int foreign_keys = 0; */ int unlogged_tables = 0; +/* + * log sampling rate (1.0 = log everything, 0.0 = option not given) + */ +double sample_rate = 0.0; + /* * tablespace selection */ @@ -370,6 +375,8 @@ usage(void) " -f FILENAME read transaction script from FILENAME\n" " -j NUM number of threads (default: 1)\n" " -l write transaction times to log file\n" + " --sampling-rate NUM\n" + " fraction of transactions to log (e.g. 0.01 for 1%% sample)\n" " -M simple|extended|prepared\n" " protocol for submitting queries to server (default: simple)\n" " -n do not run VACUUM before tests\n" @@ -883,21 +890,30 @@ top: instr_time diff; double usec; - INSTR_TIME_SET_CURRENT(now); - diff = now; - INSTR_TIME_SUBTRACT(diff, st->txn_begin); - usec = (double) INSTR_TIME_GET_MICROSEC(diff); + /* + * write the log entry if this row belongs to the random sample, + * or no sampling rate was given which means log everything. + */ + if (sample_rate == 0.0 || + pg_erand48(thread->random_state) <= sample_rate) + { + + INSTR_TIME_SET_CURRENT(now); + diff = now; + INSTR_TIME_SUBTRACT(diff, st->txn_begin); + usec = (double) INSTR_TIME_GET_MICROSEC(diff); #ifndef WIN32 - /* This is more than we really ought to know about instr_time */ - fprintf(logfile, "%d %d %.0f %d %ld %ld\n", - st->id, st->cnt, usec, st->use_file, - (long) now.tv_sec, (long) now.tv_usec); + /* This is more than we really ought to know about instr_time */ + fprintf(logfile, "%d %d %.0f %d %ld %ld\n", + st->id, st->cnt, usec, st->use_file, + (long) now.tv_sec, (long) now.tv_usec); #else - /* On Windows, instr_time doesn't provide a timestamp anyway */ - fprintf(logfile, "%d %d %.0f %d 0 0\n", - st->id, st->cnt, usec, st->use_file); + /* On Windows, instr_time doesn't provide a timestamp anyway */ + fprintf(logfile, "%d %d %.0f %d 0 0\n", + st->id, st->cnt, usec, st->use_file); #endif + } } if (commands[st->state]->type == SQL_COMMAND) @@ -1926,6 +1942,7 @@ main(int argc, char **argv) {"index-tablespace", required_argument, NULL, 3}, {"tablespace", required_argument, NULL, 2}, {"unlogged-tables", no_argument, &unlogged_tables, 1}, + {"sampling-rate", required_argument, NULL, 4}, {NULL, 0, NULL, 0} }; @@ -2131,6 +2148,14 @@ main(int argc, char **argv) case 3: /* index-tablespace */ index_tablespace = optarg; break; + case 4: + sample_rate = atof(optarg); + if (sample_rate <= 0.0 || sample_rate > 1.0) + { + fprintf(stderr, "invalid sampling rate: %f\n", sample_rate); + exit(1); + } + break; default: fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); exit(1); @@ -2166,6 +2191,13 @@ main(int argc, char **argv) exit(1); } + /* --sampling-rate may be used only with -l */ + if (sample_rate > 0.0 && !use_log) + { + fprintf(stderr, "log sampling rate is allowed only when logging transactions (-l) \n"); + exit(1); + } + /* * is_latencies only works with multiple threads in thread-based * implementations, not fork-based ones, because it supposes that the diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml index 437fceab2e..91530abe5b 100644 --- a/doc/src/sgml/pgbench.sgml +++ b/doc/src/sgml/pgbench.sgml @@ -316,6 +316,24 @@ pgbench options dbname + + rate + + + Sampling rate, used when writing data into the log, to reduce the + amount of log generated. If this option is given, only the specified + fraction of transactions are logged. 1.0 means all transactions will + be logged, 0.05 means only 5% of the transactions will be logged. + + + Remember to take the sampling rate into account when processing the + log file. For example, when computing tps values, you need to multiply + the numbers accordingly (e.g. with 0.01 sample rate, you'll only get + 1/100 of the actual tps). + + + + querymode @@ -750,6 +768,12 @@ END; 0 201 2513 0 1175850569 608 0 202 2038 0 1175850569 2663 + + + When running a long test on hardware that can handle a lot of transactions, + the log files can become very large. The -- 2.40.0