From cee523867db29c0bfc5de7ec638ce0a4ad9b3817 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 27 Mar 2012 16:14:00 -0400 Subject: [PATCH] pg_test_timing utility, to measure clock monotonicity and timing cost. Ants Aasma, Greg Smith --- contrib/Makefile | 1 + contrib/pg_test_timing/.gitignore | 1 + contrib/pg_test_timing/Makefile | 18 ++ contrib/pg_test_timing/pg_test_timing.c | 162 +++++++++++++++ doc/src/sgml/config.sgml | 4 +- doc/src/sgml/contrib.sgml | 1 + doc/src/sgml/filelist.sgml | 1 + doc/src/sgml/perform.sgml | 4 +- doc/src/sgml/pgtesttiming.sgml | 261 ++++++++++++++++++++++++ 9 files changed, 451 insertions(+), 2 deletions(-) create mode 100644 contrib/pg_test_timing/.gitignore create mode 100644 contrib/pg_test_timing/Makefile create mode 100644 contrib/pg_test_timing/pg_test_timing.c create mode 100644 doc/src/sgml/pgtesttiming.sgml diff --git a/contrib/Makefile b/contrib/Makefile index ac0a80a014..d230451a12 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -35,6 +35,7 @@ SUBDIRS = \ pg_standby \ pg_stat_statements \ pg_test_fsync \ + pg_test_timing \ pg_trgm \ pg_upgrade \ pg_upgrade_support \ diff --git a/contrib/pg_test_timing/.gitignore b/contrib/pg_test_timing/.gitignore new file mode 100644 index 0000000000..f6c664c765 --- /dev/null +++ b/contrib/pg_test_timing/.gitignore @@ -0,0 +1 @@ +/pg_test_timing diff --git a/contrib/pg_test_timing/Makefile b/contrib/pg_test_timing/Makefile new file mode 100644 index 0000000000..b8b266a2fc --- /dev/null +++ b/contrib/pg_test_timing/Makefile @@ -0,0 +1,18 @@ +# contrib/pg_test_timing/Makefile + +PGFILEDESC = "pg_test_timing - test timing overhead" +PGAPPICON = win32 + +PROGRAM = pg_test_timing +OBJS = pg_test_timing.o + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/pg_test_timing +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/pg_test_timing/pg_test_timing.c b/contrib/pg_test_timing/pg_test_timing.c new file mode 100644 index 0000000000..cdfa07f6eb --- /dev/null +++ b/contrib/pg_test_timing/pg_test_timing.c @@ -0,0 +1,162 @@ +/* + * pg_test_timing.c + * tests overhead of timing calls and their monotonicity: that + * they always move forward + */ + +#include "postgres_fe.h" + +#include "getopt_long.h" +#include "portability/instr_time.h" + +static const char *progname; + +static int32 test_duration = 3; + +static void handle_args(int argc, char *argv[]); +static void test_timing(int32); + +int +main(int argc, char *argv[]) +{ + progname = get_progname(argv[0]); + + handle_args(argc, argv); + + test_timing(test_duration); + + return 0; +} + +static void +handle_args(int argc, char *argv[]) +{ + static struct option long_options[] = { + {"duration", required_argument, NULL, 'd'}, + {NULL, 0, NULL, 0} + }; + int option; /* Command line option */ + int optindex = 0; /* used by getopt_long */ + + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0 || + strcmp(argv[1], "-?") == 0) + { + printf("Usage: %s [-d DURATION]\n", progname); + exit(0); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + puts("pg_test_timing (PostgreSQL) " PG_VERSION); + exit(0); + } + } + + while ((option = getopt_long(argc, argv, "d:", + long_options, &optindex)) != -1) + { + switch (option) + { + case 'd': + test_duration = atoi(optarg); + break; + + default: + fprintf(stderr, "Try \"%s --help\" for more information.\n", + progname); + exit(1); + break; + } + } + + if (argc > optind) + { + fprintf(stderr, + "%s: too many command-line arguments (first is \"%s\")\n", + progname, argv[optind]); + fprintf(stderr, "Try \"%s --help\" for more information.\n", + progname); + exit(1); + } + + if (test_duration > 0) + { + printf("Testing timing overhead for %d seconds.\n", test_duration); + } + else + { + fprintf(stderr, + "%s: duration must be a positive integer (duration is \"%d\")\n", + progname, test_duration); + fprintf(stderr, "Try \"%s --help\" for more information.\n", + progname); + exit(1); + } +} + +static void +test_timing(int32 duration) +{ + uint64 total_time; + int64 time_elapsed = 0; + uint64 loop_count = 0; + uint64 prev, cur; + int32 diff, i, bits, found; + + instr_time start_time, end_time, temp; + + static int64 histogram[32]; + + total_time = duration > 0 ? duration * 1000000 : 0; + + INSTR_TIME_SET_CURRENT(start_time); + cur = INSTR_TIME_GET_MICROSEC(start_time); + + while (time_elapsed < total_time) + { + prev = cur; + INSTR_TIME_SET_CURRENT(temp); + cur = INSTR_TIME_GET_MICROSEC(temp); + diff = cur - prev; + + if (diff < 0) + { + printf("Detected clock going backwards in time.\n"); + printf("Time warp: %d microseconds\n", diff); + exit(1); + } + + bits = 0; + while (diff) + { + diff >>= 1; + bits++; + } + histogram[bits]++; + + loop_count++; + INSTR_TIME_SUBTRACT(temp, start_time); + time_elapsed = INSTR_TIME_GET_MICROSEC(temp); + } + + INSTR_TIME_SET_CURRENT(end_time); + + INSTR_TIME_SUBTRACT(end_time, start_time); + + printf("Per loop time including overhead: %0.2f nsec\n", + INSTR_TIME_GET_DOUBLE(end_time) * 1e9 / loop_count); + printf("Histogram of timing durations:\n"); + printf("%9s: %10s %9s\n", "< usec", "count", "percent"); + + found = 0; + for (i = 31; i >= 0; i--) + { + if (found || histogram[i]) + { + found = 1; + printf("%9ld: %10ld %8.5f%%\n", 1l << i, histogram[i], + (double) histogram[i] * 100 / loop_count); + } + } +} diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 9baaa3fb36..0a5d519214 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -4294,7 +4294,9 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; Enables timing of database I/O calls. This parameter is off by default, because it will repeatedly query the operating system for the current time, which may cause significant overhead on some - platforms. Only superusers can change this setting. + platforms. You can use the tool to + measure the overhead of timing on your system. Only superusers can + change this setting. diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml index d4da4eec87..97031dddaa 100644 --- a/doc/src/sgml/contrib.sgml +++ b/doc/src/sgml/contrib.sgml @@ -121,6 +121,7 @@ CREATE EXTENSION module_name FROM unpackaged; &pgstatstatements; &pgstattuple; &pgtestfsync; + &pgtesttiming; &pgtrgm; &pgupgrade; &seg; diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index b5d3c6d4fc..428a167278 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -129,6 +129,7 @@ + diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 8e695fd540..5a8c6fc7dc 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -770,7 +770,9 @@ ROLLBACK; network transmission costs and I/O conversion costs are not included. Second, the measurement overhead added by EXPLAIN ANALYZE can be significant, especially on machines with slow - gettimeofday() operating-system calls. + gettimeofday() operating-system calls. You can use the + tool to measure the overhead of timing + on your system. diff --git a/doc/src/sgml/pgtesttiming.sgml b/doc/src/sgml/pgtesttiming.sgml new file mode 100644 index 0000000000..9bdf3e3973 --- /dev/null +++ b/doc/src/sgml/pgtesttiming.sgml @@ -0,0 +1,261 @@ + + + + pg_test_timing + + + pg_test_timing + + + + pg_test_timing is a tool to measure the timing overhead + on your system and confirm that the system time never moves backwards. + Systems that are slow to collect timing data can give less accurate + EXPLAIN ANALYZE results. + + + + Usage + + +pg_test_timing [options] + + + + pg_test_timing accepts the following + command-line options: + + + + + + + + + Specifies the test duration, in seconds. Longer durations + give slightly better accuracy, and are more likely to discover + problems with the system clock moving backwards. The default + test duration is 3 seconds. + + + + + + + + + + + Interpreting results + + + Good results will show most (>90%) individual timing calls take less + than one microsecond. Average per loop overhead will be even lower, + below 100 nanoseconds. This example from an Intel i7-860 system using + a TSC clock source shows excellent performance: + + + +Testing timing overhead for 3 seconds. +Per loop time including overhead: 35.96 nsec +Histogram of timing durations: + < usec: count percent + 16: 2 0.00000% + 8: 13 0.00002% + 4: 126 0.00015% + 2: 2999652 3.59518% + 1: 80435604 96.40465% + + + + Note that different units are used for the per loop time than the + histogram. The loop can have resolution within a few nanoseconds + (nsec), while the individual timing calls can only resolve down to + one microsecond (usec). + + + + + Measuring executor timing overhead + + + When the query executor is running a statement using + EXPLAIN ANALYZE, individual operations are + timed as well as showing a summary. The overhead of your system + can be checked by counting rows with the psql program: + + + +CREATE TABLE t AS SELECT * FROM generate_series(1,100000); +\timing +SELECT COUNT(*) FROM t; +EXPLAIN ANALYZE SELECT COUNT(*) FROM t; + + + + The i7-860 system measured runs the count query in 9.8 ms while + the EXPLAIN ANALYZE version takes 16.6 ms, + each processing just over 100,000 rows. That 6.8 ms difference + means the timing overhead per row is 68 ns, about twice what + pg_test_timing estimated it would be. Even that relatively + small amount of overhead is making the fully timed count statement + take almost 70% longer. On more substantial queries, the + timing overhead would be less problematic. + + + + + Changing time sources + + On some newer Linux systems, it's possible to change the clock + source used to collect timing data at any time. A second example + shows the slowdown possible from switching to the slower acpi_pm + time source, on the same system used for the fast results above: + + + +# cat /sys/devices/system/clocksource/clocksource0/available_clocksource +tsc hpet acpi_pm +# echo acpi_pm > /sys/devices/system/clocksource/clocksource0/current_clocksource +# pg_test_timing +Per loop time including overhead: 722.92 nsec +Histogram of timing durations: + < usec: count percent + 16: 3 0.00007% + 8: 563 0.01357% + 4: 3241 0.07810% + 2: 2990371 72.05956% + 1: 1155682 27.84870% + + + + In this configuration, the sample EXPLAIN ANALYZE + above takes 115.9 ms. That's 1061 nsec of timing overhead, again + a small multiple of what's measured directly by this utility. + That much timing overhead means the actual query itself is only + taking a tiny fraction of the accounted for time, most of it + is being consumed in overhead instead. In this configuration, + any EXPLAIN ANALYZE totals involving many + timed operations would be inflated significantly by timing overhead. + + + + FreeBSD also allows changing the time source on the fly, and + it logs information about the timer selected during boot: + + + +dmesg | grep "Timecounter" +sysctl kern.timecounter.hardware=TSC + + + + Other systems may only allow setting the time source on boot. + On older Linux systems the "clock" kernel setting is the only way + to make this sort of change. And even on some more recent ones, + the only option you'll see for a clock source is "jiffies". Jiffies + are the older Linux software clock implementation, which can have + good resolution when it's backed by fast enough timing hardware, + as in this example: + + + +$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource +jiffies +$ dmesg | grep time.c +time.c: Using 3.579545 MHz WALL PM GTOD PIT/TSC timer. +time.c: Detected 2400.153 MHz processor. +$ ./pg_test_timing +Testing timing overhead for 3 seconds. +Per timing duration including loop overhead: 97.75 ns +Histogram of timing durations: + < usec: count percent + 32: 1 0.00000% + 16: 1 0.00000% + 8: 22 0.00007% + 4: 3010 0.00981% + 2: 2993204 9.75277% + 1: 27694571 90.23734% + + + + + Clock hardware and timing accuracy + + + Collecting accurate timing information is normally done on computers + using hardware clocks with various levels of accuracy. With some + hardware the operating systems can pass the system clock time almost + directly to programs. A system clock can also be derived from a chip + that simply provides timing interrupts, periodic ticks at some known + time interval. In either case, operating system kernels provide + a clock source that hides these details. But the accuracy of that + clock source and how quickly it can return results varies based + on the underlying hardware. + + + + Inaccurate time keeping can result in system instability. Test + any change to the clock source very carefully. Operating system + defaults are sometimes made to favor reliability over best + accuracy. And if you are using a virtual machine, look into the + recommended time sources compatible with it. Virtual hardware + faces additional difficulties when emulating timers, and there + are often per operating system settings suggested by vendors. + + + + The Time Stamp Counter (TSC) clock source is the most accurate one + available on current generation CPUs. It's the preferred way to track + the system time when it's supported by the operating system and the + TSC clock is reliable. There are several ways that TSC can fail + to provide an accurate timing source, making it unreliable. Older + systems can have a TSC clock that varies based on the CPU + temperature, making it unusable for timing. Trying to use TSC on some + older multi-core CPUs can give a reported time that's inconsistent + among multiple cores. This can result in the time going backwards, a + problem this program checks for. And even the newest systems can + fail to provide accurate TSC timing with very aggressive power saving + configurations. + + + + Newer operating systems may check for the known TSC problems and + switch to a slower, more stable clock source when they are seen. + If your system supports TSC time but doesn't default to that, it + may be disabled for a good reason. And some operating systems may + not detect all the possible problems correctly, or will allow using + TSC even in situations where it's known to be inaccurate. + + + + The High Precision Event Timer (HPET) is the preferred timer on + systems where it's available and TSC is not accurate. The timer + chip itself is programmable to allow up to 100 nanosecond resolution, + but you may not see that much accuracy in your system clock. + + + + Advanced Configuration and Power Interface (ACPI) provides a + Power Management (PM) Timer, which Linux refers to as the acpi_pm. + The clock derived from acpi_pm will at best provide 300 nanosecond + resolution. + + + + Timers used on older PC hardware including the 8254 Programmable + Interval Timer (PIT), the real-time clock (RTC), the Advanced + Programmable Interrupt Controller (APIC) timer, and the Cyclone + timer. These timers aim for millisecond resolution. + + + + + Author + + + Ants Aasma ants.aasma@eesti.ee + + + + -- 2.40.0