--- /dev/null
+/*
+ * pg_test_timing.c
+ * tests overhead of timing calls and their monotonicity: that
+ * they always move forward
+ */
+
+#include "postgres_fe.h"
+
+#include "getopt_long.h"
+#include "portability/instr_time.h"
+
+static const char *progname;
+
+static int32 test_duration = 3;
+
+static void handle_args(int argc, char *argv[]);
+static void test_timing(int32);
+
+int
+main(int argc, char *argv[])
+{
+ progname = get_progname(argv[0]);
+
+ handle_args(argc, argv);
+
+ test_timing(test_duration);
+
+ return 0;
+}
+
+static void
+handle_args(int argc, char *argv[])
+{
+ static struct option long_options[] = {
+ {"duration", required_argument, NULL, 'd'},
+ {NULL, 0, NULL, 0}
+ };
+ int option; /* Command line option */
+ int optindex = 0; /* used by getopt_long */
+
+ if (argc > 1)
+ {
+ if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0 ||
+ strcmp(argv[1], "-?") == 0)
+ {
+ printf("Usage: %s [-d DURATION]\n", progname);
+ exit(0);
+ }
+ if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
+ {
+ puts("pg_test_timing (PostgreSQL) " PG_VERSION);
+ exit(0);
+ }
+ }
+
+ while ((option = getopt_long(argc, argv, "d:",
+ long_options, &optindex)) != -1)
+ {
+ switch (option)
+ {
+ case 'd':
+ test_duration = atoi(optarg);
+ break;
+
+ default:
+ fprintf(stderr, "Try \"%s --help\" for more information.\n",
+ progname);
+ exit(1);
+ break;
+ }
+ }
+
+ if (argc > optind)
+ {
+ fprintf(stderr,
+ "%s: too many command-line arguments (first is \"%s\")\n",
+ progname, argv[optind]);
+ fprintf(stderr, "Try \"%s --help\" for more information.\n",
+ progname);
+ exit(1);
+ }
+
+ if (test_duration > 0)
+ {
+ printf("Testing timing overhead for %d seconds.\n", test_duration);
+ }
+ else
+ {
+ fprintf(stderr,
+ "%s: duration must be a positive integer (duration is \"%d\")\n",
+ progname, test_duration);
+ fprintf(stderr, "Try \"%s --help\" for more information.\n",
+ progname);
+ exit(1);
+ }
+}
+
+static void
+test_timing(int32 duration)
+{
+ uint64 total_time;
+ int64 time_elapsed = 0;
+ uint64 loop_count = 0;
+ uint64 prev, cur;
+ int32 diff, i, bits, found;
+
+ instr_time start_time, end_time, temp;
+
+ static int64 histogram[32];
+
+ total_time = duration > 0 ? duration * 1000000 : 0;
+
+ INSTR_TIME_SET_CURRENT(start_time);
+ cur = INSTR_TIME_GET_MICROSEC(start_time);
+
+ while (time_elapsed < total_time)
+ {
+ prev = cur;
+ INSTR_TIME_SET_CURRENT(temp);
+ cur = INSTR_TIME_GET_MICROSEC(temp);
+ diff = cur - prev;
+
+ if (diff < 0)
+ {
+ printf("Detected clock going backwards in time.\n");
+ printf("Time warp: %d microseconds\n", diff);
+ exit(1);
+ }
+
+ bits = 0;
+ while (diff)
+ {
+ diff >>= 1;
+ bits++;
+ }
+ histogram[bits]++;
+
+ loop_count++;
+ INSTR_TIME_SUBTRACT(temp, start_time);
+ time_elapsed = INSTR_TIME_GET_MICROSEC(temp);
+ }
+
+ INSTR_TIME_SET_CURRENT(end_time);
+
+ INSTR_TIME_SUBTRACT(end_time, start_time);
+
+ printf("Per loop time including overhead: %0.2f nsec\n",
+ INSTR_TIME_GET_DOUBLE(end_time) * 1e9 / loop_count);
+ printf("Histogram of timing durations:\n");
+ printf("%9s: %10s %9s\n", "< usec", "count", "percent");
+
+ found = 0;
+ for (i = 31; i >= 0; i--)
+ {
+ if (found || histogram[i])
+ {
+ found = 1;
+ printf("%9ld: %10ld %8.5f%%\n", 1l << i, histogram[i],
+ (double) histogram[i] * 100 / loop_count);
+ }
+ }
+}
--- /dev/null
+<!-- doc/src/sgml/pgtesttiming.sgml -->
+
+<sect1 id="pgtesttiming" xreflabel="pg_test_timing">
+ <title>pg_test_timing</title>
+
+ <indexterm zone="pgtesttiming">
+ <primary>pg_test_timing</primary>
+ </indexterm>
+
+ <para>
+ <application>pg_test_timing</> is a tool to measure the timing overhead
+ on your system and confirm that the system time never moves backwards.
+ Systems that are slow to collect timing data can give less accurate
+ <command>EXPLAIN ANALYZE</command> results.
+ </para>
+
+ <sect2>
+ <title>Usage</title>
+
+<synopsis>
+pg_test_timing [options]
+</synopsis>
+
+ <para>
+ <application>pg_test_timing</application> accepts the following
+ command-line options:
+
+ <variablelist>
+
+ <varlistentry>
+ <term><option>-d</option></term>
+ <term><option>--duration</option></term>
+ <listitem>
+ <para>
+ Specifies the test duration, in seconds. Longer durations
+ give slightly better accuracy, and are more likely to discover
+ problems with the system clock moving backwards. The default
+ test duration is 3 seconds.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ </variablelist>
+ </para>
+
+ </sect2>
+
+ <sect2>
+ <title>Interpreting results</title>
+
+ <para>
+ Good results will show most (>90%) individual timing calls take less
+ than one microsecond. Average per loop overhead will be even lower,
+ below 100 nanoseconds. This example from an Intel i7-860 system using
+ a TSC clock source shows excellent performance:
+ </para>
+
+<screen>
+Testing timing overhead for 3 seconds.
+Per loop time including overhead: 35.96 nsec
+Histogram of timing durations:
+ < usec: count percent
+ 16: 2 0.00000%
+ 8: 13 0.00002%
+ 4: 126 0.00015%
+ 2: 2999652 3.59518%
+ 1: 80435604 96.40465%
+</screen>
+
+ <para>
+ Note that different units are used for the per loop time than the
+ histogram. The loop can have resolution within a few nanoseconds
+ (nsec), while the individual timing calls can only resolve down to
+ one microsecond (usec).
+ </para>
+
+ </sect2>
+ <sect2>
+ <title>Measuring executor timing overhead</title>
+
+ <para>
+ When the query executor is running a statement using
+ <command>EXPLAIN ANALYZE</command>, individual operations are
+ timed as well as showing a summary. The overhead of your system
+ can be checked by counting rows with the psql program:
+ </para>
+
+<screen>
+CREATE TABLE t AS SELECT * FROM generate_series(1,100000);
+\timing
+SELECT COUNT(*) FROM t;
+EXPLAIN ANALYZE SELECT COUNT(*) FROM t;
+</screen>
+
+ <para>
+ The i7-860 system measured runs the count query in 9.8 ms while
+ the <command>EXPLAIN ANALYZE</command> version takes 16.6 ms,
+ each processing just over 100,000 rows. That 6.8 ms difference
+ means the timing overhead per row is 68 ns, about twice what
+ pg_test_timing estimated it would be. Even that relatively
+ small amount of overhead is making the fully timed count statement
+ take almost 70% longer. On more substantial queries, the
+ timing overhead would be less problematic.
+ </para>
+
+ </sect2>
+ <sect2>
+ <title>Changing time sources</title>
+ <para>
+ On some newer Linux systems, it's possible to change the clock
+ source used to collect timing data at any time. A second example
+ shows the slowdown possible from switching to the slower acpi_pm
+ time source, on the same system used for the fast results above:
+ </para>
+
+<screen>
+# cat /sys/devices/system/clocksource/clocksource0/available_clocksource
+tsc hpet acpi_pm
+# echo acpi_pm > /sys/devices/system/clocksource/clocksource0/current_clocksource
+# pg_test_timing
+Per loop time including overhead: 722.92 nsec
+Histogram of timing durations:
+ < usec: count percent
+ 16: 3 0.00007%
+ 8: 563 0.01357%
+ 4: 3241 0.07810%
+ 2: 2990371 72.05956%
+ 1: 1155682 27.84870%
+</screen>
+
+ <para>
+ In this configuration, the sample <command>EXPLAIN ANALYZE</command>
+ above takes 115.9 ms. That's 1061 nsec of timing overhead, again
+ a small multiple of what's measured directly by this utility.
+ That much timing overhead means the actual query itself is only
+ taking a tiny fraction of the accounted for time, most of it
+ is being consumed in overhead instead. In this configuration,
+ any <command>EXPLAIN ANALYZE</command> totals involving many
+ timed operations would be inflated significantly by timing overhead.
+ </para>
+
+ <para>
+ FreeBSD also allows changing the time source on the fly, and
+ it logs information about the timer selected during boot:
+ </para>
+
+<screen>
+dmesg | grep "Timecounter"
+sysctl kern.timecounter.hardware=TSC
+</screen>
+
+ <para>
+ Other systems may only allow setting the time source on boot.
+ On older Linux systems the "clock" kernel setting is the only way
+ to make this sort of change. And even on some more recent ones,
+ the only option you'll see for a clock source is "jiffies". Jiffies
+ are the older Linux software clock implementation, which can have
+ good resolution when it's backed by fast enough timing hardware,
+ as in this example:
+ </para>
+
+<screen>
+$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource
+jiffies
+$ dmesg | grep time.c
+time.c: Using 3.579545 MHz WALL PM GTOD PIT/TSC timer.
+time.c: Detected 2400.153 MHz processor.
+$ ./pg_test_timing
+Testing timing overhead for 3 seconds.
+Per timing duration including loop overhead: 97.75 ns
+Histogram of timing durations:
+ < usec: count percent
+ 32: 1 0.00000%
+ 16: 1 0.00000%
+ 8: 22 0.00007%
+ 4: 3010 0.00981%
+ 2: 2993204 9.75277%
+ 1: 27694571 90.23734%
+</screen>
+
+ </sect2>
+ <sect2>
+ <title>Clock hardware and timing accuracy</title>
+
+ <para>
+ Collecting accurate timing information is normally done on computers
+ using hardware clocks with various levels of accuracy. With some
+ hardware the operating systems can pass the system clock time almost
+ directly to programs. A system clock can also be derived from a chip
+ that simply provides timing interrupts, periodic ticks at some known
+ time interval. In either case, operating system kernels provide
+ a clock source that hides these details. But the accuracy of that
+ clock source and how quickly it can return results varies based
+ on the underlying hardware.
+ </para>
+
+ <para>
+ Inaccurate time keeping can result in system instability. Test
+ any change to the clock source very carefully. Operating system
+ defaults are sometimes made to favor reliability over best
+ accuracy. And if you are using a virtual machine, look into the
+ recommended time sources compatible with it. Virtual hardware
+ faces additional difficulties when emulating timers, and there
+ are often per operating system settings suggested by vendors.
+ </para>
+
+ <para>
+ The Time Stamp Counter (TSC) clock source is the most accurate one
+ available on current generation CPUs. It's the preferred way to track
+ the system time when it's supported by the operating system and the
+ TSC clock is reliable. There are several ways that TSC can fail
+ to provide an accurate timing source, making it unreliable. Older
+ systems can have a TSC clock that varies based on the CPU
+ temperature, making it unusable for timing. Trying to use TSC on some
+ older multi-core CPUs can give a reported time that's inconsistent
+ among multiple cores. This can result in the time going backwards, a
+ problem this program checks for. And even the newest systems can
+ fail to provide accurate TSC timing with very aggressive power saving
+ configurations.
+ </para>
+
+ <para>
+ Newer operating systems may check for the known TSC problems and
+ switch to a slower, more stable clock source when they are seen.
+ If your system supports TSC time but doesn't default to that, it
+ may be disabled for a good reason. And some operating systems may
+ not detect all the possible problems correctly, or will allow using
+ TSC even in situations where it's known to be inaccurate.
+ </para>
+
+ <para>
+ The High Precision Event Timer (HPET) is the preferred timer on
+ systems where it's available and TSC is not accurate. The timer
+ chip itself is programmable to allow up to 100 nanosecond resolution,
+ but you may not see that much accuracy in your system clock.
+ </para>
+
+ <para>
+ Advanced Configuration and Power Interface (ACPI) provides a
+ Power Management (PM) Timer, which Linux refers to as the acpi_pm.
+ The clock derived from acpi_pm will at best provide 300 nanosecond
+ resolution.
+ </para>
+
+ <para>
+ Timers used on older PC hardware including the 8254 Programmable
+ Interval Timer (PIT), the real-time clock (RTC), the Advanced
+ Programmable Interrupt Controller (APIC) timer, and the Cyclone
+ timer. These timers aim for millisecond resolution.
+ </para>
+ </sect2>
+
+ <sect2>
+ <title>Author</title>
+
+ <para>
+ Ants Aasma <email>ants.aasma@eesti.ee</email>
+ </para>
+ </sect2>
+
+</sect1>