2 * src/test/isolation/isolationtester.c
5 * Runs an isolation test specified by a spec file.
8 #include "postgres_fe.h"
11 #ifdef HAVE_SYS_SELECT_H
12 #include <sys/select.h>
15 #include "datatype/timestamp.h"
17 #include "pqexpbuffer.h"
18 #include "pg_getopt.h"
20 #include "isolationtester.h"
22 #define PREP_WAITING "isolationtester_waiting"
25 * conns[0] is the global setup, teardown, and watchdog connection. Additional
26 * connections represent spec-defined sessions.
28 static PGconn **conns = NULL;
29 static const char **backend_pids = NULL;
30 static int nconns = 0;
32 /* In dry run only output permutations to be run by the tester. */
33 static int dry_run = false;
35 static void run_testspec(TestSpec *testspec);
36 static void run_all_permutations(TestSpec *testspec);
37 static void run_all_permutations_recurse(TestSpec *testspec, int nsteps,
39 static void run_named_permutations(TestSpec *testspec);
40 static void run_permutation(TestSpec *testspec, int nsteps, Step **steps);
42 #define STEP_NONBLOCK 0x1 /* return 0 as soon as cmd waits for a lock */
43 #define STEP_RETRY 0x2 /* this is a retry of a previously-waiting cmd */
44 static bool try_complete_step(Step *step, int flags);
46 static int step_qsort_cmp(const void *a, const void *b);
47 static int step_bsearch_cmp(const void *a, const void *b);
49 static void printResultSet(PGresult *res);
50 static void isotesterNoticeProcessor(void *arg, const char *message);
51 static void blackholeNoticeProcessor(void *arg, const char *message);
54 disconnect_atexit(void)
58 for (i = 0; i < nconns; i++)
64 main(int argc, char **argv)
72 PQExpBufferData wait_query;
77 while ((opt = getopt(argc, argv, "nV")) != -1)
85 puts("isolationtester (PostgreSQL) " PG_VERSION);
88 fprintf(stderr, "Usage: isolationtester [-n] [CONNINFO]\n");
94 * Make stdout unbuffered to match stderr; and ensure stderr is unbuffered
95 * too, which it should already be everywhere except sometimes in Windows.
101 * If the user supplies a non-option parameter on the command line, use it
102 * as the conninfo string; otherwise default to setting dbname=postgres
103 * and using environment variables or defaults for all other connection
107 conninfo = argv[optind];
109 conninfo = "dbname = postgres";
111 /* Read the test spec from stdin */
113 testspec = &parseresult;
115 /* Create a lookup table of all steps. */
117 for (i = 0; i < testspec->nsessions; i++)
118 nallsteps += testspec->sessions[i]->nsteps;
120 allsteps = pg_malloc(nallsteps * sizeof(Step *));
123 for (i = 0; i < testspec->nsessions; i++)
125 for (j = 0; j < testspec->sessions[i]->nsteps; j++)
126 allsteps[n++] = testspec->sessions[i]->steps[j];
129 qsort(allsteps, nallsteps, sizeof(Step *), &step_qsort_cmp);
130 testspec->nallsteps = nallsteps;
131 testspec->allsteps = allsteps;
133 /* Verify that all step names are unique */
134 for (i = 1; i < testspec->nallsteps; i++)
136 if (strcmp(testspec->allsteps[i - 1]->name,
137 testspec->allsteps[i]->name) == 0)
139 fprintf(stderr, "duplicate step name: %s\n",
140 testspec->allsteps[i]->name);
146 * In dry-run mode, just print the permutations that would be run, and
151 run_testspec(testspec);
155 printf("Parsed test spec with %d sessions\n", testspec->nsessions);
158 * Establish connections to the database, one for each session and an
159 * extra for lock wait detection and global work.
161 nconns = 1 + testspec->nsessions;
162 conns = calloc(nconns, sizeof(PGconn *));
163 atexit(disconnect_atexit);
164 backend_pids = calloc(nconns, sizeof(*backend_pids));
165 for (i = 0; i < nconns; i++)
167 conns[i] = PQconnectdb(conninfo);
168 if (PQstatus(conns[i]) != CONNECTION_OK)
170 fprintf(stderr, "Connection %d to database failed: %s",
171 i, PQerrorMessage(conns[i]));
176 * Set up notice processors for the user-defined connections, so that
177 * messages can get printed prefixed with the session names. The
178 * control connection gets a "blackhole" processor instead (hides all
182 PQsetNoticeProcessor(conns[i],
183 isotesterNoticeProcessor,
184 (void *) (testspec->sessions[i - 1]->name));
186 PQsetNoticeProcessor(conns[i],
187 blackholeNoticeProcessor,
190 /* Get the backend pid for lock wait checking. */
191 res = PQexec(conns[i], "SELECT pg_catalog.pg_backend_pid()");
192 if (PQresultStatus(res) == PGRES_TUPLES_OK)
194 if (PQntuples(res) == 1 && PQnfields(res) == 1)
195 backend_pids[i] = pg_strdup(PQgetvalue(res, 0, 0));
198 fprintf(stderr, "backend pid query returned %d rows and %d columns, expected 1 row and 1 column",
199 PQntuples(res), PQnfields(res));
205 fprintf(stderr, "backend pid query failed: %s",
206 PQerrorMessage(conns[i]));
212 /* Set the session index fields in steps. */
213 for (i = 0; i < testspec->nsessions; i++)
215 Session *session = testspec->sessions[i];
218 for (stepindex = 0; stepindex < session->nsteps; stepindex++)
219 session->steps[stepindex]->session = i;
223 * Build the query we'll use to detect lock contention among sessions in
224 * the test specification. Most of the time, we could get away with
225 * simply checking whether a session is waiting for *any* lock: we don't
226 * exactly expect concurrent use of test tables. However, autovacuum will
227 * occasionally take AccessExclusiveLock to truncate a table, and we must
228 * ignore that transient wait.
230 initPQExpBuffer(&wait_query);
231 appendPQExpBufferStr(&wait_query,
232 "SELECT pg_catalog.pg_isolation_test_session_is_blocked($1, '{");
233 /* The spec syntax requires at least one session; assume that here. */
234 appendPQExpBufferStr(&wait_query, backend_pids[1]);
235 for (i = 2; i < nconns; i++)
236 appendPQExpBuffer(&wait_query, ",%s", backend_pids[i]);
237 appendPQExpBufferStr(&wait_query, "}')");
239 res = PQprepare(conns[0], PREP_WAITING, wait_query.data, 0, NULL);
240 if (PQresultStatus(res) != PGRES_COMMAND_OK)
242 fprintf(stderr, "prepare of lock wait query failed: %s",
243 PQerrorMessage(conns[0]));
247 termPQExpBuffer(&wait_query);
250 * Run the permutations specified in the spec, or all if none were
251 * explicitly specified.
253 run_testspec(testspec);
261 * Run the permutations specified in the spec, or all if none were
262 * explicitly specified.
265 run_testspec(TestSpec *testspec)
267 if (testspec->permutations)
268 run_named_permutations(testspec);
270 run_all_permutations(testspec);
274 * Run all permutations of the steps and sessions.
277 run_all_permutations(TestSpec *testspec)
283 /* Count the total number of steps in all sessions */
285 for (i = 0; i < testspec->nsessions; i++)
286 nsteps += testspec->sessions[i]->nsteps;
288 steps = pg_malloc(sizeof(Step *) * nsteps);
291 * To generate the permutations, we conceptually put the steps of each
292 * session on a pile. To generate a permutation, we pick steps from the
293 * piles until all piles are empty. By picking steps from piles in
294 * different order, we get different permutations.
296 * A pile is actually just an integer which tells how many steps we've
297 * already picked from this pile.
299 piles = pg_malloc(sizeof(int) * testspec->nsessions);
300 for (i = 0; i < testspec->nsessions; i++)
303 run_all_permutations_recurse(testspec, 0, steps);
307 run_all_permutations_recurse(TestSpec *testspec, int nsteps, Step **steps)
312 for (i = 0; i < testspec->nsessions; i++)
314 /* If there's any more steps in this pile, pick it and recurse */
315 if (piles[i] < testspec->sessions[i]->nsteps)
317 steps[nsteps] = testspec->sessions[i]->steps[piles[i]];
320 run_all_permutations_recurse(testspec, nsteps + 1, steps);
328 /* If all the piles were empty, this permutation is completed. Run it */
330 run_permutation(testspec, nsteps, steps);
334 * Run permutations given in the test spec
337 run_named_permutations(TestSpec *testspec)
342 for (i = 0; i < testspec->npermutations; i++)
344 Permutation *p = testspec->permutations[i];
347 steps = pg_malloc(p->nsteps * sizeof(Step *));
349 /* Find all the named steps using the lookup table */
350 for (j = 0; j < p->nsteps; j++)
352 Step **this = (Step **) bsearch(p->stepnames[j],
360 fprintf(stderr, "undefined step \"%s\" specified in permutation\n",
368 run_permutation(testspec, p->nsteps, steps);
375 step_qsort_cmp(const void *a, const void *b)
377 Step *stepa = *((Step **) a);
378 Step *stepb = *((Step **) b);
380 return strcmp(stepa->name, stepb->name);
384 step_bsearch_cmp(const void *a, const void *b)
386 char *stepname = (char *) a;
387 Step *step = *((Step **) b);
389 return strcmp(stepname, step->name);
393 * If a step caused an error to be reported, print it out and clear it.
396 report_error_message(Step *step)
400 fprintf(stdout, "%s\n", step->errormsg);
401 free(step->errormsg);
402 step->errormsg = NULL;
407 * As above, but reports messages possibly emitted by multiple steps. This is
408 * useful when we have a blocked command awakened by another one; we want to
409 * report all messages identically, for the case where we don't care which
410 * one fails due to a timeout such as deadlock timeout.
413 report_multiple_error_messages(Step *step, int nextra, Step **extrastep)
415 PQExpBufferData buffer;
420 report_error_message(step);
424 initPQExpBuffer(&buffer);
425 appendPQExpBufferStr(&buffer, step->name);
427 for (n = 0; n < nextra; ++n)
428 appendPQExpBuffer(&buffer, " %s", extrastep[n]->name);
432 fprintf(stdout, "error in steps %s: %s\n", buffer.data,
434 free(step->errormsg);
435 step->errormsg = NULL;
438 for (n = 0; n < nextra; ++n)
440 if (extrastep[n]->errormsg == NULL)
442 fprintf(stdout, "error in steps %s: %s\n",
443 buffer.data, extrastep[n]->errormsg);
444 free(extrastep[n]->errormsg);
445 extrastep[n]->errormsg = NULL;
448 termPQExpBuffer(&buffer);
452 * Run one permutation
455 run_permutation(TestSpec *testspec, int nsteps, Step **steps)
466 * In dry run mode, just display the permutation in the same format used
467 * by spec files, and return.
471 printf("permutation");
472 for (i = 0; i < nsteps; i++)
473 printf(" \"%s\"", steps[i]->name);
478 waiting = pg_malloc(sizeof(Step *) * testspec->nsessions);
479 errorstep = pg_malloc(sizeof(Step *) * testspec->nsessions);
481 printf("\nstarting permutation:");
482 for (i = 0; i < nsteps; i++)
483 printf(" %s", steps[i]->name);
487 for (i = 0; i < testspec->nsetupsqls; i++)
489 res = PQexec(conns[0], testspec->setupsqls[i]);
490 if (PQresultStatus(res) == PGRES_TUPLES_OK)
494 else if (PQresultStatus(res) != PGRES_COMMAND_OK)
496 fprintf(stderr, "setup failed: %s", PQerrorMessage(conns[0]));
502 /* Perform per-session setup */
503 for (i = 0; i < testspec->nsessions; i++)
505 if (testspec->sessions[i]->setupsql)
507 res = PQexec(conns[i + 1], testspec->sessions[i]->setupsql);
508 if (PQresultStatus(res) == PGRES_TUPLES_OK)
512 else if (PQresultStatus(res) != PGRES_COMMAND_OK)
514 fprintf(stderr, "setup of session %s failed: %s",
515 testspec->sessions[i]->name,
516 PQerrorMessage(conns[i + 1]));
524 for (i = 0; i < nsteps; i++)
526 Step *step = steps[i];
527 PGconn *conn = conns[1 + step->session];
528 Step *oldstep = NULL;
532 * Check whether the session that needs to perform the next step is
533 * still blocked on an earlier step. If so, wait for it to finish.
535 * (In older versions of this tool, we allowed precisely one session
536 * to be waiting at a time. If we reached a step that required that
537 * session to execute the next command, we would declare the whole
538 * permutation invalid, cancel everything, and move on to the next
539 * one. Unfortunately, that made it impossible to test the deadlock
540 * detector using this framework, unless the number of processes
541 * involved in the deadlock was precisely two. We now assume that if
542 * we reach a step that is still blocked, we need to wait for it to
545 for (w = 0; w < nwaiting; ++w)
547 if (step->session == waiting[w]->session)
549 oldstep = waiting[w];
551 /* Wait for previous step on this connection. */
552 try_complete_step(oldstep, STEP_RETRY);
554 /* Remove that step from the waiting[] array. */
555 if (w + 1 < nwaiting)
556 memmove(&waiting[w], &waiting[w + 1],
557 (nwaiting - (w + 1)) * sizeof(Step *));
566 * Check for completion of any steps that were previously waiting.
567 * Remove any that have completed from waiting[], and include them
568 * in the list for report_multiple_error_messages().
574 if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
576 /* Still blocked on a lock, leave it alone. */
581 /* This one finished, too! */
582 errorstep[nerrorstep++] = waiting[w];
583 if (w + 1 < nwaiting)
584 memmove(&waiting[w], &waiting[w + 1],
585 (nwaiting - (w + 1)) * sizeof(Step *));
590 /* Report all errors together. */
591 report_multiple_error_messages(oldstep, nerrorstep, errorstep);
594 /* Send the query for this step. */
595 if (!PQsendQuery(conn, step->sql))
597 fprintf(stdout, "failed to send query for step %s: %s\n",
598 step->name, PQerrorMessage(conn));
602 /* Try to complete this step without blocking. */
603 mustwait = try_complete_step(step, STEP_NONBLOCK);
605 /* Check for completion of any steps that were previously waiting. */
610 if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY))
614 errorstep[nerrorstep++] = waiting[w];
615 if (w + 1 < nwaiting)
616 memmove(&waiting[w], &waiting[w + 1],
617 (nwaiting - (w + 1)) * sizeof(Step *));
622 /* Report any error from this step, and any steps that it unblocked. */
623 report_multiple_error_messages(step, nerrorstep, errorstep);
625 /* If this step is waiting, add it to the array of waiters. */
627 waiting[nwaiting++] = step;
630 /* Wait for any remaining queries. */
631 for (w = 0; w < nwaiting; ++w)
633 try_complete_step(waiting[w], STEP_RETRY);
634 report_error_message(waiting[w]);
637 /* Perform per-session teardown */
638 for (i = 0; i < testspec->nsessions; i++)
640 if (testspec->sessions[i]->teardownsql)
642 res = PQexec(conns[i + 1], testspec->sessions[i]->teardownsql);
643 if (PQresultStatus(res) == PGRES_TUPLES_OK)
647 else if (PQresultStatus(res) != PGRES_COMMAND_OK)
649 fprintf(stderr, "teardown of session %s failed: %s",
650 testspec->sessions[i]->name,
651 PQerrorMessage(conns[i + 1]));
652 /* don't exit on teardown failure */
658 /* Perform teardown */
659 if (testspec->teardownsql)
661 res = PQexec(conns[0], testspec->teardownsql);
662 if (PQresultStatus(res) == PGRES_TUPLES_OK)
666 else if (PQresultStatus(res) != PGRES_COMMAND_OK)
668 fprintf(stderr, "teardown failed: %s",
669 PQerrorMessage(conns[0]));
670 /* don't exit on teardown failure */
680 * Our caller already sent the query associated with this step. Wait for it
681 * to either complete or (if given the STEP_NONBLOCK flag) to block while
682 * waiting for a lock. We assume that any lock wait will persist until we
683 * have executed additional steps in the permutation.
685 * When calling this function on behalf of a given step for a second or later
686 * time, pass the STEP_RETRY flag. This only affects the messages printed.
688 * If the query returns an error, the message is saved in step->errormsg.
689 * Caller should call report_error_message shortly after this, to have it
690 * printed and cleared.
692 * If the STEP_NONBLOCK flag was specified and the query is waiting to acquire
693 * a lock, returns true. Otherwise, returns false.
696 try_complete_step(Step *step, int flags)
698 PGconn *conn = conns[1 + step->session];
700 struct timeval start_time;
701 struct timeval timeout;
702 int sock = PQsocket(conn);
705 bool canceled = false;
709 fprintf(stderr, "invalid socket: %s", PQerrorMessage(conn));
713 gettimeofday(&start_time, NULL);
716 while (PQisBusy(conn))
718 FD_SET(sock, &read_set);
720 timeout.tv_usec = 10000; /* Check for lock waits every 10ms. */
722 ret = select(sock + 1, &read_set, NULL, NULL, &timeout);
723 if (ret < 0) /* error in select() */
727 fprintf(stderr, "select failed: %s\n", strerror(errno));
730 else if (ret == 0) /* select() timeout: check for lock wait */
732 struct timeval current_time;
735 /* If it's OK for the step to block, check whether it has. */
736 if (flags & STEP_NONBLOCK)
740 res = PQexecPrepared(conns[0], PREP_WAITING, 1,
741 &backend_pids[step->session + 1],
743 if (PQresultStatus(res) != PGRES_TUPLES_OK ||
746 fprintf(stderr, "lock wait query failed: %s",
747 PQerrorMessage(conns[0]));
750 waiting = ((PQgetvalue(res, 0, 0))[0] == 't');
753 if (waiting) /* waiting to acquire a lock */
756 * Since it takes time to perform the lock-check query,
757 * some data --- notably, NOTICE messages --- might have
758 * arrived since we looked. We must call PQconsumeInput
759 * and then PQisBusy to collect and process any such
760 * messages. In the (unlikely) case that PQisBusy then
761 * returns false, we might as well go examine the
764 if (!PQconsumeInput(conn))
766 fprintf(stderr, "PQconsumeInput failed: %s\n",
767 PQerrorMessage(conn));
774 * conn is still busy, so conclude that the step really is
777 if (!(flags & STEP_RETRY))
778 printf("step %s: %s <waiting ...>\n",
779 step->name, step->sql);
782 /* else, not waiting */
785 /* Figure out how long we've been waiting for this step. */
786 gettimeofday(¤t_time, NULL);
787 td = (int64) current_time.tv_sec - (int64) start_time.tv_sec;
789 td += (int64) current_time.tv_usec - (int64) start_time.tv_usec;
792 * After 180 seconds, try to cancel the query.
794 * If the user tries to test an invalid permutation, we don't want
795 * to hang forever, especially when this is running in the
796 * buildfarm. This will presumably lead to this permutation
797 * failing, but remaining permutations and tests should still be
800 if (td > 180 * USECS_PER_SEC && !canceled)
802 PGcancel *cancel = PQgetCancel(conn);
808 if (PQcancel(cancel, buf, sizeof(buf)))
811 fprintf(stderr, "PQcancel failed: %s\n", buf);
812 PQfreeCancel(cancel);
817 * After 200 seconds, just give up and die.
819 * Since cleanup steps won't be run in this case, this may cause
820 * later tests to fail. That stinks, but it's better than waiting
821 * forever for the server to respond to the cancel.
823 if (td > 200 * USECS_PER_SEC)
825 fprintf(stderr, "step %s timed out after 200 seconds\n",
830 else if (!PQconsumeInput(conn)) /* select(): data available */
832 fprintf(stderr, "PQconsumeInput failed: %s\n",
833 PQerrorMessage(conn));
838 if (flags & STEP_RETRY)
839 printf("step %s: <... completed>\n", step->name);
841 printf("step %s: %s\n", step->name, step->sql);
843 while ((res = PQgetResult(conn)))
845 switch (PQresultStatus(res))
847 case PGRES_COMMAND_OK:
849 case PGRES_TUPLES_OK:
852 case PGRES_FATAL_ERROR:
853 if (step->errormsg != NULL)
855 printf("WARNING: this step had a leftover error message\n");
856 printf("%s\n", step->errormsg);
860 * Detail may contain XID values, so we want to just show
861 * primary. Beware however that libpq-generated error results
862 * may not contain subfields, only an old-style message.
865 const char *sev = PQresultErrorField(res,
867 const char *msg = PQresultErrorField(res,
868 PG_DIAG_MESSAGE_PRIMARY);
871 step->errormsg = psprintf("%s: %s", sev, msg);
873 step->errormsg = pg_strdup(PQresultErrorMessage(res));
877 printf("unexpected result status: %s\n",
878 PQresStatus(PQresultStatus(res)));
887 printResultSet(PGresult *res)
893 /* first, print out the attribute names */
894 nFields = PQnfields(res);
895 for (i = 0; i < nFields; i++)
896 printf("%-15s", PQfname(res, i));
899 /* next, print out the rows */
900 for (i = 0; i < PQntuples(res); i++)
902 for (j = 0; j < nFields; j++)
903 printf("%-15s", PQgetvalue(res, i, j));
908 /* notice processor, prefixes each message with the session name */
910 isotesterNoticeProcessor(void *arg, const char *message)
912 printf("%s: %s", (char *) arg, message);
915 /* notice processor, hides the message */
917 blackholeNoticeProcessor(void *arg, const char *message)