From 38f8bdcac4982215beb9f65a19debecaf22fd470 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 11 Feb 2016 08:36:30 -0500 Subject: [PATCH] Modify the isolation tester so that multiple sessions can wait. This allows testing of deadlock scenarios. Scenarios that would previously have been considered invalid are now simply taken as a scenario in which more than one backend will wait. --- src/test/isolation/README | 9 +- src/test/isolation/isolationtester.c | 330 +++++++++++++++------------ 2 files changed, 194 insertions(+), 145 deletions(-) diff --git a/src/test/isolation/README b/src/test/isolation/README index 490be8e615..bea278a856 100644 --- a/src/test/isolation/README +++ b/src/test/isolation/README @@ -107,10 +107,11 @@ Each step may contain commands that block until further action has been taken (most likely, some other session runs a step that unblocks it or causes a deadlock). A test that uses this ability must manually specify valid permutations, i.e. those that would not expect a blocked session to execute a -command. If the test fails to follow that rule, the test is aborted. - -Currently, at most one step can be waiting at a time. As long as one -step is waiting, subsequent steps are run to completion synchronously. +command. If a test fails to follow that rule, isolationtester will cancel it +after 60 seconds. If the cancel doesn't work, isolationtester will exit +uncleanly after a total of 75 seconds of wait time. Testing invalid +permutations should be avoided because they can make the isolation tests take +a very long time to run, and they serve no useful testing purpose. Note that isolationtester recognizes that a command has blocked by looking to see if it is shown as waiting in the pg_locks view; therefore, only diff --git a/src/test/isolation/isolationtester.c b/src/test/isolation/isolationtester.c index ab4150143f..0d48cdbdac 100644 --- a/src/test/isolation/isolationtester.c +++ b/src/test/isolation/isolationtester.c @@ -15,6 +15,7 @@ #include #endif +#include "datatype/timestamp.h" #include "libpq-fe.h" #include "pqexpbuffer.h" #include "pg_getopt.h" @@ -237,54 +238,6 @@ main(int argc, char **argv) appendPQExpBufferStr(&wait_query, ") " - "AND holder.mode = ANY (CASE waiter.mode " - "WHEN 'AccessShareLock' THEN ARRAY[" - "'AccessExclusiveLock'] " - "WHEN 'RowShareLock' THEN ARRAY[" - "'ExclusiveLock'," - "'AccessExclusiveLock'] " - "WHEN 'RowExclusiveLock' THEN ARRAY[" - "'ShareLock'," - "'ShareRowExclusiveLock'," - "'ExclusiveLock'," - "'AccessExclusiveLock'] " - "WHEN 'ShareUpdateExclusiveLock' THEN ARRAY[" - "'ShareUpdateExclusiveLock'," - "'ShareLock'," - "'ShareRowExclusiveLock'," - "'ExclusiveLock'," - "'AccessExclusiveLock'] " - "WHEN 'ShareLock' THEN ARRAY[" - "'RowExclusiveLock'," - "'ShareUpdateExclusiveLock'," - "'ShareRowExclusiveLock'," - "'ExclusiveLock'," - "'AccessExclusiveLock'] " - "WHEN 'ShareRowExclusiveLock' THEN ARRAY[" - "'RowExclusiveLock'," - "'ShareUpdateExclusiveLock'," - "'ShareLock'," - "'ShareRowExclusiveLock'," - "'ExclusiveLock'," - "'AccessExclusiveLock'] " - "WHEN 'ExclusiveLock' THEN ARRAY[" - "'RowShareLock'," - "'RowExclusiveLock'," - "'ShareUpdateExclusiveLock'," - "'ShareLock'," - "'ShareRowExclusiveLock'," - "'ExclusiveLock'," - "'AccessExclusiveLock'] " - "WHEN 'AccessExclusiveLock' THEN ARRAY[" - "'AccessShareLock'," - "'RowShareLock'," - "'RowExclusiveLock'," - "'ShareUpdateExclusiveLock'," - "'ShareLock'," - "'ShareRowExclusiveLock'," - "'ExclusiveLock'," - "'AccessExclusiveLock'] END) " - "AND holder.locktype IS NOT DISTINCT FROM waiter.locktype " "AND holder.database IS NOT DISTINCT FROM waiter.database " "AND holder.relation IS NOT DISTINCT FROM waiter.relation " @@ -467,34 +420,48 @@ report_error_message(Step *step) } /* - * As above, but reports messages possibly emitted by two steps. This is + * As above, but reports messages possibly emitted by multiple steps. This is * useful when we have a blocked command awakened by another one; we want to - * report both messages identically, for the case where we don't care which + * report all messages identically, for the case where we don't care which * one fails due to a timeout such as deadlock timeout. */ static void -report_two_error_messages(Step *step1, Step *step2) +report_multiple_error_messages(Step *step, int nextra, Step **extrastep) { - char *prefix; - - prefix = psprintf("%s %s", step1->name, step2->name); + PQExpBufferData buffer; + int n; - if (step1->errormsg) + if (nextra == 0) { - fprintf(stdout, "error in steps %s: %s\n", prefix, - step1->errormsg); - free(step1->errormsg); - step1->errormsg = NULL; + report_error_message(step); + return; } - if (step2->errormsg) + + initPQExpBuffer(&buffer); + appendPQExpBufferStr(&buffer, step->name); + + for (n = 0; n < nextra; ++n) + appendPQExpBuffer(&buffer, " %s", extrastep[n]->name); + + if (step->errormsg) { - fprintf(stdout, "error in steps %s: %s\n", prefix, - step2->errormsg); - free(step2->errormsg); - step2->errormsg = NULL; + fprintf(stdout, "error in steps %s: %s\n", buffer.data, + step->errormsg); + free(step->errormsg); + step->errormsg = NULL; } - free(prefix); + for (n = 0; n < nextra; ++n) + { + if (extrastep[n]->errormsg == NULL) + continue; + fprintf(stdout, "error in steps %s: %s\n", + buffer.data, extrastep[n]->errormsg); + free(extrastep[n]->errormsg); + extrastep[n]->errormsg = NULL; + } + + termPQExpBuffer(&buffer); } /* @@ -505,7 +472,14 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps) { PGresult *res; int i; - Step *waiting = NULL; + int w; + int nwaiting = 0; + int nerrorstep = 0; + Step **waiting; + Step **errorstep; + + waiting = malloc(sizeof(Step *) * testspec->nsessions); + errorstep = malloc(sizeof(Step *) * testspec->nsessions); /* * In dry run mode, just display the permutation in the same format used @@ -566,54 +540,76 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps) for (i = 0; i < nsteps; i++) { Step *step = steps[i]; + Step *oldstep = NULL; PGconn *conn = conns[1 + step->session]; + bool mustwait; - if (waiting != NULL && step->session == waiting->session) + /* + * Check whether the session that needs to perform the next step + * is still blocked on an earlier step. If so, wait for it to finish. + * + * In older versions of this tool, when allowed precisely one session + * to be waiting at a time. If we reached a step which required that + * session to execute the next command, we would declare the whole + * permutation invalid, cancel everything, and move on to the next one. + * Unfortunately, that made it impossible to test the deadlock detector + * using this framework unless the numebr of processes involved in the + * deadlock was precisely two. We now assume that if we reach a step + * that is still blocked, we need to wait for it to unblock itself. + */ + for (w = 0; w < nwaiting; ++w) { - PGcancel *cancel; - PGresult *res; - int j; - - /* - * This permutation is invalid: it can never happen in real life. - * - * A session is blocked on an earlier step (waiting) and no - * further steps from this session can run until it is unblocked, - * but it can only be unblocked by running steps from other - * sessions. - */ - fprintf(stderr, "invalid permutation detected\n"); - - /* Cancel the waiting statement from this session. */ - cancel = PQgetCancel(conn); - if (cancel != NULL) + if (step->session == waiting[w]->session) { - char buf[256]; - - if (!PQcancel(cancel, buf, sizeof(buf))) - fprintf(stderr, "PQcancel failed: %s\n", buf); - - /* Be sure to consume the error message. */ - while ((res = PQgetResult(conn)) != NULL) - PQclear(res); - - PQfreeCancel(cancel); + oldstep = waiting[w]; + break; } + } + if (oldstep != NULL) + { + /* Wait for previous step on this connection. */ + try_complete_step(oldstep, STEP_RETRY); /* - * Now we really have to complete all the running transactions to - * make sure teardown doesn't block. + * Attempt to complete any steps that were previously waiting. + * Remove any that have completed, and also remove the one for + * which we just waited. */ - for (j = 1; j < nconns; j++) + w = 0; + nerrorstep = 0; + while (w < nwaiting) { - res = PQexec(conns[j], "ROLLBACK"); - if (res != NULL) - PQclear(res); + if (waiting[w] == oldstep) + { + /* We just waited for this; remove it. */ + if (w + 1 < nwaiting) + memcpy(&waiting[w], &waiting[w + 1], + (nwaiting - (w + 1)) * sizeof(Step *)); + nwaiting--; + continue; + } + + if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY)) + { + /* Still blocked on a lock, leave it alone. */ + w++; + } + else + { + /* This one finished, too! */ + errorstep[nerrorstep++] = waiting[w]; + if (w + 1 < nwaiting) + memcpy(&waiting[w], &waiting[w + 1], + (nwaiting - (w + 1)) * sizeof(Step *)); + nwaiting--; + } } - goto teardown; + /* Report all errors together. */ + report_multiple_error_messages(oldstep, nerrorstep, errorstep); } + /* Send the query for this step. */ if (!PQsendQuery(conn, step->sql)) { fprintf(stdout, "failed to send query for step %s: %s\n", @@ -621,39 +617,41 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps) exit_nicely(); } - if (waiting != NULL) - { - /* Some other step is already waiting: just block. */ - try_complete_step(step, 0); + /* Try to complete this step without blocking. */ + mustwait = try_complete_step(step, STEP_NONBLOCK); - /* - * See if this step unblocked the waiting step; report both error - * messages together if so. - */ - if (!try_complete_step(waiting, STEP_NONBLOCK | STEP_RETRY)) + /* Attempt to complete any steps that were previously waiting. */ + w = 0; + nerrorstep = 0; + while (w < nwaiting) + { + if (try_complete_step(waiting[w], STEP_NONBLOCK | STEP_RETRY)) + w++; + else { - report_two_error_messages(step, waiting); - waiting = NULL; + errorstep[nerrorstep++] = waiting[w]; + if (w + 1 < nwaiting) + memcpy(&waiting[w], &waiting[w + 1], + (nwaiting - (w + 1)) * sizeof(Step *)); + nwaiting--; } - else - report_error_message(step); - } - else - { - if (try_complete_step(step, STEP_NONBLOCK)) - waiting = step; - report_error_message(step); } + + /* Report any error from this step, and any steps that it unblocked. */ + report_multiple_error_messages(step, nerrorstep, errorstep); + + /* If this step is waiting, add it to the array of waiters. */ + if (mustwait) + waiting[nwaiting++] = step; } /* Finish any waiting query. */ - if (waiting != NULL) + for (w = 0; w < nwaiting; ++w) { - try_complete_step(waiting, STEP_RETRY); - report_error_message(waiting); + try_complete_step(waiting[w], STEP_RETRY); + report_error_message(waiting[w]); } -teardown: /* Perform per-session teardown */ for (i = 0; i < testspec->nsessions; i++) { @@ -714,14 +712,17 @@ try_complete_step(Step *step, int flags) { PGconn *conn = conns[1 + step->session]; fd_set read_set; + struct timeval start_time; struct timeval timeout; int sock = PQsocket(conn); int ret; PGresult *res; + bool canceled = false; + gettimeofday(&start_time, NULL); FD_ZERO(&read_set); - while ((flags & STEP_NONBLOCK) && PQisBusy(conn)) + while (PQisBusy(conn)) { FD_SET(sock, &read_set); timeout.tv_sec = 0; @@ -737,28 +738,75 @@ try_complete_step(Step *step, int flags) } else if (ret == 0) /* select() timeout: check for lock wait */ { - int ntuples; + struct timeval current_time; + int64 td; - res = PQexecPrepared(conns[0], PREP_WAITING, 1, - &backend_pids[step->session + 1], - NULL, NULL, 0); - if (PQresultStatus(res) != PGRES_TUPLES_OK) + /* If it's OK for the step to block, check whether it has. */ + if (flags & STEP_NONBLOCK) { - fprintf(stderr, "lock wait query failed: %s", - PQerrorMessage(conn)); - exit_nicely(); + int ntuples; + + res = PQexecPrepared(conns[0], PREP_WAITING, 1, + &backend_pids[step->session + 1], + NULL, NULL, 0); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + fprintf(stderr, "lock wait query failed: %s", + PQerrorMessage(conn)); + exit_nicely(); + } + ntuples = PQntuples(res); + PQclear(res); + + if (ntuples >= 1) /* waiting to acquire a lock */ + { + if (!(flags & STEP_RETRY)) + printf("step %s: %s \n", + step->name, step->sql); + return true; + } + /* else, not waiting */ } - ntuples = PQntuples(res); - PQclear(res); - if (ntuples >= 1) /* waiting to acquire a lock */ + /* Figure out how long we've been waiting for this step. */ + gettimeofday(¤t_time, NULL); + td = (int64) current_time.tv_sec - (int64) start_time.tv_sec; + td *= USECS_PER_SEC; + td += (int64) current_time.tv_usec - (int64) start_time.tv_usec; + + /* + * After 60 seconds, try to cancel the query. + * + * If the user tries to test an invalid permutation, we don't + * want to hang forever, especially when this is running in the + * buildfarm. So try to cancel it after a minute. This will + * presumably lead to this permutation failing, but remaining + * permutations and tests should still be OK. + */ + if (td > 60 * USECS_PER_SEC && !canceled) { - if (!(flags & STEP_RETRY)) - printf("step %s: %s \n", - step->name, step->sql); - return true; + PGcancel *cancel = PQgetCancel(conn); + char buf[256]; + + if (cancel != NULL && !PQcancel(cancel, buf, sizeof(buf))) + fprintf(stderr, "PQcancel failed: %s\n", buf); + if (cancel != NULL) + PQfreeCancel(cancel); + } + + /* + * After 75 seconds, just give up and die. + * + * Since cleanup steps won't be run in this case, this may cause + * later tests to fail. That stinks, but it's better than waiting + * forever for the server to respond to the cancel. + */ + if (td > 75 * USECS_PER_SEC) + { + fprintf(stderr, "step %s timed out after 75 seconds\n", + step->name); + exit_nicely(); } - /* else, not waiting: give it more time */ } else if (!PQconsumeInput(conn)) /* select(): data available */ { -- 2.40.0