[master] 3c67da447 varnishtest: Make barriers reactive on error
Dridi Boukelmoune
dridi.boukelmoune at gmail.com
Wed Jun 23 07:00:09 UTC 2021
commit 3c67da44753cccbf5db4d2bef881abe5b7c817f4
Author: Dridi Boukelmoune <dridi.boukelmoune at gmail.com>
Date: Wed Jun 23 08:06:22 2021 +0200
varnishtest: Make barriers reactive on error
A failing test may take up to the vtc timeout to effectively fail if a
thread is waiting for a barrier to sync. Barriers now check every 100ms
whether the vtc is done to bail out.
The socket barriers were already polling for 1s, so now they poll for
100ms and leave the loop early when needed.
The condvar barriers now loop over a timedwait, but critical sections
were too wide. There were vtc_fatal() calls under barrier locks that
would defeat the timedwait loop.
Thus critical sections were more carefully laid, allowing multiple
barrier commands to run simultaneously. This also means that besides
initialization, socket barriers no longer touch the lock.
Consider the following diff:
--- bin/varnishtest/tests/v00056.vtc
+++ bin/varnishtest/tests/v00056.vtc
@@ -72,7 +72,7 @@ client c2 {
} -run
logexpect l2 -v v1 -g raw -q Backend_health {
- expect 0 0 Backend_health "default Went healthy"
+ expect 0 0 Backend_health "default Went wrong"
} -start
barrier b3 sync
With this change l2 would fail early, typically in less than 2s, but the
barriers stuck in a condwait would prevent further progress.
diff --git a/bin/varnishtest/vtc_barrier.c b/bin/varnishtest/vtc_barrier.c
index 3a83b771d..25e63bb86 100644
--- a/bin/varnishtest/vtc_barrier.c
+++ b/bin/varnishtest/vtc_barrier.c
@@ -41,6 +41,7 @@
#include "vtc.h"
#include "vtcp.h"
+#include "vtim.h"
#include "vsa.h"
enum barrier_e {
@@ -119,8 +120,10 @@ barrier_cond(struct barrier *b, const char *av, struct vtclog *vl)
{
CHECK_OBJ_NOTNULL(b, BARRIER_MAGIC);
+ AZ(pthread_mutex_lock(&b->mtx));
barrier_expect(b, av, vl);
b->type = BARRIER_COND;
+ AZ(pthread_mutex_unlock(&b->mtx));
}
static void *
@@ -170,11 +173,11 @@ barrier_sock_thread(void *priv)
conns = calloc(b->expected, sizeof *conns);
AN(conns);
- while (b->active) {
+ while (b->active && !vtc_stop && !vtc_error) {
pfd[0].fd = sock;
pfd[0].events = POLLIN;
- i = poll(pfd, 1, 1000);
+ i = poll(pfd, 1, 100);
if (i == 0)
continue;
if (i < 0) {
@@ -222,6 +225,19 @@ barrier_sock_thread(void *priv)
b->active = 0;
}
+ if (vtc_stop || vtc_error) {
+ AN(b->active);
+ /* wake up outstanding waiters */
+ for (i = 0; i < b->waiters; i++)
+ closefd(&conns[i]);
+ b->waiters = 0;
+ b->active = 0;
+ }
+
+ if (b->cyclic && b->waiters > 0)
+ vtc_fatal(vl, "Barrier(%s) has %u outstanding waiters",
+ b->name, b->waiters);
+
macro_undef(vl, b->name, "addr");
macro_undef(vl, b->name, "port");
macro_undef(vl, b->name, "sock");
@@ -237,6 +253,7 @@ barrier_sock(struct barrier *b, const char *av, struct vtclog *vl)
{
CHECK_OBJ_NOTNULL(b, BARRIER_MAGIC);
+ AZ(pthread_mutex_lock(&b->mtx));
barrier_expect(b, av, vl);
b->type = BARRIER_SOCK;
b->active = 1;
@@ -247,23 +264,33 @@ barrier_sock(struct barrier *b, const char *av, struct vtclog *vl)
*/
AZ(pthread_create(&b->thread, NULL, barrier_sock_thread, b));
AZ(pthread_cond_wait(&b->cond, &b->mtx));
+ AZ(pthread_mutex_unlock(&b->mtx));
}
static void
barrier_cyclic(struct barrier *b, struct vtclog *vl)
{
+ enum barrier_e t;
+ int w;
CHECK_OBJ_NOTNULL(b, BARRIER_MAGIC);
- if (b->type == BARRIER_NONE)
+ AZ(pthread_mutex_lock(&b->mtx));
+ t = b->type;
+ w = b->waiters;
+ AZ(pthread_mutex_unlock(&b->mtx));
+
+ if (t == BARRIER_NONE)
vtc_fatal(vl,
"Barrier(%s) use error: not initialized", b->name);
- if (b->waiters != 0)
+ if (w != 0)
vtc_fatal(vl,
"Barrier(%s) use error: already in use", b->name);
+ AZ(pthread_mutex_lock(&b->mtx));
b->cyclic = 1;
+ AZ(pthread_mutex_unlock(&b->mtx));
}
/**********************************************************************
@@ -273,17 +300,29 @@ barrier_cyclic(struct barrier *b, struct vtclog *vl)
static void
barrier_cond_sync(struct barrier *b, struct vtclog *vl)
{
+ struct timespec ts;
+ int r, w;
CHECK_OBJ_NOTNULL(b, BARRIER_MAGIC);
assert(b->type == BARRIER_COND);
- assert(b->waiters <= b->expected);
- if (b->waiters == b->expected)
+ AZ(pthread_mutex_lock(&b->mtx));
+ w = b->waiters;
+ assert(w <= b->expected);
+
+ if (w == b->expected)
+ w = -1;
+ else
+ b->waiters = ++w;
+ AZ(pthread_mutex_unlock(&b->mtx));
+
+ if (w < 0)
vtc_fatal(vl,
"Barrier(%s) use error: more waiters than the %u expected",
b->name, b->expected);
- if (++b->waiters == b->expected) {
+ AZ(pthread_mutex_lock(&b->mtx));
+ if (w == b->expected) {
vtc_log(vl, 4, "Barrier(%s) wake %u", b->name, b->expected);
if (b->cyclic)
b->waiters = 0;
@@ -291,8 +330,13 @@ barrier_cond_sync(struct barrier *b, struct vtclog *vl)
} else {
vtc_log(vl, 4, "Barrier(%s) wait %u of %u",
b->name, b->waiters, b->expected);
- AZ(pthread_cond_wait(&b->cond, &b->mtx));
+ do {
+ ts = VTIM_timespec(VTIM_real() + .1);
+ r = pthread_cond_timedwait(&b->cond, &b->mtx, &ts);
+ assert(r == 0 || r == ETIMEDOUT);
+ } while (!vtc_stop && !vtc_error && r == ETIMEDOUT);
}
+ AZ(pthread_mutex_unlock(&b->mtx));
}
static void
@@ -317,11 +361,7 @@ barrier_sock_sync(struct barrier *b, struct vtclog *vl)
VSB_destroy(&vsb);
- /* emulate pthread_cond_wait's behavior */
- AZ(pthread_mutex_unlock(&b->mtx));
sz = read(sock, buf, sizeof buf); /* XXX loop with timeout? */
- AZ(pthread_mutex_lock(&b->mtx));
-
i = errno;
closefd(&sock);
@@ -420,7 +460,6 @@ cmd_barrier(CMD_ARGS)
break;
case BARRIER_SOCK:
if (b->need_join) {
- b->active = 0;
AZ(pthread_join(b->thread, NULL));
b->need_join = 0;
}
@@ -445,7 +484,6 @@ cmd_barrier(CMD_ARGS)
b = barrier_new(av[0], vl);
av++;
- AZ(pthread_mutex_lock(&b->mtx));
for (; *av != NULL; av++) {
if (!strcmp(*av, "cond")) {
av++;
@@ -469,5 +507,4 @@ cmd_barrier(CMD_ARGS)
}
vtc_fatal(vl, "Unknown barrier argument: %s", *av);
}
- AZ(pthread_mutex_unlock(&b->mtx));
}
More information about the varnish-commit
mailing list