[master] fe5e82650 When connecting to backends, respect the administrative health

Tue Apr 28 16:52:08 UTC 2020

commit fe5e82650788c1ce3dfbd29c8eeacf2b78c04ff4
Author: Nils Goroll <nils.goroll at uplex.de>
Date:   Tue Apr 28 18:06:38 2020 +0200

    When connecting to backends, respect the administrative health
    
    When making a connection a "real" backend (VBE), we checked the probed
    health state and did not take into account the administrative health
    state as set with `varnishadm backend.set_health ... {healthy,sick}`.
    
    Our documentation was not particularly explicit on this aspect either,
    but the administrative states `sick` and `healthy` made no sense if
    `auto` semantics was implied always. Also, the semantics were implicitly
    documented for `backend.list`.
    
    Implementation note:
    
    The relevant change is to call `VRT_Healthy()`, which does check the
    administrative health, in place of checking `(struct backend *)->sick`
    in `vbe_dir_getfd()`.
    
    As a `VRT_CTX` is required by `VRT_Healthy()`, we change the arguments of
    `vbe_dir_getfd()` accordingly: The busyobj can now be taken from the ctx,
    but the worker argument differs for pipe mode vs. fetch, so we preserve
    it as an explicit argument.
    
    A test for overriding a probed backend as healthy has been added to
    c00048.vtc, which requires a second probe to hit server s1 and fail.
    This is timing sensitive, so I hope that the backend probe interval
    of 5 seconds is long enough for all our test environments. If not,
    we probably need to make it longer or add another vtc.
    
    Fixes #3299

diff --git a/bin/varnishd/cache/cache_backend.c b/bin/varnishd/cache/cache_backend.c
index 9cecd54a3..5c1f5b58d 100644
--- a/bin/varnishd/cache/cache_backend.c
+++ b/bin/varnishd/cache/cache_backend.c
@@ -110,12 +110,15 @@ VBE_Connect_Error(struct VSC_vbe *vsc, int err)
 
 /*--------------------------------------------------------------------
  * Get a connection to the backend
+ *
+ * note: wrk is a separate argument because it differs for pipe vs. fetch
  */
 
 static struct pfd *
-vbe_dir_getfd(struct worker *wrk, struct backend *bp, struct busyobj *bo,
+vbe_dir_getfd(VRT_CTX, struct worker *wrk, struct backend *bp,
     unsigned force_fresh)
 {
+	struct busyobj *bo;
 	struct pfd *pfd;
 	int *fdp, err;
 	vtim_dur tmod;
@@ -123,11 +126,12 @@ vbe_dir_getfd(struct worker *wrk, struct backend *bp, struct busyobj *bo,
 	char pbuf1[VTCP_PORTBUFSIZE], pbuf2[VTCP_PORTBUFSIZE];
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	CHECK_OBJ_NOTNULL(bo, BUSYOBJ_MAGIC);
+	CHECK_OBJ_NOTNULL(ctx->bo, BUSYOBJ_MAGIC);
+	bo = ctx->bo;
 	CHECK_OBJ_NOTNULL(bp, BACKEND_MAGIC);
 	AN(bp->vsc);
 
-	if (bp->sick) {
+	if (! VRT_Healthy(ctx, bp->director, NULL)) {
 		VSLb(bo->vsl, SLT_FetchError,
 		     "backend %s: unhealthy", VRT_BACKEND_string(bp->director));
 		bp->vsc->unhealthy++;
@@ -280,7 +284,7 @@ vbe_dir_gethdrs(VRT_CTX, VCL_BACKEND d)
 		http_PrintfHeader(bo->bereq, "Host: %s", bp->hosthdr);
 
 	do {
-		pfd = vbe_dir_getfd(wrk, bp, bo, extrachance == 0 ? 1 : 0);
+		pfd = vbe_dir_getfd(ctx, wrk, bp, extrachance == 0 ? 1 : 0);
 		if (pfd == NULL)
 			return (-1);
 		AN(bo->htc);
@@ -365,7 +369,7 @@ vbe_dir_http1pipe(VRT_CTX, VCL_BACKEND d)
 
 	ctx->req->res_mode = RES_PIPE;
 
-	pfd = vbe_dir_getfd(ctx->req->wrk, bp, ctx->bo, 0);
+	pfd = vbe_dir_getfd(ctx, ctx->req->wrk, bp, 0);
 
 	if (pfd == NULL) {
 		retval = SC_TX_ERROR;
diff --git a/bin/varnishtest/tests/c00048.vtc b/bin/varnishtest/tests/c00048.vtc
index 161c0d502..a5739503b 100644
--- a/bin/varnishtest/tests/c00048.vtc
+++ b/bin/varnishtest/tests/c00048.vtc
@@ -1,6 +1,27 @@
 varnishtest "Forcing health of backends"
 
-server s1 -repeat 3 {
+barrier b1 cond 2
+
+server s1 {
+	# probe
+	rxreq
+	txresp
+
+	# req
+	accept
+	rxreq
+	txresp
+	rxreq
+	txresp -hdr "Connection: close"
+
+	# probe sick
+	accept
+	rxreq
+	txresp -status 500
+	barrier b1 sync
+
+	accept
+	# req
 	rxreq
 	txresp
 } -start
@@ -13,7 +34,7 @@ varnish v1 -vcl {
 			.window = 8;
 			.initial = 7;
 			.threshold = 8;
-			.interval = 10s;
+			.interval = 5s;
 		}
 	}
 
@@ -60,6 +81,18 @@ client c1 {
 	expect resp.status == 200
 } -run
 
+# wait for sick probe
+barrier b1 sync
+
+# healthy overrides probe
+varnish v1 -cliok "backend.list"
+
+client c1 {
+	txreq
+	rxresp
+	expect resp.status == 200
+} -run
+
 varnish v1 -vsl_catchup
 
 varnish v1 -clierr 106 "backend.set_health s1 foo"
diff --git a/include/tbl/cli_cmds.h b/include/tbl/cli_cmds.h
index 84b600f80..3e737c641 100644
--- a/include/tbl/cli_cmds.h
+++ b/include/tbl/cli_cmds.h
@@ -312,8 +312,11 @@ CLI_CMD(BACKEND_LIST,
 CLI_CMD(BACKEND_SET_HEALTH,
 	"backend.set_health",
 	"backend.set_health <backend_pattern> [auto|healthy|sick]",
-	"Set health status on the backends.",
-	"",
+	"Set health status of backend(s) matching <backend_pattern>.",
+	"  * With ``auto``, the health status is determined by a probe\n"
+	"    or some other dynamic mechanism, if any\n"
+	"  * ``healthy`` sets the backend as usable\n"
+	"  * ``sick`` sets the backend as unsable\n",
 	2, 2
 )