When connecting to backends, respect the administrative health

When making a connection a "real" backend (VBE), we checked the probed health state and did not take into account the administrative health state as set with `varnishadm backend.set_health ... {healthy,sick}`. Our documentation was not particularly explicit on this aspect either, but the administrative states `sick` and `healthy` made no sense if `auto` semantics was implied always. Also, the semantics were implicitly documented for `backend.list`. Implementation note: The relevant change is to call `VRT_Healthy()`, which does check the administrative health, in place of checking `(struct backend *)->sick` in `vbe_dir_getfd()`. As a `VRT_CTX` is required by `VRT_Healthy()`, we change the arguments of `vbe_dir_getfd()` accordingly: The busyobj can now be taken from the ctx, but the worker argument differs for pipe mode vs. fetch, so we preserve it as an explicit argument. A test for overriding a probed backend as healthy has been added to c00048.vtc, which requires a second probe to hit server s1 and fail. This is timing sensitive, so I hope that the backend probe interval of 5 seconds is long enough for all our test environments. If not, we probably need to make it longer or add another vtc. Fixes #3299

When connecting to backends, respect the administrative health
When making a connection a "real" backend (VBE), we checked the probed health state and did not take into account the administrative health state as set with `varnishadm backend.set_health ... {healthy,sick}`. Our documentation was not particularly explicit on this aspect either, but the administrative states `sick` and `healthy` made no sense if `auto` semantics was implied always. Also, the semantics were implicitly documented for `backend.list`. Implementation note: The relevant change is to call `VRT_Healthy()`, which does check the administrative health, in place of checking `(struct backend *)->sick` in `vbe_dir_getfd()`. As a `VRT_CTX` is required by `VRT_Healthy()`, we change the arguments of `vbe_dir_getfd()` accordingly: The busyobj can now be taken from the ctx, but the worker argument differs for pipe mode vs. fetch, so we preserve it as an explicit argument. A test for overriding a probed backend as healthy has been added to c00048.vtc, which requires a second probe to hit server s1 and fail. This is timing sensitive, so I hope that the backend probe interval of 5 seconds is long enough for all our test environments. If not, we probably need to make it longer or add another vtc. Fixes #3299
fe5e8265 · Nils Goroll · 448e2edc · fe5e8265 · fe5e8265 · fe5e8265
Unverified Commit fe5e8265 authored Apr 28, 2020 by Nils Goroll
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 9 deletions

cache_backend.c bin/varnishd/cache/cache_backend.c +9 -5

c00048.vtc bin/varnishtest/tests/c00048.vtc +35 -2

cli_cmds.h include/tbl/cli_cmds.h +5 -2

No files found.
--- a/bin/varnishd/cache/cache_backend.c
+++ b/bin/varnishd/cache/cache_backend.c
@@ -110,12 +110,15 @@ VBE_Connect_Error(struct VSC_vbe *vsc, int err)

 /*--------------------------------------------------------------------
 * Get a connection to the backend
+ *
+ * note: wrk is a separate argument because it differs for pipe vs. fetch
 */

 static struct pfd *
-vbe_dir_getfd(struct worker *wrk, struct backend *bp, struct busyobj *bo,
+vbe_dir_getfd(VRT_CTX, struct worker *wrk, struct backend *bp,
    unsigned force_fresh)
 {
+	struct busyobj *bo;
 	struct pfd *pfd;
 	int *fdp, err;
 	vtim_dur tmod;
@@ -123,11 +126,12 @@ vbe_dir_getfd(struct worker *wrk, struct backend *bp, struct busyobj *bo,
 	char pbuf1[VTCP_PORTBUFSIZE], pbuf2[VTCP_PORTBUFSIZE];

 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	CHECK_OBJ_NOTNULL(bo, BUSYOBJ_MAGIC);
+	CHECK_OBJ_NOTNULL(ctx->bo, BUSYOBJ_MAGIC);
+	bo = ctx->bo;
 	CHECK_OBJ_NOTNULL(bp, BACKEND_MAGIC);
 	AN(bp->vsc);

-	if (bp->sick) {
+	if (! VRT_Healthy(ctx, bp->director, NULL)) {
 		VSLb(bo->vsl, SLT_FetchError,
 		     "backend %s: unhealthy", VRT_BACKEND_string(bp->director));
 		bp->vsc->unhealthy++;
@@ -280,7 +284,7 @@ vbe_dir_gethdrs(VRT_CTX, VCL_BACKEND d)
 		http_PrintfHeader(bo->bereq, "Host: %s", bp->hosthdr);

 	do {
-		pfd = vbe_dir_getfd(wrk, bp, bo, extrachance == 0 ? 1 : 0);
+		pfd = vbe_dir_getfd(ctx, wrk, bp, extrachance == 0 ? 1 : 0);
 		if (pfd == NULL)
 			return (-1);
 		AN(bo->htc);
@@ -365,7 +369,7 @@ vbe_dir_http1pipe(VRT_CTX, VCL_BACKEND d)

 	ctx->req->res_mode = RES_PIPE;

-	pfd = vbe_dir_getfd(ctx->req->wrk, bp, ctx->bo, 0);
+	pfd = vbe_dir_getfd(ctx, ctx->req->wrk, bp, 0);

 	if (pfd == NULL) {
 		retval = SC_TX_ERROR;

--- a/bin/varnishtest/tests/c00048.vtc
+++ b/bin/varnishtest/tests/c00048.vtc
 varnishtest "Forcing health of backends"

-server s1 -repeat 3 {
+barrier b1 cond 2
+
+server s1 {
+	# probe
+	rxreq
+	txresp
+
+	# req
+	accept
+	rxreq
+	txresp
+	rxreq
+	txresp -hdr "Connection: close"
+
+	# probe sick
+	accept
+	rxreq
+	txresp -status 500
+	barrier b1 sync
+
+	accept
+	# req
 	rxreq
 	txresp
 } -start
@@ -13,7 +34,7 @@ varnish v1 -vcl {
 			.window = 8;
 			.initial = 7;
 			.threshold = 8;
-			.interval = 10s;
+			.interval = 5s;
 		}
 	}

@@ -60,6 +81,18 @@ client c1 {
 	expect resp.status == 200
 } -run

+# wait for sick probe
+barrier b1 sync
+
+# healthy overrides probe
+varnish v1 -cliok "backend.list"
+
+client c1 {
+	txreq
+	rxresp
+	expect resp.status == 200
+} -run
+
 varnish v1 -vsl_catchup

 varnish v1 -clierr 106 "backend.set_health s1 foo"

--- a/include/tbl/cli_cmds.h
+++ b/include/tbl/cli_cmds.h
@@ -312,8 +312,11 @@ CLI_CMD(BACKEND_LIST,
 CLI_CMD(BACKEND_SET_HEALTH,
 	"backend.set_health",
 	"backend.set_health <backend_pattern> [auto|healthy|sick]",
-	"Set health status on the backends.",
-	"",
+	"Set health status of backend(s) matching <backend_pattern>.",
+	"  * With ``auto``, the health status is determined by a probe\n"
+	"    or some other dynamic mechanism, if any\n"
+	"  * ``healthy`` sets the backend as usable\n"
+	"  * ``sick`` sets the backend as unsable\n",
 	2, 2
 )