Add a watchdog to worker pools.

If the worker pool is configured too small, it can deadlock. Recovering from this would require a lot of complicated code, to discover queued but unscheduled tasks which can be cancelled (because the client went away or otherwise) and code to do the cancelling etc. etc. But fundamentally either people configured their pools wrong, in which case we want them to notice, or they are under DoS, in which case recovering gracefully is unlikely be a major improvement over a restart. Instead we implement a per-pool watchdog and kill the child process if nothing has been dequeued for too long. Default value 10 seconds, open to discussion. Band-aid for: #2418 Test-case by: @Dridi

Add a watchdog to worker pools.
If the worker pool is configured too small, it can deadlock. Recovering from this would require a lot of complicated code, to discover queued but unscheduled tasks which can be cancelled (because the client went away or otherwise) and code to do the cancelling etc. etc. But fundamentally either people configured their pools wrong, in which case we want them to notice, or they are under DoS, in which case recovering gracefully is unlikely be a major improvement over a restart. Instead we implement a per-pool watchdog and kill the child process if nothing has been dequeued for too long. Default value 10 seconds, open to discussion. Band-aid for: #2418 Test-case by: @Dridi
4db967e7 · Poul-Henning Kamp · Dridi Boukelmoune · a2357876 · 4db967e7 · 4db967e7
Commit 4db967e7 authored Oct 03, 2018 by Poul-Henning Kamp Committed by Dridi Boukelmoune Oct 16, 2018
6 changed files
--- a/bin/varnishd/cache/cache_pool.h
+++ b/bin/varnishd/cache/cache_pool.h
@@ -53,6 +53,7 @@ struct pool {
 	uintmax_t			sdropped;
 	uintmax_t			rdropped;
 	uintmax_t			nqueued;
+	uintmax_t			ndequeued;
 	struct VSC_main_wrk		*a_stat;
 	struct VSC_main_wrk		*b_stat;


--- a/bin/varnishd/cache/cache_wrk.c
+++ b/bin/varnishd/cache/cache_wrk.c
@@ -341,6 +341,7 @@ Pool_Work_Thread(struct pool *pp, struct worker *wrk)
 			tp = VTAILQ_FIRST(&pp->queues[i]);
 			if (tp != NULL) {
 				pp->lqueue--;
+				pp->ndequeued--;
 				VTAILQ_REMOVE(&pp->queues[i], tp, list);
 				break;
 			}
@@ -487,6 +488,8 @@ pool_herder(void *priv)
 	struct worker *wrk;
 	double delay;
 	int wthread_min;
+	uintmax_t dq = (1ULL << 31);
+	double dqt;

 	CAST_OBJ_NOTNULL(pp, priv, POOL_MAGIC);

@@ -494,6 +497,29 @@ pool_herder(void *priv)
 	THR_Init();

 	while (!pp->die || pp->nthr > 0) {
+		/*
+		 * If the worker pool is configured too small, we can
+		 * end up deadlocking it (see #2418 for details).
+		 *
+		 * Recovering from this would require a lot of complicated
+		 * code, and fundamentally, either people configured their
+		 * pools wrong, in which case we want them to notice, or
+		 * they are under DoS, in which case recovering gracefully
+		 * is unlikely be a major improvement.
+		 *
+		 * Instead we implement a watchdog and kill the worker if
+		 * nothing has been dequeued for that long.
+		 */
+		if (dq != pp->ndequeued) {
+			dq = pp->ndequeued;
+			dqt = VTIM_real();
+		} else if (pp->lqueue &&
+		    VTIM_real() - dqt > cache_param->wthread_watchdog) {
+			VSL(SLT_Error, 0,
+			   "Pool Herder: Queue does not move ql=%u dt=%f",
+			   pp->lqueue, VTIM_real() - dqt);
+			WRONG("Worker Pool Queue does not move");
+		}
 		wthread_min = cache_param->wthread_min;
 		if (pp->die)
 			wthread_min = 0;

--- a/bin/varnishd/common/common_param.h
+++ b/bin/varnishd/common/common_param.h
@@ -105,6 +105,7 @@ struct params {
 	double			wthread_add_delay;
 	double			wthread_fail_delay;
 	double			wthread_destroy_delay;
+	double			wthread_watchdog;
 	unsigned		wthread_stats_rate;
 	ssize_t			wthread_stacksize;
 	unsigned		wthread_queue_limit;

--- a/bin/varnishd/mgt/mgt_pool.c
+++ b/bin/varnishd/mgt/mgt_pool.c
@@ -148,6 +148,15 @@ struct parspec WRK_parspec[] = {
 		"for at least this long, will be destroyed.",
 		EXPERIMENTAL | DELAYED_EFFECT,
 		"300", "seconds" },
+	{ "thread_pool_watchdog",
+		tweak_timeout, &mgt_param.wthread_watchdog,
+		"0.1", NULL,
+		"Thread queue stuck watchdog.\n"
+		"\n"
+		"If no queued work have been released for this long,"
+		" the worker process panics itself.",
+		EXPERIMENTAL,
+		"10", "seconds" },
 	{ "thread_pool_destroy_delay",
 		tweak_timeout, &mgt_param.wthread_destroy_delay,
 		"0.01", NULL,

--- a/bin/varnishtest/tests/r02418.vtc
+++ b/bin/varnishtest/tests/r02418.vtc
+varnishtest "h2 queuing deadlock"
+
+barrier b1 cond 2
+
+# A reserve of 1 thread in a pool of 3 leaves a maximum
+# of 2 running sessions, the streams will be queued (except
+# stream 0 that is part of the h2 session).
+
+varnish v1 -cliok "param.set thread_pools 1"
+varnish v1 -cliok "param.set thread_pool_min 3"
+varnish v1 -cliok "param.set thread_pool_max 3"
+varnish v1 -cliok "param.set thread_pool_reserve 1"
+varnish v1 -cliok "param.set thread_pool_watchdog 5"
+varnish v1 -cliok "param.set feature +http2"
+varnish v1 -cliok "param.set feature +no_coredump"
+
+varnish v1 -vcl {
+	backend b1 { .host = "${bad_ip}"; }
+	sub vcl_recv {
+		return (synth(200));
+	}
+} -start
+
+logexpect l1 -v v1 -g raw {
+	expect * *	Error	"Pool Herder: Queue does not move*"
+} -start
+
+# Starve the pool with h2 sessions
+
+client c1 {
+	txpri
+	stream 0 rxsettings -run
+
+	barrier b1 sync
+
+	stream 1 {
+		txreq
+		# can't be scheduled, don't rx
+	} -run
+} -start
+
+client c2 {
+	txpri
+	stream 0 rxsettings -run
+
+	barrier b1 sync
+
+	stream 1 {
+		txreq
+		# can't be scheduled, don't rx
+	} -run
+} -start
+
+client c1 -wait
+client c2 -wait
+
+varnish v1 -vsl_catchup
+
+# At this point c1 and c2 closed their connections
+
+client c3 {
+	txreq
+	delay 10
+} -run
+
+logexpect l1 -wait
+
+varnish v1 -cliok panic.show
+varnish v1 -cliok panic.clear
+
+varnish v1 -expectexit 0x20
+
--- a/include/tbl/params.h
+++ b/include/tbl/params.h
@@ -1195,6 +1195,24 @@ PARAM(
 	/* func */	NULL
 )

+/* actual location mgt_pool.c */
+PARAM(
+	/* name */	thread_pool_watchdog,
+	/* typ */	timeout,
+	/* min */	"0.1",
+	/* max */	NULL,
+	/* default */	"10.000",
+	/* units */	"seconds",
+	/* flags */	EXPERIMENTAL,
+	/* s-text */
+	"Thread queue stuck watchdog.\n"
+	"\n"
+	"If no queued work have been released for this long,"
+	" the worker process panics itself.",
+	/* l-text */	"",
+	/* func */	NULL
+)
+
 /* actual location mgt_pool.c */
 PARAM(
 	/* name */	thread_pool_destroy_delay,