generalize the worker pool reserve to avoid deadlocks

Previously, we used a minimum number of idle threads (the reserve) to ensure that we do not assign all threads with client requests and no threads left over for backend requests. This was actually only a special case of the more general issue exposed by h2: Lower priority tasks depend on higher priority tasks (for h2, sessions need streams, which need requests, which may need backend requests). To solve this problem, we divide the reserve by the number of priority classes and schedule lower priority tasks only if there are enough idle threads to run higher priority tasks eventually. This change does not guarantee any upper limit on the amount of time it can take for a task to be scheduled (e.g. backend requests could be blocking on arbitrarily long timeouts), so the thread pool watchdog is still warranted. But this change should guarantee that we do make progress eventually. With the reserves, thread_pool_min needs to be no smaller than the number of priority classes (TASK_QUEUE__END). Ideally, we should have an even higher minimum (@Dridi rightly suggested to make it 2 * TASK_QUEUE__END), but that would prevent the very useful test t02011.vtc. For now, the value of TASK_QUEUE__END (5) is hardcoded as such for the parameter configuration and documentation because auto-generating it would require include/macro dances which I consider over the top for now. Instead, the respective places are marked and an assert is in place to ensure we do not start a worker with too small a number of workers. I dicided against checks in the manager to avoid include pollution from the worker (cache.h) into the manager. Fixes #2418 for real

generalize the worker pool reserve to avoid deadlocks
Previously, we used a minimum number of idle threads (the reserve) to ensure that we do not assign all threads with client requests and no threads left over for backend requests. This was actually only a special case of the more general issue exposed by h2: Lower priority tasks depend on higher priority tasks (for h2, sessions need streams, which need requests, which may need backend requests). To solve this problem, we divide the reserve by the number of priority classes and schedule lower priority tasks only if there are enough idle threads to run higher priority tasks eventually. This change does not guarantee any upper limit on the amount of time it can take for a task to be scheduled (e.g. backend requests could be blocking on arbitrarily long timeouts), so the thread pool watchdog is still warranted. But this change should guarantee that we do make progress eventually. With the reserves, thread_pool_min needs to be no smaller than the number of priority classes (TASK_QUEUE__END). Ideally, we should have an even higher minimum (@Dridi rightly suggested to make it 2 * TASK_QUEUE__END), but that would prevent the very useful test t02011.vtc. For now, the value of TASK_QUEUE__END (5) is hardcoded as such for the parameter configuration and documentation because auto-generating it would require include/macro dances which I consider over the top for now. Instead, the respective places are marked and an assert is in place to ensure we do not start a worker with too small a number of workers. I dicided against checks in the manager to avoid include pollution from the worker (cache.h) into the manager. Fixes #2418 for real
3bb8b84c · Nils Goroll · 75cca3cd · 3bb8b84c · 3bb8b84c · 3bb8b84c
Commit 3bb8b84c authored Oct 09, 2018 by Nils Goroll
7 changed files
--- a/bin/varnishd/cache/cache.h
+++ b/bin/varnishd/cache/cache.h
@@ -214,22 +214,20 @@ struct pool_task {
 /*
 * tasks are taken off the queues in this order
 *
- * prios up to TASK_QUEUE_RESERVE are run from the reserve
- *
 * TASK_QUEUE_{REQ|STR} are new req's (H1/H2), and subject to queue limit.
 *
- * TASK_QUEUE_RUSH is req's returning from waiting list, they are
- * not subject to TASK_QUEUE_CLIENT because we cannot safely clean
- * them up if scheduling them fails.
+ * TASK_QUEUE_RUSH is req's returning from waiting list
+ *
+ * NOTE: When changing the number of classes, update places marked with
+ * TASK_QUEUE__END in mgt_pool.c
 */
 enum task_prio {
 	TASK_QUEUE_BO,
-#define TASK_QUEUE_RESERVE	TASK_QUEUE_BO
 	TASK_QUEUE_RUSH,
 	TASK_QUEUE_REQ,
 	TASK_QUEUE_STR,
 	TASK_QUEUE_VCA,
-	TASK_QUEUE_END
+	TASK_QUEUE__END
 };

 #define TASK_QUEUE_CLIENT(prio) \

--- a/bin/varnishd/cache/cache_pool.c
+++ b/bin/varnishd/cache/cache_pool.c
@@ -148,7 +148,7 @@ pool_mkpool(unsigned pool_no)

 	VTAILQ_INIT(&pp->idle_queue);
 	VTAILQ_INIT(&pp->poolsocks);
-	for (i = 0; i < TASK_QUEUE_END; i++)
+	for (i = 0; i < TASK_QUEUE__END; i++)
 		VTAILQ_INIT(&pp->queues[i]);
 	AZ(pthread_cond_init(&pp->herder_cond, NULL));
 	AZ(pthread_create(&pp->herder_thr, NULL, pool_herder, pp));

--- a/bin/varnishd/cache/cache_pool.h
+++ b/bin/varnishd/cache/cache_pool.h
@@ -46,7 +46,7 @@ struct pool {
 	struct lock			mtx;
 	unsigned			nidle;
 	struct taskhead			idle_queue;
-	struct taskhead			queues[TASK_QUEUE_END];
+	struct taskhead			queues[TASK_QUEUE__END];
 	unsigned			nthr;
 	unsigned			lqueue;
 	uintmax_t			sdropped;

--- a/bin/varnishd/cache/cache_wrk.c
+++ b/bin/varnishd/cache/cache_wrk.c
@@ -172,12 +172,16 @@ pool_reserve(void)
 {
 	unsigned lim;

-	if (cache_param->wthread_reserve == 0)
-		return (cache_param->wthread_min / 20 + 1);
-	lim = cache_param->wthread_min * 950 / 1000;
-	if (cache_param->wthread_reserve > lim)
-		return (lim);
-	return (cache_param->wthread_reserve);
+	if (cache_param->wthread_reserve == 0) {
+		lim = cache_param->wthread_min / 20 + 1;
+	} else {
+		lim = cache_param->wthread_min * 950 / 1000;
+		if (cache_param->wthread_reserve < lim)
+			lim = cache_param->wthread_reserve;
+	}
+	if (lim < TASK_QUEUE__END)
+		return (TASK_QUEUE__END);
+	return (lim);
 }

 /*--------------------------------------------------------------------*/
@@ -190,7 +194,7 @@ pool_getidleworker(struct pool *pp, enum task_prio prio)

 	CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
 	Lck_AssertHeld(&pp->mtx);
-	if (prio <= TASK_QUEUE_RESERVE || pp->nidle > pool_reserve()) {
+	if (pp->nidle > (pool_reserve() * prio / TASK_QUEUE__END)) {
 		pt = VTAILQ_FIRST(&pp->idle_queue);
 		if (pt == NULL)
 			AZ(pp->nidle);
@@ -262,7 +266,7 @@ Pool_Task(struct pool *pp, struct pool_task *task, enum task_prio prio)
 	CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
 	AN(task);
 	AN(task->func);
-	assert(prio < TASK_QUEUE_END);
+	assert(prio < TASK_QUEUE__END);

 	if (prio == TASK_QUEUE_REQ && reqpoolfail) {
 		retval = reqpoolfail & 1;
@@ -334,7 +338,7 @@ Pool_Work_Thread(struct pool *pp, struct worker *wrk)
 	struct pool_task *tp = NULL;
 	struct pool_task tpx, tps;
 	vtim_real tmo;
-	int i, prio_lim;
+	int i, reserve;

 	CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
 	wrk->pool = pp;
@@ -345,12 +349,11 @@ Pool_Work_Thread(struct pool *pp, struct worker *wrk)
 		AZ(wrk->vsl);

 		Lck_Lock(&pp->mtx);
-		if (pp->nidle < pool_reserve())
-			prio_lim = TASK_QUEUE_RESERVE + 1;
-		else
-			prio_lim = TASK_QUEUE_END;
+		reserve = pool_reserve();

-		for (i = 0; i < prio_lim; i++) {
+		for (i = 0; i < TASK_QUEUE__END; i++) {
+			if (pp->nidle < (reserve * i / TASK_QUEUE__END))
+				break;
 			tp = VTAILQ_FIRST(&pp->queues[i]);
 			if (tp != NULL) {
 				pp->lqueue--;
@@ -656,7 +659,6 @@ static struct cli_proto debug_cmds[] = {
 void
 WRK_Init(void)
 {
-
+	assert(cache_param->wthread_min >= TASK_QUEUE__END);
 	CLI_AddFuncs(debug_cmds);
 }
-
--- a/bin/varnishd/mgt/mgt_pool.c
+++ b/bin/varnishd/mgt/mgt_pool.c
@@ -57,7 +57,6 @@ static int
 tweak_thread_pool_min(struct vsb *vsb, const struct parspec *par,
    const char *arg)
 {
-
 	if (tweak_uint(vsb, par, arg))
 		return (-1);

@@ -115,13 +114,16 @@ struct parspec WRK_parspec[] = {
 		"5000", "threads",
 		"thread_pool_min" },
 	{ "thread_pool_min", tweak_thread_pool_min, &mgt_param.wthread_min,
-		NULL, NULL,
+		"5", // TASK_QUEUE__END
+		NULL,
 		"The minimum number of worker threads in each pool.\n"
 		"\n"
 		"Increasing this may help ramp up faster from low load "
 		"situations or when threads have expired.\n"
 		"\n"
-		"Minimum is 10 threads.",
+		"Technical minimum is 5 threads, " // TASK_QUEUE__END
+		"but this parameter is strongly recommended to be "
+		"at least 10", // 2 * TASK_QUEUE__END
 		DELAYED_EFFECT,
 		"100", "threads",
 		NULL, "thread_pool_max" },
@@ -132,17 +134,16 @@ struct parspec WRK_parspec[] = {
 		"in each pool.\n"
 		"\n"
 		"Tasks may require other tasks to complete (for example, "
-		"client requests may require backend requests). This reserve "
-		"is to ensure that such tasks still get to run even under high "
-		"load.\n"
-		"\n"
-		"Increasing the reserve may help setups with a high number of "
-		"backend requests at the expense of client performance. "
-		"Setting it too high will waste resources by keeping threads "
-		"unused.\n"
+		"client requests may require backend requests, http2 sessions "
+		"require streams, which require requests). This reserve is to "
+		"ensure that lower priority tasks do not prevent higher "
+		"priority tasks from running even under high load.\n"
 		"\n"
-		"Default is 0 to auto-tune (currently 5% of thread_pool_min).\n"
-		"Minimum is 1 otherwise.",
+		"The effective value is at least 5 (the number of internal "
+		//				 ^ TASK_QUEUE__END
+		"priority classes), irrespective of this parameter.\n"
+		"Default is 0 to auto-tune (5% of thread_pool_min).\n"
+		"Minimum is 1 otherwise, maximum is 95% of thread_pool_min.",
 		DELAYED_EFFECT,
 		"0", "threads",
 		NULL, "95% of thread_pool_min" },

--- a/bin/varnishtest/tests/r01490.vtc
+++ b/bin/varnishtest/tests/r01490.vtc
@@ -6,8 +6,8 @@ server s1 {
 varnish v1 \
 	-arg "-p debug=+syncvsl" \
 	-arg "-p vsl_mask=+WorkThread" \
-	-arg "-p thread_pool_min=2" \
-	-arg "-p thread_pool_max=3" \
+	-arg "-p thread_pool_min=5" \
+	-arg "-p thread_pool_max=6" \
 	-arg "-p thread_pools=1" \
 	-arg "-p thread_pool_timeout=10" \
 	-vcl+backend {}
@@ -16,27 +16,27 @@ varnish v1 -start
 # we might have over-bred
 delay 11

-varnish v1 -expect threads == 2
+varnish v1 -expect threads == 5

 logexpect l1 -v v1 -g raw {
 	expect * 0 WorkThread {^\S+ start$}
 	expect * 0 WorkThread {^\S+ end$}
 } -start

-varnish v1 -cliok "param.set thread_pool_min 3"
+varnish v1 -cliok "param.set thread_pool_min 6"

 # Have to wait longer than thread_pool_timeout
 delay 11

-varnish v1 -expect threads == 3
+varnish v1 -expect threads == 6

-varnish v1 -cliok "param.set thread_pool_min 2"
-varnish v1 -cliok "param.set thread_pool_max 2"
+varnish v1 -cliok "param.set thread_pool_min 5"
+varnish v1 -cliok "param.set thread_pool_max 5"

 # Have to wait longer than thread_pool_timeout
 delay 11

-varnish v1 -expect threads == 2
+varnish v1 -expect threads == 5

 # Use logexpect to see that the thread actually exited
 logexpect l1 -wait
--- a/bin/varnishtest/tests/t02011.vtc
+++ b/bin/varnishtest/tests/t02011.vtc
@@ -16,15 +16,12 @@ server s1 {
 # - one for stream 1
 # - one for the backend request
 #
-# To work around the reserve's default logic, an additional thread can be
-# created, but won't be consumed for stream 3 because the pool will reserve
-# it for a backend transaction. Otherwise it could starve the pool and dead
-# lock v1.
+# thread priorities ensure that there is exactly one thread per class
+# at this point, so when we try to get a second stream, we fail.

 varnish v1 -cliok "param.set thread_pools 1"
-varnish v1 -cliok "param.set thread_pool_min 4"
-varnish v1 -cliok "param.set thread_pool_max 4"
-varnish v1 -cliok "param.set thread_pool_reserve 1"
+varnish v1 -cliok "param.set thread_pool_min 5"
+varnish v1 -cliok "param.set thread_pool_max 5"
 varnish v1 -cliok "param.set thread_queue_limit 0"
 varnish v1 -cliok "param.set thread_stats_rate 1"
 varnish v1 -cliok "param.set feature +http2"
@@ -69,7 +66,10 @@ client c1 {
 } -run

 # trigger an update of the stats
-varnish v1 -cliok "param.set thread_pool_min 3"
+varnish v1 -cliok "param.set thread_pool_max 6"
+varnish v1 -cliok "param.set thread_pool_min 6"
+delay 1
+varnish v1 -cliok "param.set thread_pool_min 5"
 delay 1
 varnish v1 -vsl_catchup
 varnish v1 -expect sess_dropped == 0