Commit 3bb8b84c authored by Nils Goroll's avatar Nils Goroll

generalize the worker pool reserve to avoid deadlocks

Previously, we used a minimum number of idle threads (the reserve) to
ensure that we do not assign all threads with client requests and no
threads left over for backend requests.

This was actually only a special case of the more general issue
exposed by h2: Lower priority tasks depend on higher priority tasks
(for h2, sessions need streams, which need requests, which may need
backend requests).

To solve this problem, we divide the reserve by the number of priority
classes and schedule lower priority tasks only if there are enough
idle threads to run higher priority tasks eventually.

This change does not guarantee any upper limit on the amount of time
it can take for a task to be scheduled (e.g. backend requests could be
blocking on arbitrarily long timeouts), so the thread pool watchdog is
still warranted. But this change should guarantee that we do make
progress eventually.

With the reserves, thread_pool_min needs to be no smaller than the
number of priority classes (TASK_QUEUE__END). Ideally, we should have
an even higher minimum (@Dridi rightly suggested to make it 2 *
TASK_QUEUE__END), but that would prevent the very useful test
t02011.vtc.

For now, the value of TASK_QUEUE__END (5) is hardcoded as such for the
parameter configuration and documentation because auto-generating it
would require include/macro dances which I consider over the top for
now. Instead, the respective places are marked and an assert is in
place to ensure we do not start a worker with too small a number of
workers. I dicided against checks in the manager to avoid include
pollution from the worker (cache.h) into the manager.

Fixes #2418 for real
parent 75cca3cd
......@@ -214,22 +214,20 @@ struct pool_task {
/*
* tasks are taken off the queues in this order
*
* prios up to TASK_QUEUE_RESERVE are run from the reserve
*
* TASK_QUEUE_{REQ|STR} are new req's (H1/H2), and subject to queue limit.
*
* TASK_QUEUE_RUSH is req's returning from waiting list, they are
* not subject to TASK_QUEUE_CLIENT because we cannot safely clean
* them up if scheduling them fails.
* TASK_QUEUE_RUSH is req's returning from waiting list
*
* NOTE: When changing the number of classes, update places marked with
* TASK_QUEUE__END in mgt_pool.c
*/
enum task_prio {
TASK_QUEUE_BO,
#define TASK_QUEUE_RESERVE TASK_QUEUE_BO
TASK_QUEUE_RUSH,
TASK_QUEUE_REQ,
TASK_QUEUE_STR,
TASK_QUEUE_VCA,
TASK_QUEUE_END
TASK_QUEUE__END
};
#define TASK_QUEUE_CLIENT(prio) \
......
......@@ -148,7 +148,7 @@ pool_mkpool(unsigned pool_no)
VTAILQ_INIT(&pp->idle_queue);
VTAILQ_INIT(&pp->poolsocks);
for (i = 0; i < TASK_QUEUE_END; i++)
for (i = 0; i < TASK_QUEUE__END; i++)
VTAILQ_INIT(&pp->queues[i]);
AZ(pthread_cond_init(&pp->herder_cond, NULL));
AZ(pthread_create(&pp->herder_thr, NULL, pool_herder, pp));
......
......@@ -46,7 +46,7 @@ struct pool {
struct lock mtx;
unsigned nidle;
struct taskhead idle_queue;
struct taskhead queues[TASK_QUEUE_END];
struct taskhead queues[TASK_QUEUE__END];
unsigned nthr;
unsigned lqueue;
uintmax_t sdropped;
......
......@@ -172,12 +172,16 @@ pool_reserve(void)
{
unsigned lim;
if (cache_param->wthread_reserve == 0)
return (cache_param->wthread_min / 20 + 1);
lim = cache_param->wthread_min * 950 / 1000;
if (cache_param->wthread_reserve > lim)
return (lim);
return (cache_param->wthread_reserve);
if (cache_param->wthread_reserve == 0) {
lim = cache_param->wthread_min / 20 + 1;
} else {
lim = cache_param->wthread_min * 950 / 1000;
if (cache_param->wthread_reserve < lim)
lim = cache_param->wthread_reserve;
}
if (lim < TASK_QUEUE__END)
return (TASK_QUEUE__END);
return (lim);
}
/*--------------------------------------------------------------------*/
......@@ -190,7 +194,7 @@ pool_getidleworker(struct pool *pp, enum task_prio prio)
CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
Lck_AssertHeld(&pp->mtx);
if (prio <= TASK_QUEUE_RESERVE || pp->nidle > pool_reserve()) {
if (pp->nidle > (pool_reserve() * prio / TASK_QUEUE__END)) {
pt = VTAILQ_FIRST(&pp->idle_queue);
if (pt == NULL)
AZ(pp->nidle);
......@@ -262,7 +266,7 @@ Pool_Task(struct pool *pp, struct pool_task *task, enum task_prio prio)
CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
AN(task);
AN(task->func);
assert(prio < TASK_QUEUE_END);
assert(prio < TASK_QUEUE__END);
if (prio == TASK_QUEUE_REQ && reqpoolfail) {
retval = reqpoolfail & 1;
......@@ -334,7 +338,7 @@ Pool_Work_Thread(struct pool *pp, struct worker *wrk)
struct pool_task *tp = NULL;
struct pool_task tpx, tps;
vtim_real tmo;
int i, prio_lim;
int i, reserve;
CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
wrk->pool = pp;
......@@ -345,12 +349,11 @@ Pool_Work_Thread(struct pool *pp, struct worker *wrk)
AZ(wrk->vsl);
Lck_Lock(&pp->mtx);
if (pp->nidle < pool_reserve())
prio_lim = TASK_QUEUE_RESERVE + 1;
else
prio_lim = TASK_QUEUE_END;
reserve = pool_reserve();
for (i = 0; i < prio_lim; i++) {
for (i = 0; i < TASK_QUEUE__END; i++) {
if (pp->nidle < (reserve * i / TASK_QUEUE__END))
break;
tp = VTAILQ_FIRST(&pp->queues[i]);
if (tp != NULL) {
pp->lqueue--;
......@@ -656,7 +659,6 @@ static struct cli_proto debug_cmds[] = {
void
WRK_Init(void)
{
assert(cache_param->wthread_min >= TASK_QUEUE__END);
CLI_AddFuncs(debug_cmds);
}
......@@ -57,7 +57,6 @@ static int
tweak_thread_pool_min(struct vsb *vsb, const struct parspec *par,
const char *arg)
{
if (tweak_uint(vsb, par, arg))
return (-1);
......@@ -115,13 +114,16 @@ struct parspec WRK_parspec[] = {
"5000", "threads",
"thread_pool_min" },
{ "thread_pool_min", tweak_thread_pool_min, &mgt_param.wthread_min,
NULL, NULL,
"5", // TASK_QUEUE__END
NULL,
"The minimum number of worker threads in each pool.\n"
"\n"
"Increasing this may help ramp up faster from low load "
"situations or when threads have expired.\n"
"\n"
"Minimum is 10 threads.",
"Technical minimum is 5 threads, " // TASK_QUEUE__END
"but this parameter is strongly recommended to be "
"at least 10", // 2 * TASK_QUEUE__END
DELAYED_EFFECT,
"100", "threads",
NULL, "thread_pool_max" },
......@@ -132,17 +134,16 @@ struct parspec WRK_parspec[] = {
"in each pool.\n"
"\n"
"Tasks may require other tasks to complete (for example, "
"client requests may require backend requests). This reserve "
"is to ensure that such tasks still get to run even under high "
"load.\n"
"\n"
"Increasing the reserve may help setups with a high number of "
"backend requests at the expense of client performance. "
"Setting it too high will waste resources by keeping threads "
"unused.\n"
"client requests may require backend requests, http2 sessions "
"require streams, which require requests). This reserve is to "
"ensure that lower priority tasks do not prevent higher "
"priority tasks from running even under high load.\n"
"\n"
"Default is 0 to auto-tune (currently 5% of thread_pool_min).\n"
"Minimum is 1 otherwise.",
"The effective value is at least 5 (the number of internal "
// ^ TASK_QUEUE__END
"priority classes), irrespective of this parameter.\n"
"Default is 0 to auto-tune (5% of thread_pool_min).\n"
"Minimum is 1 otherwise, maximum is 95% of thread_pool_min.",
DELAYED_EFFECT,
"0", "threads",
NULL, "95% of thread_pool_min" },
......
......@@ -6,8 +6,8 @@ server s1 {
varnish v1 \
-arg "-p debug=+syncvsl" \
-arg "-p vsl_mask=+WorkThread" \
-arg "-p thread_pool_min=2" \
-arg "-p thread_pool_max=3" \
-arg "-p thread_pool_min=5" \
-arg "-p thread_pool_max=6" \
-arg "-p thread_pools=1" \
-arg "-p thread_pool_timeout=10" \
-vcl+backend {}
......@@ -16,27 +16,27 @@ varnish v1 -start
# we might have over-bred
delay 11
varnish v1 -expect threads == 2
varnish v1 -expect threads == 5
logexpect l1 -v v1 -g raw {
expect * 0 WorkThread {^\S+ start$}
expect * 0 WorkThread {^\S+ end$}
} -start
varnish v1 -cliok "param.set thread_pool_min 3"
varnish v1 -cliok "param.set thread_pool_min 6"
# Have to wait longer than thread_pool_timeout
delay 11
varnish v1 -expect threads == 3
varnish v1 -expect threads == 6
varnish v1 -cliok "param.set thread_pool_min 2"
varnish v1 -cliok "param.set thread_pool_max 2"
varnish v1 -cliok "param.set thread_pool_min 5"
varnish v1 -cliok "param.set thread_pool_max 5"
# Have to wait longer than thread_pool_timeout
delay 11
varnish v1 -expect threads == 2
varnish v1 -expect threads == 5
# Use logexpect to see that the thread actually exited
logexpect l1 -wait
......@@ -16,15 +16,12 @@ server s1 {
# - one for stream 1
# - one for the backend request
#
# To work around the reserve's default logic, an additional thread can be
# created, but won't be consumed for stream 3 because the pool will reserve
# it for a backend transaction. Otherwise it could starve the pool and dead
# lock v1.
# thread priorities ensure that there is exactly one thread per class
# at this point, so when we try to get a second stream, we fail.
varnish v1 -cliok "param.set thread_pools 1"
varnish v1 -cliok "param.set thread_pool_min 4"
varnish v1 -cliok "param.set thread_pool_max 4"
varnish v1 -cliok "param.set thread_pool_reserve 1"
varnish v1 -cliok "param.set thread_pool_min 5"
varnish v1 -cliok "param.set thread_pool_max 5"
varnish v1 -cliok "param.set thread_queue_limit 0"
varnish v1 -cliok "param.set thread_stats_rate 1"
varnish v1 -cliok "param.set feature +http2"
......@@ -69,7 +66,10 @@ client c1 {
} -run
# trigger an update of the stats
varnish v1 -cliok "param.set thread_pool_min 3"
varnish v1 -cliok "param.set thread_pool_max 6"
varnish v1 -cliok "param.set thread_pool_min 6"
delay 1
varnish v1 -cliok "param.set thread_pool_min 5"
delay 1
varnish v1 -vsl_catchup
varnish v1 -expect sess_dropped == 0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment