[master] 3bb8b84cd generalize the worker pool reserve to avoid deadlocks
Nils Goroll
nils.goroll at uplex.de
Wed Jan 15 16:10:11 UTC 2020
commit 3bb8b84cd86a4d2b95c7e2508ace6d868ced412c
Author: Nils Goroll <nils.goroll at uplex.de>
Date: Tue Oct 9 13:16:10 2018 +0200
generalize the worker pool reserve to avoid deadlocks
Previously, we used a minimum number of idle threads (the reserve) to
ensure that we do not assign all threads with client requests and no
threads left over for backend requests.
This was actually only a special case of the more general issue
exposed by h2: Lower priority tasks depend on higher priority tasks
(for h2, sessions need streams, which need requests, which may need
backend requests).
To solve this problem, we divide the reserve by the number of priority
classes and schedule lower priority tasks only if there are enough
idle threads to run higher priority tasks eventually.
This change does not guarantee any upper limit on the amount of time
it can take for a task to be scheduled (e.g. backend requests could be
blocking on arbitrarily long timeouts), so the thread pool watchdog is
still warranted. But this change should guarantee that we do make
progress eventually.
With the reserves, thread_pool_min needs to be no smaller than the
number of priority classes (TASK_QUEUE__END). Ideally, we should have
an even higher minimum (@Dridi rightly suggested to make it 2 *
TASK_QUEUE__END), but that would prevent the very useful test
t02011.vtc.
For now, the value of TASK_QUEUE__END (5) is hardcoded as such for the
parameter configuration and documentation because auto-generating it
would require include/macro dances which I consider over the top for
now. Instead, the respective places are marked and an assert is in
place to ensure we do not start a worker with too small a number of
workers. I dicided against checks in the manager to avoid include
pollution from the worker (cache.h) into the manager.
Fixes #2418 for real
diff --git a/bin/varnishd/cache/cache.h b/bin/varnishd/cache/cache.h
index a8f021645..0cf024c9c 100644
--- a/bin/varnishd/cache/cache.h
+++ b/bin/varnishd/cache/cache.h
@@ -214,22 +214,20 @@ struct pool_task {
/*
* tasks are taken off the queues in this order
*
- * prios up to TASK_QUEUE_RESERVE are run from the reserve
- *
* TASK_QUEUE_{REQ|STR} are new req's (H1/H2), and subject to queue limit.
*
- * TASK_QUEUE_RUSH is req's returning from waiting list, they are
- * not subject to TASK_QUEUE_CLIENT because we cannot safely clean
- * them up if scheduling them fails.
+ * TASK_QUEUE_RUSH is req's returning from waiting list
+ *
+ * NOTE: When changing the number of classes, update places marked with
+ * TASK_QUEUE__END in mgt_pool.c
*/
enum task_prio {
TASK_QUEUE_BO,
-#define TASK_QUEUE_RESERVE TASK_QUEUE_BO
TASK_QUEUE_RUSH,
TASK_QUEUE_REQ,
TASK_QUEUE_STR,
TASK_QUEUE_VCA,
- TASK_QUEUE_END
+ TASK_QUEUE__END
};
#define TASK_QUEUE_CLIENT(prio) \
diff --git a/bin/varnishd/cache/cache_pool.c b/bin/varnishd/cache/cache_pool.c
index 22b2a7036..58846b4e9 100644
--- a/bin/varnishd/cache/cache_pool.c
+++ b/bin/varnishd/cache/cache_pool.c
@@ -148,7 +148,7 @@ pool_mkpool(unsigned pool_no)
VTAILQ_INIT(&pp->idle_queue);
VTAILQ_INIT(&pp->poolsocks);
- for (i = 0; i < TASK_QUEUE_END; i++)
+ for (i = 0; i < TASK_QUEUE__END; i++)
VTAILQ_INIT(&pp->queues[i]);
AZ(pthread_cond_init(&pp->herder_cond, NULL));
AZ(pthread_create(&pp->herder_thr, NULL, pool_herder, pp));
diff --git a/bin/varnishd/cache/cache_pool.h b/bin/varnishd/cache/cache_pool.h
index 4e668fa0b..b4b5890c5 100644
--- a/bin/varnishd/cache/cache_pool.h
+++ b/bin/varnishd/cache/cache_pool.h
@@ -46,7 +46,7 @@ struct pool {
struct lock mtx;
unsigned nidle;
struct taskhead idle_queue;
- struct taskhead queues[TASK_QUEUE_END];
+ struct taskhead queues[TASK_QUEUE__END];
unsigned nthr;
unsigned lqueue;
uintmax_t sdropped;
diff --git a/bin/varnishd/cache/cache_wrk.c b/bin/varnishd/cache/cache_wrk.c
index 9b90dc10b..d66bfa809 100644
--- a/bin/varnishd/cache/cache_wrk.c
+++ b/bin/varnishd/cache/cache_wrk.c
@@ -172,12 +172,16 @@ pool_reserve(void)
{
unsigned lim;
- if (cache_param->wthread_reserve == 0)
- return (cache_param->wthread_min / 20 + 1);
- lim = cache_param->wthread_min * 950 / 1000;
- if (cache_param->wthread_reserve > lim)
- return (lim);
- return (cache_param->wthread_reserve);
+ if (cache_param->wthread_reserve == 0) {
+ lim = cache_param->wthread_min / 20 + 1;
+ } else {
+ lim = cache_param->wthread_min * 950 / 1000;
+ if (cache_param->wthread_reserve < lim)
+ lim = cache_param->wthread_reserve;
+ }
+ if (lim < TASK_QUEUE__END)
+ return (TASK_QUEUE__END);
+ return (lim);
}
/*--------------------------------------------------------------------*/
@@ -190,7 +194,7 @@ pool_getidleworker(struct pool *pp, enum task_prio prio)
CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
Lck_AssertHeld(&pp->mtx);
- if (prio <= TASK_QUEUE_RESERVE || pp->nidle > pool_reserve()) {
+ if (pp->nidle > (pool_reserve() * prio / TASK_QUEUE__END)) {
pt = VTAILQ_FIRST(&pp->idle_queue);
if (pt == NULL)
AZ(pp->nidle);
@@ -262,7 +266,7 @@ Pool_Task(struct pool *pp, struct pool_task *task, enum task_prio prio)
CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
AN(task);
AN(task->func);
- assert(prio < TASK_QUEUE_END);
+ assert(prio < TASK_QUEUE__END);
if (prio == TASK_QUEUE_REQ && reqpoolfail) {
retval = reqpoolfail & 1;
@@ -334,7 +338,7 @@ Pool_Work_Thread(struct pool *pp, struct worker *wrk)
struct pool_task *tp = NULL;
struct pool_task tpx, tps;
vtim_real tmo;
- int i, prio_lim;
+ int i, reserve;
CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
wrk->pool = pp;
@@ -345,12 +349,11 @@ Pool_Work_Thread(struct pool *pp, struct worker *wrk)
AZ(wrk->vsl);
Lck_Lock(&pp->mtx);
- if (pp->nidle < pool_reserve())
- prio_lim = TASK_QUEUE_RESERVE + 1;
- else
- prio_lim = TASK_QUEUE_END;
+ reserve = pool_reserve();
- for (i = 0; i < prio_lim; i++) {
+ for (i = 0; i < TASK_QUEUE__END; i++) {
+ if (pp->nidle < (reserve * i / TASK_QUEUE__END))
+ break;
tp = VTAILQ_FIRST(&pp->queues[i]);
if (tp != NULL) {
pp->lqueue--;
@@ -656,7 +659,6 @@ static struct cli_proto debug_cmds[] = {
void
WRK_Init(void)
{
-
+ assert(cache_param->wthread_min >= TASK_QUEUE__END);
CLI_AddFuncs(debug_cmds);
}
-
diff --git a/bin/varnishd/mgt/mgt_pool.c b/bin/varnishd/mgt/mgt_pool.c
index b32224d72..3d7201380 100644
--- a/bin/varnishd/mgt/mgt_pool.c
+++ b/bin/varnishd/mgt/mgt_pool.c
@@ -57,7 +57,6 @@ static int
tweak_thread_pool_min(struct vsb *vsb, const struct parspec *par,
const char *arg)
{
-
if (tweak_uint(vsb, par, arg))
return (-1);
@@ -115,13 +114,16 @@ struct parspec WRK_parspec[] = {
"5000", "threads",
"thread_pool_min" },
{ "thread_pool_min", tweak_thread_pool_min, &mgt_param.wthread_min,
- NULL, NULL,
+ "5", // TASK_QUEUE__END
+ NULL,
"The minimum number of worker threads in each pool.\n"
"\n"
"Increasing this may help ramp up faster from low load "
"situations or when threads have expired.\n"
"\n"
- "Minimum is 10 threads.",
+ "Technical minimum is 5 threads, " // TASK_QUEUE__END
+ "but this parameter is strongly recommended to be "
+ "at least 10", // 2 * TASK_QUEUE__END
DELAYED_EFFECT,
"100", "threads",
NULL, "thread_pool_max" },
@@ -132,17 +134,16 @@ struct parspec WRK_parspec[] = {
"in each pool.\n"
"\n"
"Tasks may require other tasks to complete (for example, "
- "client requests may require backend requests). This reserve "
- "is to ensure that such tasks still get to run even under high "
- "load.\n"
- "\n"
- "Increasing the reserve may help setups with a high number of "
- "backend requests at the expense of client performance. "
- "Setting it too high will waste resources by keeping threads "
- "unused.\n"
+ "client requests may require backend requests, http2 sessions "
+ "require streams, which require requests). This reserve is to "
+ "ensure that lower priority tasks do not prevent higher "
+ "priority tasks from running even under high load.\n"
"\n"
- "Default is 0 to auto-tune (currently 5% of thread_pool_min).\n"
- "Minimum is 1 otherwise.",
+ "The effective value is at least 5 (the number of internal "
+ // ^ TASK_QUEUE__END
+ "priority classes), irrespective of this parameter.\n"
+ "Default is 0 to auto-tune (5% of thread_pool_min).\n"
+ "Minimum is 1 otherwise, maximum is 95% of thread_pool_min.",
DELAYED_EFFECT,
"0", "threads",
NULL, "95% of thread_pool_min" },
diff --git a/bin/varnishtest/tests/r01490.vtc b/bin/varnishtest/tests/r01490.vtc
index fa7c8af67..5a91b94d7 100644
--- a/bin/varnishtest/tests/r01490.vtc
+++ b/bin/varnishtest/tests/r01490.vtc
@@ -6,8 +6,8 @@ server s1 {
varnish v1 \
-arg "-p debug=+syncvsl" \
-arg "-p vsl_mask=+WorkThread" \
- -arg "-p thread_pool_min=2" \
- -arg "-p thread_pool_max=3" \
+ -arg "-p thread_pool_min=5" \
+ -arg "-p thread_pool_max=6" \
-arg "-p thread_pools=1" \
-arg "-p thread_pool_timeout=10" \
-vcl+backend {}
@@ -16,27 +16,27 @@ varnish v1 -start
# we might have over-bred
delay 11
-varnish v1 -expect threads == 2
+varnish v1 -expect threads == 5
logexpect l1 -v v1 -g raw {
expect * 0 WorkThread {^\S+ start$}
expect * 0 WorkThread {^\S+ end$}
} -start
-varnish v1 -cliok "param.set thread_pool_min 3"
+varnish v1 -cliok "param.set thread_pool_min 6"
# Have to wait longer than thread_pool_timeout
delay 11
-varnish v1 -expect threads == 3
+varnish v1 -expect threads == 6
-varnish v1 -cliok "param.set thread_pool_min 2"
-varnish v1 -cliok "param.set thread_pool_max 2"
+varnish v1 -cliok "param.set thread_pool_min 5"
+varnish v1 -cliok "param.set thread_pool_max 5"
# Have to wait longer than thread_pool_timeout
delay 11
-varnish v1 -expect threads == 2
+varnish v1 -expect threads == 5
# Use logexpect to see that the thread actually exited
logexpect l1 -wait
diff --git a/bin/varnishtest/tests/t02011.vtc b/bin/varnishtest/tests/t02011.vtc
index 45ec730c6..10ed15286 100644
--- a/bin/varnishtest/tests/t02011.vtc
+++ b/bin/varnishtest/tests/t02011.vtc
@@ -16,15 +16,12 @@ server s1 {
# - one for stream 1
# - one for the backend request
#
-# To work around the reserve's default logic, an additional thread can be
-# created, but won't be consumed for stream 3 because the pool will reserve
-# it for a backend transaction. Otherwise it could starve the pool and dead
-# lock v1.
+# thread priorities ensure that there is exactly one thread per class
+# at this point, so when we try to get a second stream, we fail.
varnish v1 -cliok "param.set thread_pools 1"
-varnish v1 -cliok "param.set thread_pool_min 4"
-varnish v1 -cliok "param.set thread_pool_max 4"
-varnish v1 -cliok "param.set thread_pool_reserve 1"
+varnish v1 -cliok "param.set thread_pool_min 5"
+varnish v1 -cliok "param.set thread_pool_max 5"
varnish v1 -cliok "param.set thread_queue_limit 0"
varnish v1 -cliok "param.set thread_stats_rate 1"
varnish v1 -cliok "param.set feature +http2"
@@ -69,7 +66,10 @@ client c1 {
} -run
# trigger an update of the stats
-varnish v1 -cliok "param.set thread_pool_min 3"
+varnish v1 -cliok "param.set thread_pool_max 6"
+varnish v1 -cliok "param.set thread_pool_min 6"
+delay 1
+varnish v1 -cliok "param.set thread_pool_min 5"
delay 1
varnish v1 -vsl_catchup
varnish v1 -expect sess_dropped == 0
More information about the varnish-commit
mailing list