[master] 3bb8b84cd generalize the worker pool reserve to avoid deadlocks

Wed Jan 15 16:10:11 UTC 2020

commit 3bb8b84cd86a4d2b95c7e2508ace6d868ced412c
Author: Nils Goroll <nils.goroll at uplex.de>
Date:   Tue Oct 9 13:16:10 2018 +0200

    generalize the worker pool reserve to avoid deadlocks
    
    Previously, we used a minimum number of idle threads (the reserve) to
    ensure that we do not assign all threads with client requests and no
    threads left over for backend requests.
    
    This was actually only a special case of the more general issue
    exposed by h2: Lower priority tasks depend on higher priority tasks
    (for h2, sessions need streams, which need requests, which may need
    backend requests).
    
    To solve this problem, we divide the reserve by the number of priority
    classes and schedule lower priority tasks only if there are enough
    idle threads to run higher priority tasks eventually.
    
    This change does not guarantee any upper limit on the amount of time
    it can take for a task to be scheduled (e.g. backend requests could be
    blocking on arbitrarily long timeouts), so the thread pool watchdog is
    still warranted. But this change should guarantee that we do make
    progress eventually.
    
    With the reserves, thread_pool_min needs to be no smaller than the
    number of priority classes (TASK_QUEUE__END). Ideally, we should have
    an even higher minimum (@Dridi rightly suggested to make it 2 *
    TASK_QUEUE__END), but that would prevent the very useful test
    t02011.vtc.
    
    For now, the value of TASK_QUEUE__END (5) is hardcoded as such for the
    parameter configuration and documentation because auto-generating it
    would require include/macro dances which I consider over the top for
    now. Instead, the respective places are marked and an assert is in
    place to ensure we do not start a worker with too small a number of
    workers. I dicided against checks in the manager to avoid include
    pollution from the worker (cache.h) into the manager.
    
    Fixes #2418 for real

diff --git a/bin/varnishd/cache/cache.h b/bin/varnishd/cache/cache.h
index a8f021645..0cf024c9c 100644
--- a/bin/varnishd/cache/cache.h
+++ b/bin/varnishd/cache/cache.h
@@ -214,22 +214,20 @@ struct pool_task {
 /*
  * tasks are taken off the queues in this order
  *
- * prios up to TASK_QUEUE_RESERVE are run from the reserve
- *
  * TASK_QUEUE_{REQ|STR} are new req's (H1/H2), and subject to queue limit.
  *
- * TASK_QUEUE_RUSH is req's returning from waiting list, they are
- * not subject to TASK_QUEUE_CLIENT because we cannot safely clean
- * them up if scheduling them fails.
+ * TASK_QUEUE_RUSH is req's returning from waiting list
+ *
+ * NOTE: When changing the number of classes, update places marked with
+ * TASK_QUEUE__END in mgt_pool.c
  */
 enum task_prio {
 	TASK_QUEUE_BO,
-#define TASK_QUEUE_RESERVE	TASK_QUEUE_BO
 	TASK_QUEUE_RUSH,
 	TASK_QUEUE_REQ,
 	TASK_QUEUE_STR,
 	TASK_QUEUE_VCA,
-	TASK_QUEUE_END
+	TASK_QUEUE__END
 };
 
 #define TASK_QUEUE_CLIENT(prio) \
diff --git a/bin/varnishd/cache/cache_pool.c b/bin/varnishd/cache/cache_pool.c
index 22b2a7036..58846b4e9 100644
--- a/bin/varnishd/cache/cache_pool.c
+++ b/bin/varnishd/cache/cache_pool.c
@@ -148,7 +148,7 @@ pool_mkpool(unsigned pool_no)
 
 	VTAILQ_INIT(&pp->idle_queue);
 	VTAILQ_INIT(&pp->poolsocks);
-	for (i = 0; i < TASK_QUEUE_END; i++)
+	for (i = 0; i < TASK_QUEUE__END; i++)
 		VTAILQ_INIT(&pp->queues[i]);
 	AZ(pthread_cond_init(&pp->herder_cond, NULL));
 	AZ(pthread_create(&pp->herder_thr, NULL, pool_herder, pp));
diff --git a/bin/varnishd/cache/cache_pool.h b/bin/varnishd/cache/cache_pool.h
index 4e668fa0b..b4b5890c5 100644
--- a/bin/varnishd/cache/cache_pool.h
+++ b/bin/varnishd/cache/cache_pool.h
@@ -46,7 +46,7 @@ struct pool {
 	struct lock			mtx;
 	unsigned			nidle;
 	struct taskhead			idle_queue;
-	struct taskhead			queues[TASK_QUEUE_END];
+	struct taskhead			queues[TASK_QUEUE__END];
 	unsigned			nthr;
 	unsigned			lqueue;
 	uintmax_t			sdropped;
diff --git a/bin/varnishd/cache/cache_wrk.c b/bin/varnishd/cache/cache_wrk.c
index 9b90dc10b..d66bfa809 100644
--- a/bin/varnishd/cache/cache_wrk.c
+++ b/bin/varnishd/cache/cache_wrk.c
@@ -172,12 +172,16 @@ pool_reserve(void)
 {
 	unsigned lim;
 
-	if (cache_param->wthread_reserve == 0)
-		return (cache_param->wthread_min / 20 + 1);
-	lim = cache_param->wthread_min * 950 / 1000;
-	if (cache_param->wthread_reserve > lim)
-		return (lim);
-	return (cache_param->wthread_reserve);
+	if (cache_param->wthread_reserve == 0) {
+		lim = cache_param->wthread_min / 20 + 1;
+	} else {
+		lim = cache_param->wthread_min * 950 / 1000;
+		if (cache_param->wthread_reserve < lim)
+			lim = cache_param->wthread_reserve;
+	}
+	if (lim < TASK_QUEUE__END)
+		return (TASK_QUEUE__END);
+	return (lim);
 }
 
 /*--------------------------------------------------------------------*/
@@ -190,7 +194,7 @@ pool_getidleworker(struct pool *pp, enum task_prio prio)
 
 	CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
 	Lck_AssertHeld(&pp->mtx);
-	if (prio <= TASK_QUEUE_RESERVE || pp->nidle > pool_reserve()) {
+	if (pp->nidle > (pool_reserve() * prio / TASK_QUEUE__END)) {
 		pt = VTAILQ_FIRST(&pp->idle_queue);
 		if (pt == NULL)
 			AZ(pp->nidle);
@@ -262,7 +266,7 @@ Pool_Task(struct pool *pp, struct pool_task *task, enum task_prio prio)
 	CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
 	AN(task);
 	AN(task->func);
-	assert(prio < TASK_QUEUE_END);
+	assert(prio < TASK_QUEUE__END);
 
 	if (prio == TASK_QUEUE_REQ && reqpoolfail) {
 		retval = reqpoolfail & 1;
@@ -334,7 +338,7 @@ Pool_Work_Thread(struct pool *pp, struct worker *wrk)
 	struct pool_task *tp = NULL;
 	struct pool_task tpx, tps;
 	vtim_real tmo;
-	int i, prio_lim;
+	int i, reserve;
 
 	CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
 	wrk->pool = pp;
@@ -345,12 +349,11 @@ Pool_Work_Thread(struct pool *pp, struct worker *wrk)
 		AZ(wrk->vsl);
 
 		Lck_Lock(&pp->mtx);
-		if (pp->nidle < pool_reserve())
-			prio_lim = TASK_QUEUE_RESERVE + 1;
-		else
-			prio_lim = TASK_QUEUE_END;
+		reserve = pool_reserve();
 
-		for (i = 0; i < prio_lim; i++) {
+		for (i = 0; i < TASK_QUEUE__END; i++) {
+			if (pp->nidle < (reserve * i / TASK_QUEUE__END))
+				break;
 			tp = VTAILQ_FIRST(&pp->queues[i]);
 			if (tp != NULL) {
 				pp->lqueue--;
@@ -656,7 +659,6 @@ static struct cli_proto debug_cmds[] = {
 void
 WRK_Init(void)
 {
-
+	assert(cache_param->wthread_min >= TASK_QUEUE__END);
 	CLI_AddFuncs(debug_cmds);
 }
-
diff --git a/bin/varnishd/mgt/mgt_pool.c b/bin/varnishd/mgt/mgt_pool.c
index b32224d72..3d7201380 100644
--- a/bin/varnishd/mgt/mgt_pool.c
+++ b/bin/varnishd/mgt/mgt_pool.c
@@ -57,7 +57,6 @@ static int
 tweak_thread_pool_min(struct vsb *vsb, const struct parspec *par,
     const char *arg)
 {
-
 	if (tweak_uint(vsb, par, arg))
 		return (-1);
 
@@ -115,13 +114,16 @@ struct parspec WRK_parspec[] = {
 		"5000", "threads",
 		"thread_pool_min" },
 	{ "thread_pool_min", tweak_thread_pool_min, &mgt_param.wthread_min,
-		NULL, NULL,
+		"5", // TASK_QUEUE__END
+		NULL,
 		"The minimum number of worker threads in each pool.\n"
 		"\n"
 		"Increasing this may help ramp up faster from low load "
 		"situations or when threads have expired.\n"
 		"\n"
-		"Minimum is 10 threads.",
+		"Technical minimum is 5 threads, " // TASK_QUEUE__END
+		"but this parameter is strongly recommended to be "
+		"at least 10", // 2 * TASK_QUEUE__END
 		DELAYED_EFFECT,
 		"100", "threads",
 		NULL, "thread_pool_max" },
@@ -132,17 +134,16 @@ struct parspec WRK_parspec[] = {
 		"in each pool.\n"
 		"\n"
 		"Tasks may require other tasks to complete (for example, "
-		"client requests may require backend requests). This reserve "
-		"is to ensure that such tasks still get to run even under high "
-		"load.\n"
-		"\n"
-		"Increasing the reserve may help setups with a high number of "
-		"backend requests at the expense of client performance. "
-		"Setting it too high will waste resources by keeping threads "
-		"unused.\n"
+		"client requests may require backend requests, http2 sessions "
+		"require streams, which require requests). This reserve is to "
+		"ensure that lower priority tasks do not prevent higher "
+		"priority tasks from running even under high load.\n"
 		"\n"
-		"Default is 0 to auto-tune (currently 5% of thread_pool_min).\n"
-		"Minimum is 1 otherwise.",
+		"The effective value is at least 5 (the number of internal "
+		//				 ^ TASK_QUEUE__END
+		"priority classes), irrespective of this parameter.\n"
+		"Default is 0 to auto-tune (5% of thread_pool_min).\n"
+		"Minimum is 1 otherwise, maximum is 95% of thread_pool_min.",
 		DELAYED_EFFECT,
 		"0", "threads",
 		NULL, "95% of thread_pool_min" },
diff --git a/bin/varnishtest/tests/r01490.vtc b/bin/varnishtest/tests/r01490.vtc
index fa7c8af67..5a91b94d7 100644
--- a/bin/varnishtest/tests/r01490.vtc
+++ b/bin/varnishtest/tests/r01490.vtc
@@ -6,8 +6,8 @@ server s1 {
 varnish v1 \
 	-arg "-p debug=+syncvsl" \
 	-arg "-p vsl_mask=+WorkThread" \
-	-arg "-p thread_pool_min=2" \
-	-arg "-p thread_pool_max=3" \
+	-arg "-p thread_pool_min=5" \
+	-arg "-p thread_pool_max=6" \
 	-arg "-p thread_pools=1" \
 	-arg "-p thread_pool_timeout=10" \
 	-vcl+backend {}
@@ -16,27 +16,27 @@ varnish v1 -start
 # we might have over-bred
 delay 11
 
-varnish v1 -expect threads == 2
+varnish v1 -expect threads == 5
 
 logexpect l1 -v v1 -g raw {
 	expect * 0 WorkThread {^\S+ start$}
 	expect * 0 WorkThread {^\S+ end$}
 } -start
 
-varnish v1 -cliok "param.set thread_pool_min 3"
+varnish v1 -cliok "param.set thread_pool_min 6"
 
 # Have to wait longer than thread_pool_timeout
 delay 11
 
-varnish v1 -expect threads == 3
+varnish v1 -expect threads == 6
 
-varnish v1 -cliok "param.set thread_pool_min 2"
-varnish v1 -cliok "param.set thread_pool_max 2"
+varnish v1 -cliok "param.set thread_pool_min 5"
+varnish v1 -cliok "param.set thread_pool_max 5"
 
 # Have to wait longer than thread_pool_timeout
 delay 11
 
-varnish v1 -expect threads == 2
+varnish v1 -expect threads == 5
 
 # Use logexpect to see that the thread actually exited
 logexpect l1 -wait
diff --git a/bin/varnishtest/tests/t02011.vtc b/bin/varnishtest/tests/t02011.vtc
index 45ec730c6..10ed15286 100644
--- a/bin/varnishtest/tests/t02011.vtc
+++ b/bin/varnishtest/tests/t02011.vtc
@@ -16,15 +16,12 @@ server s1 {
 # - one for stream 1
 # - one for the backend request
 #
-# To work around the reserve's default logic, an additional thread can be
-# created, but won't be consumed for stream 3 because the pool will reserve
-# it for a backend transaction. Otherwise it could starve the pool and dead
-# lock v1.
+# thread priorities ensure that there is exactly one thread per class
+# at this point, so when we try to get a second stream, we fail.
 
 varnish v1 -cliok "param.set thread_pools 1"
-varnish v1 -cliok "param.set thread_pool_min 4"
-varnish v1 -cliok "param.set thread_pool_max 4"
-varnish v1 -cliok "param.set thread_pool_reserve 1"
+varnish v1 -cliok "param.set thread_pool_min 5"
+varnish v1 -cliok "param.set thread_pool_max 5"
 varnish v1 -cliok "param.set thread_queue_limit 0"
 varnish v1 -cliok "param.set thread_stats_rate 1"
 varnish v1 -cliok "param.set feature +http2"
@@ -69,7 +66,10 @@ client c1 {
 } -run
 
 # trigger an update of the stats
-varnish v1 -cliok "param.set thread_pool_min 3"
+varnish v1 -cliok "param.set thread_pool_max 6"
+varnish v1 -cliok "param.set thread_pool_min 6"
+delay 1
+varnish v1 -cliok "param.set thread_pool_min 5"
 delay 1
 varnish v1 -vsl_catchup
 varnish v1 -expect sess_dropped == 0