[master] ddac7e41b Add a watchdog to worker pools.

Dridi Boukelmoune dridi at varni.sh
Wed Oct 3 12:37:48 UTC 2018


On Wed, Oct 3, 2018 at 12:17 PM Poul-Henning Kamp <phk at freebsd.org> wrote:
>
>
> commit ddac7e41b64af09662305d07ef3be0376d6e2021
> Author: Poul-Henning Kamp <phk at FreeBSD.org>
> Date:   Wed Oct 3 10:12:59 2018 +0000
>
>     Add a watchdog to worker pools.
<snip>
> diff --git a/bin/varnishd/cache/cache_pool.h b/bin/varnishd/cache/cache_pool.h
> index e7a6a618b..9902e32ad 100644
> --- a/bin/varnishd/cache/cache_pool.h
> +++ b/bin/varnishd/cache/cache_pool.h
> @@ -53,6 +53,7 @@ struct pool {
>         uintmax_t                       sdropped;
>         uintmax_t                       rdropped;
>         uintmax_t                       nqueued;
> +       uintmax_t                       ndequeued;
>         struct VSC_main_wrk             *a_stat;
>         struct VSC_main_wrk             *b_stat;
>
> diff --git a/bin/varnishd/cache/cache_wrk.c b/bin/varnishd/cache/cache_wrk.c
> index 589ce68cb..f7f95cfc5 100644
> --- a/bin/varnishd/cache/cache_wrk.c
> +++ b/bin/varnishd/cache/cache_wrk.c
> @@ -341,6 +341,7 @@ Pool_Work_Thread(struct pool *pp, struct worker *wrk)
>                         tp = VTAILQ_FIRST(&pp->queues[i]);
>                         if (tp != NULL) {
>                                 pp->lqueue--;
> +                               pp->ndequeued--;

I thought we'd increment pp->ndequeued above based on the field's name.

>                                 VTAILQ_REMOVE(&pp->queues[i], tp, list);
>                                 break;
>                         }
> @@ -487,6 +488,8 @@ pool_herder(void *priv)
>         struct worker *wrk;
>         double delay;
>         int wthread_min;
> +       uintmax_t dq = (1ULL << 31);
> +       double dqt;

If we increment pp->ndequeued and let dq start at zero, and initialize
dqt here [...]

>         CAST_OBJ_NOTNULL(pp, priv, POOL_MAGIC);
>
> @@ -494,6 +497,29 @@ pool_herder(void *priv)
>         THR_Init();
>
>         while (!pp->die || pp->nthr > 0) {
> +               /*
> +                * If the worker pool is configured too small, we can
> +                * end up deadlocking it (see #2418 for details).
> +                *
> +                * Recovering from this would require a lot of complicated
> +                * code, and fundamentally, either people configured their
> +                * pools wrong, in which case we want them to notice, or
> +                * they are under DoS, in which case recovering gracefully
> +                * is unlikely be a major improvement.
> +                *
> +                * Instead we implement a watchdog and kill the worker if
> +                * nothing has been dequeued for that long.
> +                */
> +               if (dq != pp->ndequeued) {
> +                       dq = pp->ndequeued;
> +                       dqt = VTIM_real();

[...] we could log the dequeue delta (both count and time) to let
users know how to sensibly set the parameter.

> +               } else if (pp->lqueue &&
> +                   VTIM_real() - dqt > cache_param->wthread_watchdog) {
> +                       VSL(SLT_Error, 0,
> +                          "Pool Herder: Queue does not move ql=%u dt=%f",
> +                          pp->lqueue, VTIM_real() - dqt);
> +                       WRONG("Worker Pool Queue does not move");
> +               }
>                 wthread_min = cache_param->wthread_min;
>                 if (pp->die)
>                         wthread_min = 0;
<snip>

Dridi


More information about the varnish-commit mailing list