[master] 11c036735 hash: Treat a waiting list match as a revalidation

Wed Aug 27 15:23:06 UTC 2025

commit 11c03673583397cb1b0cbec87b7ebbd025d38904
Author: Dridi Boukelmoune <dridi.boukelmoune at gmail.com>
Date:   Mon Jan 22 23:17:44 2024 +0100

    hash: Treat a waiting list match as a revalidation
    
    A rush used to be indiscriminate, and it is now operated based on the
    state of an objcore, by inspecting its flags. That change of behavior
    freed call sites from having to know what kind of rush to operate, and
    more importantly removed the need to attempt a rush whenever an objcore
    reference was released. It ensured that an objcore would only get the
    OC_F_BUSY flag if it was created for the purpose of a fetch, with the
    guarantee that a rush would be triggered as soon as the fetch got an
    outcome:
    
    - withdrawn (fetch not even attempted)
    - failed
    - ready (either for streaming, or complete)
    
    All these prior changes built up to this change, allowing the effective
    implementation of the unqualified form for the cache-control no-cache
    directive.
    
    RFC9112 describe the server-side no-cache directive like this:
    
    > The no-cache response directive, in its unqualified form (without an
    > argument), indicates that the response MUST NOT be used to satisfy any
    > other request without forwarding it for validation and receiving a
    > successful response; see Section 4.3.
    
    This can translate in VCL as beresp.ttl == 0 and beresp.grace == 0, but
    still allowing to serve from cache.
    
    Doing this naively leads to an infamous "waiting list serialization"
    phenomenon that tends to take time to diagnose and depending on the
    setup may be easy to remediate.
    
    One way to solve this from a Varnish perspective is to assimilate the
    waiting list as a collection of requests waiting for a response to be
    validated or revalidated. In the event of a no-cache directive (or VCL
    override to zero beresp.ttl and beresp.grace) and in the absence of
    any other criterion preventing an object to be cached, all the requests
    can consume the just-validated response.
    
    To make this possible, a few things needed to change. A happens-before
    relationship between entering the waiting list and taking part in the
    rush is needed, and the time reference for this relationship is the
    insertion of a new objcore in the cache. Let's call this time t_insert.
    
    When a rush begins there are two scenarios for requests.
    
    Requests that entered the waiting list before t_insert should evaluate
    the objcore, and when applicable, propagate the rush exponentially.
    
    Requests that entered the waiting list after t_insert already evaluated
    the objcore because by the time a rush begins, the OC_F_BUSY flag got
    cleared already. They should not evaluate the objcore again, and another
    incoming fetch outcome will eventually wake them up.
    
    What guarantees the happens-before relationship is that t_insert happens
    under the objhead lock.
    
    The relationship is effectively implemented with a generation counter
    for the objhead. Requests inherit the generation when they enter the
    waiting list. Objcores inherit the generation when they begin the rush.
    
    This allows the exponential rush propagation to know when to stop, and
    the objcore can be evaluated outside of the lookup critical section.
    Only incompatible variants should force requests to reenter the waiting
    list, reducing the scope for the serialization problem to a profusion
    of variants (like the classic 'Vary: User-Agent' pitfall).
    
    Since most spurious rushes at every corner of objhead activity are gone,
    this change puts all the spurious activity on the incompatible variants
    alone instead of all objects on more occasions.
    
    If a cacheable object was inserted in the cache, but already expired,
    this behavior enables cache hits. This can be common with multi-tier
    Varnish setups where one Varnish server may serve a graced object to
    another, but true of any origin server that may serve stale-yet-valid
    responses.
    
    The waiting list enables a proper response-wide no-cache behavior from
    now on, but the built-in VCL prevents it by default. This is also the
    first step towards implementing no-cache and private support at the
    header field granularity.

diff --git a/bin/varnishd/cache/cache.h b/bin/varnishd/cache/cache.h
index fd6cdfab8..fc5b9cf93 100644
--- a/bin/varnishd/cache/cache.h
+++ b/bin/varnishd/cache/cache.h
@@ -358,6 +358,7 @@ struct objcore {
 	uint16_t		oa_present;
 
 	unsigned		timer_idx;	// XXX 4Gobj limit
+	unsigned		waitinglist_gen;
 	vtim_real		last_lru;
 	VTAILQ_ENTRY(objcore)	hsh_list;
 	VTAILQ_ENTRY(objcore)	lru_list;
@@ -474,6 +475,7 @@ struct req {
 	stream_close_t		doclose;
 	unsigned		restarts;
 	unsigned		max_restarts;
+	unsigned		waitinglist_gen;
 
 	const struct req_step	*req_step;
 	struct reqtop		*top;	/* esi_level == 0 request */
@@ -495,9 +497,6 @@ struct req {
 
 	struct objcore		*body_oc;
 
-	/* The busy objhead we sleep on */
-	struct objhead		*hash_objhead;
-
 	/* Built Vary string == workspace reservation */
 	uint8_t			*vary_b;
 	uint8_t			*vary_e;
diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index 14259fa5d..4d3b5e290 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -101,6 +101,7 @@ hsh_initobjhead(struct objhead *oh)
 	XXXAN(oh);
 	INIT_OBJ(oh, OBJHEAD_MAGIC);
 	oh->refcnt = 1;
+	oh->waitinglist_gen = 1;
 	VTAILQ_INIT(&oh->objcs);
 	VTAILQ_INIT(&oh->waitinglist);
 	Lck_New(&oh->mtx, lck_objhdr);
@@ -377,6 +378,42 @@ hsh_insert_busyobj(const struct worker *wrk, struct objhead *oh)
 	return (oc);
 }
 
+/*---------------------------------------------------------------------
+ */
+
+static unsigned
+hsh_rush_match(struct req *req)
+{
+	struct objhead *oh;
+	struct objcore *oc;
+	const uint8_t *vary;
+
+	oc = req->objcore;
+	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
+	assert(oc->refcnt > 0);
+
+	AZ(oc->flags & OC_F_BUSY);
+	AZ(oc->flags & OC_F_PRIVATE);
+	if (oc->flags & (OC_F_WITHDRAWN|OC_F_HFM|OC_F_HFP|OC_F_CANCEL|
+	    OC_F_FAILED))
+		return (0);
+
+	if (req->vcf != NULL) /* NB: must operate under oh lock. */
+		return (0);
+
+	oh = oc->objhead;
+	CHECK_OBJ_NOTNULL(oh, OBJHEAD_MAGIC);
+
+	if (req->hash_ignore_vary)
+		return (1);
+	if (!ObjHasAttr(req->wrk, oc, OA_VARY))
+		return (1);
+
+	vary = ObjGetAttr(req->wrk, oc, OA_VARY, NULL);
+	AN(vary);
+	return (VRY_Match(req, vary));
+}
+
 /*---------------------------------------------------------------------
  */
 
@@ -407,6 +444,7 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
 	CHECK_OBJ_NOTNULL(wrk->wpriv, WORKER_PRIV_MAGIC);
 	CHECK_OBJ_NOTNULL(req->http, HTTP_MAGIC);
+	CHECK_OBJ_ORNULL(req->objcore, OBJCORE_MAGIC);
 	CHECK_OBJ_ORNULL(req->vcf, VCF_MAGIC);
 	AN(hash);
 
@@ -414,15 +452,33 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 	if (DO_DEBUG(DBG_HASHEDGE))
 		hsh_testmagic(req->digest);
 
-	if (req->hash_objhead != NULL) {
-		/*
-		 * This req came off the waiting list, and brings an
-		 * oh refcnt with it.
+	/*
+	 * When a req rushes off the waiting list, it brings an implicit
+	 * oh refcnt acquired at disembark time and an oc ref (with its
+	 * own distinct oh ref) acquired during rush hour.
+	 */
+
+	if (req->objcore != NULL && hsh_rush_match(req)) {
+		TAKE_OBJ_NOTNULL(oc, &req->objcore, OBJCORE_MAGIC);
+		*ocp = oc;
+		oh = oc->objhead;
+		Lck_Lock(&oh->mtx);
+		oc->hits++;
+		boc_progress = oc->boc == NULL ? -1 : oc->boc->fetched_so_far;
+		AN(hsh_deref_objhead_unlock(wrk, &oh, oc));
+		Req_LogHit(wrk, req, oc, boc_progress);
+		/* NB: since this hit comes from the waiting list instead of
+		 * a regular lookup, grace is not considered. The object is
+		 * fresh in the context of the waiting list, even expired: it
+		 * was successfully just [re]validated by a fetch task.
 		 */
-		CHECK_OBJ_NOTNULL(req->hash_objhead, OBJHEAD_MAGIC);
-		oh = req->hash_objhead;
+		return (HSH_HIT);
+	}
+
+	if (req->objcore != NULL) {
+		oh = req->objcore->objhead;
+		(void)HSH_DerefObjCore(wrk, &req->objcore);
 		Lck_Lock(&oh->mtx);
-		req->hash_objhead = NULL;
 	} else {
 		AN(wrk->wpriv->nobjhead);
 		oh = hash->lookup(wrk, req->digest, &wrk->wpriv->nobjhead);
@@ -615,13 +671,14 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 	AZ(req->hash_ignore_busy);
 
 	/*
-	 * The objhead reference transfers to the sess, we get it
-	 * back when the sess comes off the waiting list and
-	 * calls us again
+	 * The objhead reference is held by req while it is parked on the
+	 * waiting list. The oh pointer is taken back from the objcore that
+	 * triggers a rush of req off the waiting list.
 	 */
-	req->hash_objhead = oh;
+	assert(oh->refcnt > 1);
+
 	req->wrk = NULL;
-	req->waitinglist = 1;
+	req->waitinglist_gen = oh->waitinglist_gen;
 
 	if (DO_DEBUG(DBG_WAITINGLIST))
 		VSLb(req->vsl, SLT_Debug, "on waiting list <%p>", oh);
@@ -657,24 +714,44 @@ hsh_rush1(const struct worker *wrk, struct objcore *oc, struct rush *r)
 
 	AZ(oc->flags & OC_F_BUSY);
 	AZ(oc->flags & OC_F_PRIVATE);
+	max = cache_param->rush_exponent;
 	if (oc->flags & (OC_F_WITHDRAWN|OC_F_FAILED))
 		max = 1;
-	else if (oc->flags & (OC_F_HFM|OC_F_HFP|OC_F_CANCEL|OC_F_DYING))
-		max = cache_param->rush_exponent;
-	else
-		max = INT_MAX; /* XXX: temp */
 	assert(max > 0);
 
+	if (oc->waitinglist_gen == 0) {
+		oc->waitinglist_gen = oh->waitinglist_gen;
+		oh->waitinglist_gen++;
+	}
+
 	for (i = 0; i < max; i++) {
 		req = VTAILQ_FIRST(&oh->waitinglist);
 		if (req == NULL)
 			break;
-		CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
-		wrk->stats->busy_wakeup++;
+
+		CHECK_OBJ(req, REQ_MAGIC);
+
+		/* NB: The waiting list is naturally sorted by generation.
+		 *
+		 * Because of the exponential nature of the rush, it is
+		 * possible that new requests enter the waiting list before
+		 * the rush for this oc completes. Because the OC_F_BUSY flag
+		 * was cleared before the beginning of the rush, requests
+		 * from a newer generation already got a chance to evaluate
+		 * oc during a lookup and it didn't match their criteria.
+		 *
+		 * Therefore there's no point propagating the exponential
+		 * rush of this oc when we see a newer generation.
+		 */
+		if (req->waitinglist_gen > oc->waitinglist_gen)
+			break;
+
 		AZ(req->wrk);
 		VTAILQ_REMOVE(&oh->waitinglist, req, w_list);
 		VTAILQ_INSERT_TAIL(&r->reqs, req, w_list);
-		req->waitinglist = 0;
+		req->objcore = oc;
+		oc->refcnt++;
+		wrk->stats->busy_wakeup++;
 	}
 }
 
@@ -945,8 +1022,8 @@ HSH_Withdraw(struct worker *wrk, struct objcore **ocp)
 	assert(oh->refcnt > 0);
 	oc->flags &= ~OC_F_BUSY;
 	oc->flags |= OC_F_WITHDRAWN;
-	hsh_rush1(wrk, oc, &rush);
-	AZ(HSH_DerefObjCoreUnlock(wrk, &oc));
+	hsh_rush1(wrk, oc, &rush); /* grabs up to 1 oc ref */
+	assert(HSH_DerefObjCoreUnlock(wrk, &oc) <= 1);
 
 	hsh_rush2(wrk, &rush);
 }
diff --git a/bin/varnishd/cache/cache_objhead.h b/bin/varnishd/cache/cache_objhead.h
index 94ef81db5..162aaeb7f 100644
--- a/bin/varnishd/cache/cache_objhead.h
+++ b/bin/varnishd/cache/cache_objhead.h
@@ -40,6 +40,7 @@ struct objhead {
 	struct lock		mtx;
 	VTAILQ_HEAD(,objcore)	objcs;
 	uint8_t			digest[DIGEST_LEN];
+	unsigned		waitinglist_gen;
 	VTAILQ_HEAD(, req)	waitinglist;
 
 	/*----------------------------------------------------
diff --git a/bin/varnishd/cache/cache_req_fsm.c b/bin/varnishd/cache/cache_req_fsm.c
index 77481a41d..43cb18006 100644
--- a/bin/varnishd/cache/cache_req_fsm.c
+++ b/bin/varnishd/cache/cache_req_fsm.c
@@ -576,20 +576,23 @@ cnt_lookup(struct worker *wrk, struct req *req)
 {
 	struct objcore *oc, *busy;
 	enum lookup_e lr;
-	int had_objhead = 0;
+	int waitinglist_gen;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
 	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
-	AZ(req->objcore);
 	AZ(req->stale_oc);
 
 	AN(req->vcl);
 
 	VRY_Prep(req);
+	waitinglist_gen = req->waitinglist_gen;
+
+	if (req->waitinglist_gen) {
+		CHECK_OBJ_NOTNULL(req->objcore, OBJCORE_MAGIC);
+		req->waitinglist_gen = 0;
+	} else
+		AZ(req->objcore);
 
-	AZ(req->objcore);
-	if (req->hash_objhead)
-		had_objhead = 1;
 	wrk->strangelove = 0;
 	lr = HSH_Lookup(req, &oc, &busy);
 	if (lr == HSH_BUSY) {
@@ -605,7 +608,7 @@ cnt_lookup(struct worker *wrk, struct req *req)
 	if ((unsigned)wrk->strangelove >= cache_param->vary_notice)
 		VSLb(req->vsl, SLT_Notice, "vsl: High number of variants (%d)",
 		    wrk->strangelove);
-	if (had_objhead)
+	if (waitinglist_gen)
 		VSLb_ts_req(req, "Waitinglist", W_TIM_real(wrk));
 
 	if (req->vcf != NULL) {
diff --git a/bin/varnishd/cache/cache_vary.c b/bin/varnishd/cache/cache_vary.c
index c9c46231b..672a5b598 100644
--- a/bin/varnishd/cache/cache_vary.c
+++ b/bin/varnishd/cache/cache_vary.c
@@ -226,8 +226,7 @@ vry_cmp(const uint8_t *v1, const uint8_t *v2)
 void
 VRY_Prep(struct req *req)
 {
-	if (req->hash_objhead == NULL) {
-		/* Not a waiting list return */
+	if (req->waitinglist_gen == 0) {
 		AZ(req->vary_b);
 		AZ(req->vary_e);
 		(void)WS_ReserveAll(req->ws);
diff --git a/bin/varnishtest/tests/c00125.vtc b/bin/varnishtest/tests/c00125.vtc
new file mode 100644
index 000000000..8fbca4f46
--- /dev/null
+++ b/bin/varnishtest/tests/c00125.vtc
@@ -0,0 +1,156 @@
+varnishtest "successful expired waiting list hit"
+
+barrier b1 cond 2
+barrier b2 cond 2
+barrier b3 cond 2
+barrier b4 cond 2
+
+
+server s1 {
+	rxreq
+	expect req.http.user-agent == c1
+	expect req.http.bgfetch == false
+	barrier b1 sync
+	barrier b2 sync
+	txresp -hdr "Cache-Control: max-age=60" -hdr "Age: 120"
+
+	rxreq
+	expect req.http.user-agent == c3
+	expect req.http.bgfetch == true
+	txresp
+
+	# The no-cache case only works with a complicit VCL, for now.
+	rxreq
+	expect req.http.user-agent == c4
+	expect req.http.bgfetch == false
+	barrier b3 sync
+	barrier b4 sync
+	txresp -hdr "Cache-Control: no-cache"
+
+	rxreq
+	expect req.http.user-agent == c6
+	expect req.http.bgfetch == false
+	txresp -hdr "Cache-Control: no-cache"
+} -start
+
+varnish v1 -cliok "param.set default_grace 1h"
+varnish v1 -cliok "param.set thread_pools 1"
+varnish v1 -cliok "param.set debug +syncvsl,+waitinglist"
+varnish v1 -vcl+backend {
+	sub vcl_backend_fetch {
+		set bereq.http.bgfetch = bereq.is_bgfetch;
+	}
+	sub vcl_beresp_stale {
+		# We just validated a stale object, do not mark it as
+		# uncacheable. The object remains available for grace
+		# hits and background fetches.
+		return;
+	}
+	sub vcl_beresp_control {
+		if (beresp.http.cache-control == "no-cache") {
+			# Keep beresp.uncacheable clear.
+			return;
+		}
+	}
+	sub vcl_deliver {
+		set resp.http.obj-hits = obj.hits;
+		set resp.http.obj-ttl = obj.ttl;
+	}
+} -start
+
+client c1 {
+	txreq -url "/stale-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == 1001
+	expect resp.http.obj-hits == 0
+	expect resp.http.obj-ttl < 0
+} -start
+
+barrier b1 sync
+
+client c2 {
+	txreq -url "/stale-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == "1004 1002"
+	expect resp.http.obj-hits == 1
+	expect resp.http.obj-ttl < 0
+} -start
+
+varnish v1 -expect busy_sleep == 1
+barrier b2 sync
+
+client c1 -wait
+client c2 -wait
+
+varnish v1 -vsl_catchup
+
+varnish v1 -expect cache_miss == 1
+varnish v1 -expect cache_hit == 1
+varnish v1 -expect cache_hit_grace == 0
+varnish v1 -expect s_bgfetch == 0
+
+client c3 {
+	txreq -url "/stale-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == "1006 1002"
+	expect resp.http.obj-hits == 2
+	expect resp.http.obj-ttl < 0
+} -run
+
+varnish v1 -vsl_catchup
+
+varnish v1 -expect cache_miss == 1
+varnish v1 -expect cache_hit == 2
+varnish v1 -expect cache_hit_grace == 1
+varnish v1 -expect s_bgfetch == 1
+
+# The only way for a plain no-cache to be hit is to have a non-zero keep.
+varnish v1 -cliok "param.set default_ttl 0"
+varnish v1 -cliok "param.set default_grace 0"
+varnish v1 -cliok "param.set default_keep 1h"
+
+client c4 {
+	txreq -url "/no-cache-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == 1009
+	expect resp.http.obj-hits == 0
+	expect resp.http.obj-ttl <= 0
+} -start
+
+barrier b3 sync
+
+client c5 {
+	txreq -url "/no-cache-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == "1012 1010"
+	expect resp.http.obj-hits == 1
+	expect resp.http.obj-ttl <= 0
+} -start
+
+varnish v1 -expect busy_sleep == 2
+barrier b4 sync
+
+client c4 -wait
+client c5 -wait
+
+varnish v1 -vsl_catchup
+
+varnish v1 -expect cache_miss == 2
+varnish v1 -expect cache_hit == 3
+varnish v1 -expect cache_hit_grace == 1
+varnish v1 -expect s_bgfetch == 1
+
+# No hit when not on the waiting list
+client c6 {
+	txreq -url "/no-cache-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == 1014
+	expect resp.http.obj-hits == 0
+	expect resp.http.obj-ttl <= 0
+} -run
diff --git a/include/tbl/req_flags.h b/include/tbl/req_flags.h
index dde208d16..8d73e620e 100644
--- a/include/tbl/req_flags.h
+++ b/include/tbl/req_flags.h
@@ -37,7 +37,6 @@ REQ_FLAG(hash_ignore_busy,	1, 1, "")
 REQ_FLAG(hash_ignore_vary,	1, 1, "")
 REQ_FLAG(hash_always_miss,	1, 1, "")
 REQ_FLAG(is_hit,		0, 0, "")
-REQ_FLAG(waitinglist,		0, 0, "")
 REQ_FLAG(want100cont,		0, 0, "")
 REQ_FLAG(late100cont,		0, 0, "")
 REQ_FLAG(req_reset,		0, 0, "")