[master] 5ea8940c9 hash: Revert recent hash changes

Tue Mar 2 16:11:05 UTC 2021

commit 5ea8940c9d937697e8563609ac3a921e8755255f
Author: Dridi Boukelmoune <dridi.boukelmoune at gmail.com>
Date:   Tue Mar 2 16:02:48 2021 +0100

    hash: Revert recent hash changes
    
    This reverts the following commits:
    
    - e98e8e6497b9bbcca3e709a5ea0e094f2ec4b930.
      "Documentation updates for changed `vcl_hash{}` / `hash_data()`"
    - 001279ebd6ee83fafbc7fc1d1805bf2224361dd9.
      "Document proper design pattern for using hash_data() in vcl_recv,"
    - e36573e255af2aa1ed40fb367fcb5257ef7e0288.
      "Add a test-case for hash_data() in vcl_recv{}"
    - 03fe0cee1abc3741a7652ba2cf9c1bf822d65ab1.
      "Allow hash_data() in vcl_recv{}"
    - 4ebc3cfec133586cd8c4a715d8de18efb76402f1.
      "Make it possible to override the initial digest, and explain in"
    - d6ad52f5ff95daa3668fdad28b4c97e59b4c49d3
      "Change the way we calculate the hash key for the cache."
    
    Conflicts:
            doc/sphinx/reference/dp_vcl_recv_hash.rst
            doc/sphinx/reference/index.rst
    
    Concerns were raised regarding a change of the way we compute the hash
    key outside of the dot-zero release where we would expect such breaking
    changes (among other things, vmod_shard relies on hash stability).
    
    There is also no definite consensus of how to handle hashing from
    vcl_recv.

diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index cc1e44c26..d94ec2ad5 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -183,6 +183,19 @@ HSH_DeleteObjHead(const struct worker *wrk, struct objhead *oh)
 	FREE_OBJ(oh);
 }
 
+void
+HSH_AddString(struct req *req, void *ctx, const char *str)
+{
+
+	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
+	AN(ctx);
+	if (str != NULL) {
+		VSHA256_Update(ctx, str, strlen(str));
+		VSLb(req->vsl, SLT_Hash, "%s", str);
+	} else
+		VSHA256_Update(ctx, &str, 1);
+}
+
 /*---------------------------------------------------------------------
  * This is a debugging hack to enable testing of boundary conditions
  * in the hash algorithm.
diff --git a/bin/varnishd/cache/cache_objhead.h b/bin/varnishd/cache/cache_objhead.h
index af1797462..bc1782379 100644
--- a/bin/varnishd/cache/cache_objhead.h
+++ b/bin/varnishd/cache/cache_objhead.h
@@ -71,6 +71,7 @@ int HSH_DerefObjCore(struct worker *, struct objcore **, int rushmax);
 
 enum lookup_e HSH_Lookup(struct req *, struct objcore **, struct objcore **);
 void HSH_Ref(struct objcore *o);
+void HSH_AddString(struct req *, void *ctx, const char *str);
 unsigned HSH_Purge(struct worker *, struct objhead *, vtim_real ttl_now,
     vtim_dur ttl, vtim_dur grace, vtim_dur keep);
 struct objcore *HSH_Private(const struct worker *wrk);
diff --git a/bin/varnishd/cache/cache_req_fsm.c b/bin/varnishd/cache/cache_req_fsm.c
index ea09689f6..1dc349b99 100644
--- a/bin/varnishd/cache/cache_req_fsm.c
+++ b/bin/varnishd/cache/cache_req_fsm.c
@@ -50,6 +50,7 @@
 #include "storage/storage.h"
 #include "common/heritage.h"
 #include "vcl.h"
+#include "vsha256.h"
 #include "vtim.h"
 
 #define REQ_STEPS \
@@ -76,14 +77,6 @@
 REQ_STEPS
 #undef REQ_STEP
 
-/*
- * In this specific context we use SHA256 only as a very good
- * hashing function.  That renders most of the normal concerns
- * about salting & seeding moot.  However, if for some reason
- * you want to salt your hashes, this is where you do it.
- */
-static const uint8_t initial_digest[DIGEST_LEN];
-
 /*--------------------------------------------------------------------
  * Handle "Expect:" and "Connection:" on incoming request
  */
@@ -898,6 +891,7 @@ static enum req_fsm_nxt v_matchproto_(req_state_f)
 cnt_recv(struct worker *wrk, struct req *req)
 {
 	unsigned recv_handling;
+	struct VSHA256Context sha256ctx;
 	const char *ci, *cp, *endpname;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
@@ -927,8 +921,6 @@ cnt_recv(struct worker *wrk, struct req *req)
 		return (REQ_FSM_DONE);
 	}
 
-	memcpy(req->digest, initial_digest, sizeof req->digest);
-
 	VCL_recv_method(req->vcl, wrk, req, NULL, NULL);
 
 	if (wrk->handling == VCL_RET_FAIL) {
@@ -970,13 +962,13 @@ cnt_recv(struct worker *wrk, struct req *req)
 		}
 	}
 
-	if (!memcmp(req->digest, initial_digest, sizeof req->digest)) {
-		VCL_hash_method(req->vcl, wrk, req, NULL, NULL);
-		if (wrk->handling == VCL_RET_FAIL)
-			recv_handling = wrk->handling;
-		else
-			assert(wrk->handling == VCL_RET_LOOKUP);
-	}
+	VSHA256_Init(&sha256ctx);
+	VCL_hash_method(req->vcl, wrk, req, NULL, &sha256ctx);
+	if (wrk->handling == VCL_RET_FAIL)
+		recv_handling = wrk->handling;
+	else
+		assert(wrk->handling == VCL_RET_LOOKUP);
+	VSHA256_Final(req->digest, &sha256ctx);
 
 	switch (recv_handling) {
 	case VCL_RET_VCL:
diff --git a/bin/varnishd/cache/cache_vrt.c b/bin/varnishd/cache/cache_vrt.c
index 6d323a22c..0a729c0e4 100644
--- a/bin/varnishd/cache/cache_vrt.c
+++ b/bin/varnishd/cache/cache_vrt.c
@@ -689,29 +689,19 @@ VRT_fail(VRT_CTX, const char *fmt, ...)
 VCL_VOID
 VRT_hashdata(VRT_CTX, VCL_STRANDS s)
 {
-	struct VSHA256Context sha256ctx;
 	int i;
 
 	CHECK_OBJ_NOTNULL(ctx, VRT_CTX_MAGIC);
 	CHECK_OBJ_NOTNULL(ctx->req, REQ_MAGIC);
-	AZ(ctx->specific);
-	VSHA256_Init(&sha256ctx);
-	VSHA256_Update(&sha256ctx, ctx->req->digest, sizeof ctx->req->digest);
+	AN(ctx->specific);
 	AN(s);
-	for (i = 0; i < s->n; i++) {
-		if (s->p[i] != NULL) {
-			VSHA256_Update(&sha256ctx, s->p[i], strlen(s->p[i]));
-			VSLb(ctx->req->vsl, SLT_Hash, "%s", s->p[i]);
-		} else {
-			VSHA256_Update(&sha256ctx, "", 1);
-		}
-	}
+	for (i = 0; i < s->n; i++)
+		HSH_AddString(ctx->req, ctx->specific, s->p[i]);
 	/*
 	 * Add a 'field-separator' to make it more difficult to
 	 * manipulate the hash.
 	 */
-	VSHA256_Update(&sha256ctx, "", 1);
-	VSHA256_Final(ctx->req->digest, &sha256ctx);
+	HSH_AddString(ctx->req, ctx->specific, NULL);
 }
 
 /*--------------------------------------------------------------------*/
diff --git a/bin/varnishtest/tests/b00051.vtc b/bin/varnishtest/tests/b00051.vtc
index 8119dc6b5..3681ad2b3 100644
--- a/bin/varnishtest/tests/b00051.vtc
+++ b/bin/varnishtest/tests/b00051.vtc
@@ -22,5 +22,5 @@ client c1 {
 	rxresp
 	expect resp.http.req_hash ~ "[[:xdigit:]]{64}"
 	expect resp.http.req_hash == resp.http.bereq_hash
-	expect resp.http.req_hash-sf == ":0jkH41nfmD0PRFsKpM1m7ucOApnxFc62B//mQWxOnmQ=:"
+	expect resp.http.req_hash-sf == ":3k0f0yRKtKt7akzkyNsTGSDOJAZOQowTwKWhu5+kIu0=:"
 } -run
diff --git a/bin/varnishtest/tests/b00076.vtc b/bin/varnishtest/tests/b00076.vtc
deleted file mode 100644
index 120339275..000000000
--- a/bin/varnishtest/tests/b00076.vtc
+++ /dev/null
@@ -1,67 +0,0 @@
-varnishtest "hash_data() in vcl_recv{}"
-
-server s1 {
-	rxreq
-	txresp -hdr "Same: One"
-} -start
-
-varnish v1 -vcl+backend {
-    sub vcl_recv {
-	if (req.url == "/2") {
-		hash_data(req.http.foo);
-	}
-    }
-    sub vcl_hash {
-	hash_data(req.url);
-	return (lookup);
-    }
-} -start
-
-varnish v1 -cliok "param.set vsl_mask +Hash"
-
-client c1 {
-	txreq -url /1
-	rxresp
-	expect resp.status == 200
-	expect resp.http.same == One
-	txreq -url /2 -hdr "foo: /1"
-	rxresp
-	expect resp.status == 200
-	expect resp.http.same == One
-} -run
-
-server s1 {
-	rxreq
-	txresp -hdr "Second: One"
-} -start
-
-varnish v1 -vcl+backend {
-    sub make_hash_key {
-        hash_data("Documented Design Pattern");
-        hash_data(req.url);
-    }
-    
-    sub vcl_hash {
-        call make_hash_key;
-        return (lookup);
-    }
- 
-    sub vcl_recv {
-        if (req.http.early) {
-            call make_hash_key;
-        }
-    }
-}
-
-client c1 {
-	txreq
-	rxresp
-	expect resp.status == 200
-	expect resp.http.second == One
-
-	txreq -hdr "early: yes"
-	rxresp
-	expect resp.status == 200
-	expect resp.http.second == One
-} -run
-
diff --git a/bin/varnishtest/tests/d00020.vtc b/bin/varnishtest/tests/d00020.vtc
index 9f70ff2d3..0c42285d7 100644
--- a/bin/varnishtest/tests/d00020.vtc
+++ b/bin/varnishtest/tests/d00020.vtc
@@ -218,9 +218,9 @@ client c1 {
 client c2 {
 	txreq -url /b/def
 	rxresp
-	expect resp.http.hash		== "2e03d4fee2722154a84036faa3e4b6851e003830eadbe0d2874e3df09c8efe55"
+	expect resp.http.hash		== "93d1c4ad76396c91dd97fa310f7f26445332662c89393dbeeb77fe49f9111ee4"
 	expect resp.http.by		== "HASH"
-	expect resp.http.key		-eq 0x2e03d4fe
+	expect resp.http.key		-eq 0x93d1c4ad
 	expect resp.http.alt		== 0
 	expect resp.http.warmup	== "-1.000"
 	expect resp.http.rampup	== "true"
@@ -228,9 +228,9 @@ client c2 {
 
 	txreq -url /b/hash
 	rxresp
-	expect resp.http.hash		== "49f33e9019891091d3dcf6edab6d9433b756678bdf5202dd41b8a667c89887b1"
+	expect resp.http.hash		== "e47da20ea4db49d4f22acdadc69f02f445002be520a2865cd3351272add62540"
 	expect resp.http.by		== "HASH"
-	expect resp.http.key		-eq 0x49f33e90
+	expect resp.http.key		-eq 0xe47da20e
 	expect resp.http.alt		== 1
 	expect resp.http.warmup	== "-1.000"
 	expect resp.http.rampup	== "true"
@@ -268,9 +268,9 @@ client c2 {
 client c3 {
 	txreq -url /b/c/hash/def
 	rxresp
-	expect resp.http.hash		== "dd70dcbbf385c398ee3b53a849f12d0d846fc21292349fb45d37b2d9d8eca25e"
+	expect resp.http.hash		== "df9a465f8a0455c334b24c1638d3adda0f6e64fbe759029ab83602e3b9138884"
 	expect resp.http.by		== "HASH"
-	expect resp.http.key		-eq 0xdd70dcbb
+	expect resp.http.key		-eq 0xdf9a465f
 	expect resp.http.alt		== 7
 	expect resp.http.warmup	== "-1.000"
 	expect resp.http.rampup	== "true"
@@ -278,9 +278,9 @@ client c3 {
 
 	txreq -url /b/c/hash/hash
 	rxresp
-	expect resp.http.hash		== "41d09b9877cd0ac0eab888359b0ad54f0bf41da0ac03dc1b8ae12aff18465a8d"
+	expect resp.http.hash		== "0eb35bc1fab5aad5902fd1bac86540bd13d43aa31c6c46f54e776b43392e66e6"
 	expect resp.http.by		== "HASH"
-	expect resp.http.key		-eq 0x41d09b98
+	expect resp.http.key		-eq 0x0eb35bc1
 	expect resp.http.alt		== 8
 	expect resp.http.warmup	== "-1.000"
 	expect resp.http.rampup	== "true"
@@ -288,9 +288,9 @@ client c3 {
 
 	txreq -url /b/c/hash/url
 	rxresp
-	expect resp.http.hash		== "dcac849e02b3322f5fd3dddf9b9f5fc26d295733e6f1c51b190dfb7239a56e28"
+	expect resp.http.hash		== "1eb67b701ea07151cac5bea1f11b6267b9de15a3ff83cec995590480cbc2c750"
 	expect resp.http.by		== "HASH"
-	expect resp.http.key		-eq 0xdcac849e
+	expect resp.http.key		-eq 0x1eb67b70
 	expect resp.http.alt		== 9
 	expect resp.http.warmup	== "0.500"
 	expect resp.http.rampup	== "true"
@@ -298,9 +298,9 @@ client c3 {
 
 	txreq -url /b/c/hash/key
 	rxresp
-	expect resp.http.hash		== "112393761506e85f0c700a5d669a48b54001c870eb2e8d95f4d2f6fdccbe80a3"
+	expect resp.http.hash		== "a11b617e21aa7db22b6205d7612002e595b1b00d8c11602017f65456a1be3a35"
 	expect resp.http.by		== "HASH"
-	expect resp.http.key		-eq 0x11239376
+	expect resp.http.key		-eq 0xa11b617e
 	expect resp.http.alt		== 10
 	expect resp.http.warmup	== "-1.000"
 	expect resp.http.rampup	== "false"
@@ -308,9 +308,9 @@ client c3 {
 
 	txreq -url /b/c/hash/blob
 	rxresp
-	expect resp.http.hash		== "5ef050c1185ac02a66d9f79703b8cd5f0636abf3b1f15b9f22e0fe64df985d28"
+	expect resp.http.hash		== "d7eecc0ac83e1727332dcd8c7c8ae9f3114123abb2bf7e3fb15ecea8c84bb239"
 	expect resp.http.by		== "HASH"
-	expect resp.http.key		-eq 0x5ef050c1
+	expect resp.http.key		-eq 0xd7eecc0a
 	expect resp.http.alt		== 11
 	expect resp.http.warmup	== "-1.000"
 	expect resp.http.rampup	== "true"
diff --git a/doc/changes.rst b/doc/changes.rst
index 146ec9f60..c11ba2cf5 100644
--- a/doc/changes.rst
+++ b/doc/changes.rst
@@ -35,15 +35,6 @@ release process.
 Varnish Cache Next (2021-03-15)
 ================================
 
-* `hash_data()` can be called from `vcl_recv`, in which case
-  `vcl_hash` is not called.  This allows `vcl_recv` and 
-  backends to take the object identity into account, for
-  instance when choosing backend and grace periods.
-
-* `hash_data()` calculates the hash-key differently than previously.
-  This means that persistent storage will be lost, and it may break
-  very specific `*.vtc` test-scripts.
-
 * counters MAIN.s_req_bodybytes and VBE.*.tools.beresp_bodybytes
   are now always the number of bodybytes moved on the wire.
 
diff --git a/doc/sphinx/reference/dp_vcl_recv_hash.rst b/doc/sphinx/reference/dp_vcl_recv_hash.rst
deleted file mode 100644
index e63607e1e..000000000
--- a/doc/sphinx/reference/dp_vcl_recv_hash.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-..
-	Copyright (c) 2021 Varnish Software AS
-	SPDX-License-Identifier: BSD-2-Clause
-	See LICENSE file for full text of license
-
-.. _db_vcl_recv_hash:
-
-Hashing in `vcl_recv{}`
-=======================
-
-Calculating the `hash` used for cache lookup already in `vcl_recv{}`
-makes it possible for certain directors to offer targeted health status.
-
-To ensure consistent hashing, use this design pattern::
-
-    sub make_hash_key {
-        hash_data([…]);
-    }
-    
-    sub vcl_hash {
-        call make_hash_key;
-        return (lookup);
-    }
-
-    sub vcl_recv {
-        […]
-        call make_hash_key;
-        […]
-    }
diff --git a/doc/sphinx/reference/index.rst b/doc/sphinx/reference/index.rst
index 11fe4737c..c5c751019 100644
--- a/doc/sphinx/reference/index.rst
+++ b/doc/sphinx/reference/index.rst
@@ -27,7 +27,6 @@ VCL Design Patterns
 .. toctree::
 	:maxdepth: 1
 
-	dp_vcl_recv_hash.rst
 	dp_vcl_resp_status.rst
 
 Bundled VMODs
diff --git a/doc/sphinx/reference/vcl.rst b/doc/sphinx/reference/vcl.rst
index 02e2a64df..3dbae425b 100644
--- a/doc/sphinx/reference/vcl.rst
+++ b/doc/sphinx/reference/vcl.rst
@@ -355,9 +355,7 @@ hash_data(input)
 ~~~~~~~~~~~~~~~~
 
   Adds an input to the hash input. In the built-in VCL ``hash_data()``
-  is called on the host and URL of the request.
-  Available in ``vcl_hash`` or ``vcl_recv``.  If used in ``vcl_recv``
-  ``vcl_hash`` will not be called.
+  is called on the host and URL of the request. Available in ``vcl_hash``.
 
 synthetic(STRING)
 ~~~~~~~~~~~~~~~~~
diff --git a/doc/sphinx/users-guide/vcl-built-in-subs.rst b/doc/sphinx/users-guide/vcl-built-in-subs.rst
index 3c0990910..2f46d8b87 100644
--- a/doc/sphinx/users-guide/vcl-built-in-subs.rst
+++ b/doc/sphinx/users-guide/vcl-built-in-subs.rst
@@ -134,9 +134,8 @@ of the following keywords:
 vcl_hash
 ~~~~~~~~
 
-Called after `vcl_recv` to create a hash value for the request,
-unless `vcl_recv` already did that.
-This is used as the key to store and look up objects in the cache.
+Called after `vcl_recv` to create a hash value for the request. This is
+used as a key to look up the object in Varnish.
 
 The `vcl_hash` subroutine may terminate with calling ``return()`` with one
 of the following keywords:
diff --git a/doc/sphinx/users-guide/vcl-hashing.rst b/doc/sphinx/users-guide/vcl-hashing.rst
index 1f50a5e63..c605f5589 100644
--- a/doc/sphinx/users-guide/vcl-hashing.rst
+++ b/doc/sphinx/users-guide/vcl-hashing.rst
@@ -6,10 +6,12 @@
 Hashing
 -------
 
-Internally, when Varnish stores content in the cache indexed by a hash
-key used to find the object again. In the default setup
-this key is calculated based on `URL`, the `Host:` header, or
-if there is none, the IP address of the server::
+Internally, when Varnish stores content in the cache it stores the object
+together with a hash key to find the object again. In the default setup
+this key is calculated based on the content of the *Host* header or the
+IP address of the server and the URL.
+
+Behold the `default vcl`::
 
     sub vcl_hash {
         hash_data(req.url);
@@ -21,7 +23,7 @@ if there is none, the IP address of the server::
         return (lookup);
     }
 
-As you can see it first hashes `req.url` and then `req.http.host` if
+As you can see it first checks in `req.url` then `req.http.host` if
 it exists. It is worth pointing out that Varnish doesn't lowercase the
 hostname or the URL before hashing it so in theory having "Varnish.org/"
 and "varnish.org/" would result in different cache entries. Browsers
@@ -45,16 +47,7 @@ And then add a `vcl_hash`::
         hash_data(req.http.X-Country-Code);
     }
 
-Because there is no `return(lookup)`, the builtin VCL will take care
-of adding the URL, `Host:` or server IP# to the hash as usual.
-
-If `vcl_hash` did return, ie::
-
-    sub vcl_hash {
-        hash_data(req.http.X-Country-Code);
-        return(lookup);
-    }
-
-then *only* the country-code would matter, and Varnish would return
-seemingly random objects, ignoring the URL, (but they would always
-have the correct `X-Country-Code`).
+As the default VCL will take care of adding the host and URL to the hash
+we don't have to do anything else. Be careful calling ``return (lookup)``
+as this will abort the execution of the default VCL and Varnish can end
+up returning data based on more or less random inputs.
diff --git a/include/vrt.h b/include/vrt.h
index 70eb45179..01ca336c5 100644
--- a/include/vrt.h
+++ b/include/vrt.h
@@ -54,7 +54,6 @@
  * binary/load-time compatible, increment MAJOR version
  *
  * 13.0 (2021-03-15)
- *	VRT_hashdata() produces different hash-keys.
  *	Move VRT_synth_page() to deprecated status
  *	Add VRT_synth_strands() and VRT_synth_blob()
  *	struct vrt_type now produced by generate.py
diff --git a/lib/libvcc/vcc_action.c b/lib/libvcc/vcc_action.c
index 57a2f6daf..60fbdfa7b 100644
--- a/lib/libvcc/vcc_action.c
+++ b/lib/libvcc/vcc_action.c
@@ -452,7 +452,7 @@ vcc_Action_Init(struct vcc *tl)
 	ACT(ban,	vcc_act_ban,	0);
 	ACT(call,	vcc_act_call,	0);
 	ACT(hash_data,	vcc_act_hash_data,
-		VCL_MET_RECV | VCL_MET_HASH);
+		VCL_MET_HASH);
 	ACT(if,		vcc_Act_If,	0);
 	ACT(new,	vcc_Act_New,
 		VCL_MET_INIT);