[master] f8d611fb5 cache_ban: Add BANIDX facility to speed up BAN loading
Nils Goroll
nils.goroll at uplex.de
Tue Apr 15 06:36:05 UTC 2025
commit f8d611fb54f936006faefba07d54b20300ead611
Author: Nils Goroll <nils.goroll at uplex.de>
Date: Wed Feb 5 19:57:07 2025 +0100
cache_ban: Add BANIDX facility to speed up BAN loading
When loading a cache with a relevant number of BANs from a persistent storage,
BAN_FindBan() needs to called for every object to determine the struct ban which
the in-core object is to point to based on the persisted ban time.
This is highly inefficient because BAN_FindBAN() conducts a linear search of the
ban list, for which we would expect about half the bans to need inspection if
objects were distributed equally across all bans. But for a long BAN list, most
objects will hang near the bottom, so chances are high that we actually need to
traverse nearly _all_ of the ban list for each loaded object.
So this screams for turning the ban list into a different data structure, like a
binary tree.
Yet once loading of objects is complete, the ban list as we have it is optimal:
All operations on the ban list need to traverse it anyway and in many cases this
traversal can happen unlocked. So we really would not want to change the
structure of the ban list.
This patch suggests to add a search tree as an _additional_ data structure,
which only exists during cache load (while there are ban_holds). In particular,
the interface is made such that the only relevant change to the ban code is to
provide a hint for the linear search to start at: VTAILQ_FOREACH(ban, ...) gets
turned into
ban = BANIDX_lookup(timestamp);
VTAILQ_FOREACH_FROM(ban, ...);
and that's it.
In particular, this patch implies no change at all to setups which do not use
persistent storage, so the risk is considered very low.
The speed up is impressive:
The following tests were conducted with a SLASH/fellow storage with 7689 bans.
For each test round, the storage (residing on a ZFS volume) was rolled back to
the same state, basically like so:
zfs rollback int21/slash/10g at reproduce
varnishd ... -sfellow=fellow,/dev/zvol/int21/slash/10g,10GB,1g,32k ...
When the cache was loaded, varnishd was terminated and the test repeated.
Six runs were conducted with Varnish-Cache master as of
749a2c3fcb417563fe3c1e076f6c78349e869aa1, resulting in load times in the order
of 21 seconds:
0 Storage - fellow fellow: 289627 resurrected in 28.477966s (10235.386784/s), 1856 already expired
0 Storage - fellow fellow: 289627 resurrected in 26.170669s (11137.774192/s), 1856 already expired
0 Storage - fellow fellow: 289627 resurrected in 24.331572s (11979.620642/s), 1856 already expired
0 Storage - fellow fellow: 289626 resurrected in 21.455001s (13585.783644/s), 1857 already expired
0 Storage - fellow fellow: 289626 resurrected in 21.340564s (13658.635888/s), 1857 already expired
0 Storage - fellow fellow: 289626 resurrected in 21.568622s (13514.215014/s), 1857 already expired
(note: the change of the number of cache objects is because wall clock time has
passed the expiry time of one object)
With this patch, load times were reduced by a factor >15
0 Storage - fellow fellow: 289625 resurrected in 1.293932s (225269.103288/s), 1858 already expired
0 Storage - fellow fellow: 289625 resurrected in 1.416040s (205843.804318/s), 1858 already expired
0 Storage - fellow fellow: 289625 resurrected in 1.341485s (217283.788555/s), 1858 already expired
Naturally, the benefits are even more relevant with a higher number of bans.
diff --git a/bin/varnishd/Makefile.am b/bin/varnishd/Makefile.am
index ae4414b52..0d60f1b18 100644
--- a/bin/varnishd/Makefile.am
+++ b/bin/varnishd/Makefile.am
@@ -20,6 +20,7 @@ varnishd_SOURCES = \
cache/cache_backend_probe.c \
cache/cache_ban.c \
cache/cache_ban_build.c \
+ cache/cache_ban_idx.c \
cache/cache_ban_lurker.c \
cache/cache_busyobj.c \
cache/cache_cli.c \
diff --git a/bin/varnishd/cache/cache_ban.c b/bin/varnishd/cache/cache_ban.c
index 04468a547..77d7ed96f 100644
--- a/bin/varnishd/cache/cache_ban.c
+++ b/bin/varnishd/cache/cache_ban.c
@@ -124,8 +124,10 @@ BAN_Release(void)
assert(ban_holds > 0);
ban_holds--;
Lck_Unlock(&ban_mtx);
- if (ban_holds == 0)
+ if (ban_holds == 0) {
+ BANIDX_fini();
WRK_BgThread(&ban_thread, "ban-lurker", ban_lurker, NULL);
+ }
}
/*--------------------------------------------------------------------
@@ -294,7 +296,8 @@ BAN_FindBan(vtim_real t0)
vtim_real t1;
assert(ban_holds > 0);
- VTAILQ_FOREACH(b, &ban_head, list) {
+ b = BANIDX_lookup(t0);
+ VTAILQ_FOREACH_FROM(b, &ban_head, list) {
t1 = ban_time(b->spec);
if (t1 == t0)
return (b);
@@ -392,11 +395,13 @@ ban_reload(const uint8_t *ban, unsigned len)
vtim_real t0, t1, t2 = 9e99;
ASSERT_CLI();
Lck_AssertHeld(&ban_mtx);
+ assert(ban_holds > 0);
t0 = ban_time(ban);
assert(len == ban_len(ban));
- VTAILQ_FOREACH(b, &ban_head, list) {
+ b = BANIDX_lookup(t0);
+ VTAILQ_FOREACH_FROM(b, &ban_head, list) {
t1 = ban_time(b->spec);
assert(t1 < t2);
t2 = t1;
diff --git a/bin/varnishd/cache/cache_ban.h b/bin/varnishd/cache/cache_ban.h
index 468c54747..e2bce0113 100644
--- a/bin/varnishd/cache/cache_ban.h
+++ b/bin/varnishd/cache/cache_ban.h
@@ -156,3 +156,7 @@ vtim_real ban_time(const uint8_t *banspec);
int ban_equal(const uint8_t *bs1, const uint8_t *bs2);
void BAN_Free(struct ban *b);
void ban_kick_lurker(void);
+
+// cache_ban_idx.c
+struct ban * BANIDX_lookup(vtim_real);
+void BANIDX_fini(void);
diff --git a/bin/varnishd/cache/cache_ban_idx.c b/bin/varnishd/cache/cache_ban_idx.c
new file mode 100644
index 000000000..11a5886a0
--- /dev/null
+++ b/bin/varnishd/cache/cache_ban_idx.c
@@ -0,0 +1,135 @@
+/*-
+ * Copyright 2025 UPLEX - Nils Goroll Systemoptimierung
+ * All rights reserved.
+ *
+ * Author: Nils Goroll <nils.goroll at uplex.de>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This code creates an index on top of the ban list to speed up lookup during
+ * cache load (while ban_holds > 0). Its only assumption is that bans do not
+ * vanish (as guaranteed by ban_holds), and it does not change any functions
+ * called _after_ cache reload by constructing the additional index during
+ * lookup only.
+ *
+ * The lookup function returns a ban for VTAILQ_FOREACH_FROM() to start from,
+ * such that changes to the ban code remain minimal.
+ *
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "cache_varnishd.h"
+#include "cache_ban.h"
+#include "cache_objhead.h"
+
+#include "vtree.h"
+
+struct metaban {
+ unsigned magic;
+#define BANIDX_MAGIC 0x39b799f8
+ VRBT_ENTRY(metaban) tree;
+ // duplicate the ban time for efficiency
+ vtim_real time;
+ struct ban *ban;
+};
+
+static inline int
+metaban_cmp(const struct metaban *i1, struct metaban *i2)
+{
+ if (i1->time < i2->time)
+ return (-1);
+ if (i1->time > i2->time)
+ return (1);
+ return (0);
+}
+
+VRBT_HEAD(banidx_s, metaban);
+VRBT_GENERATE_REMOVE_COLOR(banidx_s, metaban, tree, static)
+VRBT_GENERATE_REMOVE(banidx_s, metaban, tree, static)
+VRBT_GENERATE_NFIND(banidx_s, metaban, tree, metaban_cmp, static)
+VRBT_GENERATE_INSERT_COLOR(banidx_s, metaban, tree, static)
+VRBT_GENERATE_INSERT_FINISH(banidx_s, metaban, tree, static)
+VRBT_GENERATE_INSERT(banidx_s, metaban, tree, metaban_cmp, static)
+VRBT_GENERATE_NEXT(banidx_s, metaban, tree, static)
+VRBT_GENERATE_MINMAX(banidx_s, metaban, tree, static)
+
+static struct banidx_s banidx = VRBT_INITIALIZER(banidx);
+static pthread_mutex_t banidxmtx = PTHREAD_MUTEX_INITIALIZER;
+
+struct ban *
+BANIDX_lookup(vtim_real t0)
+{
+ struct metaban *m, needle = {0, .time = t0};
+ struct ban *best = NULL, *b = NULL;
+ vtim_real t1;
+
+ PTOK(pthread_mutex_lock(&banidxmtx));
+ m = VRBT_NFIND(banidx_s, &banidx, &needle);
+ if (m != NULL && ! (m->time > t0)) {
+ PTOK(pthread_mutex_unlock(&banidxmtx));
+ return (m->ban);
+ }
+ /*
+ * if we have m, it is later than t0, which is higher up the list.
+ * check if there is a better match and create missing elements
+ * along the way
+ * if VRBT_NFIND did not return anything, it means it has no index for
+ * elements higher up the list and we can index from the top (implicit
+ * in VTAILQ_FOREACH_FROM())
+ */
+ if (m != NULL) {
+ best = m->ban;
+ b = VTAILQ_NEXT(best, list);
+ if (b == NULL) {
+ PTOK(pthread_mutex_unlock(&banidxmtx));
+ return (best);
+ }
+ }
+ VTAILQ_FOREACH_FROM(b, &ban_head, list) {
+ t1 = ban_time(b->spec);
+ if (t1 < t0)
+ break;
+ ALLOC_OBJ(m, BANIDX_MAGIC);
+ m->time = t1;
+ m->ban = b;
+ AZ(VRBT_INSERT(banidx_s, &banidx, m));
+ best = b;
+ }
+ PTOK(pthread_mutex_unlock(&banidxmtx));
+ return (best);
+}
+
+void
+BANIDX_fini(void)
+{
+ struct metaban *m, *mm;
+
+ VRBT_FOREACH_SAFE(m, banidx_s, &banidx, mm) {
+ VRBT_REMOVE(banidx_s, &banidx, m);
+ FREE_OBJ(m);
+ }
+}
More information about the varnish-commit
mailing list