Probes stop for minutes once backend is decared unhealty?

Frank Farmer frank at huddler-inc.com
Wed Feb 16 22:31:20 CET 2011


I'm having an issue where my backends fall behind for a little while
(maybe 30 seconds or so), and then varnish stops probing entirely for
minutes at a time, even though the host has long since recovered.  I'm
near capacity, currently, so I can't afford to lose a backend for
minutes -- the extra traffic tends to back up my other backends, which
then also end up taken out of the pool for minutes, even though they
recover in seconds.

Is there anything I can do to control this interval?  I'd love to have
varnish never wait more than, say, 30 seconds between probes, even at
the worst of times.

Varnish version:

# varnishd -V
varnishd (varnish-2.1.3 SVN 5049:5055)
Copyright (c) 2006-2009 Linpro AS / Verdens Gang AS

Excerpt of Apache access_log showing probes received by one of my backends:

# ... NORMAL PROBE FREQUENCY
app005 - - - [16/Feb/2011:18:44:00 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:44:01 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:44:02 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:44:04 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
# PROBES STOP FOR 2 MINUTES
app005 - - - [16/Feb/2011:18:46:07 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:07 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:07 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:07 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:07 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:09 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:09 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:09 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:09 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:10 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:10 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:11 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:12 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:13 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:14 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:15 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:16 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:17 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:18 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:19 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:46:22 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
# PROBES STOP FOR 2 MINUTES
app005 - - - [16/Feb/2011:18:48:23 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:48:23 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
# ... SNIP SEVERAL MINUTES OF NORMAL PROBING
app005 - - - [16/Feb/2011:18:54:33 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:54:33 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:18:54:33 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
# PROBES STOP FOR 7 MINUTES
app005 - - - [16/Feb/2011:19:01:33 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:19:01:45 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
app005 - - - [16/Feb/2011:19:01:46 +0000] "GET /health.html HTTP/1.1"
200 24 "-" "-"
# ...PROBES OPERATE NORMALLY FOR HOURS...


Backend config:


backend app001 {
  .host = "app001-private";
  .port = "8880";
  .probe = {
    .url = "/health.html";
    .timeout = 2s;
    .interval = 1s;
    .window = 10;
    .threshold = 8;
  }
  .connect_timeout = 2s;
}

backend app002 {
  .host = "app002-private";
  .port = "8880";
  .probe = {
    .url = "/health.html";
    .timeout = 2s;
    .interval = 1s;
    .window = 10;
    .threshold = 8;
  }
  .connect_timeout = 2s;
}

backend app003 {
  .host = "app003-private";
  .port = "8880";
  .probe = {
    .url = "/health.html";
    .timeout = 2s;
    .interval = 1s;
    .window = 10;
    .threshold = 8;
  }
  .connect_timeout = 2s;
}

backend app005 {
  .host = "app005-private";
  .port = "8880";
  .probe = {
    .url = "/health.html";
    .timeout = 2s;
    .interval = 1s;
    .window = 10;
    .threshold = 5;
    .initial = 10;
  }
  .connect_timeout = 2s;
}

backend app006 {
  .host = "app006-private";
  .port = "8880";
  .probe = {
    .url = "/health.html";
    .timeout = 2s;
    .interval = 1s;
    .window = 10;
    .threshold = 5;
    .initial = 10;
  }
  .connect_timeout = 2s;
}

director app_servers random {
	{
		.backend = app001;
		.weight = 10;
	}
	{
		.backend = app002;
		.weight = 100;
	}
	{
		.backend = app003;
		.weight = 75;
	}
	{
		.backend = app005;
		.weight = 300;
	}
	{
		.backend = app006;
		.weight = 300;
	}
}



More information about the varnish-misc mailing list