Bug : Assert error in exp_timer() | Child not responding to ping, killing it.
Benjamin Sonntag
benjamin at octopuce.fr
Fri Dec 12 03:25:43 CET 2008
Hi all,
(first of all, I'll be glad to obtain a login/pass on the trac so that I
may create a ticket for this one if it became a real bug ;) and help
varnish community)
I guess I found a bug :) So please find below the informations I was
able to gather to start working on this issue :
We are using varnish 2.0.2 (debian package version from lenny) on this
machine (from dell) :
Linux cache1b 2.6.25-2-amd64 #1 SMP Fri Jun 27 14:47:16 UTC 2008 x86_64
GNU/Linux
- 2 physical processors (8 pipelines total) : Intel(R) Xeon(R)
CPU X5460 @ 3.16GHz
- 8* 4GB FB-DIMM (total 32GB RAM) (yes, I like this machine ;) )
I found a previous similar error last July in the mailing list, but
didn't know how to solve it :
we put those parameters high enough (I guess) :
cli_timeout 40 [seconds]
ping_interval 12 [seconds]
The main question is : how can I create a backtrace, how can I obtain a
core dump to create this backtrace ?
Thanks for your help, I will, of course, do my best to find a solution
for this issue.
Regards,
Benjamin Sonntag
Here is what Syslog said (the most important I guess) :
Dec 11 20:17:01 cache1b /USR/SBIN/CRON[30655]: (root) CMD ( cd / &&
run-parts --report /etc/cron.hourly)
Dec 11 20:17:09 cache1b varnishd[63966]: Child (42470) not responding to
ping, killing it.
Child (42470) died signal=6
Child (42470) Panic message: Assert error in exp_timer(),
cache_expire.c line 303: Condition(oe2->timer_when >= oe->timer_when)
not true. thread = (cache-timeout)
Child cleanup complete
child (30657) Started
Child (30657) said Closed fds: 4 5 6 10 11 13 14
Child (30657) said Child starts
Child (30657) said managed to mmap 68719476736 bytes of 68719476736
Child (30657) said Ready
Child (30657) said Probe("GET /search/C=?definition=homepage HTTP/1.1^M
Child (30657) said Host: 192.168.131.101^M
Child (30657) said Connection: close^M
Child (30657) said ^M
Child (30657) said ", 4, 1)
Child (30657) said Probe("GET /search/C=?definition=homepage HTTP/1.1^M
Child (30657) said Host: 192.168.131.102^M
Child (30657) said Connection: close^M
Child (30657) said ^M
Child (30657) said ", 4, 1)
Child (30657) said Probe("GET /search/C=?definition=homepage HTTP/1.1^M
Child (30657) said Host: 192.168.131.107^M
Child (30657) said Connection: close^M
Child (30657) said ^M
Child (30657) said ", 4, 1)
Dec 11 20:20:01 cache1b /USR/SBIN/CRON[31317]: (root) CMD (if [ -x
/etc/munin...
Here is the varnishlog extract (my server is quite busy, I had a hard
time finding this place, so awk|sort|uniq|grep was my friends ;) )
1137 StatSess c 192.168.131.8 43747 0 1 1 0 0 0 233 5
0 StatAddr - 192.168.131.8 0 285468 2281231 2281230 0 0 1873552
489132873 83107957357
1139 ReqStart c 192.168.131.7 51390 2047115919
1139 RxRequest c GET
1139 RxURL c /confidential/url/blabla&purge=1
1139 RxProtocol c HTTP/1.1
1139 RxHeader c Host: varnish:30000
1139 RxHeader c Accept: */*
1139 VCL_call c recv
1139 VCL_return c lookup
1139 VCL_call c hash
1139 VCL_return c hash
1139 VCL_call c miss
1139 VCL_return c fetch
1137 BackendOpen b be1b 192.168.131.30 35113 192.168.131.101 30000
1139 Backend c 1137 lb3 be1b
1137 TxRequest b GET
1137 TxURL b /confidential/url/blabla&purge=1
1137 TxProtocol b HTTP/1.1
1137 TxHeader b Host: varnish:30000
1137 TxHeader b Accept: */*
1137 TxHeader b X-Varnish: 2047115919
1137 TxHeader b X-Forwarded-For: 192.168.131.7
1157 SessionClose c remote closed
0 WorkThread - 0x43033cb0 start
0 CLI - Rd vcl.load boot ./vcl.1P9zoqAU.so
0 CLI - Wr 0 200 Loaded "./vcl.1P9zoqAU.so" as "boot"
0 CLI - Rd vcl.load test ./vcl.FANefPfn.so
0 WorkThread - 0x45037cb0 start
0 Backend_health - be2b Still sick 4--X-S-RH 1 3 8 0.021917 0.021917
HTTP/1.1 200 OK
0 Backend_health - be1b Still sick 4--X-S--- 0 3 8 0.000000 0.000000
0 Backend_health - be3b Still sick 4--X-S--- 0 3 8 0.000000 0.000000
0 WorkThread - 0x40c85cb0 start
0 CLI - Wr 0 200 Loaded "./vcl.FANefPfn.so" as "test"
0 CLI - Rd vcl.use test
0 CLI - Wr 0 200
0 CLI - Rd start
0 Debug - "Acceptor is epoll"
0 CLI - Wr 0 200
0 WorkThread - 0x4683acb0 start
0 WorkThread - 0x4703bcb0 start
0 WorkThread - 0x4783ccb0 start
0 WorkThread - 0x4803dcb0 start
11 SessionOpen c 192.168.131.9 49159 :30000
11 ReqStart c 192.168.131.9 49159 705356244
11 RxRequest c GET
11 RxURL c /confidential/url/blabla&ttl=120
11 RxProtocol c HTTP/1.1
11 RxHeader c Host: varnish:30000
11 RxHeader c Accept: */*
11 VCL_call c recv
11 VCL_return c lookup
11 VCL_call c hash
11 VCL_return c hash
11 VCL_call c miss
11 VCL_return c fetch
11 VCL_call c error
11 VCL_return c deliver
11 Length c 5
11 VCL_call c deliver
11 VCL_return c deliver
11 TxProtocol c HTTP/1.1
11 TxStatus c 503
11 TxResponse c Service Unavailable
11 TxHeader c Server: Varnish
11 TxHeader c Retry-After: 0
11 TxHeader c Content-Type: text/html; charset=utf-8
11 TxHeader c Content-Length: 5
11 TxHeader c Date: Thu, 11 Dec 2008 19:17:10 GMT
11 TxHeader c X-Varnish: 705356244
11 TxHeader c Age: 0
11 TxHeader c Via: 1.1 varnish
11 TxHeader c Connection: close
Then our VCL code (... don't know if it's good or bad syntax, but it
used to work)
backend be1b {
.host = "192.168.131.101";
.port = "30000";
.probe = {
.url = "/search/C=?definition=homepage";
.timeout = 4s;
.interval = 1s;
.window = 8;
.threshold = 3;
}
}
backend be2b {
.host = "192.168.131.102";
.port = "30000";
.probe = {
.url = "/search/C=?definition=homepage";
.timeout = 4s;
.interval = 1s;
.window = 8;
.threshold = 3;
}
}
backend be3b {
.host = "192.168.131.107";
.port = "30000";
.probe = {
.url = "/search/C=?definition=homepage";
.timeout = 4s;
.interval = 1s;
.window = 8;
.threshold = 3;
}
}
director lb3 round-robin {
{
.backend = be1b;
}
{
.backend = be2b;
}
{
.backend = be3b;
}
}
sub vcl_recv {
set req.backend = lb3;
if (req.url ~ "&nocache=1") {
set req.url = regsub(req.url,"&nocache=1","");
pass;
}
lookup;
}
sub vcl_hash {
set req.hash += req.url;
hash;
}
sub vcl_fetch {
if (obj.status != 200 && obj.status != 403 && obj.status != 404) {
restart;
}
if (obj.ttl < 3600s) {
set obj.ttl = 3600s;
}
# Si on a &ttl=nombre, on fixe la ttl a <nombre> heures
if (req.url ~ "&ttl=") {
if (req.url ~ "&ttl=001") { set obj.ttl=3600s; }
if (req.url ~ "&ttl=002") { set obj.ttl=7200s; }
if (req.url ~ "&ttl=003") { set obj.ttl=10800s; }
if (req.url ~ "&ttl=006") { set obj.ttl=21600s; }
if (req.url ~ "&ttl=009") { set obj.ttl=32400s; }
if (req.url ~ "&ttl=012") { set obj.ttl=43200s; }
if (req.url ~ "&ttl=015") { set obj.ttl=54000s; }
if (req.url ~ "&ttl=018") { set obj.ttl=64800s; }
if (req.url ~ "&ttl=021") { set obj.ttl=75600s; }
if (req.url ~ "&ttl=024") { set obj.ttl=86400s; }
if (req.url ~ "&ttl=096") { set obj.ttl=345600s; }
if (req.url ~ "&ttl=168") { set obj.ttl=604800s; }
if (req.url ~ "&ttl=672") { set obj.ttl=2419200s; }
set req.url = regsub(req.url,"&ttl=([0-9]+)","");
}
# Si on a &purge=1, on purge l'entree demandee du cache.
if (req.url ~ "&purge=1") {
set req.url = regsub(req.url,"&purge=1","");
purge_url(req.url);
}
}
sub vcl_error {
set obj.http.Content-Type = "text/html; charset=utf-8";
synthetic {"error"};
deliver;
}
More information about the varnish-dev
mailing list