r2374 - trunk/varnish-tools/fetcher

des at projects.linpro.no des at projects.linpro.no
Wed Jan 23 14:45:48 CET 2008


Author: des
Date: 2008-01-23 14:45:48 +0100 (Wed, 23 Jan 2008)
New Revision: 2374

Modified:
   trunk/varnish-tools/fetcher/fetcher.pl
Log:
Add -c and -r options:

- If the former is specified, fetcher will go into a loop after having
  traversed the entire tree, and continuously re-fetch all known URLs.

- The latter is not yet implented, but the idea is to assign a random
  probability to each URL based on an inverse-exponential (or similar)
  distribution, and re-fetch URLs at random according to this frequency.
  This will help simulate a "short head long tail" scenario.

Some restructuring.

Add a comment about a possible improvement which will help work around
bugs in certain commonly used data sets (e.g. the Apache httpd manual)
which can result in an infinite set of URLs (which in reality map to
a fairly large but finite set of pages)


Modified: trunk/varnish-tools/fetcher/fetcher.pl
===================================================================
--- trunk/varnish-tools/fetcher/fetcher.pl	2008-01-23 10:20:12 UTC (rev 2373)
+++ trunk/varnish-tools/fetcher/fetcher.pl	2008-01-23 13:45:48 UTC (rev 2374)
@@ -43,9 +43,15 @@
 our %CHILD;
 our $BUSY;
 
+our $continue = 0;
+our $delay = 0;
 our $jobs = 1;
-our $delay = 0;
+our $quiet = 0;
+our $random = 0;
 
+our $url_re =
+    qr/\b(?:href|src)=[\'\"]([^\'\"\?\#]+)(?:[\?\#][^\'\"]*)?[\'\"]/io;
+
 sub new($$) {
     my ($this, $mux, $fh) = @_;
     my $class = ref($this) || $this;
@@ -57,27 +63,40 @@
     };
 }
 
+# Child
 sub run($$) {
     my ($self, $s) = @_;
 
-    my $ua = new LWP::UserAgent();
+    my $check = 1;
+    my $ua = new LWP::UserAgent('keep_alive' => 3);
     for (;;) {
 	$0 = "[fetcher] idle";
 	my $url = <$s>;
 	exit(0)
 	    unless defined($url);
 	chomp($url);
-	die "no more work\n"
-	    if $url eq "done";
+	if ($url eq "done") {
+	    last;
+	} elsif ($url eq "check") {
+	    $check = 1;
+	    next;
+	} elsif ($url eq "no check") {
+	    $check = 0;
+	    next;
+	}
 	$0 = "[fetcher] requesting $url";
-	print STDERR "Retrieving $url\n";
+	print STDERR "Retrieving $url\n"
+	    unless ($quiet > 1);
 	my $resp = $ua->get($url);
-	$0 = "[fetcher] checking $url";
-	if ($resp->header('Content-Type') =~ m/^text\//) {
-	    my %urls = map { $_ => 1 }
-		($resp->content =~ m/\b(?:href|src)=[\'\"]([^\'\"\?\#]+)(?:[\?\#][^\'\"]*)?[\'\"]/g);
-	    foreach (keys(%urls)) {
-		$s->write("add $_\n");
+	if ($check) {
+	    $0 = "[fetcher] checking $url";
+	    # XXX if we got a redirect, we should blacklist the
+	    # original URL and suggest the target URL instead
+	    if ($resp->header('Content-Type') =~ m/^text\//) {
+		my %urls = map { $_ => 1 } ($resp->content =~ m/$url_re/g);
+		foreach (keys(%urls)) {
+		    $s->write("add $_\n");
+		}
 	    }
 	}
 	select(undef, undef, undef, $delay)
@@ -87,6 +106,16 @@
     }
 }
 
+# Send a command for which we don't expect a response
+sub send($) {
+    my ($child, $msg) = @_;
+
+    die "child busy\n"
+	if $child->{'url'};
+    $child->{'mux'}->write($child->{'fh'}, "$msg\n");
+}
+
+# Send a URL and mark the child as busy
 sub send_url($) {
     my ($child) = @_;
 
@@ -102,6 +131,7 @@
     ++$BUSY;
 }
 
+# Convert relative to absolute, check if valid, and add to list
 sub get_url($$) {
     my ($child, $url) = @_;
 
@@ -111,13 +141,15 @@
     $url = $uri->canonical;
     if ($uri->scheme() ne 'http' ||
 	$uri->host_port() ne URI->new($child->{'url'})->host_port()) {
-	print STDERR "Rejected $url\n";
+	print STDERR "Rejected $url\n"
+	    unless ($quiet > 0);
 	return;
     }
     return if $TODO{$url} || $DONE{$url};
     $TODO{$url} = 1;
 }
 
+# Called when mux gets data from a client
 sub mux_input($$$$) {
     my ($child, $mux, $fh, $input) = @_;
 
@@ -172,32 +204,53 @@
 
     # main loop
     for (;;) {
+	for (;;) {
+	    foreach my $child (values(%CHILD)) {
+		$child->send_url()
+		    unless $child->{'url'};
+	    }
+	    last unless $BUSY;
+	    $mux->loop();
+	}
+	last unless $continue;
 	foreach my $child (values(%CHILD)) {
-	    $child->send_url()
-		unless $child->{'url'};
+	    $child->send("no check");
 	}
-	last unless $BUSY;
-	$mux->loop();
+	%TODO = %DONE;
+	%DONE = ();
+	print STDERR "Starting over...\n";
     }
 
     # done
     foreach my $child (values(%CHILD)) {
+	$child->send("done");
 	$mux->close($$child{'fh'});
     }
 }
 
+sub refetch() {
+
+    # Recycle valid URLs from initial run
+    %TODO = %DONE;
+}
+
 sub usage() {
 
-    print STDERR "usage: $0 [-d n] [-j n] URL ...\n";
+    print STDERR "usage: $0 [-cqr] [-d n] [-j n] URL ...\n";
     exit(1);
 }
 
 MAIN:{
-    GetOptions("j|jobs=i" => \$jobs,
-	       "d|delay=i" => \$delay)
+    GetOptions("c|continue" => \$continue,
+	       "d|delay=i" => \$delay,
+	       "j|jobs=i" => \$jobs,
+	       "q|quiet+" => \$quiet,
+	       "r|random" => \$random)
 	or usage();
     $jobs > 0
 	or usage();
+    $random
+	and die "-r is not yet implemented\n";
     @ARGV
 	or usage();
     fetcher(@ARGV);




More information about the varnish-commit mailing list