r2599 - trunk/varnish-tools/fetcher

des at projects.linpro.no des at projects.linpro.no
Wed Mar 12 16:11:00 CET 2008


Author: des
Date: 2008-03-12 16:11:00 +0100 (Wed, 12 Mar 2008)
New Revision: 2599

Modified:
   trunk/varnish-tools/fetcher/fetcher.pl
Log:
Refactor, implement random mode.


Modified: trunk/varnish-tools/fetcher/fetcher.pl
===================================================================
--- trunk/varnish-tools/fetcher/fetcher.pl	2008-03-12 14:07:08 UTC (rev 2598)
+++ trunk/varnish-tools/fetcher/fetcher.pl	2008-03-12 15:11:00 UTC (rev 2599)
@@ -43,11 +43,12 @@
 use Time::HiRes qw(gettimeofday tv_interval);
 use URI;
 
+our %URLS;
 our %BANNED;
-our %TODO;
-our %DONE;
+our @TODO;
 our %CHILD;
 our $BUSY;
+our $DONE;
 
 our $continue = 0;
 our $delay = 0;
@@ -144,10 +145,8 @@
     die "child busy\n"
 	if $$child{'url'};
     return undef
-	unless (keys(%TODO));
-    my $url = (keys(%TODO))[0];
-    $DONE{$url} = $TODO{$url};
-    delete $TODO{$url};
+	unless (@TODO);
+    my $url = shift(@TODO);
     $$child{'url'} = $url;
     $$child{'fh'}->write("$url\n");
     ++$BUSY;
@@ -162,8 +161,7 @@
     my $uri = URI->new_abs($1, $$child{'url'});
     $url = $uri->canonical;
     $BANNED{$url} = 1;
-    delete $TODO{$url};
-    delete $DONE{$url};
+    delete $URLS{$url};
     print(STDERR "Banned $url\n")
 	unless ($quiet > 2);
 }
@@ -183,8 +181,9 @@
 	    unless ($quiet > 0);
 	return;
     }
-    return if $TODO{$url} || $DONE{$url};
-    $TODO{$url} = 1;
+    return if $URLS{$url};
+    $URLS{$url} = 1;
+    push(@TODO, $url);
 }
 
 # Called when mux gets data from a client
@@ -199,6 +198,7 @@
 	if ($line eq "ready") {
 	    $$child{'url'} = '';
 	    --$BUSY;
+	    ++$DONE;
 	    $mux->endloop();
 	} elsif ($line =~ m/^add (.*?)$/) {
 	    get_url($child, $1);
@@ -210,16 +210,10 @@
     }
 }
 
-sub fetcher(@) {
-    my (@urls) = @_;
+my $mux = new IO::Multiplex;
 
-    my $mux = new IO::Multiplex;
+sub breed() {
 
-    # prepare work queue
-    foreach my $url (@urls) {
-	$TODO{URI->new($url)->canonical} = 1;
-    }
-
     # start children
     $BUSY = 0;
     for (my $i = 0; $i < $jobs; ++$i) {
@@ -241,51 +235,115 @@
 	    $mux->set_callback_object($child, $s1);
 	}
     }
+}
 
-    # main loop
+sub infanticide() {
+
+    foreach my $child (values(%CHILD)) {
+	$child->send("done");
+	$$child{'fh'}->close();
+    }
+}
+
+sub harvest(@) {
+    my (@urls) = @_;
+
+    # prepare work queue
+    foreach my $url (@urls) {
+	push(@TODO, URI->new($url)->canonical);
+    }
+
+    $DONE = 0;
     for (;;) {
-	my $t0 = [gettimeofday()];
+	foreach my $child (values(%CHILD)) {
+	    $child->send_url()
+		unless $$child{'url'};
+	}
+	printf(STDERR " %d/%d \r",
+	       int(keys(%URLS)) - @TODO, int(keys(%URLS)))
+	    unless ($quiet > 3);
+	last unless $BUSY;
+	$mux->loop();
+    }
+}
 
-	# keep dispatching URLs until we're done
-	for (;;) {
+sub summarize($$$) {
+    my ($count, $t0, $t1) = @_;
+
+    my $dt = tv_interval($t0, $t1);
+    printf(STDERR "retrieved %d documents in %.2f seconds - %.2f tps\n",
+	   $count, $dt, $count / $dt)
+	unless ($quiet > 3);
+}
+
+sub fetch_random() {
+
+    my $t0 = [gettimeofday()];
+    my @urls = keys(%URLS);
+    @TODO = @urls;
+    $DONE = 0;
+
+    while (@TODO) {
+	foreach my $child (values(%CHILD)) {
+	    $child->send_url()
+		unless $$child{'url'};
+	    push(@TODO, $urls[rand(@urls)]);
+	}
+	$mux->loop();
+	printf(STDERR " %d \r", $DONE)
+	    unless ($quiet > 3);
+	if ($DONE > 0 && ($DONE % 1000) == 0) {
+	    my $t1 = [gettimeofday()];
+	    summarize(1000, $t0, $t1);
+	    $t0 = $t1;
+	}
+    }
+}
+
+sub fetch_sequential() {
+
+    my $t0 = [gettimeofday()];
+    for (;;) {
+	@TODO = keys(%URLS);
+	$DONE = 0;
+
+	while (@TODO) {
 	    foreach my $child (values(%CHILD)) {
 		$child->send_url()
 		    unless $$child{'url'};
 	    }
-	    printf(STDERR " %d/%d \r", int(keys(%DONE)),
-		   int(keys(%DONE)) + int(keys(%TODO)))
+	    printf(STDERR " %d/%d \r", $DONE, int(keys(%URLS)))
 		unless ($quiet > 3);
 	    last unless $BUSY;
 	    $mux->loop();
 	}
+	my $t1 = [gettimeofday()];
+	summarize(int(keys(%URLS)), $t0, $t1);
+	$t0 = $t1;
+    }
+}
 
-	# summarize
-	my $dt = tv_interval($t0, [gettimeofday()]);
-	my $count = int(keys(%DONE)) + int(keys(%BANNED));
-	printf(STDERR "retrieved %d documents in %.2f seconds - %.2f tps\n",
-	       $count, $dt, $count / $dt)
-	    unless ($quiet > 3);
+sub fetcher(@) {
+    my (@urls) = @_;
 
-	last unless $continue;
-	foreach my $child (values(%CHILD)) {
-	    $child->send("no check");
-	}
-	%BANNED = ();
-	%TODO = %DONE;
-	%DONE = ();
-    }
+    breed();
 
-    # done
+    my $t0 = [gettimeofday()];
+    harvest(@urls);
+    my $t1 = [gettimeofday()];
+    summarize(int(keys(%URLS)), $t0, $t1);
+
     foreach my $child (values(%CHILD)) {
-	$child->send("done");
-	$$child{'fh'}->close();
+	$child->send("no check");
     }
-}
 
-sub refetch() {
+    if ($random) {
+	fetch_random();
+    } elsif ($continue) {
+	fetch_sequential();
+    }
 
-    # Recycle valid URLs from initial run
-    %TODO = %DONE;
+    infanticide();
 }
 
 sub usage() {
@@ -303,8 +361,6 @@
 	or usage();
     $jobs > 0
 	or usage();
-    $random
-	and die "-r is not yet implemented\n";
     @ARGV
 	or usage();
     fetcher(@ARGV);




More information about the varnish-commit mailing list