r2386 - trunk/varnish-tools/fetcher

des at projects.linpro.no des at projects.linpro.no
Fri Jan 25 16:38:18 CET 2008


Author: des
Date: 2008-01-25 16:38:18 +0100 (Fri, 25 Jan 2008)
New Revision: 2386

Modified:
   trunk/varnish-tools/fetcher/fetcher.pl
Log:
Roundup of old uncommitted changes: Getopt::Long cleanup, IO::Multiplex
cleanup, statistics.  Also improve banning, and avoid // which is only
available in very recent Perl versions.


Modified: trunk/varnish-tools/fetcher/fetcher.pl
===================================================================
--- trunk/varnish-tools/fetcher/fetcher.pl	2008-01-24 06:30:20 UTC (rev 2385)
+++ trunk/varnish-tools/fetcher/fetcher.pl	2008-01-25 15:38:18 UTC (rev 2386)
@@ -3,6 +3,8 @@
 # Copyright (c) 2007 Linpro AS
 # All rights reserved.
 #
+# Author: Dag-Erling Smørgrav <des at linpro.no>
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
@@ -28,14 +30,17 @@
 # $Id$
 #
 
+our $VERSION = '$Id$';
+
 package Varnish::Fetcher;
 
 use strict;
-use Getopt::Long;
+use Getopt::Long qw(:config bundling require_order auto_version);
 use IO::Handle;
 use IO::Multiplex;
 use LWP::UserAgent;
 use Socket;
+use Time::HiRes qw(gettimeofday tv_interval);
 use URI;
 
 our %BANNED;
@@ -50,8 +55,12 @@
 our $quiet = 0;
 our $random = 0;
 
-our $url_re =
-    qr/\b(?:href|src)=[\'\"]([^\'\"\?\#]+)(?:[\?\#][^\'\"]*)?[\'\"]/io;
+our $url_re = qr{
+    \b(?:href|src)=[\'\"]\s*
+	([^\'\"\?\#]+)		# capture URL
+	(?:[\?\#][^\'\"]*)?	# discard fragment / query
+	\s*[\'\"]
+    }iox;
 
 sub new($$) {
     my ($this, $mux, $fh) = @_;
@@ -94,7 +103,7 @@
 	    $0 = "[fetcher] checking $url";
 	    if ($resp->is_redirect()) {
 		$s->write("ban $url\n");
-		$url = $resp->header('Location') //
+		$url = $resp->header('Location') ||
 		    $resp->header('Content-Location');
 		$s->write("add $url\n")
 		    if $url;
@@ -124,8 +133,8 @@
     my ($child, $msg) = @_;
 
     die "child busy\n"
-	if $child->{'url'};
-    $child->{'mux'}->write($child->{'fh'}, "$msg\n");
+	if $$child{'url'};
+    $$child{'fh'}->write("$msg\n");
 }
 
 # Send a URL and mark the child as busy
@@ -133,14 +142,14 @@
     my ($child) = @_;
 
     die "child busy\n"
-	if $child->{'url'};
+	if $$child{'url'};
     return undef
 	unless (keys(%TODO));
     my $url = (keys(%TODO))[0];
+    $DONE{$url} = $TODO{$url};
     delete $TODO{$url};
-    $DONE{$url} = 1;
-    $child->{'url'} = $url;
-    $child->{'mux'}->write($child->{'fh'}, "$url\n");
+    $$child{'url'} = $url;
+    $$child{'fh'}->write("$url\n");
     ++$BUSY;
 }
 
@@ -149,11 +158,14 @@
     my ($child, $url) = @_;
 
     die "child not busy\n"
-	unless $child->{'url'};
-    my $uri = URI->new_abs($1, $child->{'url'});
+	unless $$child{'url'};
+    my $uri = URI->new_abs($1, $$child{'url'});
     $url = $uri->canonical;
     $BANNED{$url} = 1;
-    print(STDERR "Banned $url\n");
+    delete $TODO{$url};
+    delete $DONE{$url};
+    print(STDERR "Banned $url\n")
+	unless ($quiet > 2);
 }
 
 # Convert relative to absolute, check if valid, and add to list
@@ -161,12 +173,12 @@
     my ($child, $url) = @_;
 
     die "child not busy\n"
-	unless $child->{'url'};
-    my $uri = URI->new_abs($1, $child->{'url'});
+	unless $$child{'url'};
+    my $uri = URI->new_abs($1, $$child{'url'});
     $url = $uri->canonical;
     # XXX should cache child URI to avoid new() here
     if ($BANNED{$url} || $uri->scheme() ne 'http' ||
-	$uri->host_port() ne URI->new($child->{'url'})->host_port()) {
+	$uri->host_port() ne URI->new($$child{'url'})->host_port()) {
 	print(STDERR "Rejected $url\n")
 	    unless ($quiet > 0);
 	return;
@@ -232,27 +244,41 @@
 
     # main loop
     for (;;) {
+	my $t0 = [gettimeofday()];
+
+	# keep dispatching URLs until we're done
 	for (;;) {
 	    foreach my $child (values(%CHILD)) {
 		$child->send_url()
-		    unless $child->{'url'};
+		    unless $$child{'url'};
 	    }
+	    printf(STDERR " %d/%d \r", int(keys(%DONE)),
+		   int(keys(%DONE)) + int(keys(%TODO)))
+		unless ($quiet > 3);
 	    last unless $BUSY;
 	    $mux->loop();
 	}
+
+	# summarize
+	my $dt = tv_interval($t0, [gettimeofday()]);
+	my $count = int(keys(%DONE)) + int(keys(%BANNED));
+	printf(STDERR "retrieved %d documents in %.2f seconds - %.2f tps\n",
+	       $count, $dt, $count / $dt)
+	    unless ($quiet > 3);
+
 	last unless $continue;
 	foreach my $child (values(%CHILD)) {
 	    $child->send("no check");
 	}
+	%BANNED = ();
 	%TODO = %DONE;
 	%DONE = ();
-	print(STDERR "Starting over...\n");
     }
 
     # done
     foreach my $child (values(%CHILD)) {
 	$child->send("done");
-	$mux->close($$child{'fh'});
+	$$child{'fh'}->close();
     }
 }
 
@@ -269,11 +295,11 @@
 }
 
 MAIN:{
-    GetOptions("c|continue" => \$continue,
+    GetOptions("c|continue!" => \$continue,
 	       "d|delay=i" => \$delay,
 	       "j|jobs=i" => \$jobs,
 	       "q|quiet+" => \$quiet,
-	       "r|random" => \$random)
+	       "r|random!" => \$random)
 	or usage();
     $jobs > 0
 	or usage();




More information about the varnish-commit mailing list