r4031 - trunk/varnish-tools/fetcher

des at projects.linpro.no des at projects.linpro.no
Sun Apr 12 19:13:06 CEST 2009


Author: des
Date: 2009-04-12 19:13:05 +0200 (Sun, 12 Apr 2009)
New Revision: 4031

Modified:
   trunk/varnish-tools/fetcher/fetcher.pl
Log:
Add support for saving retrieved documents to disk.


Modified: trunk/varnish-tools/fetcher/fetcher.pl
===================================================================
--- trunk/varnish-tools/fetcher/fetcher.pl	2009-04-06 12:39:13 UTC (rev 4030)
+++ trunk/varnish-tools/fetcher/fetcher.pl	2009-04-12 17:13:05 UTC (rev 4031)
@@ -1,6 +1,7 @@
 #!/usr/bin/perl -w
 #-
 # Copyright (c) 2007-2009 Linpro AS
+# Copyright (c) 2009 Dag-Erling Coïdan Smørgrav
 # All rights reserved.
 #
 # Author: Dag-Erling Smørgrav <des at des.no>
@@ -35,6 +36,8 @@
 package Varnish::Fetcher;
 
 use strict;
+use File::Basename;
+use File::Path;
 use Getopt::Long qw(:config bundling require_order auto_version);
 use IO::Handle;
 use IO::Multiplex;
@@ -43,7 +46,7 @@
 use Time::HiRes qw(gettimeofday tv_interval);
 use URI;
 
-our %URLS;
+our %URI;
 our %BANNED;
 our @TODO;
 our %CHILD;
@@ -55,6 +58,7 @@
 our $jobs = 1;
 our $quiet = 0;
 our $random = 0;
+our $save = 0;
 
 our $url_re = qr{
     \b(?:href|src)=[\'\"]\s*
@@ -71,9 +75,22 @@
 	'mux' => $mux,
 	'fh' => $fh,
 	'url' => undef,
+	'uri' => undef,
     };
 }
 
+sub info(@) {
+
+    print(STDOUT "[$$] ", join(' ', @_), "\n")
+	unless ($quiet);
+}
+
+sub error(@) {
+
+    print(STDERR "[$$] ", join(' ', @_), "\n")
+	unless ($quiet);
+}
+
 # Child
 sub run($$) {
     my ($self, $s) = @_;
@@ -97,8 +114,7 @@
 	    next;
 	}
 	$0 = "[fetcher] requesting $url";
-	print(STDERR "Retrieving $url\n")
-	    unless ($quiet > 1);
+	info("Retrieving $url");
 	my $resp = $ua->get($url);
 	if ($check) {
 	    $0 = "[fetcher] checking $url";
@@ -106,19 +122,39 @@
 		$s->write("ban $url\n");
 		$url = $resp->header('Location') ||
 		    $resp->header('Content-Location');
+		info("Redirected to $url");
 		$s->write("add $url\n")
 		    if $url;
 	    } elsif ($resp->is_success()) {
 		if ($resp->header('Content-Type') =~ m/^text\//) {
-		    my %urls = map { $_ => 1 } ($resp->content =~ m/$url_re/g);
+		    my %urls = map({ $_ => 1 } ($resp->content =~ m/$url_re/g));
 		    foreach (keys(%urls)) {
 			$s->write("add $_\n");
 		    }
 		}
+		if ($save && $url !~ m/\/$/) {
+		    my $uri = URI->new($url);
+		    my $path = "." . $uri->path;
+		    my $dir = dirname($path);
+		    eval {
+			mkpath($dir);
+			if (open(my $fh, ">", $path)) {
+			    info("Saving $url to $path");
+			    print($fh $resp->content);
+			    close($fh);
+			} else {
+			    die("open($path): $!\n");
+			}
+		    };
+		    if ($@) {
+			print(STDERR $@);
+		    }
+		}
 	    } elsif ($resp->is_error()) {
-		# XXX should we ban these?
+		error("Failed to retrieve $url");
+		$s->write("ban $url\n");
 	    } else {
-		print(STDERR "Unsupported response type:",
+		error("Unsupported response type:",
 		    $resp->status_line(), "\n");
 	    }
 	}
@@ -148,6 +184,7 @@
 	unless (@TODO);
     my $url = shift(@TODO);
     $$child{'url'} = $url;
+    $$child{'uri'} = $URI{$url};
     $$child{'fh'}->write("$url\n");
     ++$BUSY;
 }
@@ -161,7 +198,7 @@
     my $uri = URI->new_abs($1, $$child{'url'});
     $url = $uri->canonical;
     $BANNED{$url} = 1;
-    delete $URLS{$url};
+    delete $URI{$url};
     print(STDERR "Banned $url\n")
 	unless ($quiet > 2);
 }
@@ -174,15 +211,23 @@
 	unless $$child{'url'};
     my $uri = URI->new_abs($1, $$child{'url'});
     $url = $uri->canonical;
-    # XXX should cache child URI to avoid new() here
-    if ($BANNED{$url} || $uri->scheme() ne 'http' ||
-	$uri->host_port() ne URI->new($$child{'url'})->host_port()) {
-	print(STDERR "Rejected $url\n")
-	    unless ($quiet > 0);
+    if ($BANNED{$url}) {
+	info("Rejected (banned): $url");
 	return;
     }
-    return if $URLS{$url};
-    $URLS{$url} = 1;
+    if ($uri->scheme() ne 'http') {
+	info("Rejected (not HTTP): $url");
+	return;
+    }
+    if ($uri->host_port() ne $$child{'uri'}->host_port()) {
+	info("Rejected (other server): $url");
+    }
+    if (index($url, $$child{'url'}) != 0) {
+	info("Rejected (upstream): $url");
+	return;
+    }
+    return if $URI{$url};
+    $URI{$url} = $uri;
     push(@TODO, $url);
 }
 
@@ -197,6 +242,7 @@
 	my $line = $1;
 	if ($line eq "ready") {
 	    $$child{'url'} = '';
+	    $$child{'uri'} = undef;
 	    --$BUSY;
 	    ++$DONE;
 	    $mux->endloop();
@@ -250,7 +296,10 @@
 
     # prepare work queue
     foreach my $url (@urls) {
-	push(@TODO, URI->new($url)->canonical);
+	my $uri = URI->new($url);
+	$url = $uri->canonical;
+	$URI{$url} = $uri;
+	push(@TODO, $url);
     }
 
     $DONE = 0;
@@ -260,7 +309,7 @@
 		unless $$child{'url'};
 	}
 	printf(STDERR " %d/%d \r",
-	       int(keys(%URLS)) - @TODO, int(keys(%URLS)))
+	       int(keys(%URI)) - @TODO, int(keys(%URI)))
 	    unless ($quiet > 3);
 	last unless $BUSY;
 	$mux->loop();
@@ -279,7 +328,7 @@
 sub fetch_random() {
 
     my $t0 = [gettimeofday()];
-    my @urls = keys(%URLS);
+    my @urls = keys(%URI);
     @TODO = @urls;
     $DONE = 0;
 
@@ -304,7 +353,7 @@
 
     my $t0 = [gettimeofday()];
     for (;;) {
-	@TODO = keys(%URLS);
+	@TODO = keys(%URI);
 	$DONE = 0;
 
 	while (@TODO) {
@@ -312,13 +361,13 @@
 		$child->send_url()
 		    unless $$child{'url'};
 	    }
-	    printf(STDERR " %d/%d \r", $DONE, int(keys(%URLS)))
+	    printf(STDERR " %d/%d \r", $DONE, int(keys(%URI)))
 		unless ($quiet > 3);
 	    last unless $BUSY;
 	    $mux->loop();
 	}
 	my $t1 = [gettimeofday()];
-	summarize(int(keys(%URLS)), $t0, $t1);
+	summarize(int(keys(%URI)), $t0, $t1);
 	$t0 = $t1;
     }
 }
@@ -331,7 +380,7 @@
     my $t0 = [gettimeofday()];
     harvest(@urls);
     my $t1 = [gettimeofday()];
-    summarize(int(keys(%URLS)), $t0, $t1);
+    summarize(int(keys(%URI)), $t0, $t1);
 
     foreach my $child (values(%CHILD)) {
 	$child->send("no check");
@@ -357,7 +406,8 @@
 	       "d|delay=i" => \$delay,
 	       "j|jobs=i" => \$jobs,
 	       "q|quiet+" => \$quiet,
-	       "r|random!" => \$random)
+	       "r|random!" => \$random,
+	       "s|save-files!" => \$save)
 	or usage();
     $jobs > 0
 	or usage();



More information about the varnish-commit mailing list