recursive mirror script
Fabrice GAILLARD (gaillard@ina.fr)
Tue, 30 Apr 96 14:16:44 +0200
Hello,
I use libwww-perl for HTML authoring and find it very usefull. As I need a
mirror tool, and that the mirror script is not already recursive, I wrote
such a script.
I have used part Hans de Graaff checkbot to write a recursive mirror tool
for the web. I have named the result mirrorbot.pl
Here is the resulting code. It's quite a quick and dirty job, but it seems
to work (slowly). May be some of you can inprove it.
------------------------------ snap snap -----------------------------------
#!/usr/local/bin/perl -w
#
# mirrorbot - A perl5 script to mirror www sites
#
# Fabrice Gaillard <gaillard@ina.fr> 1996
# bases on Hans de Graaff checkbot <j.j.degraaff@twi.tudelft.nl>, 1994, 1995.
# see http://dutifp.twi.tudelft.nl:8000/checkbot/
# Based on Dimitri Tischenko, Delft University of Technology, 1994
# Based on the testlinks script by Roy Fielding
#
require 5.002;
use strict;
# Version information
q$Revision: 1.0 $ =~ /Revision: (\d+\.\d+)/;
$main::revision = $1;
# Get command-line arguments
use Getopt::Std;
getopts('hvru:m:x:f:t:');
# Handle arguments, some are mandatory, some have defaults
&print_help if $main::opt_h || !$main::opt_u;
$main::opt_h = 0;
$main::opt_t = 60 unless $main::opt_t;
$main::opt_v = 0 unless $main::opt_v;
$main::opt_r = 0 unless $main::opt_r;
print STDERR "*** Starting in verbose mode\n" if $main::opt_v;
# mirror Directory for output
if ($main::opt_f) {
$main::mirror = $main::opt_f;
} else {
$main::mirror = "./mirror";
}
unless(-e $main::mirror) {
print STDERR "create $main::mirror directory\n" if $main::opt_v;
mkdir($main::mirror, 0755) || die "unable to create $main::mirror directory\n";
}
# Work on the regular expressions in -m and -x: escape all regular expression
# characters
$main::opt_x =~ s/([\.\/\\\+\?\~\@])/\\$1/g if defined($main::opt_x);
$main::opt_m =~ s/([\.\/\\\+\?\~\@])/\\$1/g if defined($main::opt_m);
# overide HTML::Element::linkElements
# Link elements an the name of the link attribute
%HTML::Element::linkElements =
(
'base' => 'href',
'a' => 'href',
'img' => 'src',
'link' => 'href',
'body' => 'background',
);
# Prepare the user agent to be used:
use LWP::UserAgent;
use LWP::RobotUA;
use LWP::MediaTypes;
use HTML::Parse;
if($main::opt_r) {
$main::ua = new LWP::RobotUA "mirrorbot/$main::revision";
} else {
$main::ua = new LWP::UserAgent;
}
$main::ua->agent("mirrorbot/$main::revision");
#
# should be passed on the argument line
#
#$main::ua->env_proxy; # initialize from environment variables
#$main::ua->proxy(http => 'http://wolfy.ina.fr');
$URI::URL::StrictSchemes = 0;
# Create the start URL
$main::starturl = new URI::URL $main::opt_u;
if ($main::starturl->scheme eq 'file') {
$main::starturl->host('localhost');
}
$main::opt_m = $main::starturl->as_string unless $main::opt_m;
#### old stuff below
# hash of already visited url
%main::checked = ();
#list of internal url
@main::internal = ();
# Add the start url, it should point to others
push(@main::internal, $main::starturl->as_string . "|");
&check_internal();
exit 0;
sub check_internal {
my $line;
while ($line = shift(@main::internal)) {
&handle_url($line);
}
}
sub handle_url {
my ($line) = @_;
my ($urlstr, $urlparent) = split(/\|/, $line);
my $url = new URI::URL $urlstr;
my $reqtype;
my $response;
my $type;
# Add this URL to the ones we've seen already.
return if &add_checked($urlstr);
if ($url->scheme eq 'http' || $url->scheme eq 'file') {
# Avoid recursing which causes many additional problem links
# to appear
return if ($url->path =~ /checkbot\.html$/);
return if ($url->path =~ /=/);
# We only want to get HTML files. We can guess the extension,
# but this will not work for files ending in in a single /
if ($url->path =~ /\/$/ || $url->path eq "") {
$type = 'text/html';
} else {
$type = guess_media_type($url->path);
}
if ($type =~ /html/) {
$reqtype = 'GET';
} else {
$reqtype = 'HEAD';
}
# Output what we are going to do with this link
printf STDERR " %4s %s (%s)\n", $reqtype, $url, $type
if $main::opt_v;
# my $ref_header = new HTTP::Headers 'Referer' => $urlparent;
# my $request = new HTTP::Request($reqtype, $url, $ref_header);
# my $response = $main::ua->simple_request($request);
my $docpath = $url->epath;
my $newpath = make_new_path($docpath);
print "$docpath --> $newpath\n";
my $request = new HTTP::Request('GET', $url);
#
# I am not hapy of that as it does twice the job
#
my $response = $main::ua->mirror($url, $newpath);
my $response = $main::ua->simple_request($request);
if ($response->is_error || $response->is_redirect) {
print STDERR " ", $response->code, ' ', $response->message,
"\n" if $main::opt_v;
return;
}
# If this url moves off of this site then return now
return unless ($url =~ /$main::opt_m/);
if (defined($main::opt_x) && $url =~ /$main::opt_x/) {
print STDERR " Exclude $url\n";
return;
}
return unless ($type =~ /html/);
&handle_doc($response, $url);
} else {
# not interested in other URL's right now
print STDERR " Ignore $url\n" if $main::opt_v;
}
}
sub add_checked {
my($url) = @_;
if ($main::checked{$url}) {
return 1;
} else {
$main::checked{$url} = 1;
return 0;
}
}
# Parse document, and get the links
sub handle_doc {
my ($response, $url) = @_;
my $link;
my $linkelem;
my $doc = parse_html($response->content);
for (@{ $doc->extract_links() }) {
($link, $linkelem) = @$_;
# Ugly hack to cope with fragments for now...
($link) = split(/#/, $link) if $link =~ /#/;
# Get the url from the link.
my $newurl = new URI::URL $link, $url;
if ($newurl->abs =~ /$main::opt_m/) {
# Should check to see if this is a duplicate
push(@main::internal, $newurl->abs . "|" . $url);
} else {
# skipp external link
}
}
$doc->delete;
}
# Make all intermediate directories needed for a file
sub make_new_path {
my($file) = @_;
my @dirs = split("/",$file);
my $name = pop @dirs;
my $path = $main::mirror;
my $dir = '';
foreach $dir (@dirs) {
$path .= "/$dir";
# only make if it isn't already there
unless (-d $path) {
print "mkdir($path)\n";
unless (mkdir($path,0755)) {
if (-f $path) {
# Hm, a file already exists with that name.
# Someone must have forgotten a trailing blank
# on a directory, so let's move the file down
# one level to index.html
my $tmp = "$path$$";
link($path,$tmp);
unlink($path);
mkdir($path,0755);
link($tmp,"$path/index.html");
unlink($tmp);
print STDERR "htget: moved $path to $path/index.html\n";
}
else { print STDERR "htget: can't create $path -- $!\n";}
}
}
}
return "$path/$name";
}
sub print_help {
print "mirrorbot command line options:\n\n";
print " -v Verbose mode: display many messages about progress.\n";
print " -r Robot mode: be nice and chech robot.txt\n";
print " -u url Start URL\n";
print " -m match Check pages only if URL matches `match'\n";
print " If no match is given, the start URL is used as a match\n";
print " -x exclude Exclude pages if the URL matches 'exclude'\n";
print " -f file Write results to directory, default is ./mirror\n";
print " -t timeout Timeout for http requests (default 60)\n\n";
print "Both -m and -x can take a perl regular expression as their argument\n";
exit 0;
}
---------------------------------------------------------------------------
Fabrice Gaillard,
SysAdmin and OODB researcher computer aided cartoon design
Institut National de l'Audiovisuel (INA), 94366 Bry sur Marne Cedex, France
Listen to the best of the Afro-Caribbean Beat : http://www.ina.fr/Music/
---------------------------------------------------------------------------