recursive mirror script

Fabrice GAILLARD (gaillard@ina.fr)
Tue, 30 Apr 96 14:16:44 +0200


Hello,

I use libwww-perl for HTML authoring and find it very usefull.  As I need a  
mirror tool, and that the mirror script is not already recursive, I wrote  
such a script.
I have used part Hans de Graaff checkbot to write a recursive mirror tool  
for the web.  I have named the result mirrorbot.pl

Here is the resulting code.  It's quite a quick and dirty job, but it seems  
to work (slowly).  May be some of you can inprove it.


------------------------------ snap snap -----------------------------------
#!/usr/local/bin/perl -w
#
# mirrorbot - A perl5 script to mirror www sites
#
# Fabrice Gaillard <gaillard@ina.fr> 1996
# bases on Hans de Graaff checkbot <j.j.degraaff@twi.tudelft.nl>, 1994, 1995.
#   see http://dutifp.twi.tudelft.nl:8000/checkbot/
# Based on Dimitri Tischenko, Delft University of Technology, 1994
# Based on the testlinks script by Roy Fielding
#

require 5.002;
use strict;

# Version information
q$Revision: 1.0 $ =~ /Revision: (\d+\.\d+)/;
$main::revision = $1;

# Get command-line arguments
use Getopt::Std;
getopts('hvru:m:x:f:t:');

# Handle arguments, some are mandatory, some have defaults
&print_help if $main::opt_h || !$main::opt_u;
$main::opt_h = 0;
$main::opt_t = 60 unless $main::opt_t;
$main::opt_v = 0 unless $main::opt_v;
$main::opt_r = 0 unless $main::opt_r;
print STDERR "*** Starting in verbose mode\n" if $main::opt_v;

# mirror Directory for output
if ($main::opt_f) {
    $main::mirror = $main::opt_f;
} else {
    $main::mirror = "./mirror";
}

unless(-e $main::mirror) {
  print STDERR "create $main::mirror directory\n" if $main::opt_v;
  mkdir($main::mirror, 0755) || die "unable to create $main::mirror directory\n";
}

# Work on the regular expressions in -m and -x: escape all regular expression
# characters
$main::opt_x =~ s/([\.\/\\\+\?\~\@])/\\$1/g if defined($main::opt_x);
$main::opt_m =~ s/([\.\/\\\+\?\~\@])/\\$1/g if defined($main::opt_m);


# overide HTML::Element::linkElements
# Link elements an the name of the link attribute
%HTML::Element::linkElements =
(
 'base' => 'href',
 'a'    => 'href',
 'img'  => 'src',
 'link' => 'href',
 'body' => 'background',
);


# Prepare the user agent to be used:
use LWP::UserAgent;
use LWP::RobotUA;
use LWP::MediaTypes;
use HTML::Parse;
if($main::opt_r) {
  $main::ua = new LWP::RobotUA "mirrorbot/$main::revision";
} else {
  $main::ua = new LWP::UserAgent;
}
$main::ua->agent("mirrorbot/$main::revision");

#
# should be passed on the argument line
#
#$main::ua->env_proxy; # initialize from environment variables
#$main::ua->proxy(http  => 'http://wolfy.ina.fr');


$URI::URL::StrictSchemes = 0;

# Create the start URL
$main::starturl = new URI::URL $main::opt_u;

if ($main::starturl->scheme eq 'file') {
  $main::starturl->host('localhost');
}
$main::opt_m = $main::starturl->as_string unless $main::opt_m;

#### old stuff below

# hash of already visited url
%main::checked = ();

#list of internal url
@main::internal = ();

# Add the start url, it should point to others
push(@main::internal, $main::starturl->as_string . "|");

&check_internal();

exit 0;


sub check_internal {
  my $line;
  while ($line = shift(@main::internal)) {
    &handle_url($line);
  }
}


sub handle_url {
  my ($line) = @_;
  my ($urlstr, $urlparent) = split(/\|/, $line);
  my $url = new URI::URL $urlstr;
  my $reqtype;
  my $response;
  my $type;

  # Add this URL to the ones we've seen already.
  return if &add_checked($urlstr);

  if ($url->scheme eq 'http' || $url->scheme eq 'file') {
    # Avoid recursing which causes many additional problem links
    # to appear
    return if ($url->path =~ /checkbot\.html$/);
    return if ($url->path =~ /=/);

    # We only want to get HTML files. We can guess the extension,
    # but this will not work for files ending in in a single /
    if ($url->path =~ /\/$/ || $url->path eq "") {
        $type = 'text/html';
    } else {
        $type = guess_media_type($url->path);
    }
    if ($type =~ /html/) {
        $reqtype = 'GET';
    } else {
        $reqtype = 'HEAD';
    }

    # Output what we are going to do with this link
    printf STDERR "    %4s %s (%s)\n", $reqtype, $url, $type
      if $main::opt_v;

#    my $ref_header = new HTTP::Headers 'Referer' => $urlparent;
#    my $request = new HTTP::Request($reqtype, $url, $ref_header);
#    my $response = $main::ua->simple_request($request);

    my $docpath = $url->epath;
    my $newpath = make_new_path($docpath);
    print "$docpath --> $newpath\n";
    my $request = new HTTP::Request('GET', $url);
    #
    # I am not hapy of that as it does twice the job
    #
    my $response = $main::ua->mirror($url, $newpath);
    my $response = $main::ua->simple_request($request);

    if ($response->is_error || $response->is_redirect) {
      print STDERR "        ", $response->code, ' ', $response->message,  
"\n" if $main::opt_v;
      return;
    }

    # If this url moves off of this site then return now
    return unless ($url =~ /$main::opt_m/);
    if (defined($main::opt_x) && $url =~ /$main::opt_x/) {
      print STDERR " Exclude $url\n";
      return;
    }
    return unless ($type =~ /html/);

    &handle_doc($response, $url);

  } else {
    # not interested in other URL's right now
    print STDERR "  Ignore $url\n" if $main::opt_v;
  }
}




sub add_checked {
  my($url) = @_;

  if ($main::checked{$url}) {
    return 1;
  } else {
    $main::checked{$url} = 1;
    return 0;
  }
}



# Parse document, and get the links
sub handle_doc {
  my ($response, $url) = @_;
  my $link;
  my $linkelem;

  my $doc = parse_html($response->content);
  for (@{ $doc->extract_links() }) {
    ($link, $linkelem) = @$_;
    # Ugly hack to cope with fragments for now...
    ($link) = split(/#/, $link) if $link =~ /#/;
    # Get the url from the link.
    my $newurl = new URI::URL $link, $url;

    if ($newurl->abs =~ /$main::opt_m/) {
       # Should check to see if this is a duplicate
       push(@main::internal, $newurl->abs . "|" . $url);
     } else {
       # skipp external link
     }
   }

   $doc->delete;
}




# Make all intermediate directories needed for a file
sub make_new_path {
  my($file) = @_;
  my @dirs = split("/",$file);
  my $name = pop @dirs;
  my $path = $main::mirror;
  my $dir = '';
  foreach $dir (@dirs) {
    $path .= "/$dir";
    # only make if it isn't already there
    unless (-d $path) {
      print "mkdir($path)\n";
      unless (mkdir($path,0755)) {
        if (-f $path) {
          # Hm, a file already exists with that name.
          # Someone must have forgotten a trailing blank
          # on a directory, so let's move the file down
          # one level to index.html
          my $tmp = "$path$$";
          link($path,$tmp);
          unlink($path);
          mkdir($path,0755);
          link($tmp,"$path/index.html");
          unlink($tmp);
          print STDERR "htget: moved $path to $path/index.html\n";
        }
        else { print STDERR "htget: can't create $path -- $!\n";}
      }
    }
  }
  return "$path/$name";
}



sub print_help {
  print "mirrorbot command line options:\n\n";
  print "  -v          Verbose mode: display many messages about progress.\n";
  print "  -r          Robot mode: be nice and chech robot.txt\n";
  print "  -u url      Start URL\n";
  print "  -m match    Check pages only if URL matches `match'\n";
  print "              If no match is given, the start URL is used as a match\n";
  print "  -x exclude  Exclude pages if the URL matches 'exclude'\n";
  print "  -f file     Write results to directory, default is ./mirror\n";
  print "  -t timeout  Timeout for http requests (default 60)\n\n";
  print "Both -m and -x can take a perl regular expression as their argument\n";

  exit 0;
}


---------------------------------------------------------------------------
Fabrice Gaillard,
SysAdmin and OODB researcher                  computer aided cartoon design
Institut National de l'Audiovisuel (INA), 94366 Bry sur Marne Cedex, France
Listen to the best of the Afro-Caribbean Beat :    http://www.ina.fr/Music/
---------------------------------------------------------------------------