Re: A better mouse-trap ... er lwp-rget

Gisle Aas (gisle@aas.no)
11 Mar 1998 22:37:32 +0100


"Larry W. Virden, x2487" <lvirden@cas.org> writes:

> Hi!  I'm in search of a fully functional WWW subtree mirror package.
> lwp-rget seems really useful - however, it doesn't maintain the
> tree structure, meaning that things really get mucked up and have to
> be shuffled all about (and of course if the same file - say index.html
> is in various directories, who knows what happens?)

The second one is renamed "index001.html", the third "index002.html",
etc.

> So, I have to believe that others are dealing with this - fetching
> subtrees for mirroring purposes, whatever.
> 
> Can someone point me along to an expanded version of rget?  Seems
> like a merge perhaps with lwp-mirror (if that does the date comparisons
> I expect that it does) would be another feature one would expect.

My unreleased version of lwp-rget has patches by David D. Kilzer
<ddkilzer@madison.dseg.ti.com> that makes it honour the original
hierarchy (included below).  You might also have luck with other
packages like w3mir or GNU wget.

Regards,
Gisle


Index: lwp-rget.PL
===================================================================
RCS file: /home/cvs/aas/perl/mods/libwww-perl/bin/lwp-rget.PL,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -u -r1.12 -r1.13
--- lwp-rget.PL	1997/12/03 21:21:01	1.12
+++ lwp-rget.PL	1998/03/04 13:55:33	1.13
@@ -22,7 +22,8 @@
 
 =head1 SYNOPSIS
 
- lwp-rget [--verbose] [--depth=N] [--limit=N] [--prefix=URL] <URL>
+ lwp-rget [--verbose] [--auth=USER:PASS] [--depth=N] [--hier]
+          [--limit=N] [--prefix=URL] <URL>
  lwp-rget --version
 
 =head1 DESCRIPTION
@@ -44,6 +45,13 @@
 
 =over 3
 
+=item --auth=USER:PASS<n>
+
+Set the authentication credentials to user "USER" and password "PASS" if
+any restricted parts of the web site are hit.  If there are restricted
+parts of the web site and authentication credentials are not available,
+those pages will not be downloaded.
+
 =item --depth=I<n>
 
 Limit the recursive level. Embedded images are always loaded, even if
@@ -53,6 +61,11 @@
 
 The default depth is 5.
 
+=item --hier
+
+Download files into a hierarchy that mimics the web site structure.
+The default is to put all files in the current directory.
+
 =item --limit=I<n>
 
 Limit the number of documents to get.  The default limit is 50.
@@ -110,18 +123,18 @@
 
 use strict;
 
-use Getopt::Long;
-use URI::URL;
+use Getopt::Long    qw(GetOptions);
+use URI::URL        qw(url);
 use LWP::MediaTypes qw(media_suffix);
 
 use vars qw($VERSION);
-use vars qw($MAX_DEPTH $MAX_DOCS $PREFIX $VERBOSE $QUIET $SLEEP);
+use vars qw($MAX_DEPTH $MAX_DOCS $PREFIX $VERBOSE $QUIET $SLEEP $HIER $AUTH);
 
 my $progname = $0;
 $progname =~ s|.*/||;  # only basename left
 $progname =~ s/\.\w*$//; #strip extension if any
 
-$VERSION = sprintf("%d.%02d", q$Revision: 1.12 $ =~ /(\d+)\.(\d+)/);
+$VERSION = sprintf("%d.%02d", q$Revision: 1.13 $ =~ /(\d+)\.(\d+)/);
 
 #$Getopt::Long::debug = 1;
 #$Getopt::Long::ignorecase = 0;
@@ -138,6 +151,8 @@
            'quiet!'   => \$QUIET,
 	   'sleep=i'  => \$SLEEP,
 	   'prefix:s' => \$PREFIX,
+	   'hier'     => \$HIER,
+	   'auth=s'   => \$AUTH,
 	  ) || usage();
 
 sub print_version {
@@ -146,7 +161,7 @@
     print <<"EOT";
 This is lwp-rget version $VERSION ($DISTNAME)
 
-Copyright 1996, Gisle Aas.
+Copyright 1996-1998, Gisle Aas.
 
 This program is free software; you can redistribute it and/or
 modify it under the same terms as Perl itself.
@@ -240,7 +255,9 @@
     # Fetch document 
     $no_docs++;
     sleep($SLEEP) if $SLEEP;
-    my $res = $ua->request(HTTP::Request->new(GET => $url));
+    my $req = HTTP::Request->new(GET => $url);
+    $req->authorization_basic(split (/:/, $AUTH)) if (defined $AUTH);
+    my $res = $ua->request($req);
 
     # Check outcome
     if ($res->is_success) {
@@ -259,24 +276,51 @@
 	    save($name, $doc);
 	    my $base = $res->base;
 	    # Follow and substitute links...
-	    $doc =~ s/(<\s*(img|a|body)\b[^>]+\b(?:src|href|background)\s*=\s*)(["']?)([^>\s]+)\3/new_link($1, lc($2), $3, $4, $base, $depth+1)/gie;       #"; # help emacs
+	    $doc =~ s/(<\s*(img|a|body|frame)\b[^>]+\b(?:src|href|background)\s*=\s*)(["']?)([^>\s]+)\3/new_link($1, lc($2), $3, $4, $base, $name, $depth+1)/gie;       #"; # help emacs
 	}
 	save($name, $doc);
-	return $name;					  
+	return $name;
     } else {
 	print STDERR $res->code . " " . $res->message . "\n" if $VERBOSE;
-	$seen{$plain_url->as_string} = "*BAD*";
-	return "*BAD*";
+	$seen{$plain_url->as_string} = $url->as_string;
+        $url->as_string;
     }
 }
 
 sub new_link
 {
-    my($pre, $type, $quote, $url, $base, $depth) = @_;
-    $url = url($url, $base)->abs;
-    $pre . $quote . fetch($url, $type, $depth) . $quote;
+    my($pre, $type, $quote, $url, $base, $localbase, $depth) = @_;
+
+    $url = fetch(url($url, $base)->abs, $type, $depth);
+    $url = url("file:$url", "file:$localbase")->rel
+        unless $url =~ /^[.+\-\w]+:/;
+
+    return $pre . $quote . $url . $quote;
+}
+
+
+sub mkdirp
+{
+    my($directory, $mode) = @_;
+    my @dirs = split(/\//, $directory);
+    my $path = shift(@dirs);   # build it as we go
+    my $result = 1;   # assume it will work
+
+    unless (-d $path) {
+        $result &&= mkdir($path, $mode);
+    }
+
+    foreach (@dirs) {
+        $path .= "/$_";
+        if ( ! -d $path) {
+            $result &&= mkdir($path, $mode);
+        }
+    }
+
+    return $result;
 }
 
+
 sub find_name
 {
     my($url, $type) = @_;
@@ -286,16 +330,22 @@
     my $path = $url->path;
 
     # trim path until only the basename is left
-    $path =~ s|.*/||;
-    $path =~ s|\..*||;
-    $path = "index" unless length($path);
+    $path =~ s|(.*/)||;
+    my $dirname = ".$1";
+    if (!$HIER) {
+	$dirname = "";
+    } elsif (! -d $dirname) {
+        mkdirp($dirname, 0775);
+    }
+    $path =~ s|\..*||;  # trim suffix
+    $path = "index" unless length $path;
 
     my $extra = "";  # something to make the name unique
     my $suffix = media_suffix($type);
 
     while (1) {
 	# Construct a new file name
-	my $file = $path . $extra;
+	my $file = $dirname . $path . $extra;
 	$file .= ".$suffix" if $suffix;
 	# Check if it is unique
 	return $file unless -f $file;
@@ -320,12 +370,15 @@
     close(FILE);
 }
 
+
 sub usage
 {
     die <<"";
 Usage: $progname [options] <URL>
 Allowed options are:
+  --auth=USER:PASS  Set authentication credentials for web site
   --depth=N         Maximum depth to traverse (default is $MAX_DEPTH)
+  --hier            Download into hierarchy (not all files into cwd)
   --limit=N         A limit on the number documents to get (default is $MAX_DOCS)
   --version         Print version number and quit
   --verbose         More output