Re: A better mouse-trap ... er lwp-rget
Gisle Aas (gisle@aas.no)
11 Mar 1998 22:37:32 +0100
"Larry W. Virden, x2487" <lvirden@cas.org> writes:
> Hi! I'm in search of a fully functional WWW subtree mirror package.
> lwp-rget seems really useful - however, it doesn't maintain the
> tree structure, meaning that things really get mucked up and have to
> be shuffled all about (and of course if the same file - say index.html
> is in various directories, who knows what happens?)
The second one is renamed "index001.html", the third "index002.html",
etc.
> So, I have to believe that others are dealing with this - fetching
> subtrees for mirroring purposes, whatever.
>
> Can someone point me along to an expanded version of rget? Seems
> like a merge perhaps with lwp-mirror (if that does the date comparisons
> I expect that it does) would be another feature one would expect.
My unreleased version of lwp-rget has patches by David D. Kilzer
<ddkilzer@madison.dseg.ti.com> that makes it honour the original
hierarchy (included below). You might also have luck with other
packages like w3mir or GNU wget.
Regards,
Gisle
Index: lwp-rget.PL
===================================================================
RCS file: /home/cvs/aas/perl/mods/libwww-perl/bin/lwp-rget.PL,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -u -r1.12 -r1.13
--- lwp-rget.PL 1997/12/03 21:21:01 1.12
+++ lwp-rget.PL 1998/03/04 13:55:33 1.13
@@ -22,7 +22,8 @@
=head1 SYNOPSIS
- lwp-rget [--verbose] [--depth=N] [--limit=N] [--prefix=URL] <URL>
+ lwp-rget [--verbose] [--auth=USER:PASS] [--depth=N] [--hier]
+ [--limit=N] [--prefix=URL] <URL>
lwp-rget --version
=head1 DESCRIPTION
@@ -44,6 +45,13 @@
=over 3
+=item --auth=USER:PASS<n>
+
+Set the authentication credentials to user "USER" and password "PASS" if
+any restricted parts of the web site are hit. If there are restricted
+parts of the web site and authentication credentials are not available,
+those pages will not be downloaded.
+
=item --depth=I<n>
Limit the recursive level. Embedded images are always loaded, even if
@@ -53,6 +61,11 @@
The default depth is 5.
+=item --hier
+
+Download files into a hierarchy that mimics the web site structure.
+The default is to put all files in the current directory.
+
=item --limit=I<n>
Limit the number of documents to get. The default limit is 50.
@@ -110,18 +123,18 @@
use strict;
-use Getopt::Long;
-use URI::URL;
+use Getopt::Long qw(GetOptions);
+use URI::URL qw(url);
use LWP::MediaTypes qw(media_suffix);
use vars qw($VERSION);
-use vars qw($MAX_DEPTH $MAX_DOCS $PREFIX $VERBOSE $QUIET $SLEEP);
+use vars qw($MAX_DEPTH $MAX_DOCS $PREFIX $VERBOSE $QUIET $SLEEP $HIER $AUTH);
my $progname = $0;
$progname =~ s|.*/||; # only basename left
$progname =~ s/\.\w*$//; #strip extension if any
-$VERSION = sprintf("%d.%02d", q$Revision: 1.12 $ =~ /(\d+)\.(\d+)/);
+$VERSION = sprintf("%d.%02d", q$Revision: 1.13 $ =~ /(\d+)\.(\d+)/);
#$Getopt::Long::debug = 1;
#$Getopt::Long::ignorecase = 0;
@@ -138,6 +151,8 @@
'quiet!' => \$QUIET,
'sleep=i' => \$SLEEP,
'prefix:s' => \$PREFIX,
+ 'hier' => \$HIER,
+ 'auth=s' => \$AUTH,
) || usage();
sub print_version {
@@ -146,7 +161,7 @@
print <<"EOT";
This is lwp-rget version $VERSION ($DISTNAME)
-Copyright 1996, Gisle Aas.
+Copyright 1996-1998, Gisle Aas.
This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
@@ -240,7 +255,9 @@
# Fetch document
$no_docs++;
sleep($SLEEP) if $SLEEP;
- my $res = $ua->request(HTTP::Request->new(GET => $url));
+ my $req = HTTP::Request->new(GET => $url);
+ $req->authorization_basic(split (/:/, $AUTH)) if (defined $AUTH);
+ my $res = $ua->request($req);
# Check outcome
if ($res->is_success) {
@@ -259,24 +276,51 @@
save($name, $doc);
my $base = $res->base;
# Follow and substitute links...
- $doc =~ s/(<\s*(img|a|body)\b[^>]+\b(?:src|href|background)\s*=\s*)(["']?)([^>\s]+)\3/new_link($1, lc($2), $3, $4, $base, $depth+1)/gie; #"; # help emacs
+ $doc =~ s/(<\s*(img|a|body|frame)\b[^>]+\b(?:src|href|background)\s*=\s*)(["']?)([^>\s]+)\3/new_link($1, lc($2), $3, $4, $base, $name, $depth+1)/gie; #"; # help emacs
}
save($name, $doc);
- return $name;
+ return $name;
} else {
print STDERR $res->code . " " . $res->message . "\n" if $VERBOSE;
- $seen{$plain_url->as_string} = "*BAD*";
- return "*BAD*";
+ $seen{$plain_url->as_string} = $url->as_string;
+ $url->as_string;
}
}
sub new_link
{
- my($pre, $type, $quote, $url, $base, $depth) = @_;
- $url = url($url, $base)->abs;
- $pre . $quote . fetch($url, $type, $depth) . $quote;
+ my($pre, $type, $quote, $url, $base, $localbase, $depth) = @_;
+
+ $url = fetch(url($url, $base)->abs, $type, $depth);
+ $url = url("file:$url", "file:$localbase")->rel
+ unless $url =~ /^[.+\-\w]+:/;
+
+ return $pre . $quote . $url . $quote;
+}
+
+
+sub mkdirp
+{
+ my($directory, $mode) = @_;
+ my @dirs = split(/\//, $directory);
+ my $path = shift(@dirs); # build it as we go
+ my $result = 1; # assume it will work
+
+ unless (-d $path) {
+ $result &&= mkdir($path, $mode);
+ }
+
+ foreach (@dirs) {
+ $path .= "/$_";
+ if ( ! -d $path) {
+ $result &&= mkdir($path, $mode);
+ }
+ }
+
+ return $result;
}
+
sub find_name
{
my($url, $type) = @_;
@@ -286,16 +330,22 @@
my $path = $url->path;
# trim path until only the basename is left
- $path =~ s|.*/||;
- $path =~ s|\..*||;
- $path = "index" unless length($path);
+ $path =~ s|(.*/)||;
+ my $dirname = ".$1";
+ if (!$HIER) {
+ $dirname = "";
+ } elsif (! -d $dirname) {
+ mkdirp($dirname, 0775);
+ }
+ $path =~ s|\..*||; # trim suffix
+ $path = "index" unless length $path;
my $extra = ""; # something to make the name unique
my $suffix = media_suffix($type);
while (1) {
# Construct a new file name
- my $file = $path . $extra;
+ my $file = $dirname . $path . $extra;
$file .= ".$suffix" if $suffix;
# Check if it is unique
return $file unless -f $file;
@@ -320,12 +370,15 @@
close(FILE);
}
+
sub usage
{
die <<"";
Usage: $progname [options] <URL>
Allowed options are:
+ --auth=USER:PASS Set authentication credentials for web site
--depth=N Maximum depth to traverse (default is $MAX_DEPTH)
+ --hier Download into hierarchy (not all files into cwd)
--limit=N A limit on the number documents to get (default is $MAX_DOCS)
--version Print version number and quit
--verbose More output