[Q] Extracting HTML links from a document
Mike Grommet (mgrommet@insolwwb.net)
Tue, 16 Dec 1997 00:48:26 -0600
Hi guys...
what is some suggested code for extracting
HTML links (not all anchors, just htm and html docs)
from a page?
I would like to basically create a list of all html file links
on our entire site, recursively searching each document and making a list of
unique urls (using an associative array). unfortunately, I keep getting all
kinds of links including wav files and such...
I am getting frustrated... this code almost works ;) so I now turn
it loose and see what you guys think of it. Maybe I'm not thinking about
this in quite the right manner.
I wish I had more examples using libwww to piddle with... thanks for your
help guys
--------------------- code follows --------------------------------
use HTML::LinkExtor;
use LWP::UserAgent;
use URI::URL;
use HTML::Parse;
use LWP::Simple;
require 5.002;
#setup the useragent
$ua = new LWP::UserAgent;
$ua->agent("$0/0.1 " . $ua->agent);
$ua->agent("Digger v1.0");
#setup the parser
$p = HTML::LinkExtor->new(\&picklinks);
@links = (); # initially set links to none
%linkcheck = (); # the list of urls we have seen so far
sub picklinks
{
my $base = $res->base; #get the base of the page we are currentlyparsing
my($tag, %attr) = @_;
return if $tag ne 'a'; # only want anchors
for my $url (values(%attr))
{
print "Checking URL: $url\n";
$url = url($url,$base)->abs; #absolutize it
# next unless $url =~ /^\Q$origbase/o; #
next unless $linkcheck{$url} == 0; # have we never seen it before?
push(@links, $url); # yes, push it
$linkcheck{$url} = 1; # and mark it
print "ADDED: $url at base $base\n";
}
}
#get the url off command line
$s = $ARGV[0];
#setup the request for the page
$req = new HTTP::Request 'GET' => $s;
$req->header('Accept' => 'text/html');
# send request
$res = $ua->request($req);
# check the outcome
if (!($res->is_success))
{
print "Error: " . $res->code . " " . $res->message;
}
#otherwise parse the document
$origbase = $res->base; # the base of the url specified on the cmd line
$p->parse($res->content); # parse the origional address
while ($checkurl = pop(@links)) # continue until we run out of links to
# check
{
print "ON FILE $checkurl\n----------\n";
$req = new HTTP::Request 'GET' => $checkurl;
$req->header('Accept' => 'text/html');
# send request
$res = $ua->request($req);
# check the outcome
if (!($res->is_success))
{
print "Error: " . $res->code . " " . $res->message . "\n";
}
else
{ $p->parse($res->content);
}
}
@allsiteurls = values(%linkcheck);