[Q] Extracting HTML links from a document

Mike Grommet (mgrommet@insolwwb.net)
Tue, 16 Dec 1997 00:48:26 -0600


Hi guys...
what is some suggested code for extracting
HTML links (not all anchors, just htm and html docs)
from a page?

I would like to basically create a list of all html file links
on our entire site, recursively searching each document and making a list of
unique urls (using an associative array).  unfortunately, I keep getting all
kinds of links including wav files and such...

I am getting frustrated... this code almost works ;) so I now turn
it loose and see what you guys think of it.  Maybe I'm  not thinking about
this in quite the right manner.
I wish I had more examples using libwww to piddle with... thanks for your
help guys

--------------------- code follows --------------------------------

use HTML::LinkExtor;
use LWP::UserAgent;
use URI::URL;
use HTML::Parse;
use LWP::Simple;

require 5.002;

#setup the useragent
  $ua = new LWP::UserAgent;
  $ua->agent("$0/0.1 " . $ua->agent);
  $ua->agent("Digger v1.0");
#setup the parser
  $p = HTML::LinkExtor->new(\&picklinks);
@links = ();   # initially set links to none
%linkcheck = ();    # the list of urls we have seen so far
sub picklinks
{
  my $base = $res->base;   #get the base of the page we are currentlyparsing
  my($tag, %attr) = @_;
  return if $tag ne 'a';   # only want anchors
  for my $url (values(%attr))
    {
     print "Checking URL: $url\n";
     $url = url($url,$base)->abs; #absolutize it
#     next unless $url =~ /^\Q$origbase/o;  #
     next unless $linkcheck{$url} == 0;  # have we never seen it before?
     push(@links, $url);  # yes, push it
     $linkcheck{$url} = 1;  # and mark it
     print "ADDED: $url at base $base\n";
    }
}

#get the url off command line
$s = $ARGV[0];
#setup the request for the page
  $req = new HTTP::Request 'GET' => $s;
  $req->header('Accept' => 'text/html');
# send request
  $res = $ua->request($req);

# check the outcome
  if (!($res->is_success))
  {
     print "Error: " . $res->code . " " . $res->message;
  }
#otherwise parse the document
$origbase = $res->base;    # the base of the url specified on the cmd line
$p->parse($res->content);  # parse the origional address
while ($checkurl = pop(@links))  # continue until we run out of links to
                                 # check
{
  print "ON FILE $checkurl\n----------\n";
  $req = new HTTP::Request 'GET' => $checkurl;
  $req->header('Accept' => 'text/html');

# send request
  $res = $ua->request($req);
# check the outcome
  if (!($res->is_success))
  {
     print "Error: " . $res->code . " " . $res->message . "\n";
  }
  else
  { $p->parse($res->content);
  }
}

@allsiteurls = values(%linkcheck);