Bug in Parser (about comments)
Calle Aasman (md4calle@mdstud.chalmers.se)
Thu, 26 Jun 1997 12:24:04 +0200 (MET DST)
Hi, I don't know if I'm doing anything wrong or if it's a bug:
I have made a parser that shall do some stuff whenever it hit specific
tags, but if a htmldocument looks like this:
...
this text will be parser out fine
<!---------------------->
this will be parsad as if it were inside a comment
<!--------------------->
this will work fine
....
anyone know of this problem before?
If anyone have any idea I would really appreciate it, cause i'm right now
in panicmode since I thought the parser worked fine...
I've included the two files I got:
#!/usr/local/bin/perl
require "tagfunctions.pm";=20
=20
#####################main#####################################
package main;
%allinfo;
$uselessdict =3D "ORD/uselesswords.lst";
my $form =3D new ParseXX;
$form->parse_file("tst.html"); =20
for ($i=3D0; $i<=3D $#links; $i++){=20
print "$links[$i]\n"; #skriver ut l=E4nkarna.
}
while (($key,$value) =3D each %allinfo) {
print "$key =3D $allinfo{$key}\n\n"; =20
} =20
####################subrutiner######################################
#f=F6r att lagra data i hashen
sub lagra{
my ($tag,$info) =3D@_;
$allinfo{$tag}=3D "$allinfo{$tag}$info";
}
this is the file with what it shall do when it strike a tag:
package ParseXX;
require HTML::Parser;
use HTML::Entities (); =20
@ISA=3Dqw(HTML::Parser);
sub start{
my($self,$tag, $attr, $attrseq, $origtext) =3D @_;
eval("start_$tag".'($self, $attr, $attrseq, $origtext)');
}
sub text{
my($self, $text) =3D @_;
$decoded =3D HTML::Entities::decode($text);
=20
$decoded =3D~ s/\s+/ /g; #ta bort newline och whitespace i texten
if ($takecare){
$tempcare=3D$decoded;
}
main::lagra("text",$decoded);
}
sub end{
my($self, $tag) =3D @_;
eval("end_$tag"); =20
}
sub start_header{
my ($self,$attr, $attrseq, $origtext) =3D @_;
$takecare=3D1; =20
}
sub end_header{
my($self, $tag) =3D @_;
if($tempcare ne ""){
main::lagra("header", "$tempcare ");
}
$takecare=3D0;
$tempcare =3D ""; #/!!fult att beh=F6va mollst=E4lla s=E5 h=E4r =
=20
}
sub start_h1{my ($self,$attr, $attrseq, $origtext) =3D
@_;start_header($self,$attr, $attrseq, $origtext);}
sub start_h2{my ($self,$attr, $attrseq, $origtext) =3D
@_;start_header($self,$attr, $attrseq, $origtext);}
sub start_h3{my ($self,$attr, $attrseq, $origtext) =3D
@_;start_header($self,$attr, $attrseq, $origtext);}
sub start_h4{my ($self,$attr, $attrseq, $origtext) =3D
@_;start_header($self,$attr, $attrseq, $origtext);}
sub start_h5{my ($self,$attr, $attrseq, $origtext) =3D
@_;start_header($self,$attr, $attrseq, $origtext);}
sub end_h1{my($self, $tag) =3D @_;end_header($self, $tag);}
sub end_h2{my($self, $tag) =3D @_;end_header($self, $tag);}
sub end_h3{my($self, $tag) =3D @_;end_header($self, $tag);}
sub end_h4{my($self, $tag) =3D @_;end_header($self, $tag);}
sub end_h5{my($self, $tag) =3D @_;end_header($self, $tag);}
sub start_hr{
main::lagra("text","\n");
}
sub start_p{
main::lagra("text","\n");
}
sub start_meta{
my ($self,$attr, $attrseq, $origtext) =3D @_;
my $tmp;
=20
while (($key,$value) =3D each (%$attr)) {
if(!($key=3D~/name/i)){
$tmp =3D $value;
}
}
#print $attr->{'name'};
main::lagra($attr->{'name'},$tmp);
}=20
sub start_script{
my ($self,$attr, $attrseq, $origtext) =3D @_;
main::lagra("script","+");
}