Hi foex,
I just recognized that "<p><b> Text </b></p>" seems
to be not parsed correctly, while "<p> <b> Text </b> </p>" is.
More precisely, the start event for <p> and the end event for </b>
are not generated.
TIA, Chris
-- check.pl ------
#!/usr/bin/perl
# some html pretty printer
use HTML::Parser;
$indent = -1;
$indentstr = " ";
$p = HTML::Parser->new(api_version => 3);
$p->xml_mode(1);
$p->handler(start => \&start_handler, 'tagname,self' );
$p->handler(end => \&end_handler, 'tagname,self' );
sub start_handler {
my $tag = shift;
my $self = shift;
$self->handler(text => sub { $indent ++;
my $text = shift;
my $ind = $indentstr x $indent;
chomp($text);
$text =~ s/^\s*//gs;
$text =~ s/\r?\n/ /gs;
print $ind, "<$tag>","\n",
($text ne "" )?($ind." $text"."\n"):(""); },
"dtext");
}
sub end_handler {
my $tag = shift;
my $self = shift;
$self->handler(text => sub { print $indentstr x $indent , "</$tag>\n";
$indent --;
});
}
$p->parse_file("test.html");
------------------
-- test.html -----
text1
text2
------------------ -- Christian Recktenwald : : citecs GmbH : chris@citecs.de : Unternehmensberatung fuer : voice +49 711 601 2090 : Burgstallstrasse 54 EDV und Telekommunikation : fax +49 711 601 2092 : D-70199 Stuttgart