Patch to libwww 5.05 parse error

Jyrgen Nyrgaard (jnp@www.ifs.dk)
Sat, 25 Jan 97 16:12:55 +0100


libwww Version 5.05 (and possibly earlier versions) are too restrictive when
parsing HTML containing tables:

If a table contains <P> within e.g. a <TD>-tag, the table is ended
and the <P>-tagged text put there (after the table). This may or may not be
according to standard, but it is often used and accepted by some, otherwise,
strict editors (PageMill, Hotmetal). The attached patch will allow libwww to
accept such usage.


Regards,
/jrgen nrgaard           | e-mail: jnp@ifs.dk
Innova Financial Solutions | Phone: +45 3917 9797
Symbion Science Park       | Fax: +45 3927 5521
Copenhagen	           | Denmark


*** TreeBuilder.pm-ORIG	Sat Jan  4 16:17:15 1997
--- TreeBuilder.pm	Sat Jan 25 15:42:18 1997
***************
*** 169,193 ****
      my $e = HTML::Element->new($tag, %$attr);
  
      if (!$self->{'_implicit_tags'}) {
  	# do nothing
      } elsif ($isBodyElement{$tag}) {
- 
  	# Ensure that we are within <body>
  	if ($pos->is_inside('head')) {
  	    $self->end('head');
  	    $pos = $self->insert_element('body', 1);
  	    $ptag = $pos->tag;
! 	} elsif (!$pos->is_inside('body')) {
  	    $pos = $self->insert_element('body', 1);
  	    $ptag = $pos->tag;
  	}
  
  	# Handle implicit endings and insert based on <tag> and position
  	if ($tag eq 'p' || $tag =~ /^h[1-6]/ || $tag eq 'form') {
  	    # Can't have <p>, <h#> or <form> inside these
! 	    $self->end([qw(p h1 h2 h3 h4 h5 h6 pre textarea)], 'li');
  	} elsif ($tag =~ /^[oud]l$/) {
  	    # Can't have lists inside <h#>
  	    if ($ptag =~ /^h[1-6]/) {
  		$self->end($ptag);
  		$pos = $self->insert_element('p', 1);
--- 169,195 ----
      my $e = HTML::Element->new($tag, %$attr);
  
      if (!$self->{'_implicit_tags'}) {
  	# do nothing
      } elsif ($isBodyElement{$tag}) {
  	# Ensure that we are within <body>
  	if ($pos->is_inside('head')) {
  	    $self->end('head');
  	    $pos = $self->insert_element('body', 1);
  	    $ptag = $pos->tag;
! 	} elsif (!$pos->is_inside('body') ) {
  	    $pos = $self->insert_element('body', 1);
  	    $ptag = $pos->tag;
  	}
  
  	# Handle implicit endings and insert based on <tag> and position
  	if ($tag eq 'p' || $tag =~ /^h[1-6]/ || $tag eq 'form') {
+ #	    warn "?1 $tag, $ptag";
+ #	    printf STDERR "Tag in: %s, Tag: %s, Parent tag: %s\n", $tag, $pos->tag, $pos->{'_parent'}->{'_tag'};
  	    # Can't have <p>, <h#> or <form> inside these
! 	    $self->end([qw(p h1 h2 h3 h4 h5 h6 pre textarea)],
! 		       qw(li td th));
  	} elsif ($tag =~ /^[oud]l$/) {
  	    # Can't have lists inside <h#>
  	    if ($ptag =~ /^h[1-6]/) {
  		$self->end($ptag);
  		$pos = $self->insert_element('p', 1);
***************
*** 277,286 ****
--- 279,289 ----
  	    my $ptag = $p->{'_tag'};
  	    for (@$tag) {
  		last PARENT if $ptag eq $_;
  	    }
  	    for (@stop) {
+ #		warn "STOP ($ptag, $_)!";
  		return if $ptag eq $_;
  	    }
  	    $p = $p->{'_parent'};
  	}
      } else {