bugs, with fixes

Brian Grossman (brian@softhome.net)
Sat, 25 Jan 1997 16:13:52 -0700


Hi, I've been building an application using libwww-5.05, CGI.pm, Linux 2.0.27,
Apache 1.2b4 and FastCGI.  (And I'm new to the list.).  I've come across
a few bugs and wish-features (enclosed).

Bug:

The input to HTML::TreeBuilder (indented by intention).  Netscape does
what I expect, which is how the code got wrong in the first place:
	<table>
		TEXT_1
		<tr><td><table>
			<!-- note missing tr -->
			<td>TEXT_2</td>
			</tr>
		</table></td></tr>
		TEXT_3
	</table>

comes out of HTML::Element::as_HTML like, which netscape dislikes:

	<table>
		TEXT_1
		<tr><td>
		<table>
			<td>TEXT_2
		<table>
		TEXT_3
	</table>

Or something similar; I got confused while writing it.  In English:
the missing <tr> tag caused TreeBuilder (Parser?) to turn a
two column table with a table embeded in the 2nd column into a
three column table.  The missing <tr> was in a table embeded in the
table embeded in the 2nd column.  Yuck!


Please-features, with implementations:

# TreeBuilder doesn't define a comment method, and I need to preserve
# contents, so I wrote the following:

# TreeBuilder doesn't define a comment method, so we have to.
package HTML::TreeBuilder;
sub comment {
	my $self = shift;
	my $pos = $self->{'_pos'};
	$pos = $self unless defined($pos);
	my $text = shift;
	return unless length $text;
	$pos->push_content('<!--' . $text . '-->');
	}


# HTML::Element::as_HTML insists on quoting text, and my text is
# already quoted, plus the quoting quotes the comments my
# HTML::TreeBuilder::comment method passes through.
#
# This also pretty-prints HTML.  The motivation was first to put
# the optional tags in, since netscape doesn't always format tables as
# expected when optional tags are left out and second to get the
# generated HTML off the single line, so I could read it.
#
# Pretty printing was by-blow but I think works quite well.  It preserves
# white space/lack of white space.
#
# ww is just an abreviation of my application name.


package HTML::Element;

my $ww_ws;
my $ww_encode_text = 1;

sub ww_as_HTML {
	my $self = shift;
	my @html = ();
	$ww_ws = 0;
	$self->traverse(
		sub {
			my($node, $start, $depth) = @_;
			if (ref $node) {
				my $tag = $node->tag;
				push(@html, "\n") if $ww_ws;
				push(@html, "  " x $depth) if $ww_ws;
				if ($start) {
					push(@html, $node->starttag);
					}
				elsif (not $emptyElement{$tag} ) {
					push(@html, $node->endtag);
					}
				$ww_ws = 1;
				}
			else {
				my $in_pre =
					$self->is_inside(qw(pre xmp listing));
				if($node eq '' && !$in_pre) { $node = ' ' }
				# simple text content
				if($node =~ m/^\s/ ) {
						push(@html, "\n") if $node !~ m/^\s*$/;
						my $ws = "  " x $depth;
						$node =~ s/\A/$ws/go;
						}
				if($node =~ m/\s$/) {
					$ww_ws = 1;
					$node =~ s/\s*\Z//g;
					}
				else { $ww_ws = 0 }
				HTML::Entities::encode_entities($node, "<>&") if
 $ww_encode_text;
				push(@html, $node);
				}
			}
	);
	join('', @html, "\n");
	}


# HTML::Element:extract_links doesn't handle the implied link that
# exists if there isn't an action= in a <form>.  Action is optional
# in form.  This one line change makes the link return as undefined
# instead of not at all.  It turns out I didn't need this, but
# somebody probably will, so here it is.

package HTML::Element;
# action in form is optional, but is still an implied link
sub extract_links {
	my $self = shift;
	my %wantType; @wantType{map { lc $_ } @_} = (1) x @_;
	my $wantType = scalar(@_);
	my @links;
	$self->traverse(
		sub {
			my($self, $start, $depth) = @_;
			return 1 unless $start;
			my $tag = $self->{'_tag'};
			return 1 if $wantType && !$wantType{$tag};
			my $attr = $linkElements{$tag};
			return 1 unless defined $attr;
			$attr = [$attr] unless ref $attr;
			for (@$attr) {
				my $val = $self->attr($_);
				push(@links, [$val, $self]) if defined $val || $tag eq 'form';
				}
			1;
			}, 'ignoretext');
	\@links;
	}




Brian