require 'text_tags.pl';
# Notes
#
# The hardest part of this library was getting the spacing right.
#
#
# - Some tags have a "whitespace block" and some tags don't. For example,
#
.., <[oud]l>..[oud]l>,
,
block. # Excessive returns are allowed inand other html tags are # supported in... # 1. At most, the largest "whitespace block" will be two returns (\n\n) # 2. Multiple whitespace characters will be collapsed into a single space, # unless it is the beginning of a line require 'shellwords.pl' unless(defined &shellwords); # I saw a nasty HTML deviation that shellwords won't parse. it was: # ... no space between " and href package w3bbc_html; %html_tag_ignore = ( 'head',1, 'html',1, 'body',1, ); %html_tags_block = ( 'a', "'html_tag_a", 'b', "'html_tag_em", 'i', "'html_tag_em", 'em', "'html_tag_em", # map to italic 'cite', "'html_tag_em", # map to italic 'strong', "'html_tag_em", # map to bold 'dfn', "'html_tag_em", # map to bold 'h1', "'html_tag_h", 'h2', "'html_tag_h", 'h3', "'html_tag_h", 'h4', "'html_tag_h", 'h5', "'html_tag_h", 'h6', "'html_tag_h", 'pre', "'html_tag_pre", 'title', "'html_tag_title", ); %html_tags_single = ( 'img', "'html_tag_img", 'isindex', "'html_tag_isindex", ); %text_tag_ignore = ( 'head',1, 'html',1, 'body',1, ); %text_tags_block = ( 'a', 'text_tag_a', 'b', 'text_tag_em', 'i', 'text_tag_em', 'em', 'text_tag_em', # map to italic 'cite', 'text_tag_em', # map to italic 'strong', 'text_tag_em', # map to bold 'dfn', 'text_tag_em', # map to bold 'ul', 'text_tag_list', 'ol', 'text_tag_list', 'dl', 'text_tag_list', 'menu', 'text_tag_list', 'dir', 'text_tag_list', 'address', 'text_tag_address', 'h1', 'text_tag_h', 'h2', 'text_tag_h', 'h3', 'text_tag_h', 'h4', 'text_tag_h', 'h5', 'text_tag_h', 'h6', 'text_tag_h', 'pre', 'text_tag_pre', ); %text_tags_single = ( 'img', 'text_tag_img', 'p', 'text_tag_p', 'br', 'text_tag_br', 'hr', 'text_tag_hr', # These should be with/
or
'li', 'text_tag_li', 'dt', 'text_tag_dt', ); # This is a hack, so it will parse deviations from syntactically correct HTML # this was used to stop a unterminated from making the whole doc linked %tags_not_nested = ( 'a', 1, ); %html_ampersand = ( 'lt', '<', 'gt', '>', 'amp', '&', 'quot', '"', 'nbsp', ' ', ); # Interface # reset_tags($class) # Where class is: # text, html, postscript, none # text - reset to text bindings # html - reset to html bindings # postscript - reset to poscript (not yet though) # none - erase all current bindings sub reset_tags { local($class) = @_; if (!$class || $class eq 'text') { %tags_block = %text_tags_block; %tags_single = %text_tags_single; %tags_ignore = %text_tags_ignore; } elsif ($class eq 'html') { %tags_block = %html_tags_block; %tags_single = %html_tags_single; %tags_ignore = %html_tags_ignore; } elsif ($class eq 'postscript') { print STDERR "class $class not yet supported\n"; } elsif ($class eq 'none') { undef %tags_block; undef %tags_single; undef %tags_ignore; } else { print STDERR "class $class unsupported\n"; } #&'dumpvar('w3bbc_html','tags_single','tags_block'); } sub set_tags_class { local($class) = @_; #&'dumpvar('w3bbc_html','tags_single','tags_block'); if ($class eq 'text') { for (keys %text_tags_block) { $tags_block{$_} = $text_tags_block{$_}; } for (keys %text_tags_single) { $tags_single{$_} = $text_tags_single{$_}; } for (keys %text_tags_ignore) { $tags_ignore{$_} = $text_tags_ignore{$_}; } } elsif ($class eq 'html') { for (keys %html_tags_block) { $tags_block{$_} = $html_tags_block{$_}; } for (keys %html_tags_single) { $tags_single{$_} = $html_tags_single{$_}; } for (keys %html_tags_ignore) { $tags_ignore{$_} = $html_tags_ignore{$_}; } } #&'dumpvar('w3bbc_html','tags_single','tags_block'); } sub set_tags { local(*block,*single,$clear) = @_; local($_); if ($clear) { undef %tags_block; undef %tags_single; } for (keys %block) { $tags_block{$_} = $block{$_}; } for (keys %single) { $tags_single{$_} = $single{$_}; } for (keys %ignore) { $tags_ignore{$_} = $ignore{$_}; } } sub newlines { local(*opts,$num,$max) = @_; $max = 2 unless($max); #print STDERR "newlines($num)\n"; #print STDERR "caller(0)=(",join(',',caller(0)),")\n"; #print STDERR "caller(1)=(",join(',',caller(1)),")\n"; return if ($opts{'_newlines'} >= $max); #print STDERR "caller(2)=(",join(',',caller(2)),")\n"; #print STDERR "opts{'_first'}='$opts{'_first'}'\n"; #print STDERR "opts{'_line_len'}='$opts{'_line_len'}'\n"; #print STDERR "opts{'_newlines'}='$opts{'_newlines'}'\n"; local($n,$nl); if ($opts{'_first'} && !$opts{'_line_len'} && !$opts{'_newlines'}) { # Ignore newlines at the beginning if top of file and no leading text } elsif($opts{'_newlines'} < $max) { if (($opts{'_newlines'}+$num) > $max) { $nl = "\n" x ($max - $opts{'_newlines'}); $opts{'_newlines'} = $max; } else { $nl = "\n" x $num; $opts{'_newlines'} += $num; } } $opts{'_first'} = $opts{'_line_len'} = $opts{'_spaces'} = 0; #print STDERR "newlines($num,$max) = ",length($nl),"\n"; #print STDERR "opts{'_first'}='$opts{'_first'}'\n"; #print STDERR "opts{'_line_len'}='$opts{'_line_len'}'\n"; #print STDERR "opts{'_newlines'}='$opts{'_newlines'}'\n"; return($nl); } sub display_text { local(*opts,$text) = @_; #print STDERR "display_text(opts,$text)\n"; #print STDERR "caller(0)=(",join(',',caller(0)),")\n"; #print STDERR "caller(1)=(",join(',',caller(1)),")\n"; #print STDERR "caller(2)=(",join(',',caller(2)),")\n"; #return unless (length($text)); $opts{'_spaces'} = $opts{'_newlines'} = 0; #print STDERR "display_text: opts{'_line_len'}='$opts{'_line_len'}'\n"; $opts{'_line_len'} += length($text); #print STDERR "display_text: opts{'_line_len'}='$opts{'_line_len'}'\n"; $opts{'_spaces'} = 1 if ($text =~ / $/); } sub has_tag { #return(($_[0] =~ /<\S.*>/) ? 1 : 0); return(($_[0] =~ /<\S+\s*.*>/) ? 1 : 0); } sub is_tag { #return(($_[0] =~ /^<\S.*>$/) ? 1 : 0); return(($_[0] =~ /^<\S+\s*.*>$/) ? 1 : 0); } # Takes a HTML tag like: #
# and returns # ('a','src','/home/path/to/a/pic.gif','ismap','1') # which should be read like # ($tag,%tags) = &split_tag($html); # sub split_tag { local($html_tag) = shift; return($html_tag) unless($html_tag =~ /^<.*>$/); local($tag,@tags,$_,$key,$val,%ret); $tag = $html_tag; $tag =~ s/^/; $tag =~ s/\s*>$//; $tag =~ s/[\r\n]+/ /g; # to handle @tags = &'shellwords($tag); $tag = shift(@tags); $tag =~ tr/A-Z/a-z/; for (@tags) { # This will skip value's like #
if (/=/) { local($val_i); ($key,$val) = split(/=/,$_,2); $key =~ tr/A-Z/a-z/; $val =~ s/^"//; $val =~ s/"$//; next unless(length $val); # For
#$val = 1 if ($key =~ /^$val$/i); $val_i = $val; $val_i =~ tr/A-Z/a-z/; $val = 1 if ($key eq $val_i); $ret{"${tag}_$key"} = $val; } else { # For
$ret{"${tag}_$_"} = 1; } } return($tag,%ret) if (%ret); return($tag); } sub process_html_text { local(*opts,$text,$nowrap) = @_; #print STDERR "process_html_text(opts,$text)\n"; #print STDERR "caller(0)=(",join(',',caller(0)),")\n"; #print STDERR "caller(1)=(",join(',',caller(1)),")\n"; #print STDERR "caller(2)=(",join(',',caller(2)),")\n"; local($ret,$tag,$_,@t,$t,$routine); return unless(($text =~ /\S/) || ($opts{'_tag'} eq 'pre')); if ($opts{'_tag'}) { $tag = $opts{'_tag'}; local($open,$close) = ($tag,"/$tag"); local(%open,%close); for (keys %opts) { next unless(/^${tag}_/o); $open{$_} = $opts{$_}; } if ($tags_single{$tag}) { $ret .= &call_tags_single(*opts,*open); } elsif ($tags_block{$tag}) { $ret .= &call_tags_block(*opts,*open,*close_tag,$text) if ($text); } elsif ($tags_ignore{$tag}) { return undef; } return($ret); } $text =~ s/\s+/ /g; if ($opts{'_newlines'}) { $text =~ s/^\s+//; } $routine = $opts{'_filter'}; if (!$nowrap && ($opts{'_line_len'}+length($text) > $opts{'_max_len'})) { @t = &wordwrap($opts{'_line_len'},$opts{'_max_len'},$text); for $t (0..$#t) { if ($opts{'_filter'}) { $ret .= &$routine(*opts,$t[$t]); } else { $ret .= $t[$t]; &display_text(*opts,$t[$t]); } $ret .= &w3bbc_html'newlines(*opts,1) unless($t == $#t); } } else { if ($opts{'_filter'}) { $ret .= &$routine(*opts,$text); } else { $ret .= &unescape($text); &display_text(*opts,$text); } } $text = ''; return($ret); } sub call_tags_single { local(*opts,*tag) = @_; local($routine) = $tags_single{$tag}; return &$routine(*opts,*tag); } sub call_tags_block { local(*opts,*tag,*close_tag,$text) = @_; local($routine) = $tags_block{$tag}; return &$routine(*opts,*tag,*close_tag,$text); } sub process_html { local(*opts,*html_in) = @_; local($ret,$_,$l,$open,%open,$num_tags,$text,$routine,$close_tag,%close_tag); local($last_tag,$open_ndx); #unless($opts{'_textspace'}) { $opts{'_textspace'} = " "; } # 4 spaces unless($opts{'_textspace'}) { $opts{'_textspace'} = " "; } # 2 spaces unless($opts{'_max_len'}) { $opts{'_max_len'} = 75; } $open_ndx = $l = -1; while (defined $html_in[++$l]) { #print STDERR "html_in[$l]='$html_in[$l]'\n"; #next if ($html_in[$l] =~ /^<\!.*>$/); next if (substr($html_in[$l],0,2) eq ' but everyone doesn't use SGML comment form) $last_tag = $open; $html_in[$l] =~ s/\s+=/=/g; # kludge to fix $html_in[$l] =~ s/=\s+/=/g; # kludge to fix # The above should be rewritten to take a argument, check for /=/, # if not take the next, unless (/=\s*\S/) and the next.. ($open,%open) = &split_tag($html_in[$l]); #if ($text =~ /\S/) { $ret .= &process_html_text(*opts,$text); } if (($text =~ /\S/) || ($opts{'_tag'} eq 'pre')) { $ret .= &process_html_text(*opts,$text); } $text = ''; if ($tags_ignore{$open}) { next; } elsif ($tags_single{$open}) { $ret .= &call_tags_single(*opts,*open); } elsif ($tags_block{$open}) { $num_tags = 1; while (defined $html_in[++$l]) { #print STDERR "html_in[$l]='$html_in[$l]'\n"; if (($num_tags) && ($html_in[$l] =~ m!^<$open(\s.*)*\s*>$!i) && ($tags_not_nested{$open})) { # Un-nested block tags $num_tags--; ($close_tag,%close_tag) = ("/$open"); $l--; last; } elsif ($html_in[$l] =~ m!^<$open(\s.*)*\s*>$!i) { # nested block tags $open_ndx = $l+1; $num_tags++; $text .= $html_in[$l]; ($close_tag,%close_tag) = (); } elsif ($html_in[$l] =~ m!^$open(\s.*)*\s*>$!i) { $open_ndx = -1; $num_tags--; unless ($num_tags) { ($close_tag,%close_tag) = &split_tag($html_in[$l]); last; } else { $text .= $html_in[$l]; } } else { $text .= $html_in[$l]; } } if (!$close && $open_ndx != -1) { #print STDERR "$open wasn't closed, resetting..\n"; #$l = $open_ndx; #$text = ''; #next; $close = "/$open"; } $ret .= &call_tags_block(*opts,*open,*close_tag,$text) if (($text =~ /\S/) || ($opts{'_tag'} eq 'pre')); $text = ''; } else { # Otherwise it's a invalid tag, ignore it if ($opts{'_unknown'} eq 'preserve') { $text .= $html_in[$l]; } else { print "process_html: Ignoring tag '$open'\n" if ($opts{'_debug'}); } } } else { $text .= $html_in[$l]; } } if (($text =~ /\S/) || ($opts{'_tag'} eq 'pre')) { $ret .= &process_html_text(*opts,$text); } $text = ''; return($ret); } # subroutine: parse_html # Argument 1: (*data) - array pointer to return data # Argument 2-n: variables with HTML data # Returns in *data HTML split up - first is non-HTML tag, 2nd HTML tag... sub parse_html { local(*data) = shift(@_); local($save,$lt,$gt,$data,$_); NEXTLINE: for (@_) { $save .= $_; if ((($lt = index($save,'<')) == -1) || (index($save,'>',$lt) == -1)) { next; } $lt = $gt = 0; while (($lt = index($save, '<', $gt)) >= $[) { # This is the data *BEFORE* the '<' if ($lt) { # do If isn't /^ if ($gt) { $data = substr($save, ($gt+1), ($lt-$gt-1)); } else { $data = substr($save, ($gt), ($lt-$gt)); } push(@data, $data); } $gt = index($save, '>', $lt); if ($gt == -1) { $save = substr($save, $lt); next NEXTLINE; } # This is the data *INSIDE* the <> $data = substr($save, $lt, ($gt-$lt+1)); push(@data, $data); } $save = substr($save, ($gt+1)); } push(@data, $save); return(@data); } sub parse_html_list { local($type) = shift(@_); local(@links,$n,$_,@ret,$text,$oldn,@type,$key,$subtype,@subtype,$tmp); $n = 1; &w3bbc_html'parse_html(*links, @_); $subtype = ' '; for (@links) { if (m!^<(dl|ol|ul|menu|dir)\s*>$!i) { $tmp = $1; $tmp =~ tr/A-Z/a-z/; $key = "$type/$subtype/$n"; push(@type,$type); push(@subtype,$subtype); $type = $tmp; push(@ret,$key,$text) if ($text =~ /\S/); # At least one non-space char $text = ''; $subtype = ' '; $oldn = $n; $n++; } elsif (m!^(dl|ol|ul|menu|dir)\s*>$!i) { $key = "$type/$subtype/$n"; push(@ret,$key,$text) if ($text =~ /\S/); # At least one non-space char #$type = pop(@type); $type = ' '; # Continuation of previous type #$subtype = pop(@subtype); $subtype = ' '; $text = ''; $oldn = $n; $n--; } elsif (m!^
- $!i) { if ($type eq ' ') { $type = pop(@type); } $key = "$type/$subtype/$n"; push(@ret,"$key",$text) if ($text =~ /\S/); # At least one non-space char $subtype = ' '; $text = ''; } elsif (m!^
- $!i) { # Description term $key = "$type/$subtype/$n"; push(@ret,$key, $text) if ($text =~ /\S/); $subtype = 'dt'; $text = ''; } elsif (m!^
- $!i) { # Description definition $key = "$type/$subtype/$n"; push(@ret,$key, $text) if ($text =~ /\S/); $subtype = 'dd'; $text = ''; } else { $text .= $_; } } push(@ret,"$type/$subtype/$n",$text) if ($text =~ /\S/); return(@ret); } sub wordwrap { local($offset,$length,$text) = @_; #print STDERR "wordwrap($offset,$length,$text)\n"; #print STDERR "return($text) if (($offset+length($text)) <= $length);\n"; return($text) if (($offset+length($text)) <= $length); #print STDERR "caller(0)=(",join(',',caller(0)),")\n"; #print STDERR "caller(1)=(",join(',',caller(1)),")\n"; #print STDERR "caller(2)=(",join(',',caller(2)),")\n"; # Die hard - otherwise it will loop forever.. die "wordwrap called without valid length argument" unless($length); local(@r,$space,$l,$r); if ($offset) { $r = ($length - $offset) } else { $r = $length; } while (1) { last unless($text); $l = length($text); if ($l > $r) { $space = rindex($text, ' ', $r); if ($space == -1) { unless($offset) { push(@r, substr($text, 0, $r)); $text = substr($text, $r, $length); } else { push(@r,''); # skip to next line } #$text = substr($text, $length); } else { push(@r, substr($text, 0, $space)); $text = substr($text, ($space+1)); # Skip the space } } else { push(@r, $text); $text = ''; } $r = $length; $offset = 0; } return(@r); } # This routine will take a argument of a url which contains # HTML data. It will extract the
and links, # and return a array with two entries for each link. # # For links, the first entry will be the description between # and and the second will be the URL for that hyperlink # # For
links, the first will be the entire HTML tag, and the # second will be the URL for the image.. # sub extract_links_desc { local($in_url) = shift(@_); local($links,@links,%links); local(@ret,$url,$_,$desc,@headers,%headers,$content,$response); $response = &www'lrequest('GET', *in_url, *headers, *content, $request_timeout); &parse_html(*links,$content); $links = -1; while (defined $links[++$links]) { $_ = $links[$links]; if ((/^$/i) || (/^$/i)) { $url = $1; $desc = ''; while (defined $links[++$links]) { last if ($links[$links] =~ m!<\s*/\s*a\s*>!); $desc .= $links[$links]; } push(@ret,$url,$desc); } elsif ((/^\s*
$/i) || (/^\s*
$/i)) { push(@ret,$1,$_); } else { next; } } return(@ret); } sub parse_html_tag { local(*opts,*tag,*close_tag,$html) = @_; #local(*opts,*tag,*close_tag,$html) = (shift,shift,shift,shift); local(@parsed,@save,$ret); # unless ($opts{'_tag'} eq $open) { # $ret = &newlines(*opts,2); # } if (&has_tag($html)) { @parsed = (); &parse_html(*parsed,$html); $html = &process_html(*opts,*parsed); } else { &display_text(*opts,$html); } $ret .= $html; return($ret); } sub unescape { local($html) = @_; #print STDERR "unescape($html)\n"; #print STDERR "caller(0)=(",join(',',caller(0)),")\n"; #print STDERR "caller(1)=(",join(',',caller(1)),")\n"; #print STDERR "caller(2)=(",join(',',caller(2)),")\n"; local($ndx,$ctr,$amp,$c,$len,$_,$replace); return($html) if (($ndx = index($html,'&')) == -1); unless($html_ampersand_len) { # Set the max len of the keys - so I can abort early if no match for (keys %html_ampersand) { $len = length($_); if ($len > $html_ampersand_len) { $html_ampersand_len = $len; } } } $len = length($html); do { $ctr = 1; $amp = ''; $replace = 1; while (($replace) && (!$html_ampersand{$amp})) { $c = substr($html,$ndx+$ctr,1); if (($c eq '&') || ($c eq ';')) { last; } if ($ctr > $html_ampersand_len) { $replace = 0; next; } $c =~ tr/A-Z/a-z/; $amp .= $c; $ctr++; if ($ndx+$ctr >= $len) { $replace = 0; next; } } if ($replace && $html_ampersand{$amp}) { if (substr($html,$ndx+$ctr,1) eq ';') { substr($html,$ndx,$ctr+1) = $html_ampersand{$amp}; } else { substr($html,$ndx,$ctr) = $html_ampersand{$amp}; } } } while ((($ndx = index($html,'&',$ndx+1)) != -1) && ($ndx+$ctr < $len)); #print STDERR "unescape = $html\n"; return($html); } package main; 1;