# From: Marc VanHeyningen # To: libwww-perl@ics.uci.edu # Subject: Re: Libwww suggestions # Date: Fri, 26 Aug 1994 12:58:52 -0500 # Message-Id: <15253.777923932@moose.cs.indiana.edu> # # Thus wrote: Martijn Koster # >Marc wrote: # >> Also, does anyone know of perl code for converting HTML entities # >> to their 8bit text equivalents in iso-8859-1? Or maybe their appropriate # >> if crude ASCII renditions? And possibly back the other way (at least # >> with the 8bit stuff)? It seems that would be a useful addition to wwwhtml. # > # >I did hack a script to read ISOLat1 and produce a table, from # >which you can parse some 'crude' renditions. See # >http://web.nexor.co.uk/mak/misc/charset.html # # Thanks for the pointer! The resulting code is attached. Roy, feel # free to include it in wwwhtml.pl if you think it appropriate. It # encodes and decodes HTML entities into (potentially 8bit) text files; # obviously it should be used in domains where the real HTML tags are # not present. # # A lot of people (like me) have been willy-nilly passing text back and # forth between HTML and plaintext without thinking about these issues. # I'm not currently motivated enough to try to establish a mapping to # approximate ASCII representations (e.g. map ü to "ue"); if # someone has a table of them it shouldn't be hard to throw in. # # - Marc # # Code exerpt for HTML entity handling # hastily written by Marc VanHeyningen August 1994 $entity{"lt"} = '<'; $entity{"gt"} = '>'; $entity{"amp"} = '&'; # this is never used $entity{"aacute"} = pack("c", 225); $entity{"Aacute"} = pack("c", 193); $entity{"acirc"} = pack("c", 226); $entity{"Acirc"} = pack("c", 194); $entity{"agrave"} = pack("c", 224); $entity{"Agrave"} = pack("c", 192); $entity{"aring"} = pack("c", 229); $entity{"Aring"} = pack("c", 197); $entity{"atilde"} = pack("c", 195); $entity{"Atilde"} = pack("c", 227); $entity{"auml"} = pack("c", 228); $entity{"Auml"} = pack("c", 196); $entity{"aelig"} = pack("c", 230); $entity{"AElig"} = pack("c", 198); $entity{"ccedil"} = pack("c", 231); $entity{"Ccedil"} = pack("c", 199); $entity{"eth"} = pack("c", 240); $entity{"ETH"} = pack("c", 208); $entity{"eacute"} = pack("c", 233); $entity{"Eacute"} = pack("c", 201); $entity{"ecirc"} = pack("c", 234); $entity{"Ecirc"} = pack("c", 202); $entity{"egrave"} = pack("c", 232); $entity{"Egrave"} = pack("c", 200); $entity{"euml"} = pack("c", 235); $entity{"Euml"} = pack("c", 203); $entity{"iacute"} = pack("c", 237); $entity{"Iacute"} = pack("c", 205); $entity{"icirc"} = pack("c", 238); $entity{"Icirc"} = pack("c", 206); $entity{"igrave"} = pack("c", 236); $entity{"Igrave"} = pack("c", 204); $entity{"iuml"} = pack("c", 239); $entity{"Iuml"} = pack("c", 207); $entity{"ntilde"} = pack("c", 241); $entity{"Ntilde"} = pack("c", 209); $entity{"oacute"} = pack("c", 243); $entity{"Oacute"} = pack("c", 211); $entity{"ocirc"} = pack("c", 244); $entity{"Ocirc"} = pack("c", 212); $entity{"ograve"} = pack("c", 242); $entity{"Ograve"} = pack("c", 210); $entity{"oslash"} = pack("c", 248); $entity{"Oslash"} = pack("c", 216); $entity{"otilde"} = pack("c", 245); $entity{"Otilde"} = pack("c", 213); $entity{"ouml"} = pack("c", 246); $entity{"Ouml"} = pack("c", 214); $entity{"szlig"} = pack("c", 223); $entity{"thorn"} = pack("c", 254); $entity{"THORN"} = pack("c", 222); $entity{"uacute"} = pack("c", 250); $entity{"Uacute"} = pack("c", 218); $entity{"ucirc"} = pack("c", 251); $entity{"Ucirc"} = pack("c", 219); $entity{"ugrave"} = pack("c", 249); $entity{"Ugrave"} = pack("c", 217); $entity{"uuml"} = pack("c", 252); $entity{"Uuml"} = pack("c", 220); $entity{"yacute"} = pack("c", 253); $entity{"Yacute"} = pack("c", 221); $entity{"yuml"} = pack("c", 255); foreach $entity (keys %entity) { $charentity{$entity{$entity}} = $entity; } sub decode_entities { local(*_) = @_; local($entity, %entities); # We can't change the string during the while, so we store changes to # make in the next loop while(/\&(\w+)\;/g) { $entity = $1; next if(($entity eq "amp") || ($entity == 38)); # do ampersands last $entities{$entity} = 1; } foreach $entity (keys %entities) { if($entity =~ /^\d+$/) { s/\&$entity\;/pack("c", $entity)/eg; next; } if(defined $entity{$entity}) { s/\&$entity\;/$entity{$entity}/g; } else { warn "Unknown HTML entity $entity!"; } } # Handle ampersands last s/\&(amp|0*38)\;/\&/g; } sub encode_entities { local(*_) = @_; local($i, $c); # Get rid of ampersands first s/\&/\&\;/g; foreach $char (keys %charentity) { next if $char eq "&"; # already did ampersands s/$char/\&$charentity{$char}\;/g; } for($i = 128; $i < 256; $i++) { $c = pack("c", $i); s/$c/\&$i\;/g; } } 1;