# Date: Wed, 16 Nov 1994 18:57:06 -0500 # From: Brooks Cutter # Message-Id: <9411162357.AA07903@twilight.paradyne.att.com> # To: libwww-perl@ics.uci.edu # Subject: relative URL routine # # Enclosed is a routine that takes two a base and current URL, and # attempts to return a relative URL if possible. # # I use this with a robot that mirrors the web to a local file, # and converts the URLss into relative URLS so the mirror can # be moved around... # # -Brooks # # # ----- Begin Included Message ----- package wwwurl; # =========================================================================== # relative(): Convert a URL relative to base URL # If possible, it will drop the scheme, host and port, # and trim the leading directory # # $relative_url = &wwwurl'relative($base_url, $curr_url, $dotpath); # # where # # $base_url: the URL where $curr_url will appear. Used for comparison # # $curr_url: The current URL that should be made relative if possible # # $dotpath: Reserved for future use. (See todo below) # # Note: relative() does not require that either the $base_url or # $curr_url be absolute URL's. Here is the behavior # # Base| Curr| Action (what will happen as a result) # Yes | Yes | normal behavior. Scheme and Address are compared for equality # Yes | No | assumed that $curr_url uses same scheme & address as $base_url # No | Yes | $curr_url will be returned without modification # No | No | assumed that both use same scheme and same address and port # # Examples: # # $base_url = 'http://www.company.com/dir1/file.html'; # $curr_url = 'http://www.company.com/dir1/file2.html'; # $relative_url = &wwwurl'relative($base_url, $curr_url, $dotpath,0); # # $relative_url eq 'file2.html' # # Just the filename is returned # # $base_url = 'http://www.company.com/dir1/file.html'; # $curr_url = 'http://www.company.com/dir1/dir2/file2.html'; # $relative_url = &wwwurl'relative($base_url, $curr_url, $dotpath,0); # # $relative_url eq 'http://www.company.com/dir1/dir2/file2.html' # # the complete URL is returned, because they don't share a common parent dir # # $base_url = '/dir1/file.html'; # $curr_url = '/dir1/file2.html'; # $relative_url = &wwwurl'relative($base_url, $curr_url, $dotpath,0); # # $relative_url eq 'file2.html' # # assumed that they share the same scheme, address and port # # $base_url = 'http://www.company.com/dir1/file.html'; # $curr_url = '/dir1/file2.html'; # $relative_url = &wwwurl'relative($base_url, $curr_url, $dotpath,0); # # $relative_url eq 'file2.html' # # assumed that curr_url uses the same scheme, address and port as $base_url # # Todo: # - add support for third argument $dotpath (default is false) # if $dotpath is true, then it will attempt to return a # relative path with "../../..", etc if the host is the same. # This is needed for a robot that mirrors a web to a filesystem and # and rewrites the URL's to relative URL's, assuming they are all local. # would allow the root of the mirror to be moved to another server # or directory (since all references are to same server) # sub relative { local($base_url,$curr_url,$dotpath) = @_; return($curr_url) unless($base_url); # Can't do anything without a $base_url local($scheme, $address, $port, $path, $query, $frag); local($path_end,$last_slash); # If $base_url is the same as Base URL, ok.. # if $base_url is same as a copy of Base URL, then Base URL has a extra :80 if (($base_url ne $Burl) && ((!length($Burl2)) || ($base_url ne $Burl2))) { # Check the Base URL cache $Burl = $base_url; ($Bsch,$Baddr,$Bport,$Bpath,$Bquery,$Bfrag) = &wwwurl'parse($Burl); $Burl2 = $Burl; # make a second copy, in case I rewrite it next return($curr_url) if ($InSet{$Bsch} & $NonHierarchical); if (!$Bport && $Baddr && $Bsch) # default port used with scheme & address { # If I have to modify it, rewrite it with the :80 $Bport = 80; # set to default port, 80 $Burl = $base_url = &wwwurl'compose($Bsch, $Baddr, $Bport, $Bpath, $Bquery, $Bfrag); } $Bmem = $InSet{$Bsch} || 0; } ($scheme, $address, $port, $path, $query, $frag) = &wwwurl'parse($curr_url); # If it isn't a hiearchical URL like news: or mailto: return($curr_url) if ($InSet{$scheme} & $NonHierarchical); if (!$port && $address && $scheme) # default port used with scheme & address { # If I have to modify it, rewrite it with the :80 $port = 80; # set to default port, 80 $curr_url = &wwwurl'compose($scheme, $address, $port, $path, $query, $frag); } return($curr_url) if ($scheme && $Bsch ne $scheme); # $scheme mismatch return($curr_url) if ($address && $Baddr ne $address); # $address mismatch return($curr_url) if ($port && $Bport ne $port); # $port mismatch $path_end = index($base_url,'?'); # before this, nuthin but path if (($path_end == -1) && (($path_end = index($base_url,'#')) == -1)) { # If I can't find a ? or # then start at the last slash.. $last_slash = rindex($base_url,'/'); # search from the end } else { # start at the first ? or # $last_slash = rindex($base_url,'/',$path_end); # search from end of path } if (substr($base_url,0,$last_slash) eq substr($curr_url,0,$last_slash)) { # If the leading scheme/host/port/paths are the same, return the rest return(substr($curr_url,$last_slash+1)); # return the relative portion.. } # If the $scheme, $host and $port are common, then just return # the path, query and fragment.. return(&wwwurl'compose('','','',$path,$query,$frag)); } package main; 1;