relative URL routine

Brooks Cutter (bcutter@twilight.paradyne.att.com)
Wed, 16 Nov 1994 18:57:06 -0500


Enclosed is a routine that takes two a base and current URL, and
attempts to return a relative URL if possible.

I use this with a robot that mirrors the web to a local file, 
and converts the URLss into relative URLS so the mirror can
be moved around...

-Brooks


----- Begin Included Message -----

package wwwurl;

# ===========================================================================
# relative(): Convert a URL relative to base URL 
#             If possible, it will drop the scheme, host and port,
#             and trim the leading directory
#
# $relative_url = &wwwurl'relative($base_url, $curr_url, $dotpath);
#
# where 
#
#    $base_url: the URL where $curr_url will appear.  Used for comparison
#
# $curr_url: The current URL that should be made relative if possible
#
#     $dotpath: Reserved for future use. (See todo below)
#
# Note: relative() does not require that either the $base_url or
#       $curr_url be absolute URL's.  Here is the behavior
#
# Base| Curr| Action (what will happen as a result)
# Yes | Yes | normal behavior.  Scheme and Address are compared for equality
# Yes | No  | assumed that $curr_url uses same scheme & address as $base_url
# No  | Yes | $curr_url will be returned without modification
# No  | No  | assumed that both use same scheme and same address and port
#
# Examples:
#
# $base_url = 'http://www.company.com/dir1/file.html';
# $curr_url = 'http://www.company.com/dir1/file2.html';
# $relative_url = &wwwurl'relative($base_url, $curr_url, $dotpath,0);
#
# $relative_url eq 'file2.html'
# # Just the filename is returned
#
# $base_url = 'http://www.company.com/dir1/file.html';
# $curr_url = 'http://www.company.com/dir1/dir2/file2.html';
# $relative_url = &wwwurl'relative($base_url, $curr_url, $dotpath,0);
#
# $relative_url eq 'http://www.company.com/dir1/dir2/file2.html'
# # the complete URL is returned, because they don't share a common parent dir
#
# $base_url = '/dir1/file.html';
# $curr_url = '/dir1/file2.html';
# $relative_url = &wwwurl'relative($base_url, $curr_url, $dotpath,0);
#
# $relative_url eq 'file2.html'
# # assumed that they share the same scheme, address and port
#
# $base_url = 'http://www.company.com/dir1/file.html';
# $curr_url = '/dir1/file2.html';
# $relative_url = &wwwurl'relative($base_url, $curr_url, $dotpath,0);
#
# $relative_url eq 'file2.html'
# # assumed that curr_url uses the same scheme, address and port as $base_url
#
# Todo:
# - add support for third argument $dotpath (default is false)
#   if $dotpath is true, then it will attempt to return a 
#   relative path with "../../..", etc if the host is the same.
#   This is needed for a robot that mirrors a web to a filesystem and
#   and rewrites the URL's to relative URL's, assuming they are all local.
#   would allow the root of the mirror to be moved to another server
#   or directory (since all references are to same server)
#
sub relative {
	local($base_url,$curr_url,$dotpath) = @_;
	return($curr_url) unless($base_url); # Can't do anything without a $base_url
	local($scheme, $address, $port, $path, $query, $frag);
	local($path_end,$last_slash);

	# If $base_url is the same as Base URL, ok..
	# if $base_url is same as a copy of Base URL, then Base URL has a extra :80
	if (($base_url ne $Burl)  && ((!length($Burl2)) || ($base_url ne $Burl2))) 
	{ # Check the Base URL cache
		$Burl = $base_url;
		($Bsch,$Baddr,$Bport,$Bpath,$Bquery,$Bfrag) = &wwwurl'parse($Burl);
		$Burl2 = $Burl; # make a second copy, in case I rewrite it next
		return($curr_url) if ($InSet{$Bsch} & $NonHierarchical);
		if (!$Bport && $Baddr && $Bsch) # default port used with scheme & address
		{ # If I have to modify it, rewrite it with the :80
			$Bport = 80; # set to default port, 80
			$Burl = $base_url = 
				&wwwurl'compose($Bsch, $Baddr, $Bport, $Bpath, $Bquery, $Bfrag);
		}
		$Bmem = $InSet{$Bsch} || 0;
	}

	($scheme, $address, $port, $path, $query, $frag) = &wwwurl'parse($curr_url); 

	# If it isn't a hiearchical URL like news: or mailto:
	return($curr_url) if ($InSet{$scheme} & $NonHierarchical);

	if (!$port && $address && $scheme) # default port used with scheme & address
	{ # If I have to modify it, rewrite it with the :80
		$port = 80; # set to default port, 80
		$curr_url = 
			&wwwurl'compose($scheme, $address, $port, $path, $query, $frag);
	}
	return($curr_url) if ($scheme && $Bsch ne $scheme);     # $scheme mismatch
	return($curr_url) if ($address && $Baddr ne $address);  # $address mismatch
	return($curr_url) if ($port && $Bport ne $port);        # $port mismatch
	$path_end = index($base_url,'?'); # before this, nuthin but path
	if (($path_end == -1) && (($path_end = index($base_url,'#')) == -1)) 
	{ # If I can't find a ? or # then start at the last slash..
		$last_slash = rindex($base_url,'/'); # search from the end
	} else { # start at the first ? or #
		$last_slash = rindex($base_url,'/',$path_end); # search from end of path
	}
	if (substr($base_url,0,$last_slash) eq substr($curr_url,0,$last_slash))
	{ # If the leading scheme/host/port/paths are the same, return the rest
		return(substr($curr_url,$last_slash+1)); # return the relative portion..
	}
	# If the $scheme, $host and $port are common, then just return
	# the path, query and fragment..
	return(&wwwurl'compose('','','',$path,$query,$frag));
}

package main;

1;


----- End Included Message -----