#!YOUR_PERL_BINARY # --------------------------------------------------------------------------- $Version = 'oldlog2new-2.0'; # # Copyright (c) 1994, 1996 Regents of the University of California. # # This software has been developed by Roy Fielding as # part of the WebSoft project at the University of California, Irvine. # # See the file LICENSE for licensing and redistribution information. # # This program is based on an early version of the wwwstat log analyzer. # It exists only for the purpose of converting old NCSA httpd 1.0 and 1.1 # log files to the common logfile format (CLF) used by wwwstat-1.0 and later. # ALMOST ALL SITES WILL HAVE NO USE FOR THIS PROGRAM. # # It reads the old log, figures out what each entry points to, finds the # current file size for that entity, and outputs the new format including # a reasonable approximation of the server response code. NOTE that this # won't work if the logfile entries do not correspond to real files that # are still in the place they were when the entry was logged. # sub usage { die <<"EndUsage"; usage: oldlog2new [-hez] [-f logfile] [-s srmfile] $Version Convert an NCSA httpd 1.1 access_log file to CLF access_log Display Options: -h Help -- just display this message and quit. -e Display all invalid log entries on STDERR. (default is to ignore them) Input Options: -f Read from the following access_log file instead of the default. -z Use zcat to uncompress the log file while reading [requires -f]. -s Get the server directives from the following srm.conf file. EndUsage } # --------------------------------------------------------------------------- # Set the default configuration options: # Edit the next line to specify the (+/-)HHMM offset from GMT $GMToffset = '-0700'; # Edit the next line to identify the server's default home page. $ServerHome = "/"; # Edit the next two lines to specify the location of your server access log # and your server configuration (srm.conf) file. $access_log = '/usr/local/etc/httpd/logs/access_log'; $srm_conf = '/usr/local/etc/httpd/conf/srm.conf'; # Edit the next line to specify the command for displaying compressed files $zcat = 'gunzip -c'; # specify as null string if none are available # Estimate the size of a redirect message minus the two location URLs $DirectoryRedirect = 1; # Does server do automatic redirect for slashless # index reqs? (1 for httpd_1.1, 0 for httpd_1.0) # Is the server running with rfc931 support (IdentityCheck on)? $IdentityCheck = 0; # Must = 1 if server uses rfc931 remote ident. # Edit the next few lines to specify whether (1) or not (0) you want: $PrintInvalids = 0; # Display invalid log entries on STDERR? $CompressedLog = 0; # Access log has been compressed (or gzipped)? # ========================================================================== # Get the command-line options require "getopts.pl"; &Getopts('hezf:s:'); if ($@ || $opt_h) { &usage; } if ($opt_e) { $PrintInvalids = 1; } if ($opt_z) { if ($opt_f) { $CompressedLog = 1; } # Require logfile name if else { &usage; } # uncompression is desired } if ($opt_f) { $access_log = $opt_f; } if ($opt_s) { $srm_conf = $opt_s; } if ($CompressedLog && !$zcat) { die "No zcat decompression command has been defined, stopped"; } # ========================================================================== # Get the other needed configuration items from the srm.conf file open (SRM,$srm_conf) || die "Error opening config file: $srm_conf\n"; $UserDir = "public_html"; # Start with NCSA defaults $DirectoryIndex = "index.html"; $DocumentRoot = "/usr/local/etc/httpd/htdocs"; while () { next if ( ($_ eq "\n") || /^\#/ ); # Ignore blank and comment lines if (/^DocumentRoot (.+)\s/) { $DocumentRoot = $1; } elsif (/^UserDir (.+)\s/) { $UserDir = $1; } elsif (/^DirectoryIndex (.+)\s/) { $DirectoryIndex = $1; } elsif (/^Redirect\s+(\S+)\s+(\S+)\s/) { $alias = $1; $rname = $2; $alias =~ s/(\W)/\\$1/g; # Needed for later pattern match $AllRedirects{$alias} = $rname; } elsif (/^Alias\s+(\S+)\s+(\S+)\s/) { $alias = $1; $rname = $2; $alias =~ s/(\W)/\\$1/g; # Needed for later pattern match $AllAliases{$alias} = $rname; } elsif ( /^ScriptAlias\s+(\S+)\s+(\S+)\s/ || /^OldScriptAlias\s+(\S+)\s+(\S+)\s/ ) { $alias = $1; $rname = $2; $alias =~ s/(\W)/\\$1/g; # Needed for later pattern match $AllScripts{$alias} = $rname; } } close SRM; # ========================================================================== # Now read log, figure out the response code and bytes, and output new format # if ($CompressedLog) { $access_log = "$zcat $access_log |"; } open (LOG,$access_log) || die "Error opening access log file: $access_log\n"; LINE: while () { $saveline = $_; $ident = "-"; if ($IdentityCheck) # Does log include IdentityCheck info? { /^(.*)@\S+\s/; if ($_) { $ident = $1; # Save ident for later use $saveline =~ s/^.*@//; # Remove the remote ident from log } $_ = $saveline; } $htv = ''; ($afield, $date, $method, $oname, $htv) = /^(\S+) \[(.+)\] (\S+)\s+(\S+)\s(.*)$/; if (!($afield && $date && $method && $oname && (length($date) == 24))) { print(STDERR "$.:$saveline") if $PrintInvalids; next LINE; } # # First, we have to figure out what file or script was accessed # $fname = $oname; $fname =~ s/\?.*$//; # Remove any trailing query information $fname =~ s/\#.*$//; # Remove any trailing anchor information $fname =~ s#//#/#g; # Remove any extra slashes if (($fname eq "") || ($fname eq "HTTP/1.0")) { print(STDERR "$.:$saveline") if $PrintInvalids; next LINE; } FNAME: # Get the document's real name { $rname = ""; # and start with it unknown $rcode = 200; $fsize = 0; study $fname; if (($fname eq "/") || ($fname eq "/$DirectoryIndex")) { $fname = "$ServerHome"; # Handle top file with extra care $rname = "$DocumentRoot$fname"; last FNAME; } foreach $redir (keys %AllRedirects) # Is it a redirected file? { if ( $fname =~ /^$redir/ ) { $rcode = 302; last FNAME; } } foreach $alias (keys %AllAliases) # Is it a file name alias? { if ( $fname =~ /^$alias/ ) { $rname = $fname; $rname =~ s#^$alias#$AllAliases{$alias}#; last FNAME; } } if ($fname =~ /^\/~(\w+)\// ) # Is it a /~username/...? { ($name,$passwd,$uid,$gid,$quota,$comment,$gcos,$dir,$shell) = getpwnam($1); if ($dir) { $rname = $fname; $rname =~ s#^/~$1#$dir/$UserDir#; } else { $rcode = 404; } last FNAME; } if ($fname =~ /^\/~(\w+)$/ ) # Is it a /~username ? { ($name,$passwd,$uid,$gid,$quota,$comment,$gcos,$dir,$shell) = getpwnam($1); if ($dir) { $rname = $fname; $rname =~ s#^/~$1#$dir/$UserDir#; if (-e "$rname/$DirectoryIndex") { if ($DirectoryRedirect) { $rcode = 302; last FNAME; } $rname .= "/$DirectoryIndex"; } $fname .= '/'; } else { $rcode = 404; } last FNAME; } foreach $alias ( keys %AllScripts ) # Is it a script directory alias? { if ( $fname =~ /^$alias/ ) { $fsize = '-'; last FNAME; } } if (-d "$DocumentRoot$fname") # Is it a directory? { $hasSlash = ($fname =~ s/\/$//); # Remove any trailing slash if (-e "$DocumentRoot$fname/$DirectoryIndex") { if (!$hasSlash && $DirectoryRedirect) { $rcode = 302; last FNAME; } $rname = "$DocumentRoot$fname/$DirectoryIndex"; } else { $rname = "$DocumentRoot$fname"; } $fname .= '/'; last FNAME; } $rname = "$DocumentRoot$fname"; # It must be a normal file } # end FNAME $xname = 0; if (!$fsize && ($rcode == 200) && $rname) # Get the file size { # through use of a cache of Sizes ($fsize = $Sizes{$fname}) || ($fsize = $Sizes{$fname} = (-s $rname)) || ($xname = 1); } if ($xname) { $rcode = 404; } if (!(($method eq 'GET')||($method eq 'HEAD')||($method eq 'POST'))) { $rcode = 400; } if ($rcode != 200) { $fsize = '-'; } elsif ($method eq 'HEAD') { $fsize = '0'; } if ($htv) { $oname .= ' '. $htv; } # # Phew! Now we have to swap the date format around # $newdate = substr($date, 8, 2) .'/'. substr($date, 4, 3) .'/'. substr($date,20, 4) .':'. substr($date,11, 9) . $GMToffset; $newdate =~ s/^ /0/; # # Now that we have categorized it, print it in the new format # print($afield,' ',$ident,' - [',$newdate,'] "',$method,' ',$oname, '" ',$rcode,' ',$fsize," \n"); } close LOG; exit(0);