Re: excluding requests by URL pattern

Tomas Pospisek (tpo@spin.ch)
Mon, 24 Feb 1997 12:04:36 +0000 (GMT)


I've patched wwwstat to have an option to not display other than html 
files in the archive section.

The patch below includes >ALL< changes I've made recently. It patches 
wwwstat and wwwstat.rc. If you only want to include the "OnlyHTML" patch 
then search for "OnlyHTML" in the patch and include the corresponding 
lines (4 in total).

Look at the head of the patch for all changes.

Btw: what do you think about Roy Fielding's proposition about voting the 
patches to be officially included?

+
T

--------------------------------------------------------------------------------
                            Tomas Pospisek   

           @ SPIN - Internet Services in Graubuenden/Switzerland
             for more info have a look at "http://www.spin.ch/"
--------------------------------------------------------------------------------

--- wwwstat.orig	Fri Feb 14 19:03:03 1997
+++ wwwstat	Mon Feb 24 11:35:52 1997
@@ -1,6 +1,6 @@
 #! /usr/local/bin/perl
 # ==========================================================================
-$Version = 'wwwstat-2.0';
+$Version = 'wwwstat-2.1b';
 #
 # Copyright (c) 1994, 1996 Regents of the University of California.
 #
@@ -13,6 +13,17 @@
 # See the file README  for more information.
 # See the wwwstat.1 man page for options and usage information.
 #
+# Patches added on 19.Feb.97 by T.Pospisek <tpo@spin.ch>
+#   Added configurable descriptors for Time/Date/Archive/Domain/SubDomain ...
+#   Added maximum Archive URL length
+#   Added configurable output of bars
+#   Added option to print country or domain (f.ex. .com)
+#   Changed behaveour so unresolved gets a Unresolved "country" (for gwstat)
+#   :) T.Pospisek <tpo@spin.ch>
+#
+# Patches added on 24.Feb.97 by T.Pospisek <tpo@spin.ch>
+#   Added display .html only in the Archive section
+
 sub usage {
     die <<"EndUsage";
 usage: $Pname [-F system_config] [-f user_config] [-helLoOuUrRvx]
@@ -20,10 +31,10 @@
                [-A IP_address] [-C code] [-D date] [-T hour] [-N archive_name] 
                [-m method]  [-M method]  [-H heading_title] [-X lastsummary]
                [-noescape] [-trunc N] [-files] [-nofiles] [-link] [-nolink]
-               [-cgi] [-nocgi] [-dns] [-nodns] [-cache filename]
-               [-daily] [-hourly] [-domain] [-subdomain] [-archive] [-ident]
-               [-all]   [-sort (key|byte|req)] [-top N]  [-both]
-               [-no (daily|hourly|domain|subdomain|archive|ident|all)]
+               [-cgi] [-nocgi] [-bars] [-nobars] [-dns] [-nodns]
+               [-cache filename] [-daily] [-hourly] [-domain] [-subdomain]
+               [-archive] [-ident] [-all] [-sort (key|byte|req)] [-top N]
+               [-both] [-no (daily|hourly|domain|subdomain|archive|ident|all)]
                [--] [ logfile | summary | + | - ]...
 $Version
    Process a sequence of httpd Common Logfile Format access_log files and/or
@@ -58,6 +69,8 @@
    -nolink  Do not add a hypertext link around each archive URL.
    -cgi     Check HTTP method and output like a CGI script.
    -nocgi   Do not produce CGI output.
+   -bars    Print bars
+   -nobars  Do not print bars
 Section Options, <section>=(all|daily|hourly|domain|subdomain|archive|ident):
    -no<section>  Exclude the given section from the output.
    -<section>    Include the given section and set scope for -sort and -top.
@@ -237,6 +250,19 @@
     $OldArchiveHeader   = 'Total Transfers from each Archive Section';
     $OldIdentHeader     = 'Total Transfers to each Remote Identifier';
 
+    # This sets some descriptors
+
+    $UnresolvedText     = 'Unresolved';
+    $DateText           = 'Date';
+    $TimeText           = 'Time';
+    $DomainText         = 'Domain';
+    $ReversedSubdomainText = 'Reversed Subdomain';
+    $ArchiveSectionText = 'Archive Section';
+    $RequestsText	= 'Hits in %';
+    # Maximum length of a URL/Archive name. 0 means unlimited
+
+    $MaxArchiveLength   = 0;
+
     # The following sets the default ordering for the daily stats.
     # Change this to 1 if you always want gwstat-style output.
 
@@ -273,6 +299,26 @@
     $Do_Subdomain    = 1;    # Display the Subdomain Statistics
     $Do_Archive      = 1;    # Display the Archive Statistics
 
+    # If you want the domain (0), the country (1) or both (2) printed
+
+    $PrintCountry    = 2;
+
+    # Print bars (1) or not (0)
+
+    $PrintBars       = 0;
+
+    # Display only (1) .htm and .html in Archive Section or all URLs (0)
+
+    $OnlyHTMLinArchiveSection = 0;
+
+    # Factors to modify the bar's length
+
+    $DateBarFactor       = 12;
+    $HourBarFactor       = 12;
+    $DomainBarFactor     = 5;
+    $SubdomainBarFactor  = 5;
+    $ArchiveBarFactor    = 75;
+
     # The following option is only useful if the server is running with
     # rfc931/ident support (i.e. "IdentityCheck on" appears in httpd.conf)
     # or for resources which require user authentication.
@@ -430,6 +476,10 @@
         {
             $Do_CGI = 1;
         }
+        elsif (/^bars$/)                            # Output Bars
+        {
+            $PrintBars = 1;
+        }
         elsif (/^dns$/)                            # Resolve IP addresses
         {
             $LookupDNS = 1;
@@ -1226,8 +1276,9 @@
             }
             else
             {
-                $domain    = 'unresolved';
-                $subdomain = $ShowUnresolved ? $host : 'Unresolved';
+                $domain    = 'unresolved '.$UnresolvedText;
+#                $domain    = 'unresolved ';
+                $subdomain = $ShowUnresolved ? $host : $UnresolvedText;
             }
         }
 
@@ -1552,6 +1603,75 @@
 }
 
 # ==========================================================================
+# Output the head of a stat
+#
+sub output_stat_head
+{
+    local($frag, $top, $title, $title2) = @_;
+    local($prefix) = $top ? "$PrefixTop $top" : $PrefixTotal;
+
+    print "<HR>\n";
+    print "<H2><A NAME=\"$frag\">$prefix $title</A></H2>\n";
+    if ($PrintBars) {
+        local ($PReqs, $PBytes, $Bytes, $Hits);
+
+	($PReqs, $PBytes, $Bytes, $Hits) = split ( /\s+/, $StatsHeader);
+        print "<TABLE>\n";
+
+	print "<TR>\n";
+        print '	<TD ALIGN="left" VALIGN="middle"><B>'. $PReqs.  "</B></TD>\n";
+        print '	<TD ALIGN="left" VALIGN="middle"><B>'. $PBytes. "</B></TD>\n";
+        print '	<TD ALIGN="left" VALIGN="middle"><B>'. $Bytes.  "</B></TD>\n";
+        print '	<TD ALIGN="left" VALIGN="middle"><B>'. $Hits.   "</B></TD>\n";
+        print '	<TD ALIGN="left" VALIGN="middle"><B>'. $title2. "</B></TD>\n";
+        print '	<TD ALIGN="left" VALIGN="middle"><B>'. $RequestsText.
+								"</B></TD>\n";
+        print "</TR>\n";
+    } else {
+        print $StartTag;
+        print $StatsHeader, " $title2\n";
+        print $StatsRule,   "------------\n";
+    }
+}
+
+# ==========================================================================
+# Output the stats for each calendar day represented in the input file(s)
+#
+sub print_end_tag
+{
+    if ($PrintBars) {
+        print "</TABLE>\n";
+    } else {
+        print $EndTag;
+    }
+}
+
+# ==========================================================================
+# Output the stats for each calendar day represented in the input file(s)
+#
+sub print_stat_line
+{
+    local ($fmt, $pctrqsts, $pctbytes, $bytes, $rqsts, $descriptor, $faktor);
+
+    ($fmt, $pctrqsts, $pctbytes, $bytes, $rqsts, $descriptor, $faktor) = @_;
+    if ($PrintBars) {
+	print "<TR>\n";
+        print '	<TD ALIGN="left" VALIGN="middle">'. $pctrqsts.   "</TD>\n";
+        print '	<TD ALIGN="left" VALIGN="middle">'. $pctbytes.   "</TD>\n";
+        print '	<TD ALIGN="left" VALIGN="middle">'. $bytes.      "</TD>\n";
+        print '	<TD ALIGN="left" VALIGN="middle">'. $rqsts.      "</TD>\n";
+        print '	<TD ALIGN="left" VALIGN="middle">'. $descriptor. "</TD>\n";
+        print '	<TD ALIGN="left" VALIGN="middle">'.
+              '<IMG SRC="red.gif" ALT="bar" HEIGHT="10" WIDTH="'.
+              int($pctrqsts * $faktor) .'">'.                    "</TD>\n";
+        print "</TR>\n";
+
+    } else {
+        printf $fmt, $pctrqsts, $pctbytes, $bytes, $rqsts, $descriptor;
+    }
+}
+
+# ==========================================================================
 # Output the stats for each calendar day represented in the input file(s)
 #
 sub output_daily
@@ -1559,13 +1679,8 @@
     local($frag) = @_;
     local($rqsts, $bytes, $pctrqsts, $pctbytes);
     local($top)    = $TopDaily;
-    local($prefix) = $top ? "$PrefixTop $top" : $PrefixTotal;
 
-    print "<HR>\n";
-    print "<H2><A NAME=\"$frag\">$prefix $DailyHeader</A></H2>\n";
-    print $StartTag;
-    print $StatsHeader, " Date\n";
-    print $StatsRule,   "------------\n";
+    &output_stat_head($frag, $top, $DailyHeader, $DateText);
     local($fmt) = "$StatsFormat %s\n";
     
     foreach $date (@SortedDates)
@@ -1582,11 +1697,12 @@
         } else {
             $pctbytes = sprintf("%5.2f", 100*$bytes/$TotalBytes);
         }
-        printf $fmt, $pctrqsts, $pctbytes, $bytes, $rqsts, $date;
+        &print_stat_line($fmt, $pctrqsts, $pctbytes, $bytes,
+                         $rqsts, $date, $DateBarFactor);
 
         last if ($top && (--$top == 0));
     }
-    print $EndTag;
+    &print_end_tag;
 }
 
 # ==========================================================================
@@ -1599,11 +1715,7 @@
     local($top)    = $TopHourly;
     local($prefix) = $top ? "$PrefixTop $top" : $PrefixTotal;
 
-    print "<HR>\n";
-    print "<H2><A NAME=\"$frag\">$prefix $HourlyHeader</A></H2>\n";
-    print $StartTag;
-    print $StatsHeader, " Time\n";
-    print $StatsRule,   "-----\n";
+    &output_stat_head($frag, $top, $HourlyHeader, $TimeText);
     local($fmt) = "$StatsFormat  %s\n";
 
     foreach $hour (sort hourcompare keys %HourRequests)
@@ -1620,11 +1732,12 @@
         } else {
             $pctbytes = sprintf("%5.2f", 100*$bytes/$TotalBytes);
         }
-        printf $fmt, $pctrqsts, $pctbytes, $bytes, $rqsts, $hour;
+        &print_stat_line($fmt, $pctrqsts, $pctbytes, $bytes,
+                         $rqsts, $hour, $HourBarFactor);
 
         last if ($top && (--$top == 0));
     }
-    print $EndTag;
+    &print_end_tag;
 }
 
 # ==========================================================================
@@ -1637,11 +1750,7 @@
     local($top)    = $TopDomain;
     local($prefix) = $top ? "$PrefixTop $top" : $PrefixTotal;
 
-    print "<HR>\n";
-    print "<H2><A NAME=\"$frag\">$prefix $DomainHeader</A></H2>\n";
-    print $StartTag;
-    print $StatsHeader, " Domain\n";
-    print $StatsRule,   "------------------------------------\n";
+    &output_stat_head($frag, $top, $DomainHeader, $DomainText);
     local($fmt) = "$StatsFormat %-5s %s\n";
     
     foreach $domain (sort domaincompare keys %DomainRequests)
@@ -1659,11 +1768,21 @@
         } else {
             $pctbytes = sprintf("%5.2f", 100*$bytes/$TotalBytes);
         }
-        printf $fmt, $pctrqsts, $pctbytes, $bytes, $rqsts, $domain, $country;
+
+        local ($descriptor);
+        if      ($PrintCountry == 0) {
+            $descriptor = $domain;
+        } elsif ($PrintCountry == 1) {
+            $descriptor = $country;
+        } else {
+            $descriptor = $domain."  ".$country;
+        }
+        &print_stat_line($fmt, $pctrqsts, $pctbytes,
+                         $bytes, $rqsts, $descriptor, $DomainBarFactor);
 
         last if ($top && (--$top == 0));
     }
-    print $EndTag;
+    &print_end_tag;
 }
 
 # ==========================================================================
@@ -1676,11 +1795,7 @@
     local($top)    = $TopSubdomain;
     local($prefix) = $top ? "$PrefixTop $top" : $PrefixTotal;
 
-    print "<HR>\n";
-    print "<H2><A NAME=\"$frag\">$prefix $SubdomainHeader</A></H2>\n";
-    print $StartTag;
-    print $StatsHeader, " Reversed Subdomain\n";
-    print $StatsRule,   "------------------------------------\n";
+    &output_stat_head($frag, $top, $SubdomainHeader, $ReversedSubdomainText);
     local($fmt) = "$StatsFormat %s\n";
 
     foreach $subdomain (sort subdomcompare keys %SubdomainRequests)
@@ -1697,11 +1812,12 @@
         } else {
             $pctbytes = sprintf("%5.2f", 100*$bytes/$TotalBytes);
         }
-        printf $fmt, $pctrqsts, $pctbytes, $bytes, $rqsts, $subdomain;
+        &print_stat_line($fmt, $pctrqsts, $pctbytes, $bytes,
+                         $rqsts, $subdomain, $SubdomainBarFactor);
 
         last if ($top && (--$top == 0));
     }
-    print $EndTag;
+    &print_end_tag;
 }
 
 # ==========================================================================
@@ -1714,14 +1830,10 @@
     local($top)    = $TopArchive;
     local($prefix) = $top ? "$PrefixTop $top" : $PrefixTotal;
 
-    print "<HR>\n";
-    print "<H2><A NAME=\"$frag\">$prefix $ArchiveHeader</A></H2>\n";
-    print $StartTag;
-    print $StatsHeader, " Archive Section\n";
-    print $StatsRule,   "------------------------------------\n";
+    &output_stat_head($frag, $top, $ArchiveHeader, $ArchiveSectionText);
     local($fmt) = "$StatsFormat %s\n";
 
-    foreach $section (sort archivecompare keys %ArchiveRequests)
+ARCH_LOOP: foreach $section (sort archivecompare keys %ArchiveRequests)
     {
         $rqsts = $ArchiveRequests{$section};
         $bytes = $ArchiveBytes{$section};
@@ -1736,18 +1848,28 @@
         } else {
             $pctbytes = sprintf("%5.2f", 100*$bytes/$TotalBytes);
         }
+
         $asec = $section;
+        # if set, only print .html URLs
+        $_ = $asec;
+        next ARCH_LOOP if ( $OnlyHTMLinArchiveSection
+                            && ! ( /\.html$/ || /\.htm$/ || /\/$/ ));
         $asec =~ s/\&/\&amp;/g;      # Replace HTML specials
         $asec =~ s/</\&lt;/g;
         $asec =~ s/>/\&gt;/g;
+	if ($MaxArchiveLength && length($asec) > $MaxArchiveLength ) {
+            $asec = substr( $asec, length($asec) - $MaxArchiveLength + 3);
+            $asec = '...'.$asec;
+        } 
         if ($InsertLink && ($asec =~ m:^/:)) {
             $asec = "<a href=\"$asec\">$asec</a>";
         }
-        printf $fmt, $pctrqsts, $pctbytes, $bytes, $rqsts, $asec;
+        &print_stat_line($fmt, $pctrqsts, $pctbytes,
+                         $bytes, $rqsts, $asec, $ArchiveBarFactor);
 
         last if ($top && (--$top == 0));
     }
-    print $EndTag;
+    &print_end_tag;
 }
 
 # ==========================================================================
@@ -1760,11 +1882,7 @@
     local($top)    = $TopIdent;
     local($prefix) = $top ? "$PrefixTop $top" : $PrefixTotal;
 
-    print "<HR>\n";
-    print "<H2><A NAME=\"$frag\">$prefix $IdentHeader</A></H2>\n";
-    print $StartTag;
-    print $StatsHeader, " Remote Identity\n";
-    print $StatsRule,   "------------------------------------\n";
+    &output_head($frag, $top, $IdentHeader, $RemoteIdentityText);
     local($fmt) = "$StatsFormat %s\n";
 
     foreach $ident (sort identcompare keys %IdentRequests)
@@ -1781,11 +1899,12 @@
         } else {
             $pctbytes = sprintf("%5.2f", 100*$bytes/$TotalBytes);
         }
-        printf $fmt, $pctrqsts, $pctbytes, $bytes, $rqsts, $ident;
+        &print_stat_line($fmt, $pctrqsts, $pctbytes, $bytes,
+                         $rqsts, $ident, $IdentBarFactor);
 
         last if ($top && (--$top == 0));
     }
-    print $EndTag;
+    &print_end_tag;
 }
 
 # ==========================================================================
--- wwwstat.rc.orig	Fri Feb 14 18:48:51 1997
+++ wwwstat.rc	Fri Feb 21 14:45:33 1997
@@ -64,6 +64,13 @@
 ##  $DomainMap{'uci.edu'}     = 'University of California, Irvine';
 ##  $DomainMap{'ics.uci.edu'} = 'UCI Information and Computer Science';
 ##  $DomainMap{'co.uk'}       = 'UK Commercial';
+# $DomainMap{'spin.ch'}		= 'Spin';
+# $DomainMap{'bluewin.ch'}	= 'Bluewin';
+# $DomainMap{'spectraweb.ch'}	= 'SpectraWeb';
+# $DomainMap{'active.ch'}	= 'ActiveNet';
+# $DomainMap{'eth.ch'}		= 'ETH';
+# $DomainMap{'unresolved'}	= 'Ohne Adresse';
+
 #
 # NOTE: The key must be in lowercase.
 #
@@ -140,31 +147,48 @@
 # parsing algorithm for reading old summary files.  Test all changes!
 #
 ##  $OutputTitle        = 'World Wide Web Access Statistics for www' .
-##                        $AppendToLocalhost;
+##                      $AppendToLocalhost;
+    $OutputTitle        = 'World Wide Web Zugangs-Statistik for www.spin.ch';
 #
 ##  $UpdateHeader       = 'Last updated: ';
+    $UpdateHeader       = 'Letztes Update: ';
 ##  $LastSumHeader      = 'Previous Full Summary Period';
+    $LastSumHeader      = 'Statistiken der letzten Periode';
 ##  $TotalsHeader       = 'Totals for Summary Period:  ';
+    $TotalsHeader       = 'Totals dieser Periode:  ';
 ##  $ReqRcvHeader       = 'Requests Received During Summary Period ';
+    $ReqRcvHeader       = 'Total Hits                              ';
 ##  $BtransHeader       = 'Bytes Transmitted During Summary Period ';
+    $BtransHeader       = 'Total Bytes                             ';
 ##  $AvgReqHeader       = 'Average Requests Received Daily         ';
+    $AvgReqHeader       = 'Durchschnitt Hits pro Tag               ';
 ##  $AvgByteHeader      = 'Average Bytes Transmitted Daily         ';
+    $AvgByteHeader      = 'Durchschnitt Bytes pro Tag              ';
 #
 ##  $TotalsFormat       = "%s %14.0f\n";
 # 
 ##  $StatsHeader        = '%Reqs %Byte  Bytes Sent  Requests  ';
 ##  $StatsRule          = '----- ----- ------------ -------- |';
 ##  $StatsFormat        = '%s %s %12.0f %8d |';
+    $StatsHeader        = '%Hits %Bytes Bytes        Hits      ';
+    $StatsRule          = '----- ------ ------------ ------- |';
+    $StatsFormat        = '%s %s %12.0f %8d |';
 # 
 ##  $PrefixTop          = 'Top';
 ##  $PrefixTotal        = 'Total';
 #
 ##  $DailyHeader        = 'Transfers by Request Date';
+    $DailyHeader        = 'Transfers nach Datum';
 ##  $HourlyHeader       = 'Transfers by Request Hour';
+    $HourlyHeader       = 'Transfers nach Tageszeit';
 ##  $DomainHeader       = 'Transfers by Client Domain';
+    $DomainHeader       = 'Transfers nach Dom&auml;ne';
 ##  $SubdomainHeader    = 'Transfers by Reversed Subdomain';
+    $SubdomainHeader    = 'Transfers nach Umgekehrter Subdom&auml;ne';
 ##  $ArchiveHeader      = 'Transfers by URL/Archive Section';
+    $ArchiveHeader      = 'Transfers nach URL/Datei';
 ##  $IdentHeader        = 'Transfers by Remote Identity';
+    $IdentHeader        = 'Transfers nach Client Identit&auml;';
 #
 # These Old headers are for reading old summary files
 #
@@ -175,6 +199,20 @@
 ##  $OldArchiveHeader   = 'Total Transfers from each Archive Section';
 ##  $OldIdentHeader     = 'Total Transfers to each Remote Identifier';
 #
+# This sets the description the unresolved addresses
+#
+    $UnresolvedText     = 'Ohne Adresse';
+    $DateText           = 'Datum';
+    $TimeText           = 'Zeit';
+    $DomainText         = 'Dom&auml;ne';
+    $ReversedSubdomainText = 'Umgekehrte Subdom&aumlne';
+    $ArchiveSectionText = 'Datei';
+    $RequestsText       = 'Hits in %';
+#
+# Maximum length of a URL/Archive name. 0 means unlimited
+#
+    $MaxArchiveLength   = 20;
+#
 # The following sets the default ordering for the daily stats.
 # Change this to 1 if you always want gwstat-style output.
 #
@@ -187,10 +225,10 @@
 #
 ##  $SortDaily       = 0;
 ##  $SortHourly      = 0;
-##  $SortDomain      = 0;
-##  $SortSubdomain   = 0;
-##  $SortArchive     = 0;
-##  $SortIdent       = 0;
+    $SortDomain      = 1;
+    $SortSubdomain   = 1;
+    $SortArchive     = 1;
+    $SortIdent       = 1;
 #
 # If the output of a section is sorted, you may also want to restrict
 # the output to only the N best in that section.
@@ -198,9 +236,9 @@
 ##  $TopDaily        = 0;
 ##  $TopHourly       = 0;
 ##  $TopDomain       = 0;
-##  $TopSubdomain    = 0;
-##  $TopArchive      = 0;
-##  $TopIdent        = 0;
+    $TopSubdomain    = 30;
+    $TopArchive      = 30;
+    $TopIdent        = 30;
 #
 # On the other hand, you may want to exclude (0) an entire section.
 # If set = 2, the top N is done first and then followed by normal section.
@@ -211,6 +249,26 @@
 ##  $Do_Subdomain    = 1;    # Display the Subdomain Statistics
 ##  $Do_Archive      = 1;    # Display the Archive Statistics
 ##  $Do_Ident        = 0;    # Display the Identity Statistics (DON'T PUBLISH)
+#
+# If you want the domain (0), the country (1) or both (2) printed
+#
+    $PrintCountry    = 1;
+#
+# Print bars (1) or not (0)
+#
+    $PrintBars       = 1;
+#
+# Display only (1) .htm and .html in Archive Section or all URLs (0)
+#
+    $OnlyHTMLinArchiveSection = 1;
+#
+# Factors to modify the bar's length
+#
+##  $DateBarFactor       = 12;
+##  $HourBarFactor       = 12;
+##  $DomainBarFactor     = 5;
+##  $SubdomainBarFactor  = 10;
+##  $ArchiveBarFactor    = 75;
 #
 # The rest of these options are normally only changed on the command-line
 #
----------------------------------------------------------------------------