#!/usr/local/bin/perl
# ---------------------------------------------------------------------------
$Version = 'wwwstat-1.0';
#
# Copyright (c) 1994 Regents of the University of California.
#
# This software has been developed by Roy Fielding <fielding@ics.uci.edu> as
# part of the Arcadia project at the University of California, Irvine.
#
# See the file README for licensing, distribution, usage, and installation
# information.  See the file Changes for known problems and version info.
#
sub usage {
    die <<"EndUsage";
usage: wwwstat [-helLoOuUrvx]  [-s srmfile] [-i pathname]
               [-a IP_address] [-c code] [-d date] [-t hour] [-n archive_name] 
               [-A IP_address] [-C code] [-D date] [-T hour] [-N archive_name] 
               [logfile ...]   [logfile.gz ...]    [logfile.Z ...]
$Version
Process a sequence of NCSA httpd access_log files and output an HTML summary.
Display Options:
     -h  Help -- just display this message and quit.
     -e  Display all invalid log entries on STDERR.
     -l  Do display full IP address of clients in my domain.
     -L  Don't (i.e. strip the machine name from local addresses).
     -o  Do display full IP address of clients from other domains.
     -O  Don't (i.e. strip the machine name from non-local addresses).
     -u  Do display IP address from unresolved domain names.
     -U  Don't (i.e. group all "unresolved" addresses under that name).
     -r  Display table of requests by each remote ident or authuser.
     -v  Verbose display (to STDERR) of each log entry processed.
     -x  Display all requests of nonexistant files to STDERR.
Input Options:
     -s  Get the server directives from the following srm.conf file.
     -i  Include the following file (assumed to be a prior wwwstat output).
    ...  Process the sequence of logfiles (compressed if extension $zhandle).
Search Options (include in summary only those log entries):
     -a  Containing a  hostname/IP address  matching the given perl regexp.
     -A  Not containing   "      "     "       "      "      "   "    "
     -c  Containing a  server response code matching the given perl regexp.
     -C  Not containing   "      "     "       "      "      "   "    "
     -d  Containing a  date ("Feb  2 1994") matching the given perl regexp.
     -D  Not containing   "      "     "       "      "      "   "    "
     -t  Containing an hour ("00" -- "23")  matching the given perl regexp.
     -T  Not containing   "      "     "       "      "      "   "    "
     -n  Containing an archive (URL) name   matching perl regexp (except +.).
     -N  Not containing   "      "     "       "      "      "   "    "
EndUsage
}
#
# Redistribution and use in source and binary forms are permitted,
# subject to the restriction noted below, provided that the above
# copyright notice and this paragraph and the following paragraphs are
# duplicated in all such forms and that any documentation, advertising
# materials, and other materials related to such distribution and use
# acknowledge that the software was developed in part by the University of
# California, Irvine.  The name of the University may not be used to
# endorse or promote products derived from this software without
# specific prior written permission.  THIS SOFTWARE IS PROVIDED ``AS
# IS'' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
# LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE.
#   
# Use of this software in any way or in any form, source or binary,
# is not allowed in any country which prohibits disclaimers of any
# implied warranties of merchantability or fitness for a particular
# purpose or any disclaimers of a similar nature.
#   
# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
# ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION
# (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF THE UNIVERSITY
# OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# ---------------------------------------------------------------------------
# Set the default configuration options:

# Edit the next line to specify the output document's HTML Title.

$OutputTitle = "World-Wide Web Access Statistics for G. Ramkumar";

# Edit the next line to specify the URL of the previous summary period
# (for use only as a hypertext link).  Set it = "" if link is unwanted.
# The three-letter abbrev for last month is substituted at any "%M"
# Note that this is the month prior to the earliest one in this summary.

$LastSummary = "http://robotics.stanford.edu/people/ramkumar/wwwstat/summary.html";

# Edit the next line to identify the server's default home page.

$ServerHome = "http://robotics.stanford.edu/home.html";

# Edit the next line to locate the country-codes file.
# This is a file of the format:
# domain text
# which will allow expansion from domain to country name.

$countrycodefile = "/u/ramkumar/misc/www/wwwstat/country-codes";

# Edit the next two lines to specify the location of your server access log
# (the default input) and your server configuration (srm.conf) file.

$access_log = '/usr/spool/ftp/www/httpd/logs/access_log';
$srm_conf   = '/usr/spool/ftp/www/httpd/conf/srm.conf';

# Specify the command for displaying compressed files to STDOUT

$zcat    = 'gunzip -c';    # specify as null string if none are available
$zhandle = '(gz|Z|z)';     # list the file extensions that indicate compressed

# If address in log entry is one word (a local host), append what?

$AppendToLocalhost = ".stanford.edu";  # Either "" or appropriate ".sub.dom.ain"

# Edit the next two lines to customize for your domain.
# This will allow your domain to be separated in the domain listing.
# Note that you may also want to change the domain selection logic
# (where these variables are used) if you are at a site without
# multi-level subdomains.

$mydom1 = "stanford";
$mydom2 = "edu";

# Edit the next three lines to specify whether (1) or not (0) you want to
# display the IP address on reversed subdomain listings as follows:

$LocalFullAddress  = 0;    # Show full address for hosts in my domain?
$OthersFullAddress = 0;    # Show full address for hosts outside my domain?
$ShowUnresolved    = 0;    # Show all unresolved addresses?

# ==========================================================================
# The following are unlikely to need changing:

# Setup the table of response codes and (if not '') the name for archive

%RespCodes = (
    '200', '',                                 # OK response
    '302', 'Code 302 Redirected Requests',
    '304', '',                                 # Not Modified response
    '400', 'Code 400 Bad Requests',
    '401', 'Code 401 Unauthorized Requests',
    '403', 'Code 403 Forbidden Requests',
    '404', 'Code 404 Not Found Requests',
    '500', 'Code 500 Server Errors',
    '501', 'Code 501 Not Implemented Requests',
);

# Estimate the size (in bytes) of typical responses not counted in logfile

%RespEstimates = (
    '200', 180,     # A normal response header
    '302', 255,     # A redirect message minus the two location URLs
    '304', 98,      # A Not Modified response to a Conditional GET
    '400', 313,     # A Bad Request response message
    '401', 281,     # An Authorization Required response message
    '403', 268,     # A Forbidden response message minus requested name
    '404', 250,     # A Not Found message minus requested name
    '500', 485,     # A Server Error response message
    '501', 482,     # A Not Implemented response message not including method
);

# The rest of the options are extremely unlikely to need changing:

$Verbose           = 0;    # Display   valid log entries on STDERR?
$PrintInvalids     = 0;    # Display invalid log entries on STDERR?
$PrintNonexist     = 0;    # Display nonexistant file requests on STDERR?
$IncludeFile       = "";   # Prior output file to include first.
$SearchAddress     = "";   # Pattern to look for in hostname/IP addresses.
$SearchCode        = "";   # Pattern to look for in Code.
$SearchDate        = "";   # Pattern to look for in Date.
$SearchTime        = "";   # Pattern to look for in Hour.
$SearchArchive     = "";   # Pattern to look for in Archive names.
$NotAddress        = "";   # Pattern to reject entry if in IP addresses.
$NotCode           = "";   # Pattern to reject entry if in Code.
$NotDate           = "";   # Pattern to reject entry if in Date.
$NotTime           = "";   # Pattern to reject entry if in Hour.
$NotArchive        = "";   # Pattern to reject entry if in Archive names.

# The following option is only useful if the server is running with
# rfc931 support (i.e. "IdentityCheck on" appears in httpd.conf).
# NOTE: For security reasons, you should not publish to the web any report
# that lists the Remote Identifiers.  This option is intended for server
# maintenance only.  Use the -r option on the command-line instead.

$Do_Ident  = 0;    # Set = 2 ONLY if IdentityCheck on and Ident ALWAYS desired.

# ==========================================================================
# Get the command-line options

require "getopts.pl";
&Getopts('helLoOuUrvxs:i:a:c:d:t:n:A:C:D:T:N:');
if ($@ || $opt_h) { &usage; }

if ($opt_e) { $PrintInvalids     = 1; }
if ($opt_l) { $LocalFullAddress  = 1; }
if ($opt_L) { $LocalFullAddress  = 0; }
if ($opt_o) { $OthersFullAddress = 1; }
if ($opt_O) { $OthersFullAddress = 0; }
if ($opt_u) { $ShowUnresolved    = 1; }
if ($opt_U) { $ShowUnresolved    = 0; }
if ($opt_r) { $Do_Ident          = 2; }
if ($opt_v) { $Verbose           = 1; }
if ($opt_x) { $PrintNonexist     = 1; }

if ($opt_s) { $srm_conf      = $opt_s; }
if ($opt_i) { $IncludeFile   = $opt_i; }

if ($opt_a) {
    $SearchAddress = $opt_a;
    $SearchAddress =~ s/([#+.])/\\\1/g;       # Escape these special characters
}
if ($opt_c) {
    $SearchCode    = $opt_c;
}
if ($opt_d) {
    $SearchDate    = $opt_d;
}
if ($opt_t) {
    $SearchTime    = $opt_t;
}
if ($opt_n) {
    $SearchArchive = $opt_n;
    $SearchArchive =~ s/([#+.])/\\\1/g;
}

if ($opt_A) {
    $NotAddress    = $opt_A;
    $NotAddress    =~ s/([#+.])/\\\1/g;
}
if ($opt_C) {
    $NotCode       = $opt_C;
}
if ($opt_D) {
    $NotDate       = $opt_D;
}
if ($opt_T) {
    $NotTime       = $opt_T;
}
if ($opt_N) {
    $NotArchive    = $opt_N;
    $NotArchive    =~ s/([#+.])/\\\1/g;
}

# ==========================================================================
# Get the other needed configuration items from the srm.conf file

open (SRM,$srm_conf) || die "Error opening config file: $srm_conf\n";

$UserDir        = "public_html";              # Start with NCSA defaults
$DirectoryIndex = "index.html";
$DocumentRoot   = "/usr/local/etc/httpd/htdocs";

while (<SRM>)
{
    next if ( ($_ eq "\n") || /^\#/ ); # Ignore blank and comment lines

    if (/^DocumentRoot (.+)\s/)
    {
        $DocumentRoot = $1;
    }
    elsif (/^UserDir (.+)\s/)
    {
        $UserDir = $1;
    }
    elsif (/^DirectoryIndex (.+)\s/)
    {
        $DirectoryIndex = $1;
    }
    elsif (/^Redirect\s+(\S+)\s+(\S+)\s/)
    {
        $alias = $1;
        $rname = $2;
        $alias =~ s/(\W)/\\\1/g;          # Needed for later pattern match
        $AllRedirects{$alias} = $rname;
    }
}
close SRM;

# ==========================================================================
# If an old output file is to be included, read it into the counters
# We assume that the old output file was created with the same options
# and that its content is disjunct from the current access_log.
# NOTE that using search options with inclusion cannot work unless the
# included file was also created with those search options.

$startTag = "<PRE>\n";       # Parse by finding the preformatted parts.
$endTag   = "</PRE>\n";      # Note that these vars are used by output code.

if ($IncludeFile)
{
    open (OLD,$IncludeFile) ||
         die "Error opening file for inclusion: $IncludeFile\n";

    # This code depends on the order of the output being similar
    # to the default distribution.  If you change the output content,
    # you must also check to ensure this code still works.

    $sequence = 0;

    OLDLINE: while (<OLD>)
    {
        if (!($sequence % 2))          # Are we outside a preformatted section?
        {
            if ($_ eq $startTag)       # Yes - Start of preformat?
            {
                $sequence++;
                if ($sequence != 1)    #       Unless this is first section,
                {
                    <OLD>;             #       Read past the two header lines
                    <OLD>;
                }
            }
            next OLDLINE;
        }
        elsif ($_ eq $endTag)          # No  - Is this the end of preformat?
        {
            $sequence++;
            if ($sequence == (12 + $Do_Ident)) #       Are we done yet?
            {
                last OLDLINE;
            }
            next OLDLINE;
        }

        if ($sequence == 1)            # Now at "Files Transmitted ..."
        {
            /\s(\d+)\n/;               #     get last numeric word
            $xferfiles = $1;
            $_ = <OLD>;                #     next at "Bytes Transmitted ..."
            /\s(\d+)\n/;               #     get last numeric word
            $xferbytes = $1;
            <OLD>; <OLD>;              #     skip the two averages
        }
        elsif ($sequence == 3)         # Now in Daily Transmission Statistics
        {
            /\s(\d+)\s+(\d+)\s+\|\s+(\S.*)\n/; # Parse out:
            $date            = $3;             # the date after "|"
            $dayfiles{$date} = $2;             # the first  number left of "|"
            $daybytes{$date} = $1;             # the second number left of "|"
        }
        elsif ($sequence == 5)         # Now in Hourly Transmission Statistics
        {
            /\s(\d+)\s+(\d+)\s+\|\s+(\S+)\s/;  # Parse out:
            $hour             = $3;            # the hour after "|"
            $hourfiles{$hour} = $2;            # the first  number left of "|"
            $hourbytes{$hour} = $1;            # the second number left of "|"
        }
        elsif ($sequence == 7)         # Now in Client Domain
        {
            /\s(\d+)\s+(\d+)\s+\|\s+(\S+)\s/;  # Parse out:
            $domain               = $3;        # the domain abbrev after "|"
            $domainfiles{$domain} = $2;        # the first  number left of "|"
            $domainbytes{$domain} = $1;        # the second number left of "|"
        }
        elsif ($sequence == 9)         # Now in Reversed Subdomain
        {
            /\s(\d+)\s+(\d+)\s+\|\s+(\S.*)\n/; # Parse out:
            $subdomain                  = $3;  # the subdomain after "|"
            $subdomainfiles{$subdomain} = $2;  # the first  number left of "|"
            $subdomainbytes{$subdomain} = $1;  # the second number left of "|"
        }
        elsif ($sequence == 11)        # Now in Archive Section
        {
            /\s(\d+)\s+(\d+)\s+\|\s+(\S.*)\n/; # Parse out:
            $pathkey              = $3;        # the pathkey after "|"
            $groupfiles{$pathkey} = $2;        # the first  number left of "|"
            $groupbytes{$pathkey} = $1;        # the second number left of "|"
        }
        elsif (($sequence == 13) && $Do_Ident )  # Now in Ident Section
        {
            /\s(\d+)\s+(\d+)\s+\|\s+(\S.*)\n/; # Parse out:
            $ident                = $3;        # the pathkey after "|"
            $identfiles{$ident}   = $2;        # the first  number left of "|"
            $identbytes{$ident}   = $1;        # the second number left of "|"
        }
        else                           # Now in Hell (too many preformats)
        {
            print (STDERR "Warning: Something is wrong with $IncludeFile");
            last OLDLINE;
        }
    }
    close OLD;
}

# ==========================================================================
# Now process each logfile left on command-line or just the default one
# 

$thistime   = time;                     # Get the current date-time stamp
$Updated    = &wtime($thistime,'');     # Format it as local time
$UpdatedGMT = &wtime($thistime,'GMT');  #  and also as  GMT  time

if ($Verbose) { print(STDERR "$Version: $Updated\n"); }

if ($ARGV[0])
{
    while ($access_log = shift)
    {
        &process_log($access_log);
    }
}
else
{
    &process_log($access_log);
}

if ($Verbose) { print(STDERR "Done processing logs, now doing summary\n"); }

if ($xferfiles == 0) {die "There was no matching data to summarize.\n";}

# ==========================================================================
# We now need to combine slashless directory names with slashful ones
#

foreach $section (keys %groupfiles)
{
    if ($section !~ m#/$# )
    {
        $secdir = $section . '/';
        if ($groupfiles{$secdir})
        {
            $groupfiles{$secdir} += $groupfiles{$section};
            $groupbytes{$secdir} += $groupbytes{$section};
            delete $groupfiles{$section};
            delete $groupbytes{$section};
        }
    }
}

#
# And create a sorted date array for later output
#

@dates = sort datecompare keys(dayfiles);

# ==========================================================================
# Finally, we can print out the resulting statistics as a series of forms.
#
# Note: if you have a heavily used server, you may need to increase
#       the length of the numeric output fields in the forms below.

$DailyHeader     = "Daily Transmission Statistics";
$HourlyHeader    = "Hourly Transmission Statistics";
$DomainHeader    = "Total Transfers by Client Domain";
$SubdomainHeader = "Total Transfers by Reversed Subdomain";
$ArchiveHeader   = "Total Transfers from each Archive Section";
$IdentHeader     = "Total Transfers to each Remote Identifier";

print "<HTML><HEAD>\n";
print "<TITLE>$OutputTitle</TITLE>\n";
print "</HEAD><BODY>\n";
print "<H1>$OutputTitle</H1>\n";

if ($tmzo) { $Updated .= "(GMT $tmzo)";   }
else       { $Updated .= "($UpdatedGMT)"; }

print "<EM>Last updated: $Updated</EM>\n";

print "<UL>\n";
print "<LI><A HREF=\"\#Daily\">$DailyHeader</A>\n";
print "<LI><A HREF=\"\#Hourly\">$HourlyHeader</A>\n";
print "<LI><A HREF=\"\#Domain\">$DomainHeader</A>\n";
print "<LI><A HREF=\"\#Subdomain\">$SubdomainHeader</A>\n";
print "<LI><A HREF=\"\#Archive\">$ArchiveHeader</A>\n";
if ($Do_Ident)
{
    print "<LI><A HREF=\"\#Ident\">$IdentHeader</A>\n";
}
if ($LastSummary)
{
    $prevmonth = &lastmonth($dates[0]);
    $LastSummary =~ s/%M/$prevmonth/g;
    print "<LI><A HREF=\"$LastSummary\">Previous Full Summary Period</A>\n";
}
print "</UL>\n";

print "<H2>Totals for Summary Period:  ","$dates[0]"," to ",$dates[$#dates],
      "</H2>\n";
print $startTag;
printf ("Files Transmitted During Summary Period  %14.0f\n", $xferfiles);
printf ("Bytes Transmitted During Summary Period  %14.0f\n", $xferbytes); 

printf ("Average Files Transmitted Daily          %14.0f\n",
        $xferfiles / ($#dates + 1));
printf ("Average Bytes Transmitted Daily          %14.0f\n",
        $xferbytes / ($#dates + 1));
print $endTag;

if ( $xferfiles < 1 ) { $xferfiles = 1; }
if ( $xferbytes < 1 ) { $xferbytes = 1; }

# ==========================================================================

print "<HR>\n";
print "<H2><A NAME=\"Daily\">$DailyHeader</A></H2>\n";
print $startTag;
print 
"%Reqs %Byte  Bytes Sent  Requests   Date\n";
print 
"----- ----- ------------ -------- |------------\n";

foreach $date ( sort datecompare keys(daybytes) )
{
    $files = $dayfiles{$date};
    $bytes = $daybytes{$date};
    if ($files == $xferfiles) {
        $pctfiles = "100.0";
    } else {
        $pctfiles = sprintf("%5.2f", 100*$files/$xferfiles);
    }
    if ($bytes == $xferbytes) {
        $pctbytes = "100.0";
    } else {
        $pctbytes = sprintf("%5.2f", 100*$bytes/$xferbytes);
    }
    printf ("%s %s %12d %8d | %s\n",$pctfiles,$pctbytes,$bytes,$files,$date);
}
print $endTag;

# ==========================================================================

print "<HR>\n";
print "<H2><A NAME=\"Hourly\">$HourlyHeader</A></H2>\n";
print $startTag;
print 
"%Reqs %Byte  Bytes Sent  Requests   Time\n";
print 
"----- ----- ------------ -------- |-----\n";

foreach $hour ( sort keys(hourbytes) )
{
    $files = $hourfiles{$hour};
    $bytes = $hourbytes{$hour};
    if ($files == $xferfiles) {
        $pctfiles = "100.0";
    } else {
        $pctfiles = sprintf("%5.2f", 100*$files/$xferfiles);
    }
    if ($bytes == $xferbytes) {
        $pctbytes = "100.0";
    } else {
        $pctbytes = sprintf("%5.2f", 100*$bytes/$xferbytes);
    }
    printf ("%s %s %12d %8d |  %s\n",$pctfiles,$pctbytes,$bytes,$files,$hour);
}
print $endTag;

# ==========================================================================

print "<HR>\n";
print "<H2><A NAME=\"Domain\">$DomainHeader</A></H2>\n";
print $startTag;
print 
"%Reqs %Byte  Bytes Sent  Requests   Domain\n";
print 
"----- ----- ------------ -------- |------------------------------------\n";

%codetable=&initcountryname();
foreach $domain ( sort domnamcompare keys(domainfiles) )
{
    $country = $domain;
    $country = &countryname($domain,%codetable);
    $files   = $domainfiles{$domain};
    $bytes   = $domainbytes{$domain};
    if ($files == $xferfiles) {
        $pctfiles = "100.0";
    } else {
        $pctfiles = sprintf("%5.2f", 100*$files/$xferfiles);
    }
    if ($bytes == $xferbytes) {
        $pctbytes = "100.0";
    } else {
        $pctbytes = sprintf("%5.2f", 100*$bytes/$xferbytes);
    }
    printf ("%s %s %12d %8d | %-5s %s\n", $pctfiles,$pctbytes,$bytes,$files,
                                          $domain, $country);
}
print $endTag;

# ==========================================================================

print "<HR>\n";
print "<H2><A NAME=\"Subdomain\">$SubdomainHeader</A></H2>\n";
print $startTag;
print 
"%Reqs %Byte  Bytes Sent  Requests   Reversed Subdomain\n";
print 
"----- ----- ------------ -------- |------------------------------------\n";

foreach $subdomain ( sort keys(subdomainfiles) )
{
    $files = $subdomainfiles{$subdomain};
    $bytes = $subdomainbytes{$subdomain};
    if ($files == $xferfiles) {
        $pctfiles = "100.0";
    } else {
        $pctfiles = sprintf("%5.2f", 100*$files/$xferfiles);
    }
    if ($bytes == $xferbytes) {
        $pctbytes = "100.0";
    } else {
        $pctbytes = sprintf("%5.2f", 100*$bytes/$xferbytes);
    }
    printf ("%s %s %12d %8d | %s\n", $pctfiles,$pctbytes,$bytes,$files,
                                     $subdomain);
}
print $endTag;

# ==========================================================================

print "<HR>\n";
print "<H2><A NAME=\"Archive\">$ArchiveHeader</A></H2>\n";
print $startTag;
print 
"%Reqs %Byte  Bytes Sent  Requests   Archive Section\n";
print 
"----- ----- ------------ -------- |------------------------------------\n";

foreach $section ( sort keys(groupfiles) )
{
    $files = $groupfiles{$section};
    $bytes = $groupbytes{$section};
    next unless $files;
    if ($files == $xferfiles) {
        $pctfiles = "100.0";
    } else {
        $pctfiles = sprintf("%5.2f", 100*$files/$xferfiles);
    }
    if ($bytes == $xferbytes) {
        $pctbytes = "100.0";
    } else {
        $pctbytes = sprintf("%5.2f", 100*$bytes/$xferbytes);
    }
    printf ("%s %s %12d %8d | %s\n", $pctfiles, $pctbytes, $bytes, $files,
                                     $section);
}
print $endTag;

# ==========================================================================

if ($Do_Ident)                            # Is Ident Table desired?
{
    print "<HR>\n";
    print "<H2><A NAME=\"Ident\">$IdentHeader</A></H2>\n";
    print $startTag;
    print 
    "%Reqs %Byte  Bytes Sent  Requests   Remote Identifier\n";
    print 
    "----- ----- ------------ -------- |------------------------------------\n";

    foreach $ident ( sort keys(identfiles) )
    {
        $files = $identfiles{$ident};
        $bytes = $identbytes{$ident};
        if ($files == $xferfiles) {
            $pctfiles = "100.0";
        } else {
            $pctfiles = sprintf("%5.2f", 100*$files/$xferfiles);
        }
        if ($bytes == $xferbytes) {
            $pctbytes = "100.0";
        } else {
            $pctbytes = sprintf("%5.2f", 100*$bytes/$xferbytes);
        }
        printf ("%s %s %12d %8d | %s\n", $pctfiles, $pctbytes, $bytes, $files,
                                         $ident);
    }
    print $endTag;
}

# ==========================================================================

print "<HR>\n";
print "<ADDRESS>This summary was generated by \n";
print "<A HREF=\"http://www.ics.uci.edu/WebSoft/wwwstat/\">";
print "$Version</A>\n";
print "</ADDRESS>\n";
print "</BODY></HTML>\n";

exit(0);

# ==========================================================================
# ==========================================================================
# Read the passed-in log and accumulate statistics for each access
#
sub process_log
{
  local($thislog) = @_;

  if ($Verbose) { print(STDERR "Processing access log \"$thislog\"\n"); }

  if ($thislog =~ m#\.$zhandle$#)
  {
      if (!$zcat)
      {
          print(STDERR "No zcat decompression command has been defined\n");
          return;
      }
      $thislog = "$zcat $thislog |";
  }

  if (!open (LOG,$thislog))
  {
      print(STDERR "Error opening access log file: $thislog\n");
      return;
  }

  LINE: while (<LOG>)
  {
    $saveline = $_;

    #
    # First, parse the new format into its seven basic components
    #

    ($host, $rfc931, $authuser, $timestamp, $request, $status, $bytes) =
        /^(\S+) (\S+) (\S+) \[(.+)\] \"(.+)\" (\S+) (\S+)\s/;

    # Now, is this garbage or is it memorex?  Note that $bytes can be 0

    if (!($host && $rfc931 && $authuser && $timestamp && $request && $status))
    {
        print(STDERR "$.:$saveline") if $PrintInvalids;
        next LINE;
    }

    if (!$RespEstimates{$status})      # Test the response code
    {
        print(STDERR "$.:$saveline") if $PrintInvalids;
        next LINE;
    }
    if ($SearchCode) { next LINE unless ( $status =~ m#$SearchCode# ); }
    if ($NotCode)    { next LINE unless ( $status !~ m#$NotCode# ); }

    if ($bytes eq '-') { $bytes = '0'; }

    if ($bytes !~ /^\d+$/ )            # Test the bytes transferred
    {
        print(STDERR "$.:$saveline") if $PrintInvalids;
        next LINE;
    }

    if ($rfc931 ne '-')                # Test the remote ident
    {
        $ident = $rfc931;                  # Save ident for later use
        $ident =~ s/\[.*\]/COOKIE/g;       # Replace all magic cookies
    }
    elsif ($authuser ne '-')
    {
        $ident = $authuser;                # Jury rig support for authuser
        $ident =~ s/\[.*\]/COOKIE/g;       # Replace all magic cookies
    }
    else { $ident = "unknown"; }

    #
    # Looks okay -- Now figure out when the request was made.
    #
    $timestamp =~ s/^0/ /;                 # Remove leading zero from day

    $hour = substr($timestamp,12, 2); 
    $date = join(' ',substr($timestamp, 3, 3),
                     substr($timestamp, 0, 2),
                     substr($timestamp, 7, 4));
    $tmzo = substr($timestamp, 21, 5);

    if (!($hour && $date && $tmzo))
    {
        print(STDERR "$.:$saveline") if $PrintInvalids;
        next LINE;
    }

    if ($SearchDate) { next LINE unless ( $date =~ m#$SearchDate# ); }
    if ($NotDate)    { next LINE unless ( $date !~ m#$NotDate# ); }
    if ($SearchTime) { next LINE unless ( $hour =~ m#$SearchTime# ); }
    if ($NotTime)    { next LINE unless ( $hour !~ m#$NotTime# ); }

    #
    # Then parse the method and URL pathname from request
    #

    ($method, $fname, $htv) = split(' ',$request,3);

    $fname =~ s/\?.*$//;          # Remove any trailing query information
    $fname =~ s/\#.*$//;          # Remove any trailing anchor information
    $fname =~ s#//#/#g;           # Remove any extra slashes

    if (!$fname || ($fname =~ m#^HTTP/#i))
    {
        print(STDERR "$.:$saveline") if $PrintInvalids;
        next LINE;
    }

    $has_head = 0;                     # Check for HTTP/1.X tag
    if ($htv)
    {
        if ($htv !~ m#^HTTP/#i)
        {
            print(STDERR "$.:$saveline") if $PrintInvalids;
            next LINE;
        }
        $has_head = 1;
    }

    #
    # Now we can update the actual bytes transferred to include header/errors
    #

    if ($has_head || ($status ne '200'))
    {
        $bytes += $RespEstimates{$status};
    }

    if ($status eq '302')           # A redirect message
    {
        $rname = "";
        foreach $redir (keys(AllRedirects))  # Is it a redirected file?
        {
            if ( $fname =~ /^$redir/ )
            {
                $rname = $fname;
                $rname = s#^$redir#$AllRedirects{$redir}#;
                last;
            }
        }
        if (!$rname)                         # Nope, must be dir redirect
        {
            $rname = $fname;
            $bytes += 60;                    # add a double http://site...
        }
        $bytes += (2 * length($rname));
    }
    elsif ($status eq '501')        # A Not Implemented response message
    {
        $bytes += length($method);
    }
    elsif ($status =~ /^40[34]$/)   # A Forbidden or Not Found message
    {
        $bytes += length($fname);
    }

    #
    # And then determine what the archive name should be
    #

    $xname = '';

    if ($rname = $RespCodes{$status})
    {
        $xname = $fname;
        $fname = $rname unless ($SearchCode || $NotCode);
    }
    else
    {
        $fname =~ s#/$DirectoryIndex$#/#;     # Remove any trailing index name

        if (($fname eq "/") || ($fname eq ""))
        {
            $fname = "$ServerHome";           # Handle top file with extra care
        }
    }

    if ($SearchArchive) { next LINE unless ( $fname =~ m#$SearchArchive# ); }
    if ($NotArchive)    { next LINE unless ( $fname !~ m#$NotArchive# ); }

    #
    # If you want to further restrict Archive Section names, do it here.
    # For example, if you wanted to show all GIFs as a single total,
    # then you would do:
    #
    #     if ($fname =~ /\.gif$/) { $pathkey = "All GIFs"; }
    #     else                    { $pathkey = $fname;     }
    #

    if ($SearchArchive || $SearchCode || $NotCode)
    {
        $pathkey = $fname;
    }
    elsif ($fname =~ /^\/Icons\/\w/)      { $pathkey = "All Icons (site)"; }
    elsif ($fname =~ /^\/icons\/\w/)      { $pathkey = "All Icons (server)"; }
    elsif ($fname =~ /^\/pictures\/\w/)   { $pathkey = "All Pictures"; }
    else                                  { $pathkey = $fname; }

    if ($pathkey eq '')
    {
        print(STDERR "$.:$saveline") if $PrintInvalids;
        next LINE;
    }


    if ($Verbose)
    {
        print(STDERR "$date $hour $bytes $fname\n");
    }

    #
    # Get hostname/IP address and determine domain and reversed subdomain.
    #

    $host  =~ tr/A-Z/a-z/;
    $ident .= '@' . $host;

    if ($SearchAddress) { next LINE unless ( $host =~ m#$SearchAddress# ); }
    if ($NotAddress)    { next LINE unless ( $host !~ m#$NotAddress# ); }

    @address = split(/\./, $host);

    if ( $#address < 1 )               # Usually caused by garbage in log
    {                                  # or perhaps a strange IP setup
        if ($AppendToLocalhost)        # or perhaps perfectly normal
        {
            $domain  = "$mydom1\.$mydom2";
            $ident  .= $AppendToLocalhost;
            $host   .= $AppendToLocalhost;
            @address =  split(/\./, $host);
            if ((!$LocalFullAddress)&&($#address > 1))
            {
                shift(@address);
            }
            $subdomain = join('.', reverse(@address));
        }
        else
        {
            print(STDERR "$.:$saveline") if $PrintInvalids;
            $domain    = "localhost";
            $subdomain = $host;
        }
    } 
    elsif ( $address[$#address] =~ /^[0-9]+$/ )
    {
        $domain = "unresolved";
        if ($ShowUnresolved)
        {
            $subdomain = join('.', reverse(@address));
        }
        else
        {
            $subdomain = "Unresolved";
        }
    }
    elsif ($address[$#address-1] eq "$mydom1" &&
           $address[$#address]   eq "$mydom2"    )
    {
        $domain = "$mydom1\.$mydom2";
        if ((!$LocalFullAddress)&&($#address > 1))
        {                         # If the address has at least 3 components
            shift(@address);      #    clip off the machine name
        }
        $subdomain = join('.', reverse(@address));
    }
    else
    {
        $domain = $address[$#address];
        if ((!$OthersFullAddress)&&($#address > 1))
        {                         # If the address has at least 3 components
            shift(@address);      #    clip off the machine name
        }
        $subdomain = join('.', reverse(@address));
    }


    if ($PrintNonexist && $xname && ($status >= 400))
    {
        print(STDERR "$status $date $hour $xname  BY $host\n");
    }

    #
    # Now that we have categorized it, add it to the corresponding counters
    #

    $xferfiles++;                             # total files sent
    $dayfiles{$date}++;                       #             per day
    $groupfiles{$pathkey}++;                  #             per file
    $domainfiles{$domain}++;                  #             per domain
    $subdomainfiles{$subdomain}++;            #             per subdomain

    $xferbytes                  += $bytes;    # total bytes sent     
    $daybytes{$date}            += $bytes;    #       bytes per day
    $groupbytes{$pathkey}       += $bytes;    #             per file
    $domainbytes{$domain}       += $bytes;    #             per domain
    $subdomainbytes{$subdomain} += $bytes;    #             per subdomain

    $hourfiles{$hour}++;                      # total files per hour
    $hourbytes{$hour}           += $bytes;    #       bytes per hour

    if ($Do_Ident)                            # Is Ident Table desired?
    {
        $identfiles{$ident}++;                #       files per ident
        $identbytes{$ident}     += $bytes;    #       bytes per ident
    }
  }
  close LOG;
}

# ==========================================================================

sub initcountryname
{
    #read in table of ISO codes and country names -added by jem
    open (blah, "<$countrycodefile") || die "Can't open $countrycodefile";
    while (<blah>) {
        chop;
        local($iso,$name)=split('   ');
        $iso =~ y/A-Z/a-z/;
        $code{$iso}=$name;
    }
    close blah;
    return %code;
}

sub countryname
{
    #returns country name for an iso code
    local($iso, %codetable) = @_;
    return $codetable{$iso};
}

sub datecompare
{
    local($[) = 0;

    local($date1) = substr($a, 7, 4) * 512;
    local($date2) = substr($b, 7, 4) * 512;
    $date1 += index("JanFebMarAprMayJunJulAugSepOctNovDec",substr($a,0,3))*12;
    $date2 += index("JanFebMarAprMayJunJulAugSepOctNovDec",substr($b,0,3))*12;
    $date1 += substr($a, 4, 2);
    $date2 += substr($b, 4, 2);
    $date1 - $date2;
}

sub domnamcompare
{
    $sdiff = length($a) - length($b);
    ($sdiff < 0) ? -1 : ($sdiff > 0) ? 1 : ($a lt $b) ? -1 : ($a gt $b) ? 1 : 0;
}

sub bytecompare
{
    $bdiff = $groupbytes{$b} - $groupbytes{$a};
    ($bdiff < 0) ? -1 : ($bdiff > 0) ? 1 : ($a lt $b) ? -1 : ($a gt $b) ? 1 : 0;
}

# ===========================================================================
# The following subroutine should be in a package, but I'm lazy.
# This is a modified (by Roy Fielding) version of Perl 4.036's ctime.pl
# library by Waldemar Kebsch <kebsch.pad@nixpbe.UUCP> and
# Marion Hakanson <hakanson@cse.ogi.edu>.  It is distributed under the
# Artistic License (included with your Perl distribution files).
# 
#
# wtime returns a time string in the format "Wkd, Dy Mon Year HH:MM:SS Zone"
#               with no newline appended.
#
# USAGE:
#
# wtime(time,'');     -- returns the local time with no timezone appended
#                        As in "Wed, 15 Dec 1993 23:59:59 "
#
# wtime(time,'GMT');  -- returns GMT time
#                        As in "Wed, 16 Dec 1993 07:59:59 GMT"
#

sub wtime {
    local($time, $TZ) = @_;
    local($[) = 0;
    local($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst);

    local(@DoW) = ('Sun','Mon','Tue','Wed','Thu','Fri','Sat');
    local(@MoY) = ('Jan','Feb','Mar','Apr','May','Jun',
                   'Jul','Aug','Sep','Oct','Nov','Dec');

    # Determine what time zone is in effect.  Use local time if
    # TZ is anything other than 'GMT'
    # There's no portable way to find the system default timezone.

    ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) =
        ($TZ eq 'GMT') ? gmtime($time) : localtime($time);

    $year += ($year < 70) ? 2000 : 1900;
    sprintf("%s, %02d %s %4d %02d:%02d:%02d %s",
      $DoW[$wday], $mday, $MoY[$mon], $year, $hour, $min, $sec, $TZ);
}

# ===========================================================================
# This last routine returns the three letter abbreviation for the month
# before the one in the date that was passed as an argument
#

sub lastmonth {
    local($date) = @_;        # Should be in the format "Feb 01 1994"
    local($[) = 0;

    local($Mstr) = 'JanFebMarAprMayJunJulAugSepOctNovDec';

    local($midx) = index($Mstr, substr($date,0,3));

    if    ($midx  < 0) { return 'Err'; }
    elsif ($midx == 0) { return 'Dec'; }
    else               { return substr($Mstr,($midx - 3),3); }
}