#!/usr/bin/perl -w
use strict;
use FileHandle;
use lib '/usr/local/myNMS/bin' ;
use My_Utils ;
use My_DButils ;
use My_Config ;

my  $VERSION="0.63.2"    ;

=pod
Copyright (c) 2000-2001 John Stumbles <john@uk.stumbles.org> and the University of Reading. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

process incoming log (etc) files to extract for NMS database

                        C H A N G E   H I S T O R Y
                        ---------------------------

0.63.2  27Feb01 removed redundant smb log file processing code
                Also when getting IP_MAC_user table, *DON'T* restrict to records >= earliest time of records
                we're processing as this results in records consecutive with those from a few days ago
                which we didn't retrieve: instead get records from a week or so back - this still
                saves a lot if records go back many months, and perhaps gives a useful picture of users'
                usage if there is a break of > week in their appearances
0.63.1  20Feb01 removed $CONFIG{DIR}{BASE} from myNMS subdir specs
0.63    16Feb01 more verbose INFORM if we have SQUID OUT defined
0.62    16Feb01 if SQUID OUT is not an absolute or relative filespec prepend our var dir to it
0.61    06Feb01 If SQUID OUT defined attempt to write logfile records to this file
0.6     05Feb01 allow --squid as alternative to --www
                process_wwwcachelogs name changed to process_squidlogs,
                if not given filenames constructs data sources from config file
                open_rkv_file just tries to open whatever given in filename if it contains | (pipe sysmbol)

0.5     30Jan01 use modules

0.4.1   24Jan01 web log processor adds records to Shared_Users table in DB - we can use a query like:
                    select u.username, u.IPadd, u.MAC, from_unixtime(u.mtime), x.deviceID, x.ifIndex, s.sysName
                    from Shared_Users as u left join MAC_connections as x ON u.MAC=x.MAC
                    left join SNMP_system as s on x.deviceID=s.deviceID
                to determine whether user is e.g. innocently using 2 adjacent PCs in a lab

                Also when getting IP_MAC_user table, restrict to records >= earliest time of records
                we're processing (saves a lot on no of records found currently, as table is not being purged of ancient records)

0.4     23Jan01 change to use new directory structure
0.3.2   04Jan01 version number updated (0.3.1 was 0.3.0) and other cosmetic changes
0.3.1   03Jan01 YYYYMMDDhhmmss removed - was being redifined by sub of same name in utils.pl
                - and latter changed to take supplied time parameter
0.3.0   04Apr00 added --smb function for processing samba logfiles
                (at present to report on machines accessing TVD autoupd share,
                though also getting MS names of hosts

=cut


my  $DB =   'NMS';
my  $dbh = DB_init($DB) ;

my  ($PROGNAME)=$0=~/(?:.*\/)?([^\/]+)/;

#   rounding, chunk time, clock skew error and arp cache timeout constants as discussed above:
sub Tround { my $t = shift; $t =~ s/\d\d$/00/; return $t ; } #   function to round to nearest 100 secs
my  $Tchunk =   10000;     #   process logs in time chunks of this no of seconds
my  $Tclockerr  =   100;    #   allow systems' clocks to be out by 100secs
my  $Tarptimeout    =   300;    #   assume arp cache times out in 5 minutes
my  %Shared_Users  ;    #   Users on Multiple Hosts
my  $T_EARLIEST ;   #   global used by www log process
my  $SQUID_FOUT ;   #   filehandle used by squid log processing

my  $GZIP   =   $CONFIG{PROG}{GZIP};
my  $ZCAT   =   "$GZIP --decompress --stdout"   ;
#   my  $ZOUT   =   "| $GZIP -c " ;

my  $USAGE="$PROGNAME Version $VERSION
Usage:
    $PROGNAME --www webcachelogfiles
    $PROGNAME --squid webcachelogfiles
    webcachelogfiles is list of files to process
    from different (squid) web caches
";

sub WARNING { print STDERR "$PROGNAME ($0): "; printf STDERR @_ ; }
sub INFORM { print STDERR "$PROGNAME ($0): "; printf STDERR @_ ; }
sub INFO { printf STDERR @_ ; }

die $USAGE unless scalar @ARGV ;

foreach (@ARGV)
{
    if (/--(www|squid)/i)   #   process webcache logfiles
    {
        shift;
        process_squidlogs(@ARGV);
        exit 0;
    }
    else
    {
        die "$USAGE\n";
    }
}
##############################################################################
sub process_squidlogs
#   process logfiles with names given in list passed
{
# see Implementation.html for explanation

    my  $FUNCNAME = 'process_squidlogs' ;
    my  @Logs = @_ ;
    unless (scalar @Logs)
    {
        die "*** ERROR *** no config info for SQUID\n" unless $CONFIG{SQUID} ;
        my  $yesterday = YYYYMMDDhhmmss((time() - DAY()), 8);
        foreach my $squid (split (/\s+/, $CONFIG{SQUID}{HOSTS}))
        {
            push @Logs, "$CONFIG{PROG}{REMOTE_SHELL} $squid -l $CONFIG{SQUID}{USER} 'cat $CONFIG{SQUID}{LOG}.$yesterday.gz' | $ZCAT |" ;
        }
        if ($CONFIG{SQUID}{OUT})
        {
            my  $fout = $CONFIG{SQUID}{OUT} ;
            $fout = "$CONFIG{DIR}{VAR}/$fout" unless ($fout =~ m|^\./|) ; # out to file in our VAR dir unless full or relative path specified
            $fout =~ s/^STDOUT$/\-/i ;  #   if specified as 'stdout' replace with '-'
            INFORM "CONFIG{SQUID}{OUT} defined: [$CONFIG{SQUID}{OUT}] - writing to [$fout]\n" ;
            INFO "*** FAILED *** to open $CONFIG{SQUID}{OUT} for logfile records\n" unless ($SQUID_FOUT = new FileHandle ">$fout") ;
        }
    }

    ##########################################################################
    #   INITIALISE:
    #       open logfiles,
    #       get first timestamp from logfiles,
    #       get %IP_MAC from DB
    #
    #   open logfiles:
    my  ($fn, $FH, @FH) ;
    foreach $fn (@Logs)
    {
        die "Could not get handle for file $fn\n" unless ($FH = open_rkv_file($fn)) ;
        push @FH, $FH ;
    }
    die "*** ERROR *** no fileshandles to process\n" unless scalar (@FH) ;

    #   get first timestamp from logfiles
    my  $T =   0;          #   current time stamp (rounded to nearest whatever) being processed
    my  ($time, $host, $user) ;
    if ( $T == 0 )    #   initialise
    {
        #   get earliest time seen in cache logs
        $T_EARLIEST = $T = get_cache_recs(\@FH, 0) ;
        die "Error reading from cache logs\n" unless $T;
    }

    #   get %IP_MAC from DB
    my %IP_MAC = get_IP_MAC($T) ;   #   fetch IP_MAC data structure from DB
    die "Could not get ARP info from IP_MAC table\n" unless %IP_MAC ;

    #   $T is earliest time seen in logs, now round off to nearest whatever
    #   and add chunk size to get period up to which to read webcache logs:
    $T =  Tround ($T) + $Tchunk ;
INFO "Time: $T\t%s\n", YYYYMMDDhhmmss($T);


    ##########################################################################
    #   process logfiles:
    #
    my  %IP_MAC_user ;  #   to hold results
    #   my  $user_by_host;
    while (1)
    {
        my  $user_by_host = get_cache_recs(\@FH, $T) ;  #   keep on until end of logfiles
        last unless scalar (keys %{$user_by_host});

        #   match up host-user with IP-MAC:
        foreach $host (keys %{$user_by_host})
        {
            my  $t_user;    #   we'll need this soon ...
            my  @t_user = (keys %{$user_by_host->{$host}}) ;    #   get list of times for user on this host, we'll need this soon ...

            #   do we have IP-MAC data for this host?
            my  @t_arp = (keys %{$IP_MAC{$host}}) ;     #   get list of start times when MAC seen for this host IP
            if (scalar @t_arp)  #   yes ...
            {
                my  $t_arp ;    foreach $t_arp (@t_arp) #   for each start time for ARP record of this host ...
                {
                    foreach $t_user (@t_user)  #   check against list of times for user on this host
                    {
                        next if $t_user < $t_arp;   #   skip if user time before start of arp time
                        if ($t_user <= $IP_MAC{$host}{$t_arp}{t2})    #   if user time before end of arp time range
                        {
                            #   update IP-MAC-user-time record
                            $IP_MAC_user {$host} {$IP_MAC{$host}{$t_arp}{MAC}} {${$user_by_host->{$host}}{$t_user}} = $t_user ;
                        }
                    }
                }
            }
            else    #   no arp records for this host
            {
                foreach $t_user (@t_user)
                {
                    #   make record of host + blank MAC field + username = time seen
                    $IP_MAC_user {$host} {''} {${$user_by_host->{$host}}{$t_user}} = $t_user ;
                }
            }
        }

        #   go to next time chunk:
        $T    += $Tchunk ;
INFO "Time: $T\t%s\n", YYYYMMDDhhmmss($T);
    }

    updateDB_Host_MAC_user(%IP_MAC_user);

    if (scalar keys %Shared_Users)
    {
        printf "\nusers on multiple hosts\n"  ;
        foreach $user (keys %Shared_Users)
        {
            printf "\n%-8s\ton\t\t\t\tat\n",$user;
            my  $hx; foreach $hx (keys %{$Shared_Users{$user}})
            {
                my  $MACx = MAC_by_IP($hx) ;   #   would be better to get this from IP_MAC_user, but it's not indexed conveniently for this
                printf "\t%-15s %12s %-8s %s (%s)\n", $hx, ($MACx or ''), (Vendor_by_MAC($MACx) or '(unknwn)'), YYYYMMDDhhmmss($Shared_Users{$user}{$hx}), $Shared_Users{$user}{$hx};
                my $DO = sprintf "INSERT INTO Shared_Users (username, IPadd, MAC, mtime) VALUES (%s, '%s', %s, %u)", $dbh->quote($user), $hx, $dbh->quote($MACx), $Shared_Users{$user}{$hx} ;
                $dbh->do($DO);
            }
        }
        printf "%s\n", '-' x 79;
    }

    return 1 ;  #   OK
}
##############################################################################
sub get_cache_recs
#   given ref to array of filehandles for webcache logfiles, and demand time
#   this function has 2 modes:
#   init, in which demand time == 0,
#       read only one record from each logfile and earliest timestamp seen;
#   and normal in demand time is > 0,
#       read cache log files until demand time is just reached or EOF reached on all files
#   and return list of refs to hashes of user by host and users seen on multiple hosts,
#   or empty list if EOF reached on log files.
{
    my  ($far, $T)    =   @_ ;
    my  $FUNCNAME = 'get_cache_recs';
    die "$FUNCNAME called without list of refs to filehandles to process\n" unless ( $far ) ;  #   must have $far: filehandle array reference
    my  $fh;
    my  $tx =   0 ;
    my  (%user_by_host, %host_by_user);
    foreach $fh (@$far)   
    {
        while (1)   #   (exit conditions are tested within loop)
        {
            my ($t, $h, $u) = get_next_wwwcache_record($fh, $T) ;
            last unless $t  ;   #   end loop when EOF reached for this file
            $SQUID_FOUT and print $SQUID_FOUT $_;     #   echo logfile record to output filehandle, if it exists
            if ($T == 0)   #   initialisation mode
            {
                $tx = $t unless $tx ;   #   get any time value to start with ...
                #   ... thereafter remember earliest time seen from cache logfiles:
                $tx = $t unless $tx < $t ;
                last ;      #   exit loop immediately when in init mode
            }
            #   (else)  normal mode: update structures
            last unless $u ;    #   exit loop if get_next_wwwcache_record indicated that demand time was reached
            $t = Tround ($t) ;  #   round off timestamp
            $user_by_host{$h}{$t} = $u ;  #   save indexed by host

            #   check whether user is on Shared_Users list, if so record more detail of their sigtings (* TO-DO *)
            #   do we have record for this user on another host at this time?
            if ($host_by_user{$u}{$t})
            {
                if ($host_by_user{$u}{$t} ne $h)
                {
                    $Shared_Users{$u}{$h} = $t ;
                    $Shared_Users{$u}{$host_by_user{$u}{$t}} = $t ;
                }
            }
            else
            {
                $host_by_user{$u}{$t} = $h
            }
        }
    }
    return $tx unless $T ;                      #   init mode
    return (\%user_by_host);    #   normal mode:
    #   (when EOF reached of log files this list of hashrefs returned should be empty)
}
##############################################################################
sub get_next_wwwcache_record
#   given filehandle and demand time,
#       return timestamp, host IP and user from next good webcache record
#   if time matches demand time, just return time
#   if EOF reached just return 0 ;
{
    my  $FUNCNAME   =   'get_next_wwwcache_record'  ;
    my  ($fh, $T)   =   @_ ;
    $T = 0 unless $T ;
    my  ($t, $host, $f4, $userID, $x) ;
    my  $BAD_UID_RESPONSE   =   'TCP_DENIED/407'    ;

    while (<$fh>)
    {
        ($t, undef, $host, $f4, undef, undef, undef, $userID, $x) = split /\s+/ ;
        next unless $t ;    #   skip blank lines
        return ($t =~ /^(\d+)/) if ($t >= $T);  #   indicate that demand time has been met
        #   we may lose data by ignoring first record matching demand time
        #   (we could save it by caching it in a static data structure somewhere)
        #   but chances are that the record that first matches the time may not be
        #   usable anyway, and even if it is the duplication of good data is such that
        #   losing the odd record is of little consequence and not worth the hassle
        #   of a workaround.
        next unless ($x);                      #   skip any incomplete/corrupt records we may have
        next unless ($userID);	                #   sanity check
        next if ($f4 eq $BAD_UID_RESPONSE)  ;  #   skip where server rejected bad username
        next if ($userID eq "-" ) ;            #   skip records we're not interested in

        #   filter out leading and trailing spaces (and following passwords!) which we see 
        #   in some user ID strings in apparently successful web cache transactions:
        if ($userID =~ /^(%20)*([^%]+)(%20.*)?/)    {   $userID = $2;   }

        if (length ($userID) > 8)
        {
            WARNING ("bad data in username field: [%s]", $_) ;
            next ;                                              #   skip bad record
        }
        return (($t =~ /^(\d+)/), $host, $userID);
    }   #   (while)
    return () ;
}
##############################################################################
sub updateDB_Host_MAC_user
{
#   could keep database in a hash so we don't re read it for each separate day's file
    my  %IP_MAC_user = @_ ;
    return unless scalar keys (%IP_MAC_user) ;
    my  $Table='IP_MAC_user' ;
##    my  $SELECT = "SELECT IPadd, MAC, username, mtime FROM $Table" ;
    my  $SELECT = "SELECT IPadd, MAC, username, mtime FROM $Table where mtime>=" . ($T_EARLIEST - WEEK()) ;
    my  $sth = $dbh->prepare($SELECT) ;
    $sth->execute ;
    my  @DO ;
    if ($sth->rows)
    {
        my  ($db_IPadd, $db_MAC, $db_username, $db_mtime) ;
        $sth->bind_columns(undef, \($db_IPadd, $db_MAC, $db_username, $db_mtime)) ;
        while ($sth->fetch)
        {
            #   do we already data to update this record ?
            if ($IP_MAC_user{$db_IPadd}{$db_MAC}{$db_username})
            {
                #   write a DB update statement and delete from hash
                push @DO, sprintf "UPDATE $Table SET mtime=%u WHERE IPadd='%s' AND MAC='%s' AND username='%s'",
                    $IP_MAC_user{$db_IPadd}{$db_MAC}{$db_username}, #   mtime to update
                    $db_IPadd, $db_MAC, $db_username ; #   WHERE values
                delete $IP_MAC_user{$db_IPadd}{$db_MAC}{$db_username} ;
            }
        }
    }
    $sth->finish ;
    
    my ($IPadd, $MAC, $username, $time);
    foreach $IPadd (keys %IP_MAC_user)
    {
        foreach $MAC (keys %{$IP_MAC_user{$IPadd}})
        {
            foreach $username (keys %{$IP_MAC_user{$IPadd}{$MAC}})
            {
                push @DO, sprintf "INSERT INTO $Table (IPadd, MAC, username, mtime) VALUES ('%s','%s','%s',%u)", $IPadd, $MAC, $username, $IP_MAC_user{$IPadd}{$MAC}{$username} ;
            }
        }
    }
    my  $DO;
    foreach $DO (@DO)
    {
        $dbh->do($DO) or die "*** $PROGNAME: could not complete DB update ***\n" ;
    }
}
##############################################################################
sub get_IP_MAC_test
#   read in IP-MAC-time records from test file
{
my  $FUNCNAME = 'get_IP_MAC_test' ;

    my  $testfile = "wwwcache.testarp.1" ;
    open (IN, $testfile) or die "Could not open $testfile\n$!";
    my  %IP_MAC ;
    while (<IN>)
    {
        next if /^\s*#/ ;
        my  ($IPadd, $MAC, $t1, $t2) = split /\t/;
        $t1 += $Tclockerr ;
        $t2 -= $Tarptimeout ;
        next if ($t1 > $t2) ;   #   skip if arp timeout makes time interval invalid
        $IP_MAC{$IPadd}{$t1}{MAC}   =   $MAC;   #   record MAC address and sell-by date for this IP-MAC-time combination
        $IP_MAC{$IPadd}{$t1}{t2}    =   $t2;
    }
    return %IP_MAC ;
}
##############################################################################
sub get_IP_MAC
#   read in IP-MAC-time records from IP_MAC table of NMS database
#   if parameter T is specified get records with latest time greater than that value
{
my  $FUNCNAME = 'get_IP_MAC' ;
    my  $T = shift ;
    INFORM "$FUNCNAME\n" ;

    my  $SELECT = "SELECT IPadd, MAC, time1, time2 from IP_MAC" ;
    $SELECT .= " WHERE time2>=$T" if $T ;
    my  $sth = $dbh->prepare($SELECT);
    $sth->execute;
    unless ($sth->rows)
    {
        WARNING "Could not retieve any data from IP_MAC table\n";
        return 0 ;
    }
    my  ($IPadd, $MAC, $t1, $t2);
    $sth->bind_columns(undef, \($IPadd, $MAC, $t1, $t2));

    my  %IP_MAC;
    while ($sth->fetch)
    {
        next if ($MAC =~ /FFFFFFFFFFFF/i) ; #   skip broadcast addresses
        next unless $t1;    #   sanity check: must have start time
        next unless $t2;    #   sanity check: must have last time
        $t1 += $Tclockerr ;
        $t2 -= $Tarptimeout ;
        next if ($t1 > $t2) ;   #   skip if arp timeout makes time interval invalid
        #   store IP, MAC and times in some structure from which we can retrieve info for matching up with IP-user records
        $IP_MAC{$IPadd}{$t1}{MAC}   =   $MAC;   #   record MAC address and sell-by date for this IP-MAC-time combination
        $IP_MAC{$IPadd}{$t1}{t2}    =   $t2;
    }
    return %IP_MAC ;
}
##############################################################################
sub open_rkv_file
#   open and return a handle to a file which may be gzipped
{
my  $FUNCNAME = 'open_rkv_file' ;
    my  $fn = shift or die "$FUNCNAME called without name of file to open\n";
    my  $FH ;
    if ($fn =~ /|/) #   pipe?
    {
        return new FileHandle $fn ;     #   open and hope
    }
    if ( -e $fn )       #   file of given name exists?
    {
        if ( -r $fn)    #   file is readable?
        {
            if ($fn =~ /\.gz$/) #   filename ends in .gz?
            {
                return new FileHandle "$ZCAT $fn |" ;
            }
            #   (else)  open plain (not .gz) file:
            return new FileHandle $fn, "r";
        }
        #   (else) exists but not readable
        WARNING "(function $FUNCNAME) file $fn is not readable\n";
        return undef ;
    }
    #   (else) try filename + '.gz'
    {
        my $zfn .= "$fn.gz" ;
        if ( -e $zfn )   #   file exists?
        {
            if ( -r $zfn)    #   is readable?
            {
                return new FileHandle "$ZCAT $zfn |" ;
            }
            #   (else) not readable
            WARNING "(function $FUNCNAME) file $zfn is not readable\n";
            return undef ;
        }
    }
    #   (else)  file not found even with .gz extension
    WARNING "(function $FUNCNAME) file $fn not found, even with .gz suffix\n";
    return undef ;
}
##############################################################################