#!/usr/bin/perl

# This PERL script generates reports of access to a sub path of a web site
# Report formats are : HTML, CSV (Comma-Separated Values)
# from the access log (APACHE HTTPD)

# Author : Didier Donsez, didier.donsez@imag
# Licence : LGPL (you can send me improvements of this script)

# Example of APACHE HTTPD access log entries
#129.88.38.248 - - [19/Nov/2002:11:14:45 +0100] "GET /team.html HTTP/1.1" 304 -
#129.88.38.248 - - [19/Nov/2002:11:14:45 +0100] "GET /images/us.gif HTTP/1.1" 304 -
#129.88.38.248 - - [19/Nov/2002:11:14:45 +0100] "GET /style.css HTTP/1.1" 304 -
#129.88.38.248 - - [19/Nov/2002:11:14:45 +0100] "GET /images/bground91.jpg HTTP/1.1" 404 300
#193.252.55.136 - - [19/Nov/2002:11:14:48 +0100] "GET /favicon.ico HTTP/1.1" 404 291
#129.88.38.248 - - [19/Nov/2002:11:14:57 +0100] "GET /Les.Publications/book_chapters.html HTTP/1.1" 200 4777
#192.73.228.9 - - [19/Nov/2002:11:15:33 +0100] "GET /%7Ejmfavre/ENSEIGNEMENT/CCIGL-02-03/GL/ HTTP/1.0" 200 7797

# TO DO
# improve Status (404, ...) request exclusion
# descending sort by access number

# public

$logfile="/usr/local/apache/logs/access_log";
#$logfile="access_log";
$website="http://www-adele.imag.fr";
$urlprefixs="/~donsez|/\%7Edonsez|/\%7edonsez";
$equivalenturlprefixs="/~donsez";
#$extensions="pdf|htm|html|doc|ppt|ps|zip|xml|xslt|java|js|cs|pl";
$extensions="pdf|doc|ppt|ps|zip";
@excludedip=(129,88,103);

$reportfilename="access.html";
$csvreportfilename="access.csv";

# private
$excludedstatus="404|304";
$preval="";
$totalaccess=0;
$begin="";
$end="";

open(LOGFILE ,$logfile);
while( $_ = <LOGFILE> ) {
# if( $_ =~/(\d+)\.(\d+)\.(\d+)\.(\d+).*\[(.*)\].*$urlprefix(.*)($extensions).*HTTP\/\d\.\d\" (^($excludedstatus))/o ){
 if( $_ =~/(\d+)\.(\d+)\.(\d+)\.(\d+).*\[(.*)\].*($urlprefixs)(.*)($extensions).*HTTP\/\d\.\d\"\s+(\d+)\s+(.+)/o ){
  # exclude access from ADELE Team' IP @
  # exclude 404 Not Found status access
  if ( (($1,$2,$3)!=@excludedip) and ($9!=404) ){
	  
   # set begin and end dates
   $end=$5;
   if($begin=="") { $begin=$5; }
   
   $ipuseragent=$1 . "." . $2 . "." . $3 . "." . $4;
   $accessedurl=$equivalenturlprefixs . $7 . $8;
   $val=$ipuseragent . " " . $accessedurl;
   
#   print ">" . $9 . " ". $10 . " ";
   if($10 =~ /(\d+)/o) {
	$downloadedbytes+=$1;
#	print $1;
   }
#   print "\n";
   # count only one access if the user agent downloaded one file with multiple request
   if($val != $prevval) {
   	$prevval=$val;
	$totalaccess++;
	# print $val . "\n";
	$accessperfile{$accessedurl} += 1;
   }
  }
 }
}
close(LOGFILE);

# build report HTML header
$htmlheader = <<_Delim_Header;
<html>
<head>
<LINK HREF='style.css' TYPE='text/css' REL='STYLESHEET'/>
</head>
<body class='index'>
<h1 class='title'>Access Report</h1>

<p align='center'>generated by <a href='webstat.pl'>webstat.pl</a></p>

<p>
$begin to $end;
</p>
<p>
$totalaccess  files (.$extensions ) accessed
</p>
<p>
$downloadedbytes  bytes downloaded
</p>
<table border='0' width='100%'>
_Delim_Header

# build report HTML tailer
$htmltailer = <<_Delim_Tailer;
</table>
</body>
</html>
_Delim_Tailer

# generate HTML report
open(REPORT_HTML ,">$reportfilename");
print REPORT_HTML $htmlheader;

@sortedkeys=keys %accessperfile;
foreach $k (sort(@sortedkeys)) {
	$url= $website . $k;
	print REPORT_HTML "<tr><td>" . $accessperfile{$k} . "</td>\n";
	print REPORT_HTML "<td><a href='". $url . "'>" . $url . "</a></td></tr>";
}

print REPORT_HTML $htmltailer;

#while( ($accessedurl, $cpt) = each(%accessperfile)) {
#	print REPORT_HTML $accessedurl." ".$cpt." times\n";
#}


# ---------------------------------------------------
# build report CSV header
$csvheader = <<_Delim_CSVHeader;
accessnum;begin;end;url
_Delim_CSVHeader

# generate CSV report
open(REPORT_CSV ,">$csvreportfilename");
print REPORT_CSV $csvheader;

@sortedkeys=keys %accessperfile;
foreach $k (sort(@sortedkeys)) {
	$url= $website . $k;
	print REPORT_CSV $accessperfile{$k} .";". $begin .";". $end . ";". $url . "\n";
}

