#!/usr/bin/perl # This PERL script generates reports of access to a sub path of a web site # Report formats are : HTML, CSV (Comma-Separated Values) # from the access log (APACHE HTTPD) # Author : Didier Donsez, didier.donsez@imag # Licence : LGPL (you can send me improvements of this script) # Example of APACHE HTTPD access log entries #129.88.38.248 - - [19/Nov/2002:11:14:45 +0100] "GET /team.html HTTP/1.1" 304 - #129.88.38.248 - - [19/Nov/2002:11:14:45 +0100] "GET /images/us.gif HTTP/1.1" 304 - #129.88.38.248 - - [19/Nov/2002:11:14:45 +0100] "GET /style.css HTTP/1.1" 304 - #129.88.38.248 - - [19/Nov/2002:11:14:45 +0100] "GET /images/bground91.jpg HTTP/1.1" 404 300 #193.252.55.136 - - [19/Nov/2002:11:14:48 +0100] "GET /favicon.ico HTTP/1.1" 404 291 #129.88.38.248 - - [19/Nov/2002:11:14:57 +0100] "GET /Les.Publications/book_chapters.html HTTP/1.1" 200 4777 #192.73.228.9 - - [19/Nov/2002:11:15:33 +0100] "GET /%7Ejmfavre/ENSEIGNEMENT/CCIGL-02-03/GL/ HTTP/1.0" 200 7797 # TO DO # improve Status (404, ...) request exclusion # descending sort by access number # public $logfile="/usr/local/apache/logs/access_log"; #$logfile="access_log"; $website="http://www-adele.imag.fr"; $urlprefixs="/~donsez|/\%7Edonsez|/\%7edonsez"; $equivalenturlprefixs="/~donsez"; #$extensions="pdf|htm|html|doc|ppt|ps|zip|xml|xslt|java|js|cs|pl"; $extensions="pdf|doc|ppt|ps|zip"; @excludedip=(129,88,103); $reportfilename="access.html"; $csvreportfilename="access.csv"; # private $excludedstatus="404|304"; $preval=""; $totalaccess=0; $begin=""; $end=""; open(LOGFILE ,$logfile); while( $_ = ) { # if( $_ =~/(\d+)\.(\d+)\.(\d+)\.(\d+).*\[(.*)\].*$urlprefix(.*)($extensions).*HTTP\/\d\.\d\" (^($excludedstatus))/o ){ if( $_ =~/(\d+)\.(\d+)\.(\d+)\.(\d+).*\[(.*)\].*($urlprefixs)(.*)($extensions).*HTTP\/\d\.\d\"\s+(\d+)\s+(.+)/o ){ # exclude access from ADELE Team' IP @ # exclude 404 Not Found status access if ( (($1,$2,$3)!=@excludedip) and ($9!=404) ){ # set begin and end dates $end=$5; if($begin=="") { $begin=$5; } $ipuseragent=$1 . "." . $2 . "." . $3 . "." . $4; $accessedurl=$equivalenturlprefixs . $7 . $8; $val=$ipuseragent . " " . $accessedurl; # print ">" . $9 . " ". $10 . " "; if($10 =~ /(\d+)/o) { $downloadedbytes+=$1; # print $1; } # print "\n"; # count only one access if the user agent downloaded one file with multiple request if($val != $prevval) { $prevval=$val; $totalaccess++; # print $val . "\n"; $accessperfile{$accessedurl} += 1; } } } } close(LOGFILE); # build report HTML header $htmlheader = <<_Delim_Header;

Access Report

generated by webstat.pl

$begin to $end;

$totalaccess files (.$extensions ) accessed

$downloadedbytes bytes downloaded

_Delim_Header # build report HTML tailer $htmltailer = <<_Delim_Tailer;

_Delim_Tailer # generate HTML report open(REPORT_HTML ,">$reportfilename"); print REPORT_HTML $htmlheader; @sortedkeys=keys %accessperfile; foreach $k (sort(@sortedkeys)) { $url= $website . $k; print REPORT_HTML "" . $accessperfile{$k} . "\n"; print REPORT_HTML "" . $url . ""; } print REPORT_HTML $htmltailer; #while( ($accessedurl, $cpt) = each(%accessperfile)) { # print REPORT_HTML $accessedurl." ".$cpt." times\n"; #} # --------------------------------------------------- # build report CSV header $csvheader = <<_Delim_CSVHeader; accessnum;begin;end;url _Delim_CSVHeader # generate CSV report open(REPORT_CSV ,">$csvreportfilename"); print REPORT_CSV $csvheader; @sortedkeys=keys %accessperfile; foreach $k (sort(@sortedkeys)) { $url= $website . $k; print REPORT_CSV $accessperfile{$k} .";". $begin .";". $end . ";". $url . "\n"; }