home *** CD-ROM | disk | FTP | other *** search
- #!/usr/local/gnu/bin/perl
- #
- # "page-stats v1.3": a Perl-program by Mark Koenen <markko@sci.kun.nl>
- # that 'cleverly' checks how often a WWW-page has been accessed.
- # Changes by Patrick Atoon <patricka@cs.kun.nl>.
- #
- # Usage: page-stats.pl -h
- # page-stats.pl [ -b ] [ -i identfile ] [ -l logfile ]
- #
- # Where:
- # The pages that have to be counted are defined in the 'identfile'.
- # 'logfile' gives the exact location of the log-file.
- #
- # For details on the ident-file, do "page-stats.pl -h" to view the
- # manual page.
- #
- # Latest version on : http://www.sci.kun.nl/thalia/guide/
- # A working example : http://www.sci.kun.nl/thalia/page-stats/
- #
-
- # Just some defaults:
- $logfile = "/vol/www/ezel/httpd/logs/access_log";
- $identfile = "page-stats_en.ident";
- $total_number_req = 0;
- $benchmark = 0;
-
- # Process switches:
- while ($ARGV[0] =~ /^-/)
- {
- $_ = shift;
- if (/^-i/)
- {
- $identfile = shift;
- }
- elsif (/^-l/)
- {
- $logfile = shift;
- }
- elsif (/^-h/)
- {
- &print_help;
- exit(0);
- }
- elsif (/^-b/)
- {
- $benchmark = 1;
- }
- else
- {
- die "Unrecognized switch: $_.\n";
- }
- }
-
- if (! -r $logfile)
- {
- die "Cannot open logfile $logfile.\n";
- }
-
- if (! -r $identfile)
- {
- die "Cannot open identfile $identfile.\n";
- }
-
- # Are we benchmarking?
- ($u, $s) = times if ($benchmark);
-
- # Well, the important files are there. Start generating the
- # HTML-file
-
- # Let's determine the other filenames:
- $strippedfile = substr($identfile, 0, rindex($identfile, "."));
- $htmlfile = $strippedfile . ".html";
- $sourcefile = $strippedfile . ".source";
-
- @test_url = (); # (Partial) URLs to match
- @true_idx = (); # Index of true URL to which (partial) URLs belong
- @true_url = (); # True URLs
- @url_desc = (); # Description of the true URL
- @urlcount = (); # Number of hits for those true URLs
-
- # Read the identifiers-file:
- open(IDENT, $identfile);
-
- while (<IDENT>)
- {
- chop;
-
- s/#.*//g; # Throw out comments
- s/\s$//g; # Throw out spaces at the end
-
- next if ($_ eq "");
-
- @info = split("@", $_);
-
- # Fill in the various arrays that administrate stuff.
- $index = scalar(@true_url);
- push(@true_url, shift(@info));
- push(@url_desc, shift(@info));
- push(@urlcount, 0);
-
- foreach $url (@info)
- {
- push(@true_idx, $index);
- push(@test_url, $url);
- }
- }
-
- close(IDENT);
- $num_true_urls = scalar(@true_url);
- $num_test_urls = scalar(@test_url);
- open(LOG, $logfile);
-
- #
- # Read the logfile and check if the page is recognized.
- #
- # This is computationally the heaviest part of the script.
- # Optimizing would help a lot.
- #
- while (<LOG>)
- {
- # Since HMTL pages almost always contain pictures, it is more
- # cost-effective to filter these pictures out before trying to
- # match these lines. Even if each page only contains one
- # picture, this would already save 50% of the lines to match!
- # In real life pages, this percentage will be much higher.
- #
- # If you want to be able to match pictures, outcomment the next
- # two lines by placing a "#" before them.
- next if (index($_, ".gif ") != -1);
- next if (index($_, ".jpg ") != -1);
-
- # Get URL from logline; they happen to start with " /", how handy!
- # Use index(), it's faster than using s/X/Y/.
- $begin = index($_, " /") + 1;
-
- next if ($begin == -1);
-
- # The next space marks the end of the URL.
- $end = index($_, " ", $begin);
- $pageurl = substr($_, $begin, $end-$begin);
-
- for ($i = 0; $i < $num_test_urls; $i++)
- {
- # Is this one with or without the wildcard?
- if (substr($test_url[$i], -1) eq "*")
- {
- $url = $test_url[$i];
- chop($url);
- $len = length($url);
-
- # If this matches the pageurl, increase the counter.
- if (substr($pageurl, 0, $len) eq $url)
- {
- $urlcount[$true_idx[$i]]++;
- $total_number_req++;
- last; # No need to check others; continue with next URL.
- }
- }
- else
- {
- # If this is the pageurl, increase the counter.
- if ($pageurl eq $test_url[$i])
- {
- $urlcount[$true_idx[$i]]++;
- $total_number_req++;
- last; # No need to check others; continue with next URL.
- }
- }
- }
- }
-
- close (LOG);
-
- # Calculate some variables:
- $firstrequest = `head -1 $logfile`;
- $firstrequest =~ s/^.*\[(\S*)\s.*$/$1/;
-
- $lastrequest = `tail -1 $logfile`;
- $lastrequest =~ s/^.*\[(\S*)\s.*$/$1/;
-
- chop($firstrequest, $lastrequest);
-
- # Determine the time of creation
- ($sec, $min, $hour, $day, $month, $year, $wday, $yday, $isdst) = localtime();
- @MONTHS = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep",
- "Oct", "Nov", "Dec");
- $date = sprintf("%02d/%s/19%02d:%02d:%02d:%02d", $day, $MONTHS[$month], $year,
- $hour, $min, $sec);
-
- # Open HTML-file for writing
- if (!open(HTML, ">$htmlfile"))
- {
- die "Cannot open html file $htmlfile for writing.\n";
- }
-
- # Write SOURCE-file to HTML-file
- if (open(SOURCE, $sourcefile))
- {
- while (<SOURCE>)
- {
- # Replace requested variables
- s/\$firstrequest/$firstrequest/gi;
- s/\$lastrequest/$lastrequest/gi;
- s/\$date/$date/gi;
-
- $line = $_;
-
- # Insert top number pages
- if (/\$top/i)
- {
- # Get the limit
- $limit = $line;
- $limit =~ s/.*\$top([0-9]*).*$/$1/i;
-
- # Don't forget to erase the $topXX bit from the line.
- $line =~ s/\$top[0-9]*//gi;
-
- &write_top($limit);
-
- }
-
- # Insert all pages
- if ($line =~ /\$list/)
- {
- $line =~ s/\$list//gi;
- &write_pages;
- }
- print HTML $line;
- }
- }
- else
- {
- # Generate a default page
- print HTML "<HTML>\n<HEAD>\n<TITLE>Page-statistics</TITLE>\n";
- print HTML "</HEAD>\n<BODY>\n";
- &write_pages;
- print HTML "<HR>\n<EM>Page was generated on $date</EM>\n";
- print HTML "</BODY>\n</HTML>\n";
- }
-
- close(SOURCE);
- close(HTML);
-
- if ($benchmark)
- {
- ($nu, $ns) = times;
- printf "%8.4f secs user time, %8.4f secs system time.\n", ($nu - $u),
- ($ns - $s);
- }
-
- exit(0);
-
-
- ###############################
- # Subroutines from here on
- #
-
- #
- # Print all pages
- #
- sub write_pages
- {
- local($i, $whitespace, $desc) = (0, "", "");
-
- # Write page-stats to HTML-file
- print (HTML "<PRE>\n");
-
- for ($i = 0; $i < $num_true_urls; $i++)
- {
- $desc = $url_desc[$i];
- $whitespace = $desc;
- $desc =~ s/^\s*//;
- $whitespace =~ s/^(\s*).*/$1/;
- printf HTML ("%6d %s<A HREF=\"%s\">%s</A>\n", $urlcount[$i],
- $whitespace, $true_url[$i], $desc);
- }
-
- # print out the total number of requests
- print HTML "--------------------------------\n";
- printf HTML ("%6d Total number of requests\n", $total_number_req);
- print HTML "</PRE>\n";
- }
-
- #
- # Print the top X
- #
- sub write_top
- {
- local($lim) = @_;
- local($i, $j, $max, $max_idx) = (0, 0, -1, -1);
-
- # Sanity check
- $lim = 0 if ($lim < 0);
- $lim = $num_true_urls if ($lim > $num_true_urls);
-
- @tmpcount = @urlcount;
- print HTML "<PRE>\n";
-
- for ($i = 0; $i < $lim; $i++)
- {
- $max = -1;
- $max_idx = -1;
-
- for ($j = 0; $j < $num_true_urls; $j++)
- {
- if ($tmpcount[$j] > $max)
- {
- $max = $tmpcount[$j];
- $max_idx = $j;
- }
- }
-
- if ($max > -1)
- {
- $desc = $url_desc[$max_idx];
- $desc =~ s/^\s*//;
- printf HTML ("%6d <A HREF=\"%s\">%s</A>\n", $tmpcount[$max_idx],
- $true_url[$max_idx], $desc);
- $tmpcount[$max_idx] = -1;
- }
- else
- {
- last;
- }
- }
-
- print HTML "</PRE>\n";
- }
-
- #
- # The manual is included in the program, so you can never lose it.
- #
- sub print_help
- {
- print <<EOF;
-
- NAME
- page-stats.pl - Check WWW page accesses (v1.3)
-
- SYNOPSIS
- page-stats.pl -h
- page-stats.pl [ -b ] [ -i identfile ] [ -l logfile ]
-
- DESCRIPTION
- page-stats.pl will examine the acceslog of a http daemon and search
- it for occurrences of certain references. These references are then
- counted and put into a HTML file that is ready to be displayed to
- the outside world as a "Page Statistics" page. Each page can be
- selected from the statistics page.
-
- The identfile contains the references that should be counted. A
- line in this file should be in the following format:
-
- URL\@title\@reference[\@reference...]
-
- which could look like this:
-
- ~gnu/index.html\@Gnu's pages\@/gnu.html\@~gnu*
-
- Comments are allowed, and should be preceded by a "#". Everything
- following that character will be ignored. Each line should at least
- contain the following:
-
- URL The URL of the page, as it should be referenced from the
- "Page Statistics" page.
-
- title The title of the page, as you want visitors to see it. Note
- that leading spaces are significant, so it is possible to
- make use of indentation for different levels of documents.
-
- reference
- A reference of how the page might be accessed. For instance,
- if a directory contains a file index.html, it can be
- accessed by leaving out the "index.html" part, or even the
- "/" before it. If this is the case, put all references
- behind each other, separated by "\@". You may use a wildcard
- "*" at the end of a string to match only the begin of an
- URL.
-
- The order of the lines in the identfile matters. Only the first
- match will be taken into account. Be careful when using wildcards,
- as they might filter out hits for lines below. Take a look at the
- (faulty) example below:
-
- # Wrong; second line will never be reached!
- ~gnu/index.html\@Gnu's pages\@~gnu*
- ~gnu/info/index.html\@Gnu's info files\@~gnu/info*
-
- The first line will filter out all URLs ending in ".html", which
- automatically means that URLs that would match /info/*.html are
- matched as well. Place the second line above the first to solve
- the problem:
-
- # Right!
- ~gnu/info/index.html\@Gnu's info files\@~gnu/info*
- ~gnu/index.html\@Gnu's pages\@~gnu*
-
- Currently page-stats.pl will skip lines in the access_log that
- contain references to ".gif", ".jpg" or ".jpeg" files, even if you
- specify matching URLs. If you need the program to be able to
- handle references to those pictures, you should outcomment the
- lines as indicated in the code.
-
- Note that once the first matching reference is found, the quest
- for matches is ended. Only the first page will be recognized as a
- matching reference and its counter will be increased.
-
- The HTML "Page Statistics" file is created from two files. These
- are the ident file with references to check, and a source file that
- contains the basic HTML page as desired. The name of the source
- file is determined by replacing the mandatory ".ident" ending of
- the ident file by ".source". The HTML file that is created will be
- named in the same way, ending in ".html".
-
- It is possible to use certain variables in the source file. These
- variables will be replaced by page-stats.pl as it rummages through
- the file.
-
- \$date The current date and time will be inserted for this
- variable.
-
- \$firstrequest
- The date and time of the first request logged in the
- access_log will be inserted for this variable.
-
- \$lastrequest
- This variable is replaced by the last request logged in the
- access_log.
-
- \$list This will be replaced by the complete list of references
- and their number of hits.
-
- \$topN This will insert a sorted list of the N most visited pages,
- where N can be any number . Of course setting a number
- greater than the number of references is silly. There must
- be no space between "\$top" and the number.
-
- OPTIONS
- -b Benchmark; print used user and system times when ready.
-
- -h Displays this manual page.
-
- -i identfile
- Specify the file that determines which references to look
- for in the logfile. This defaults to 'page-stats.ident'.
-
- -l logfile
- Specify the access_log of the http daemon. The default
- location is '/usr/local/httpd/logs/access_log'.
-
- FILES
- access_log (generated by httpd)
- <identname>.ident
- <identname>.source (optional)
- <identname>.html (generated by page-stats.pl)
-
- SEE ALSO
- httpd(1).
- http://www.sci.kun.nl/thalia/guide/#page-stats
- For the latest version.
- http://www.sci.kun.nl/thalia/page-stats/
- For a working example.
-
- CHANGES
- 03-01-1995: (v1.0) First draft of the program.
- 03-17-1995: (v1.1) Added 'total number of requests' at the bottom
- of the page.
- 05-26-1995: (v1.2) Added '\$topN' and '\$list'; juggled with the
- code. Improved performance by skipping images in
- access_log. Allowed comments in the ident file. Also
- moved the external README into the code.
- 07-17-1995: (v1.3) You can now use wildcards to define URLs to
- recognize. Using arrays to administrate URLs instead
- of strings.
-
- BUGS
- If the accesslog is big, and there are many references to check,
- this program can take very long to complete. It is recommended
- that both the size of the accesslog and the number of references
- are kept to acceptable levels.
-
- The program might not work because the path to Perl in the first
- line of page-stats.pl is wrong. See if the path is correct by
- doing 'which perl' at your Unix prompt. If it is not correct, you
- will have to edit the first line.
-
- AUTHOR
- Mark Koenen <markko\@sci.kun.nl>,
- changes by Patrick Atoon <patricka\@cs.kun.nl>
-
- EOF
- }
-