home *** CD-ROM | disk | FTP | other *** search
- package HTML::Clean;
-
- use Carp;
- use IO;
- use Fcntl;
- use strict;
- require 5.004;
-
- use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
-
- require Exporter;
- require AutoLoader;
-
- # Items to export to callers namespace
- @EXPORT = qw();
-
- $VERSION = '0.8';
-
- =head1 NAME
-
- HTML::Clean - Cleans up HTML code for web browsers, not humans
-
- =head1 SYNOPSIS
-
- use HTML::Clean;
- $h = new HTML::Clean($filename); # or..
- $h = new HTML::Clean($htmlcode);
-
- $h->compat();
- $h->strip();
- $data = $h->data();
- print $$data;
-
- =head1 DESCRIPTION
-
- The HTML::Clean module encapsulates a number of common techniques for
- minimizing the size of HTML files. You can typically save between
- 10% and 50% of the size of a HTML file using these methods.
- It provides the following features:
-
- =over 8
-
- =item Remove unneeded whitespace (begining of line, etc)
-
- =item Remove unneeded META elements.
-
- =item Remove HTML comments (except for styles, javascript and SSI)
-
- =item Replace tags with equivilant shorter tags (<strong> --> <b>)
-
- =item etc.
-
- =back
-
- The entire proces is configurable, so you can pick and choose what you want
- to clean.
-
- =head1 THE HTML::Clean CLASS
-
- =over 4
-
- =cut
-
-
- ######################################################################
-
- =head2 $h = new HTML::Clean($dataorfile, [$level]);
-
- This creates a new HTML::Clean object. A Prerequisite for all other
- functions in this module.
-
- The $dataorfile parameter supplies the input HTML, either a filename,
- or a reference to a scalar value holding the HTML, for example:
-
- $h = new HTML::Clean("/htdocs/index.html");
- $html = "<strong>Hello!</strong>";
- $h = new HTML::Clean(\$html);
-
- An optional 'level' parameter controls the level of optimization
- performed. Levels range from 1 to 9. Level 1 includes only simple
- fast optimizations. Level 9 includes all optimizations.
-
- =cut
-
- sub new {
- my $this = shift;
- my $class = ref($this) || $this;
- my $self = {};
- bless $self, $class;
-
- my $data = shift;
- my $level = shift;
-
- if ($self->initialize($data)) {
- # set the default level
- $level = 9 if (!$level);
- $self->level($level);
- return $self;
- } else {
- undef $self;
- return undef;
- }
- }
-
-
- #
- # Set up the data in the self hash..
- #
-
- =head2 $h->initialize($dataorfile)
-
- This function allows you to reinitialize the HTML data used by the
- current object. This is useful if you are processing many files.
-
- $dataorfile has the same usage as the new method.
-
- Return 0 for an error, 1 for success.
-
- =cut
-
- sub initialize {
- my($self, $data) = @_;
- $self->{'DATA'} = undef;
-
- # Not defined? Just return true.
- return(1) if (!$data);
-
- # Check if it's a ref
- if (ref($data)) {
- $self->{DATA} = $data;
- return(1);
- }
-
- # Newline char, really an error, but just go with it..
- if ($data =~ /\n/) {
- $self->{'DATA'} = \$data;
- }
-
- # No newline? Must be a filename
- if (-f $data) {
- my $storage;
-
- sysopen(IN, "$data", O_RDONLY) || return(0);
- while (<IN>) {
- $storage .= $_;
- }
- close(IN);
- $self->{'DATA'} = \$storage;
- return(1);
- }
-
- return(0); # file not found?
- }
-
-
- =head2 $h->level([$level])
-
- Get/set the optimization level. $level is a number from 1 to 9.
-
- =cut
-
- sub level {
- my($self, $level) = @_;
-
- if (defined($level) && ($level > 0) && ($level < 10)) {
- $self->{'LEVEL'} = $level
- }
- return($self->{'LEVEL'});
- }
-
- =head2 $myref = $h->data()
-
- Returns the current HTML data as a scalar reference.
-
- =cut
-
- sub data {
- my($self) = @_;
-
- return $self->{'DATA'};
- }
-
-
- # Junk HTML comments (INTERNAL)
-
- sub _commentcheck($) {
- my($comment) = @_;
-
- $_ = $comment;
-
- # Server side include
- return($comment) if (m,^<!--\#,si);
-
- # ITU Hack.. preserve some frontpage components
- return($comment) if (m,^<!-- %,si);
- return($comment) if (m,bot="(SaveResults|Search|ConfirmationField)",si);
-
- # Javascript
- return($comment) if (m,//.*-->$,si);
- return($comment) if (m,navigator\.app(name|version),si);
-
- # Stylesheet
- return($comment) if (m,[A-z0-9]+\:[A-z0-9]+\s*\{.*\},si);
- return('');
- }
-
-
- # Remove javascript comments (INTERNAL)
-
- sub _jscomments {
- my($js) = @_;
-
- $js =~ s,\n\s*//.*?\n,\n,sig;
- $js =~ s,\s+//.*?\n,\n,sig;
-
- # insure javascript is hidden
-
- if ($js =~ m,<!--,) {
- $js =~ s,</script>,// -->\n</script>,si;
- }
- return($js);
- }
-
- # Clean up other javascript stuff..
-
- sub _javascript {
- my($js) = @_;
-
- # remove excess whitespace at the beginning and end of lines
- $js =~ s,\s*\n+\s*,\n,sig;
-
- # braces/semicolon at end of line, join next line
- $js =~ s,([;{}])\n,$1,sig;
-
- # What else is safe to do?
-
- return($js);
- }
-
- # replace #000000 -> black, etc..
- # Does the browser render faster with RGB? You would think so..
-
- sub _defcolorcheck ($) {
- my($c) = @_;
-
- $c =~ s/\#000000/black/;
- $c =~ s/\#c0c0c0/silver/i;
- $c =~ s/\#808080/gray/;
- $c =~ s/\#ffffff/white/i;
- $c =~ s/\#800000/maroon/;
- $c =~ s/\#ff0000/red/i;
- $c =~ s/\#800080/purple/;
- $c =~ s/\#ff00ff/fuchsia/i;
- $c =~ s/\#ff00ff/fuchsia/i;
- $c =~ s/\#008000/green/;
- $c =~ s/\#00ff00/lime/i;
- $c =~ s/\#808000/olive/;
- $c =~ s/\#ffff00/yellow/i;
- $c =~ s/\#000080/navy/;
- $c =~ s/\#0000ff/blue/i;
- $c =~ s/\#008080/teal/i;
- $c =~ s/\#00ffff/aqua/i;
- return($c);
- }
-
- # For replacing entities with numerics
- use vars qw/ %_ENTITIES/;
- %_ENTITIES = (
- 'Agrave' => 192,
- 'Aacute' => 193,
- 'Acirc' => 194,
- 'Atilde' => 195,
- 'Auml' => 196,
- 'Aring' => 197,
- 'AElig' => 198,
- 'Ccedil' => 199,
- 'Egrave' => 200,
- 'Eacute' => 201,
- 'Ecirc' => 202,
- 'Euml' => 203,
- 'Igrave' => 204,
- 'Iacute' => 205,
- 'Icirc' => 206,
- 'Iuml' => 207,
- 'ETH' => 208,
- 'Ntilde' => 209,
- 'Ograve' => 210,
- 'Oacute' => 211,
- 'Ocirc' => 212,
- 'Otilde' => 213,
- 'Ouml' => 214,
- 'Oslash' => 216,
- 'Ugrave' => 217,
- 'Uacute' => 218,
- 'Ucirc' => 219,
- 'Uuml' => 220,
- 'Yacute' => 221,
- 'THORN' => 222,
- 'szlig' => 223,
- 'agrave' => 224,
- 'aacute' => 225,
- 'acirc' => 226,
- 'atilde' => 227,
- 'auml' => 228,
- 'aring' => 229,
- 'aelig' => 230,
- 'ccedil' => 231,
- 'egrave' => 232,
- 'eacute' => 233,
- 'ecirc' => 234,
- 'euml' => 235,
- 'igrave' => 236,
- 'iacute' => 237,
- 'icirc' => 238,
- 'iuml' => 239,
- 'eth' => 240,
- 'ntilde' => 241,
- 'ograve' => 242,
- 'oacute' => 243,
- 'ocirc' => 244,
- 'otilde' => 245,
- 'ouml' => 246,
- 'oslash' => 248,
- 'ugrave' => 249,
- 'uacute' => 250,
- 'ucirc' => 251,
- 'uuml' => 252,
- 'yacute' => 253,
- 'thorn' => 254,
- 'yuml' => 255
- );
-
- =head2 strip(\%options);
-
- Removes excess space from HTML
-
- You can control the optimizations used by specifying them in the
- %options hash reference.
-
- The following options are recognized:
-
- =over 8
-
- =item boolean values (0 or 1 values)
-
- whitespace Remove excess whitespace
- shortertags <strong> -> <b>, etc..
- blink No blink tags.
- contenttype Remove default contenttype.
- comments Remove excess comments.
- entities " -> ", etc.
- dequote remove quotes from tag parameters where possible.
- defcolor recode colors in shorter form. (#ffffff -> white, etc.)
- javascript remove excess spaces and newlines in javascript code.
- htmldefaults remove default values for some html tags
- lowercasetags translate all HTML tags to lowercase
-
- =item parameterized values
-
- meta Takes a space separated list of meta tags to remove,
- default "GENERATOR FORMATTER"
-
- emptytags Takes a space separated list of tags to remove when there is no
- content between the start and end tag, like this: <b></b>.
- The default is 'b i font center'
-
- =back
-
- =cut
-
- use vars qw/
- $do_whitespace
- $do_shortertags
- $do_meta
- $do_blink
- $do_contenttype
- $do_comments
- $do_entities
- $do_dequote
- $do_defcolor
- $do_emptytags
- $do_javascript
- $do_htmldefaults
- $do_lowercasetags
- $do_defbaseurl
- /;
-
- $do_whitespace = 1;
- $do_shortertags = 1;
- $do_meta = "generator formatter";
- $do_blink = 1;
- $do_contenttype = 1;
- $do_comments = 1;
- $do_entities = 1;
- $do_dequote = 1;
- $do_defcolor = 1;
- $do_emptytags = 'b i font center';
- $do_javascript = 1;
- $do_htmldefaults = 1;
- $do_lowercasetags = 1;
- $do_defbaseurl = '';
-
- sub strip {
- my($self, $options) = @_;
-
- my $h = $self->{'DATA'};
- my $level = $self->{'LEVEL'};
-
- # Select a set of options based on $level, and then modify based on
- # user supplied options.
-
- _level_defaults($level);
-
- if(defined($options)) {
- no strict 'refs';
- for (keys(%$options)) {
- ${"do_" . lc($_)} = $options->{$_} if defined ${"do_" . lc($_)};
- }
- }
-
- if ($do_shortertags) {
- $$h =~ s,<strong>,<b>,sgi;
- $$h =~ s,</strong>,</b>,sgi;
- $$h =~ s,<em>,<i>,sgi;
- $$h =~ s,</em>,</i>,sgi;
- }
-
- if ($do_whitespace) {
- $$h =~ s,[\r\n]+,\n,sg; # Carriage/LF -> LF
- $$h =~ s,\s+\n,\n,sg; # empty line
- $$h =~ s,\n\s+<,\n<,sg; # space before tag
- $$h =~ s,\n\s+,\n ,sg; # other spaces
-
- $$h =~ s,>\n\s*<,><,sg; # LF/spaces between tags..
-
- # Remove excess spaces within tags.. note, we could parse out the elements
- # and rewrite for excess spaces between elements. perhaps next version.
- # removed due to problems with > and < in tag elements..
- #$$h =~ s,\s+>,>,sg;
- #$$h =~ s,<\s+,<,sg;
- # do this again later..
- }
-
- if ($do_entities) {
- $$h =~ s,",\",sg;
- # Simplify long entity names if using default charset...
- $$h =~ m,charset=([^\"]+)\",;
- if (!defined($1) || ($1 eq 'iso-8859-1')) {
- $$h =~ s,&([A-z]+);,($_ENTITIES{$1}) ? chr($_ENTITIES{$1}) : $&,sige;
- }
- }
-
- if ($do_meta) {
- foreach my $m (split(/\s+/, $do_meta)) {
- $$h =~ s,<meta name="$m"[^>]*?>,,sig;
- }
- }
- if ($do_contenttype) {
- # Don't need this, since it is the default for most web servers
- # Also gets rid of 'blinking pages' in older versions of netscape.
- $$h =~ s,<meta http-equiv="Content-Type".*?content="text/html;.*?charset=iso-8859-1">,,sig;
- }
-
- if ($do_defcolor) {
- $$h =~ s,(<[^<]+?color=['"]?\#[0-9A-Fa-f]+["']?),_defcolorcheck($&),sige;
- }
- if ($do_comments) {
- # don't strip server side includes..
- # try not to get javascript, or styles...
- $$h =~ s,<!--.*?-->,_commentcheck($&),sige;
-
- # Remove javascript comments
- $$h =~ s,<script[^>]*(java|ecma)script[^>]*>.*?</script>,_jscomments($&),sige;
- }
-
- if ($do_javascript) {
- #
- $$h =~ s,<script[^>]*(java|ecma)script[^>]*>.*?</script>,_javascript($&),sige;
- }
-
- if ($do_blink) {
- $$h =~ s,<BLINK>,,sgi;
- $$h =~ s,</BLINK>,,sgi;
- }
-
- if ($do_dequote) {
- while ($$h =~ s,<([A-z]+ [A-z]+=)(['"])([A-z0-9]+)\2(\s*?[^>]*?>),<$1$3$4,sig)
- {
- # Remove alphanumeric quotes. Note, breaks DTD..
- ;
- }
- }
- # remove <b></b>, etc..
- if ($do_emptytags) {
- my $pat = $do_emptytags;
- $pat =~ s/\s+/|/g;
-
- while ($$h =~ s,<($pat)(\s+[^>]*?)?>\s*</\1>,,siog){}
-
- }
- if ($do_htmldefaults) {
- # Tables
- # seems to break things..
- #$$h =~ s,(<table[^>]*)\s+border=0([^>]*>),$1$2,sig;
- $$h =~ s,(<td[^>]*)\s+rowspan=1([^>]*>),$1$2,sig;
- $$h =~ s,(<td[^>]*)\s+colspan=1([^>]*>),$1$2,sig;
-
- #
-
- # P, TABLE tags are default left aligned..
- # lynx is inconsistent in this manner though..
-
- $$h =~ s,<(P|table|td)( [^>]*)align=\"?left\"?([^>]*)>,<$1$2$3>,sig;
-
- # OL start=1
- $$h =~ s,(<OL [^>]*)start=\"?1\"?([^>]*>),$1$2,sig;
-
- # FORM
- $$h =~ s,(<form [^>]*)method=\"?get\"?([^>]*>),$1$2,sig;
- $$h =~ s,(<form [^>]*)enctype=\"application/x-www-form-urlencoded\"([^>]*>),$1$2,sig;
-
- # hr
- $$h =~ s,(<hr [^>]*)align=\"?center\"?([^>]*>),$1$2,sig;
- $$h =~ s,(<hr [^>]*)width=\"?100%\"?([^>]*>),$1$2,sig;
-
- # URLs
- $$h =~ s,(href|src)(=\"?http://[^/:]+):80/,$1$2/,sig;
- }
-
- if ($do_whitespace) {
- # remove space within tags <center > becomes <center>
- $$h =~ s,\s+>,>,sg;
- $$h =~ s,<\s+,<,sg;
- # join lines with a space at the beginning/end of the line
- # and a line that begins with a tag
- $$h =~ s,>\n ,> ,sig;
- $$h =~ s, \n<, <,sig;
- }
-
- if ($do_lowercasetags) {
- # translate tags to lowercase to (hopefully) improve compressability..
-
- # simple tags <H1>, </H1> etc.
- $$h =~ s,(<[/]?[a-zA-Z][a-zA-Z0-9_-]*\s*>),\L$1\E,sg;
-
- # the rest..
- $$h =~ s/(<[a-zA-Z][a-zA-Z0-9_-]*)(\s+.*?>)/_lowercasetag($1,$2)/sge;
- }
- }
-
- sub _lowercasetag {
- my($prefix, $body) = @_;
- $prefix =~ s/^(.+)$/\L$1\E/;
- $body =~ s/(\s+[a-zA-Z][a-zA-Z0-9_-]*)(\s*=\s*[^"\s]+|\s*=\s*"[^"]*"|>|\s)/\L$1\E$2/sg;
- return $prefix.$body;
- }
-
- # set options based on the level provided.. INTERNAL
-
- sub _level_defaults($) {
- my ($level) = @_;
-
- $do_whitespace = 1; # always do this...
-
- # level 2
- $do_shortertags = ($level > 1) ? 1 : 0;
- $do_meta = ($level > 1) ? "generator formatter" : "";
- $do_contenttype = ($level > 1) ? 1 : 0;
-
- # level 3
- $do_entities = ($level > 2) ? 1 : 0;
- $do_blink = ($level > 2) ? 1 : 0;
-
- # level 4
- $do_comments = ($level > 3) ? 1 : 0;
- $do_dequote = ($level > 3) ? 1 : 0;
- $do_defcolor = ($level > 3) ? 1 : 0;
- $do_emptytags = ($level > 3) ? 'b i font center' : 0;
- $do_javascript = ($level > 3) ? 1 : 0;
- $do_htmldefaults = ($level > 3) ? 1 : 0;
- $do_lowercasetags = ($level > 3) ? 1 : 0;
-
- # higher levels reserved for more intensive optimizations.
- }
-
- ######################################################################
-
- =head2 compat()
-
- This function improves the cross-platform compatibility of your HTML.
- Currently checks for the following problems:
-
- =over 8
-
- =item Insuring all IMG tags have ALT elements.
-
- =item Use of Arial, Futura, or Verdana as a font face.
-
- =item Positioning the <TITLE> tag immediately after the <head> tag.
-
- =back
-
- =cut
-
- sub compat {
- my($self, $level, $options) = @_;
-
- my $h = $self->{'DATA'};
-
- $$h =~ s/face="arial"/face="arial,helvetica,sansserif"/sgi;
- $$h =~ s/face="(verdana|futura)"/face="$1,arial,helvetica,sansserif"/sgi;
-
- # insure that <title> tag is directly after the <head> tag
- # Some search engines only search the first N chars. (PLweb for instance..)
-
- if ($$h =~ s,<title>(.*)</title>,,si) {
- my $title = $1;
- $$h =~ s,<head>,<head><title>$title</title>,si;
- }
-
- # Look for IMG without ALT tags.
- $$h =~ s/(<img[^>]+>)/_imgalt($1)/segi;
- }
-
- sub _imgalt {
- my($tag) = @_;
-
- $tag =~ s/>/ alt="">/ if ($tag !~ /alt=/i);
- return($tag);
- }
-
- =head2 defrontpage();
-
- This function converts pages created with Microsoft Frontpage to
- something a Unix server will understand a bit better. This function
- currently does the following:
-
- =over 8
-
- =item Converts Frontpage 'hit counters' into a unix specific format.
-
- =item Removes some frontpage specific html comments
-
- =back
-
- =cut
-
-
- sub defrontpage {
- my($self) = @_;
-
- my $h = $self->{'DATA'};
-
- while ($$h =~ s,<img\sSRC="[\./]*_vti_bin/fpcount.exe(/.*/).Page=(.*?)\|.*?\s(.*?)>,<img src="/counter?link=$1$2" $3>,xis) {
- print "Converted a Hitcounter.. $1, $2, $3\n";
- }
- $$h =~ s,<!--(mstheme|msthemeseparator|msnavigation)-->,,sgx;
- }
-
- =back
-
- =head1 SEE ALSO
-
- =head2 Modules
-
- FrontPage::Web, FrontPage::File
-
- =head2 Web Sites
-
- =over 6
-
- =item Distribution Site - http://people.itu.int/~lindner/
-
- =back
-
- =head1 AUTHORS
-
- Paul Lindner for the International Telecommunication Union (ITU)
-
- =head1 COPYRIGHT
-
- The HTML::Strip module is Copyright (c) 1998,99 by the ITU, Geneva Switzerland.
- All rights reserved.
-
- You may distribute under the terms of either the GNU General Public
- License or the Artistic License, as specified in the Perl README file.
-
- =cut
-
- 1;
- __END__
-