home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/perl -w
-
- # This script cleans up an HTML document
-
- use strict;
- use HTML::Parser ();
-
- # configure these values
- my @ignore_attr =
- qw(bgcolor background color face style link alink vlink text
- onblur onchange onclick ondblclick onfocus onkeydown onkeyup onload
- onmousedown onmousemove onmouseout onmouseover onmouseup
- onreset onselect onunload
- );
- my @ignore_tags = qw(font big small b i);
- my @ignore_elements = qw(script style);
-
- # make it easier to look up attributes
- my %ignore_attr = map { $_ => 1} @ignore_attr;
-
- sub tag
- {
- my($pos, $text) = @_;
- if (@$pos >= 4) {
- # kill some attributes
- my($k_offset, $k_len, $v_offset, $v_len) = @{$pos}[-4 .. -1];
- my $next_attr = $v_offset ? $v_offset + $v_len : $k_offset + $k_len;
- my $edited;
- while (@$pos >= 4) {
- ($k_offset, $k_len, $v_offset, $v_len) = splice @$pos, -4;
- if ($ignore_attr{lc substr($text, $k_offset, $k_len)}) {
- substr($text, $k_offset, $next_attr - $k_offset) = "";
- $edited++;
- }
- $next_attr = $k_offset;
- }
- # if we killed all attributed, kill any extra whitespace too
- $text =~ s/^(<\w+)\s+>$/$1>/ if $edited;
- }
- print $text;
- }
-
- sub decl
- {
- my $type = shift;
- print shift if $type eq "doctype";
- }
-
- sub text
- {
- print shift;
- }
-
- HTML::Parser->new(api_version => 3,
- start_h => [\&tag, "tokenpos, text"],
- process_h => ["", ""],
- comment_h => ["", ""],
- declaration_h => [\&decl, "tagname, text"],
- default_h => [\&text, "text"],
-
- ignore_tags => \@ignore_tags,
- ignore_elements => \@ignore_elements,
- )
- ->parse_file(shift) || die "Can't open file: $!\n";
-
-