CLIX - Fazer Clix Custa Nix

home *** CD-ROM | disk | FTP | other *** search

/ CLIX - Fazer Clix Custa Nix / CLIX-CD.cdr / mac / lib / HTML / HeadParser.pm < prev next >

Wrap

Text File | 1997-08-18 | 6KB | 265 lines

package HTML::HeadParser; =head1 NAME HTML::HeadParser - Parse <HEAD> section of a HTML document =head1 SYNOPSIS require HTML::HeadParser; $p = HTML::HeadParser->new; $p->parse($text) and print "not finished"; $p->header('Title') # to access <title>....</title> $p->header('Content-Base') # to access <base href="http://..."> $p->header('Foo') # to access <meta http-equiv="Foo" content="..."> =head1 DESCRIPTION The I<HTML::HeadParser> is a specialized (and lightweight) I<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD> section of a HTML document. The parse() and parse_file() methods will return a FALSE value as soon as a E<lt>BODY> element is found, and should not be called again after this. The I<HTML::HeadParser> constructor takes an optional I<HTTP::Headers> object reference as argument. The parser will update this header object as the various E<lt>HEAD> elements are recognized. If no header is given we will create an internal (and initially empty) header object. This header object can be accessed with the header() method. The following header fields are initialized from elements found in the E<lt>HEAD> section of the HTML document: =over 4 =item Content-Base: The I<Content-Base> header is initialized from the E<lt>base href="..."> element. =item Title: The I<Title> header is initialized from the E<lt>title>...E<lt>/title> element. =item Isindex: The I<Isindex> header will be added if there is a E<lt>isindex> element in the E<lt>head>. The header value is initialized from the I<prompt> attribute if it is present. If not I<prompt> attribute is given it will have '?' as the value. =item X-Meta-Foo All E<lt>meta> elements will initialize headers with the prefix "X-Meta-". If the element contains a I<http-equiv> attribute, then it will be honored as the header name. =back =head1 METHODS The following methods (in addition to those provided by the superclass) are available: =over 4 =cut require HTML::Parser; @ISA = qw(HTML::Parser); use HTML::Entities (); require HTTP::Headers; use strict; use vars qw($VERSION $DEBUG); #$DEBUG = 1; $VERSION = sprintf("%d.%02d", q$Revision: 2.4 $ =~ /(\d+)\.(\d+)/); my $FINISH = "HEAD PARSED\n"; =item $hp = HTML::HeadParser->new( [$header] ) =cut sub new { my($class, $header) = @_; $header ||= HTTP::Headers->new; my $self = bless HTML::Parser->new, $class; $self->{'header'} = $header; $self->{'tag'} = ''; # name of active element that takes textual content $self->{'text'} = ''; # the accumulated text associated with the element $self; } =item $hp->parse( $text ) Parses some HTML text (see HTML::Parser->parse()) but will return FALSE as soon as parsing should end. =cut sub parse { my $self = shift; eval { $self->SUPER::parse(@_) }; if ($@) { print $@ if $DEBUG; $self->{'_buf'} = ''; # flush rest of buffer return ''; } $self; } =item $hp->header; Returns a reference to the HTML::Header object. =item $hp->header( $key ) Returns a header value. =cut sub header { my $self = shift; return $self->{'header'} unless @_; $self->{'header'}->header(@_); } =item $hp->as_string; Same as $hp->header->as_string =cut sub as_string { my $self = shift; $self->{'header'}->as_string; } sub flush_text # internal { my $self = shift; my $tag = $self->{'tag'}; my $text = $self->{'text'}; $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; print "FLUSH $tag => '$text'\n" if $DEBUG; if ($tag eq 'title') { $self->{'header'}->header(title => $text); } $self->{'tag'} = $self->{'text'} = ''; } # This is an quote from the HTML3.2 DTD which shows which elements # that might be present in a <HEAD>...</HEAD>. Also note that the # <HEAD> tags themselves might be missing: # # <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? & # SCRIPT* & META* & LINK*"> # # <!ELEMENT HEAD O O (%head.content)> sub start { my($self, $tag, $attr) = @_; # $attr is reference to a HASH print "START[$tag]\n" if $DEBUG; $self->flush_text if $self->{'tag'}; if ($tag eq 'meta') { my $key = $attr->{'http-equiv'}; if (!defined $key) { return unless $attr->{'name'}; $key = "X-Meta-\u$attr->{'name'}"; } $self->{'header'}->push_header($key => $attr->{content}); } elsif ($tag eq 'base') { return unless exists $attr->{href}; $self->{'header'}->header('Content-Base' => $attr->{href}); } elsif ($tag eq 'isindex') { # This is a non-standard header. Perhaps we should just ignore # this element $self->{'header'}->header(Isindex => $attr->{prompt} || '?'); } elsif ($tag =~ /^(?:title|script|style)$/) { # Just remember tag. Initialize header when we see the end tag. $self->{'tag'} = $tag; } elsif ($tag eq 'link') { return unless exists $attr->{href}; # <link href="http:..." rel="xxx" rev="xxx" title="xxx"> my $h_val = "<" . delete($attr->{href}) . ">"; for (sort keys %{$attr}) { $h_val .= qq(; $_="$attr->{$_}"); } $self->{'header'}->header(Link => $h_val); } elsif ($tag eq 'head' || $tag eq 'html') { # ignore } else { die $FINISH; } } sub end { my($self, $tag) = @_; print "END[$tag]\n" if $DEBUG; $self->flush_text if $self->{'tag'}; die $FINISH if $tag eq 'head'; } sub text { my($self, $text) = @_; print "TEXT[$text]\n" if $DEBUG; my $tag = $self->{tag}; if (!$tag && $text =~ /\S/) { # Normal text means start of body die $FINISH; } return if $tag ne 'title'; # optimize skipping of <script> and <style> HTML::Entities::decode($text); $self->{'text'} .= $text; } 1; __END__ =head1 EXAMPLES $h = HTTP::Headers->new; $p = HTML::HeadParser->new($h); $p->parse(<<EOT); <title>Stupid example</title> <base href="http://www.sn.no/libwww-perl/"> Normal text starts here. EOT undef $p; print $h->title; # should print "Stupid example" =head1 SEE ALSO L<HTML::Parser>, L<HTTP::Headers> =head1 COPYRIGHT Copyright 1996-1997 Gisle Aas. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 AUTHOR Gisle Aas E<lt>aas@sn.no> =cut