home *** CD-ROM | disk | FTP | other *** search
- package HTML::Entities;
-
- # $Id: Entities.pm,v 1.27 2003/10/10 09:56:18 gisle Exp $
-
- =head1 NAME
-
- HTML::Entities - Encode or decode strings with HTML entities
-
- =head1 SYNOPSIS
-
- use HTML::Entities;
-
- $a = "Våre norske tegn bør æres";
- decode_entities($a);
- encode_entities($a, "\200-\377");
-
- For example, this:
-
- $input = "vis-α-vis BeyoncΘ's na∩ve\npapier-mΓchΘ rΘsumΘ";
- print encode_entities($in), "\n"
-
- Prints this out:
-
- vis-à-vis Beyoncé's naïve
- papier-mâché résumé
-
- =head1 DESCRIPTION
-
- This module deals with encoding and decoding of strings with HTML
- character entities. The module provides the following functions:
-
- =over 4
-
- =item decode_entities( $string )
-
- This routine replaces HTML entities found in the $string with the
- corresponding ISO-8859-1 character, and if possible (under perl 5.8
- or later) will replace to Unicode characters. Unrecognized
- entities are left alone.
-
- This routine is exported by default.
-
- =item encode_entities( $string )
-
- =item encode_entities( $string, $unsafe_chars )
-
- This routine replaces unsafe characters in $string with their entity
- representation. A second argument can be given to specify which
- characters to consider unsafe (i.e., which to escape). The default set
- of characters to encode are control chars, high-bit chars, and the
- C<< < >>, C<< & >>, C<< > >>, and C<< " >>
- characters. But this, for example, would encode I<just> the
- C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters:
-
- $escaped = encode_entities($input, '<>&"');
-
- This routine is exported by default.
-
- =item encode_entities_numeric( $string )
-
- =item encode_entities_numeric( $string, $unsafe_chars )
-
- This routine works just like encode_entities, except that the replacement
- entities are always C<I<hexnum>;> and never C<&I<entname>;>. For
- example, C<escape_entities("r\xF4le")> returns "rôle", but
- C<escape_entities_numeric("r\xF4le")> returns "rôle".
-
- This routine is I<not> exported by default. But you can always
- export it with C<use HTML::Entities qw(encode_entities_numeric);>
- or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
-
- =back
-
- All these routines modify the string passed as the first argument, if
- called in a void context. In scalar and array contexts, the encoded or
- decoded string is returned (without changing the input string).
-
- If you prefer not to import these routines into your namespace, you can
- call them as:
-
- use HTML::Entities ();
- $decoded = HTML::Entities::decode($a);
- $encoded = HTML::Entities::encode($a);
- $encoded = HTML::Entities::encode_numeric($a);
-
- The module can also export the %char2entity and the %entity2char
- hashes, which contain the mapping from all characters to the
- corresponding entities (and vice versa, respectively).
-
- =head1 COPYRIGHT
-
- Copyright 1995-2003 Gisle Aas. All rights reserved.
-
- This library is free software; you can redistribute it and/or
- modify it under the same terms as Perl itself.
-
- =cut
-
- use strict;
- use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
- use vars qw(%entity2char %char2entity);
-
- require 5.004;
- require Exporter;
- @ISA = qw(Exporter);
-
- @EXPORT = qw(encode_entities decode_entities _decode_entities);
- @EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
-
- $VERSION = sprintf("%d.%02d", q$Revision: 1.27 $ =~ /(\d+)\.(\d+)/);
- sub Version { $VERSION; }
-
- require HTML::Parser; # for fast XS implemented decode_entities
-
-
- %entity2char = (
- # Some normal chars that have special meaning in SGML context
- amp => '&', # ampersand
- 'gt' => '>', # greater than
- 'lt' => '<', # less than
- quot => '"', # double quote
- apos => "'", # single quote
-
- # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
- AElig => '╞', # capital AE diphthong (ligature)
- Aacute => '┴', # capital A, acute accent
- Acirc => '┬', # capital A, circumflex accent
- Agrave => '└', # capital A, grave accent
- Aring => '┼', # capital A, ring
- Atilde => '├', # capital A, tilde
- Auml => '─', # capital A, dieresis or umlaut mark
- Ccedil => '╟', # capital C, cedilla
- ETH => '╨', # capital Eth, Icelandic
- Eacute => '╔', # capital E, acute accent
- Ecirc => '╩', # capital E, circumflex accent
- Egrave => '╚', # capital E, grave accent
- Euml => '╦', # capital E, dieresis or umlaut mark
- Iacute => '═', # capital I, acute accent
- Icirc => '╬', # capital I, circumflex accent
- Igrave => '╠', # capital I, grave accent
- Iuml => '╧', # capital I, dieresis or umlaut mark
- Ntilde => '╤', # capital N, tilde
- Oacute => '╙', # capital O, acute accent
- Ocirc => '╘', # capital O, circumflex accent
- Ograve => '╥', # capital O, grave accent
- Oslash => '╪', # capital O, slash
- Otilde => '╒', # capital O, tilde
- Ouml => '╓', # capital O, dieresis or umlaut mark
- THORN => '▐', # capital THORN, Icelandic
- Uacute => '┌', # capital U, acute accent
- Ucirc => '█', # capital U, circumflex accent
- Ugrave => '┘', # capital U, grave accent
- Uuml => '▄', # capital U, dieresis or umlaut mark
- Yacute => '▌', # capital Y, acute accent
- aacute => 'ß', # small a, acute accent
- acirc => 'Γ', # small a, circumflex accent
- aelig => 'µ', # small ae diphthong (ligature)
- agrave => 'α', # small a, grave accent
- aring => 'σ', # small a, ring
- atilde => 'π', # small a, tilde
- auml => 'Σ', # small a, dieresis or umlaut mark
- ccedil => 'τ', # small c, cedilla
- eacute => 'Θ', # small e, acute accent
- ecirc => 'Ω', # small e, circumflex accent
- egrave => 'Φ', # small e, grave accent
- eth => '≡', # small eth, Icelandic
- euml => 'δ', # small e, dieresis or umlaut mark
- iacute => 'φ', # small i, acute accent
- icirc => 'ε', # small i, circumflex accent
- igrave => '∞', # small i, grave accent
- iuml => '∩', # small i, dieresis or umlaut mark
- ntilde => '±', # small n, tilde
- oacute => '≤', # small o, acute accent
- ocirc => '⌠', # small o, circumflex accent
- ograve => '≥', # small o, grave accent
- oslash => '°', # small o, slash
- otilde => '⌡', # small o, tilde
- ouml => '÷', # small o, dieresis or umlaut mark
- szlig => '▀', # small sharp s, German (sz ligature)
- thorn => '■', # small thorn, Icelandic
- uacute => '·', # small u, acute accent
- ucirc => '√', # small u, circumflex accent
- ugrave => '∙', # small u, grave accent
- uuml => 'ⁿ', # small u, dieresis or umlaut mark
- yacute => '²', # small y, acute accent
- yuml => ' ', # small y, dieresis or umlaut mark
-
- # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
- copy => '⌐', # copyright sign
- reg => '«', # registered sign
- nbsp => "\240", # non breaking space
-
- # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
- iexcl => 'í',
- cent => 'ó',
- pound => 'ú',
- curren => 'ñ',
- yen => 'Ñ',
- brvbar => 'ª',
- sect => 'º',
- uml => '¿',
- ordf => '¬',
- laquo => '½',
- 'not' => '¼', # not is a keyword in perl
- shy => '¡',
- macr => '»',
- deg => '░',
- plusmn => '▒',
- sup1 => '╣',
- sup2 => '▓',
- sup3 => '│',
- acute => '┤',
- micro => '╡',
- para => '╢',
- middot => '╖',
- cedil => '╕',
- ordm => '║',
- raquo => '╗',
- frac14 => '╝',
- frac12 => '╜',
- frac34 => '╛',
- iquest => '┐',
- 'times' => '╫', # times is a keyword in perl
- divide => '≈',
-
- ( $] > 5.007 ? (
- OElig => chr(338),
- oelig => chr(339),
- Scaron => chr(352),
- scaron => chr(353),
- Yuml => chr(376),
- fnof => chr(402),
- circ => chr(710),
- tilde => chr(732),
- Alpha => chr(913),
- Beta => chr(914),
- Gamma => chr(915),
- Delta => chr(916),
- Epsilon => chr(917),
- Zeta => chr(918),
- Eta => chr(919),
- Theta => chr(920),
- Iota => chr(921),
- Kappa => chr(922),
- Lambda => chr(923),
- Mu => chr(924),
- Nu => chr(925),
- Xi => chr(926),
- Omicron => chr(927),
- Pi => chr(928),
- Rho => chr(929),
- Sigma => chr(931),
- Tau => chr(932),
- Upsilon => chr(933),
- Phi => chr(934),
- Chi => chr(935),
- Psi => chr(936),
- Omega => chr(937),
- alpha => chr(945),
- beta => chr(946),
- gamma => chr(947),
- delta => chr(948),
- epsilon => chr(949),
- zeta => chr(950),
- eta => chr(951),
- theta => chr(952),
- iota => chr(953),
- kappa => chr(954),
- lambda => chr(955),
- mu => chr(956),
- nu => chr(957),
- xi => chr(958),
- omicron => chr(959),
- pi => chr(960),
- rho => chr(961),
- sigmaf => chr(962),
- sigma => chr(963),
- tau => chr(964),
- upsilon => chr(965),
- phi => chr(966),
- chi => chr(967),
- psi => chr(968),
- omega => chr(969),
- thetasym => chr(977),
- upsih => chr(978),
- piv => chr(982),
- ensp => chr(8194),
- emsp => chr(8195),
- thinsp => chr(8201),
- zwnj => chr(8204),
- zwj => chr(8205),
- lrm => chr(8206),
- rlm => chr(8207),
- ndash => chr(8211),
- mdash => chr(8212),
- lsquo => chr(8216),
- rsquo => chr(8217),
- sbquo => chr(8218),
- ldquo => chr(8220),
- rdquo => chr(8221),
- bdquo => chr(8222),
- dagger => chr(8224),
- Dagger => chr(8225),
- bull => chr(8226),
- hellip => chr(8230),
- permil => chr(8240),
- prime => chr(8242),
- Prime => chr(8243),
- lsaquo => chr(8249),
- rsaquo => chr(8250),
- oline => chr(8254),
- frasl => chr(8260),
- euro => chr(8364),
- image => chr(8465),
- weierp => chr(8472),
- real => chr(8476),
- trade => chr(8482),
- alefsym => chr(8501),
- larr => chr(8592),
- uarr => chr(8593),
- rarr => chr(8594),
- darr => chr(8595),
- harr => chr(8596),
- crarr => chr(8629),
- lArr => chr(8656),
- uArr => chr(8657),
- rArr => chr(8658),
- dArr => chr(8659),
- hArr => chr(8660),
- forall => chr(8704),
- part => chr(8706),
- exist => chr(8707),
- empty => chr(8709),
- nabla => chr(8711),
- isin => chr(8712),
- notin => chr(8713),
- ni => chr(8715),
- prod => chr(8719),
- sum => chr(8721),
- minus => chr(8722),
- lowast => chr(8727),
- radic => chr(8730),
- prop => chr(8733),
- infin => chr(8734),
- ang => chr(8736),
- 'and' => chr(8743),
- 'or' => chr(8744),
- cap => chr(8745),
- cup => chr(8746),
- 'int' => chr(8747),
- there4 => chr(8756),
- sim => chr(8764),
- cong => chr(8773),
- asymp => chr(8776),
- 'ne' => chr(8800),
- equiv => chr(8801),
- 'le' => chr(8804),
- 'ge' => chr(8805),
- 'sub' => chr(8834),
- sup => chr(8835),
- nsub => chr(8836),
- sube => chr(8838),
- supe => chr(8839),
- oplus => chr(8853),
- otimes => chr(8855),
- perp => chr(8869),
- sdot => chr(8901),
- lceil => chr(8968),
- rceil => chr(8969),
- lfloor => chr(8970),
- rfloor => chr(8971),
- lang => chr(9001),
- rang => chr(9002),
- loz => chr(9674),
- spades => chr(9824),
- clubs => chr(9827),
- hearts => chr(9829),
- diams => chr(9830),
- ) : ())
- );
-
-
- # Make the opposite mapping
- while (my($entity, $char) = each(%entity2char)) {
- $char2entity{$char} = "&$entity;";
- }
- delete $char2entity{"'"}; # only one-way decoding
-
- # Fill in missing entities
- for (0 .. 255) {
- next if exists $char2entity{chr($_)};
- $char2entity{chr($_)} = "$_;";
- }
-
- my %subst; # compiled encoding regexps
-
- sub decode_entities_old
- {
- my $array;
- if (defined wantarray) {
- $array = [@_]; # copy
- } else {
- $array = \@_; # modify in-place
- }
- my $c;
- for (@$array) {
- s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
- s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
- s/(&(\w+);?)/$entity2char{$2} || $1/eg;
- }
- wantarray ? @$array : $array->[0];
- }
-
- sub encode_entities
- {
- my $ref;
- if (defined wantarray) {
- my $x = $_[0];
- $ref = \$x; # copy
- } else {
- $ref = \$_[0]; # modify in-place
- }
- if (defined $_[1] and length $_[1]) {
- unless (exists $subst{$_[1]}) {
- # Because we can't compile regex we fake it with a cached sub
- my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
- $subst{$_[1]} = eval $code;
- die( $@ . " while trying to turn range: \"$_[1]\"\n "
- . "into code: $code\n "
- ) if $@;
- }
- &{$subst{$_[1]}}($$ref);
- } else {
- # Encode control chars, high bit chars and '<', '&', '>', '"'
- $$ref =~ s/([^\n\r\t !\#\$%\'-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
- }
- $$ref;
- }
-
- sub encode_entities_numeric {
- local %char2entity;
- return &encode_entities; # a goto &encode_entities wouldn't work
- }
-
-
- sub num_entity {
- sprintf "%X;", ord($_[0]);
- }
-
- # Set up aliases
- *encode = \&encode_entities;
- *encode_numeric = \&encode_entities_numeric;
- *encode_numerically = \&encode_entities_numeric;
- *decode = \&decode_entities;
-
- 1;
-