home *** CD-ROM | disk | FTP | other *** search
- package MIME::Words;
-
-
- =head1 NAME
-
- MIME::Words - deal with RFC-1522 encoded words
-
-
- =head1 SYNOPSIS
-
- Before reading further, you should see L<MIME::Tools> to make sure that
- you understand where this module fits into the grand scheme of things.
- Go on, do it now. I'll wait.
-
- Ready? Ok...
-
-
- use MIME::Words qw(:all);
-
- ### Decode the string into another string, forgetting the charsets:
- $decoded = decode_mimewords(
- 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
- );
-
- ### Split string into array of decoded [DATA,CHARSET] pairs:
- @decoded = decode_mimewords(
- 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
- );
-
- ### Encode a single unsafe word:
- $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
-
- ### Encode a string, trying to find the unsafe words inside it:
- $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB in town");
-
-
-
- =head1 DESCRIPTION
-
- Fellow Americans, you probably won't know what the hell this module
- is for. Europeans, Russians, et al, you probably do. C<:-)>.
-
- For example, here's a valid MIME header you might get:
-
- From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>
- To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>
- CC: =?ISO-8859-1?Q?Andr=E9_?= Pirard <PIRARD@vm1.ulg.ac.be>
- Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
- =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
- =?US-ASCII?Q?.._cool!?=
-
- The fields basically decode to (sorry, I can only approximate the
- Latin characters with 7 bit sequences /o and 'e):
-
- From: Keith Moore <moore@cs.utk.edu>
- To: Keld J/orn Simonsen <keld@dkuug.dk>
- CC: Andr'e Pirard <PIRARD@vm1.ulg.ac.be>
- Subject: If you can read this you understand the example... cool!
-
-
- =head1 PUBLIC INTERFACE
-
- =over 4
-
- =cut
-
- require 5.001;
-
- ### Pragmas:
- use strict;
- use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);
-
- ### Exporting:
- use Exporter;
- %EXPORT_TAGS = (all => [qw(decode_mimewords
- encode_mimeword
- encode_mimewords
- )]);
- Exporter::export_ok_tags('all');
-
- ### Inheritance:
- @ISA = qw(Exporter);
-
- ### Other modules:
- use MIME::Base64;
- use MIME::QuotedPrint;
-
-
-
- #------------------------------
- #
- # Globals...
- #
- #------------------------------
-
- ### The package version, both in 1.23 style *and* usable by MakeMaker:
- $VERSION = substr q$Revision: 5.404 $, 10;
-
- ### Nonprintables (controls + x7F + 8bit):
- my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF";
-
-
- #------------------------------
-
- # _decode_Q STRING
- # Private: used by _decode_header() to decode "Q" encoding, which is
- # almost, but not exactly, quoted-printable. :-P
- sub _decode_Q {
- my $str = shift;
- $str =~ s/_/\x20/g; # RFC-1522, Q rule 2
- $str =~ s/=([\da-fA-F]{2})/pack("C", hex($1))/ge; # RFC-1522, Q rule 1
- $str;
- }
-
- # _encode_Q STRING
- # Private: used by _encode_header() to decode "Q" encoding, which is
- # almost, but not exactly, quoted-printable. :-P
- sub _encode_Q {
- my $str = shift;
- $str =~ s{([_\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
- $str;
- }
-
- # _decode_B STRING
- # Private: used by _decode_header() to decode "B" encoding.
- sub _decode_B {
- my $str = shift;
- decode_base64($str);
- }
-
- # _encode_B STRING
- # Private: used by _decode_header() to decode "B" encoding.
- sub _encode_B {
- my $str = shift;
- encode_base64($str, '');
- }
-
-
-
- #------------------------------
-
- =item decode_mimewords ENCODED, [OPTS...]
-
- I<Function.>
- Go through the string looking for RFC-1522-style "Q"
- (quoted-printable, sort of) or "B" (base64) encoding, and decode them.
-
- B<In an array context,> splits the ENCODED string into a list of decoded
- C<[DATA, CHARSET]> pairs, and returns that list. Unencoded
- data are returned in a 1-element array C<[DATA]>, giving an effective
- CHARSET of C<undef>.
-
- $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>';
- foreach (decode_mimewords($enc)) {
- print "", ($_[1] || 'US-ASCII'), ": ", $_[0], "\n";
- }
-
- B<In a scalar context,> joins the "data" elements of the above
- list together, and returns that. I<Warning: this is information-lossy,>
- and probably I<not> what you want, but if you know that all charsets
- in the ENCODED string are identical, it might be useful to you.
- (Before you use this, please see L<MIME::WordDecoder/unmime>,
- which is probably what you want.)
-
- In the event of a syntax error, $@ will be set to a description
- of the error, but parsing will continue as best as possible (so as to
- get I<something> back when decoding headers).
- $@ will be false if no error was detected.
-
- Any arguments past the ENCODED string are taken to define a hash of options:
-
- =over 4
-
- =item Field
-
- Name of the mail field this string came from. I<Currently ignored.>
-
- =back
-
- =cut
-
- sub decode_mimewords {
- my $encstr = shift;
- my %params = @_;
- my @tokens;
- $@ = ''; ### error-return
-
- ### Collapse boundaries between adjacent encoded words:
- $encstr =~ s{(\?\=)\r?\n[ \t](\=\?)}{$1$2}gs;
- pos($encstr) = 0;
- ### print STDOUT "ENC = [", $encstr, "]\n";
-
- ### Decode:
- my ($charset, $encoding, $enc, $dec);
- while (1) {
- last if (pos($encstr) >= length($encstr));
- my $pos = pos($encstr); ### save it
-
- ### Case 1: are we looking at "=?..?..?="?
- if ($encstr =~ m{\G # from where we left off..
- =\?([^?]*) # "=?" + charset +
- \?([bq]) # "?" + encoding +
- \?([^?]+) # "?" + data maybe with spcs +
- \?= # "?="
- }xgi) {
- ($charset, $encoding, $enc) = ($1, lc($2), $3);
- $dec = (($encoding eq 'q') ? _decode_Q($enc) : _decode_B($enc));
- push @tokens, [$dec, $charset];
- next;
- }
-
- ### Case 2: are we looking at a bad "=?..." prefix?
- ### We need this to detect problems for case 3, which stops at "=?":
- pos($encstr) = $pos; # reset the pointer.
- if ($encstr =~ m{\G=\?}xg) {
- $@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|;
- push @tokens, ['=?'];
- next;
- }
-
- ### Case 3: are we looking at ordinary text?
- pos($encstr) = $pos; # reset the pointer.
- if ($encstr =~ m{\G # from where we left off...
- ([\x00-\xFF]*? # shortest possible string,
- \n*) # followed by 0 or more NLs,
- (?=(\Z|=\?)) # terminated by "=?" or EOS
- }xg) {
- length($1) or die "MIME::Words: internal logic err: empty token\n";
- push @tokens, [$1];
- next;
- }
-
- ### Case 4: bug!
- die "MIME::Words: unexpected case:\n($encstr) pos $pos\n\t".
- "Please alert developer.\n";
- }
- return (wantarray ? @tokens : join('',map {$_->[0]} @tokens));
- }
-
- #------------------------------
-
- =item encode_mimeword RAW, [ENCODING], [CHARSET]
-
- I<Function.>
- Encode a single RAW "word" that has unsafe characters.
- The "word" will be encoded in its entirety.
-
- ### Encode "<<Franc,ois>>":
- $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
-
- You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
- You may specify the CHARSET, which defaults to C<iso-8859-1>.
-
- =cut
-
- sub encode_mimeword {
- my $word = shift;
- my $encoding = uc(shift || 'Q');
- my $charset = uc(shift || 'ISO-8859-1');
- my $encfunc = (($encoding eq 'Q') ? \&_encode_Q : \&_encode_B);
- "=?$charset?$encoding?" . &$encfunc($word) . "?=";
- }
-
- #------------------------------
-
- =item encode_mimewords RAW, [OPTS]
-
- I<Function.>
- Given a RAW string, try to find and encode all "unsafe" sequences
- of characters:
-
- ### Encode a string with some unsafe "words":
- $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB");
-
- Returns the encoded string.
- Any arguments past the RAW string are taken to define a hash of options:
-
- =over 4
-
- =item Charset
-
- Encode all unsafe stuff with this charset. Default is 'ISO-8859-1',
- a.k.a. "Latin-1".
-
- =item Encoding
-
- The encoding to use, C<"q"> or C<"b">. The default is C<"q">.
-
- =item Field
-
- Name of the mail field this string will be used in. I<Currently ignored.>
-
- =back
-
- B<Warning:> this is a quick-and-dirty solution, intended for character
- sets which overlap ASCII. B<It does not comply with the RFC-1522
- rules regarding the use of encoded words in message headers>.
- You may want to roll your own variant,
- using C<encoded_mimeword()>, for your application.
- I<Thanks to Jan Kasprzak for reminding me about this problem.>
-
- =cut
-
- sub encode_mimewords {
- my ($rawstr, %params) = @_;
- my $charset = $params{Charset} || 'ISO-8859-1';
- my $encoding = lc($params{Encoding} || 'q');
-
- ### Encode any "words" with unsafe characters.
- ### We limit such words to 18 characters, to guarantee that the
- ### worst-case encoding give us no more than 54 + ~10 < 75 characters
- my $word;
- $rawstr =~ s{([a-zA-Z0-9\x7F-\xFF]{1,18})}{ ### get next "word"
- $word = $1;
- (($word !~ /[$NONPRINT]/o)
- ? $word ### no unsafe chars
- : encode_mimeword($word, $encoding, $charset)); ### has unsafe chars
- }xeg;
- $rawstr;
- }
-
- 1;
- __END__
-
-
- =back
-
- =head1 NOTES
-
- Exports its principle functions by default, in keeping with
- MIME::Base64 and MIME::QuotedPrint.
-
-
- =head1 AUTHOR
-
- Eryq (F<eryq@zeegee.com>), ZeeGee Software Inc (F<http://www.zeegee.com>).
-
- All rights reserved. This program is free software; you can redistribute
- it and/or modify it under the same terms as Perl itself.
-
- Thanks also to...
-
- Kent Boortz For providing the idea, and the baseline
- RFC-1522-decoding code!
- KJJ at PrimeNet For requesting that this be split into
- its own module.
- Stephane Barizien For reporting a nasty bug.
-
-
- =head1 VERSION
-
- $Revision: 5.404 $ $Date: 2000/11/10 16:45:12 $
-
- =cut
-
-
- #------------------------------
- # Execute simple test if run as a script.
- #------------------------------
- {
- package main; no strict;
- eval join('',<main::DATA>) || die "$@ $main::DATA" unless caller();
- }
- 1; # end the module
- __END__
-
-
- ### Pick up other MIME stuff, just in case...
- BEGIN { unshift @INC, ".", "./etc", "./lib" };
- import MIME::Words;
-
- my @encs = (
- '=?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>',
- '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
- '=?ISO-8859-1?Q?Andr=E9_?= Pirard <PIRARD@vm1.ulg.ac.be>',
- ('=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?='.
- '=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?='.
- '=?US-ASCII?Q?.._cool!?='));
- foreach $enc (@encs) {
- $x = decode_mimewords($enc);
- print "DEC: ", $x, "\n";
- }
-
- ### Encode a single unsafe word:
- $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
- print "ENC1: ", $encoded, "\n";
-
- ### Encode a string, trying to find the unsafe words inside it:
- $encoded = encode_mimewords("Me and \xABFran\xE7ois\xBB at the beach");
- print "ENC2: ", $encoded, "\n";
-
- ### Encode "<<Franc,ois>>":
- my $unsafe = <<EOF;
- Me and \xABFran\xE7ois\xBB, down at the beach
- with Dave <dave\@ether.net>
- EOF
- $encoded = encode_mimewords($unsafe);
- print "ENC3: ", $encoded, "\n";
- print "DEC3: ", scalar(decode_mimewords($encoded)), "\n";
-
- ### So we know everything went well...
- exit 0;
-
- #------------------------------
-
-
-