home *** CD-ROM | disk | FTP | other *** search
/ PC Professionell 2004 December / PCpro_2004_12.ISO / files / webserver / xampp / xampp-perl-addon-1.4.9-installer.exe / EncodingDetect.pm < prev    next >
Encoding:
Perl POD Document  |  2002-02-03  |  6.2 KB  |  188 lines

  1. # $Id: EncodingDetect.pm,v 1.3 2002/02/03 12:25:42 matt Exp $
  2.  
  3. package XML::SAX::PurePerl; # NB, not ::EncodingDetect!
  4.  
  5. use strict;
  6.  
  7. sub encoding_detect {
  8.     my ($parser, $reader) = @_;
  9.     
  10.     my $error = "Invalid byte sequence at start of file";
  11.     
  12.     # BO == Byte Order mark
  13.     if ($reader->match_nocheck("\x00")) {
  14.         # maybe BO-UCS4-be, BO-UCS4-3412, UCS4-be, UCS4-2143, UCS4-3412, UTF-16BE
  15.         if ($reader->match_nocheck("\x00")) {
  16.             # maybe BO-UCS4-be, BO-UCS4-2143, UCS4-be, UCS4-2143
  17.             if ($reader->match_nocheck("\xFE")) {
  18.                 if ($reader->match_nonext("\xFF")) {
  19.                     # BO-UCS4-be
  20.                     $reader->set_encoding("UCS-4BE");
  21.                     $reader->next;
  22.                     return;
  23.                 }
  24.             }
  25.             elsif ($reader->match_nocheck("\xFF")) {
  26.                 if ($reader->match_nonext("\xFE")) {
  27.                     # BO-UCS-4-2143
  28.                     $reader->set_encoding("UCS-4-2143");
  29.                     $reader->next;
  30.                     return;
  31.                 }
  32.             }
  33.             elsif ($reader->match_nocheck("\x00")) {
  34.                 if ($reader->match_nonext("\x3C")) {
  35.                     # UCS4-be
  36.                     $reader->set_encoding("UCS-4BE");
  37.                     $reader->next;
  38.                     $reader->buffer('<');
  39.                     return;
  40.                 }
  41.             }
  42.             elsif ($reader->match_nocheck("\x3C")) {
  43.                 if ($reader->match_nonext("\x00")) {
  44.                     # UCS-4-2143
  45.                     $reader->set_encoding("UCS-4-2143");
  46.                     $reader->next;
  47.                     $reader->buffer('<');
  48.                     return;
  49.                 }
  50.             }
  51.         }
  52.         elsif ($reader->match_nocheck("\x3C")) {
  53.             # maybe UCS4-3412, UTF-16BE
  54.             if ($reader->match_nocheck("\x00")) {
  55.                 if ($reader->match_nonext("\x00")) {
  56.                     # UCS4-3412
  57.                     $reader->set_encoding("UCS-4-3412");
  58.                     $reader->next;
  59.                     # these are parsable chars
  60.                     $reader->buffer("<");
  61.                     return;
  62.                 }
  63.                 elsif ($reader->match_nonext("\x3F")) {
  64.                     # UTF-16BE
  65.                     $reader->set_encoding("UTF-16BE");
  66.                     # these are parsable chars
  67.                     $reader->buffer("<?");
  68.                     return;
  69.                 }
  70.             }
  71.         }
  72.         
  73.         $parser->parser_error($error, $reader);
  74.     }
  75.     elsif ($reader->match_nocheck("\xFF")) {
  76.         # maybe BO-UCS-4LE, UTF-16LE
  77.         if ($reader->match_nocheck("\xFE")) {
  78.             if ($reader->match_nocheck("\x00")) {
  79.                 if ($reader->match_nonext("\x00")) {
  80.                     $reader->set_encoding("UCS-4LE");
  81.                     $reader->next;
  82.                     return;
  83.                 }
  84.             }
  85.             else {
  86.                 my $byte1 = $reader->current;
  87.                 $reader->next;
  88.                 my $char = chr unpack("v", $byte1 . $reader->current);
  89.                 $reader->set_encoding("UTF-16LE");
  90.                 $reader->next;
  91.                 $reader->buffer($char);
  92.                 return;
  93.             }
  94.         }
  95.         
  96.         $parser->parser_error($error, $reader);
  97.     }
  98.     elsif ($reader->match_nocheck("\xFE")) {
  99.         # maybe BO-UCS-4-3412, UTF-16BE
  100.         if ($reader->match_nocheck("\xFF")) {
  101.             if ($reader->match_nocheck("\x00")) {
  102.                 if ($reader->match_nonext("\x00")) {
  103.                     $reader->set_encoding("UCS-4-3412");
  104.                     $reader->next;
  105.                     return;
  106.                 }
  107.                 elsif ($reader->match_nonext("\x3C")) {
  108.                     $reader->set_encoding("UTF-16BE");
  109.                     $reader->next;
  110.                     $reader->buffer("<");
  111.                     return;
  112.                 }
  113.             }
  114.         }
  115.         $parser->parser_error($error, $reader);
  116.     }
  117.     elsif ($reader->match_nocheck("\xEF")) {
  118.         if ($reader->match_nocheck("\xBB")) {
  119.             if ($reader->match_nonext("\xBF")) {
  120.                 # OK, UTF-8
  121.                 $reader->set_encoding("UTF-8");
  122.                 $reader->next;
  123.                 return;
  124.             }
  125.         }
  126.         $parser->parser_error($error, $reader);
  127.     }
  128.     elsif ($reader->match_nocheck("\x3C")) {
  129.         if ($reader->match_nocheck("\x00")) {
  130.             if ($reader->match_nocheck("\x00")) {
  131.                 if ($reader->match_nonext("\x00")) {
  132.                     $reader->set_encoding("UCS-4LE");
  133.                     $reader->next;
  134.                     $reader->buffer("<");
  135.                     return;
  136.                 }
  137.             }
  138.             elsif ($reader->match_nocheck("\x3F")) {
  139.                 if ($reader->match_nonext("\x00")) {
  140.                     $reader->set_encoding("UTF-16LE");
  141.                     $reader->next;
  142.                     $reader->buffer("<?");
  143.                     return;
  144.                 }
  145.             }
  146.         }
  147.         elsif ($reader->match_nocheck("\x3F")) {
  148.             if ($reader->match_nocheck("\x78")) {
  149.                 if ($reader->match_nocheck("\x6D")) {
  150.                     # some 7 or 8 bit charset with ASCII chars in right place
  151.                     $reader->buffer("<?xm");
  152.                     return;
  153.                 }
  154.                 else {
  155.                     $reader->buffer('<?x');
  156.                     return;
  157.                 }
  158.             }
  159.             else {
  160.                 $reader->buffer('<?');
  161.                 return;
  162.             }
  163.         }
  164.         else {
  165.             # assume we have "<tag", and assume UTF-8/ASCII
  166.             $reader->buffer("<");
  167.             return;
  168.         }
  169.     }
  170.     elsif ($reader->match_nocheck("\x4C") && 
  171.             $reader->match_nocheck("\x6F") &&
  172.             $reader->match_nocheck("\xA7") &&
  173.             $reader->match_nonext("\x94"))
  174.     {
  175.         $reader->set_encoding("EBCDIC");
  176.         $reader->next;
  177.         return;
  178.     }
  179.     
  180.     # lets just try parsing it...
  181.     return;
  182.     
  183.     # $parser->parser_error($error, $reader);
  184. }
  185.  
  186. 1;
  187.  
  188.