home *** CD-ROM | disk | FTP | other *** search
/ Tools / WinSN5.0Ver.iso / NETSCAP.50 / WIN1998.ZIP / ns / lib / libi18n / kinsokuf.c < prev    next >
Encoding:
C/C++ Source or Header  |  1998-04-08  |  12.8 KB  |  515 lines

  1. /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
  2.  *
  3.  * The contents of this file are subject to the Netscape Public License
  4.  * Version 1.0 (the "NPL"); you may not use this file except in
  5.  * compliance with the NPL.  You may obtain a copy of the NPL at
  6.  * http://www.mozilla.org/NPL/
  7.  *
  8.  * Software distributed under the NPL is distributed on an "AS IS" basis,
  9.  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
  10.  * for the specific language governing rights and limitations under the
  11.  * NPL.
  12.  *
  13.  * The Initial Developer of this code under the NPL is Netscape
  14.  * Communications Corporation.  Portions created by Netscape are
  15.  * Copyright (C) 1998 Netscape Communications Corporation.  All Rights
  16.  * Reserved.
  17.  */
  18. /*    kinsukof.c    */
  19.  
  20. #include "intlpriv.h"
  21.  
  22. /* The table is defined in kinsukod.c */
  23. extern const char *ProhibitBegin_SJIS[];
  24. extern const char *ProhibitBegin_EUCJP[];
  25. extern const char *ProhibitBegin_BIG5[];
  26. extern const char *ProhibitBegin_GB[];
  27. extern const char *ProhibitBegin_KSC[];
  28. extern const char *ProhibitBegin_UTF8[];
  29. extern const char *ProhibitBegin_CNS[];
  30.  
  31. extern const char *ProhibitEnd_SJIS[];
  32. extern const char *ProhibitEnd_EUCJP[];
  33. extern const char *ProhibitEnd_BIG5[];
  34. extern const char *ProhibitEnd_GB[];
  35. extern const char *ProhibitEnd_KSC[];
  36. extern const char *ProhibitEnd_UTF8[];
  37. extern const char *ProhibitEnd_CNS[];
  38.  
  39. PUBLIC const char *INTL_NonBreakingSpace(uint16 win_csid)
  40. {
  41.  
  42. #ifdef XP_MAC
  43.         return "\07";        /* 0x07 */
  44. #else
  45.         return "\240";        /* 0xA0 */
  46. #endif
  47.  
  48. }
  49. /*
  50.     INTL_CharClass is used for multibyte to divide character to different type
  51. */
  52. #define IN_BETWEEN(a,b,c)    (((a) <= (b)) && ((b) <= (c)))
  53. PUBLIC int
  54. INTL_CharClass(int charset, unsigned char *pstr)
  55. {
  56.     int    c1, c2, c3;
  57.  
  58.     c1 = *pstr;
  59.  
  60.     switch (charset)
  61.     {
  62.     case CS_SJIS:
  63.         /*
  64.             SEVEN_BIT_CHAR:                    [0x00-0x7F]
  65.             HALFWIDTH_PRONOUNCE_CHAR:        [0xA0-0xE0]
  66.             FULLWIDTH_ASCII_CHAR:            [0x82]        [0x60-0x9A]
  67.                                             [0x83]        [0x9f-0xB6]    ( Really no ASCII but Greek and Cyrillic )
  68.                                             [0x83]        [0xBF-0x8F]    
  69.                                             [0x84]        [0x40-0x60]    
  70.                                             [0x84]        [0x70-0x8F]    
  71.             FULLWIDTH_PRONOUNCE_CHAR:        [0x82]        [0x9F-0xF1]
  72.                                             [0x83]        [0x40-0x96]
  73.                                             [0x81]        [0x5B-0x5D]
  74.             KANJI_CHAR:                        [0x88-0xFC] [xxxxxxxxx] (Except above)
  75.  
  76.             Note:    We count Cyrillic and Greek as FULLWIDTH_ASCII_CHAR
  77.  
  78.         */
  79.         if (c1 < 0x80)
  80.             return  SEVEN_BIT_CHAR;
  81.  
  82.         if (IN_BETWEEN(0xA0, c1, 0xE0))
  83.             return HALFWIDTH_PRONOUNCE_CHAR;
  84.  
  85.         c2 = *(pstr + 1);
  86.  
  87.         switch(c1)
  88.         {
  89.             case 0x81:
  90.                 if(IN_BETWEEN(0x5B, c2, 0x5D)) 
  91.                     return FULLWIDTH_PRONOUNCE_CHAR;
  92.                 break;
  93.  
  94.             case 0x82:
  95.                 if(IN_BETWEEN(0x60, c2, 0x9A))
  96.                        return FULLWIDTH_ASCII_CHAR;
  97.  
  98.                 if(IN_BETWEEN(0x9F, c2, 0xF1))
  99.                     return FULLWIDTH_PRONOUNCE_CHAR;
  100.                 break;
  101.  
  102.             case 0x83:
  103.                 if(IN_BETWEEN(0x9F, c2, 0xB6) || IN_BETWEEN(0xBF, c2, 0xD0))
  104.                        return FULLWIDTH_ASCII_CHAR;
  105.  
  106.                 if(IN_BETWEEN(0x40, c2, 0x96))
  107.                     return FULLWIDTH_PRONOUNCE_CHAR;
  108.                 break;
  109.  
  110.             case 0x84:
  111.                 if(IN_BETWEEN(0x40, c2, 0x8F) || IN_BETWEEN(0xBF, c2, 0xD0)) 
  112.                        return FULLWIDTH_ASCII_CHAR;
  113.                 break;
  114.         }
  115.  
  116.  
  117.         if (IN_BETWEEN(0x88, c1, 0xFC))
  118.             return KANJI_CHAR;
  119.  
  120.         return UNCLASSIFIED_CHAR;
  121.  
  122.     case CS_EUCJP:        /* TO BE TEST ON UNIX */
  123.         /*
  124.             SEVEN_BIT_CHAR:                    [0x00-0x7F]
  125.             HALFWIDTH_PRONOUNCE_CHAR:        [0x8E]
  126.             FULLWIDTH_ASCII_CHAR:            [0xA3]        [0xC1-0xDA]
  127.                                                         [0xE1-0xFA]
  128.                                             [0xA6]        [0xA1-0xB8]
  129.                                                         [0xC1-0xD8]
  130.                                             [0xA7]        [0xA1-0xC1]
  131.                                                         [0xD1-0xF1]
  132.                                             [0x8F]        [0xA6-0xAF]
  133.             FULLWIDTH_PRONOUNCE_CHAR:        [0xA4]        [xxxxxxx]
  134.                                             [0xA5]        [xxxxxxx]
  135.                                             [0x81]        [0x5B-0x5D]
  136.             KANJI_CHAR:                        [0xB0-0xFF] [xxxx]
  137.                                             [0x8F]        [>0xB0]
  138.  
  139.             Note:    We count Cyrillic and Greek as FULLWIDTH_ASCII_CHAR
  140.  
  141.         */
  142.         if (c1 < 0x80)
  143.             return  SEVEN_BIT_CHAR;
  144.  
  145.         c2 = *(pstr + 1);
  146.  
  147.         switch(c1)
  148.         {
  149.             case 0x8E:
  150.                 return HALFWIDTH_PRONOUNCE_CHAR;
  151.  
  152.             case 0x8F:
  153.                 if(IN_BETWEEN(0xA6, c2, 0xAF))
  154.                        return FULLWIDTH_ASCII_CHAR;
  155.                 break;
  156.  
  157.             case 0xA3:
  158.                 if(IN_BETWEEN(0xC1, c2, 0xDA) || IN_BETWEEN(0xE1, c2, 0xFA))
  159.                        return FULLWIDTH_ASCII_CHAR;
  160.                 break;
  161.  
  162.             case 0xA4:     case 0xA5:
  163.                 return FULLWIDTH_PRONOUNCE_CHAR;
  164.  
  165.             case 0xA6:
  166.                 if(IN_BETWEEN(0xA1, c2, 0xB8) || IN_BETWEEN(0xC1, c2, 0xD8))
  167.                        return FULLWIDTH_ASCII_CHAR;
  168.                 break;
  169.  
  170.             case 0xA7:
  171.                 if(IN_BETWEEN(0xA1, c2, 0xC1) || IN_BETWEEN(0xD1, c2, 0xF1))
  172.                        return FULLWIDTH_ASCII_CHAR;
  173.                 break;
  174.         }
  175.  
  176.  
  177.         if( 
  178.             (c1 >= 0xB0) || 
  179.             ((c1 == 0x8F) &&  (c2 > 0xB0))
  180.           )
  181.         {
  182.             return KANJI_CHAR;
  183.         }
  184.  
  185.         return UNCLASSIFIED_CHAR;
  186.  
  187.     case CS_KSC_8BIT:
  188.         /*
  189.             SEVEN_BIT_CHAR:                    [0x00-0x80]
  190.             HALFWIDTH_PRONOUNCE_CHAR:        None
  191.             FULLWIDTH_ASCII_CHAR:            [0xA3]        [0xC1-0xDA]
  192.                                                         [0xE1-0xFA]
  193.                                             [0xA5]        [0xC1-0xD8]
  194.                                                         [0xE1-0xF8]
  195.                                             [0xAC]        [0xA1-0xC2]
  196.                                                         [0xD1-0xF2]
  197.             FULLWIDTH_PRONOUNCE_CHAR:        [0xA4]        [0xA1-0xFE]
  198.                                             [0xB0-0xC8]    [xxxxxxxxx]
  199.             KANJI_CHAR:                        [0xCA-0xFD] [xxxxxxxxx]
  200.  
  201.             Note:     We didn't handle Hiragana and Katakana here
  202.                      We count Cyrillic and Greek as FULLWIDTH_ASCII_CHAR
  203.  
  204.         */        
  205.         if (c1 < 0x80)
  206.             return  SEVEN_BIT_CHAR;
  207.  
  208.         c2 = *(pstr + 1);
  209.         if (
  210.             ((c1== 0xA3) && (IN_BETWEEN(0xC1, c2, 0xDA) || IN_BETWEEN(0xE1, c2, 0xFA))) ||
  211.             ((c1== 0xA5) && (IN_BETWEEN(0xC1, c2, 0xD8) || IN_BETWEEN(0xE1, c2, 0xF8))) ||
  212.             ((c1== 0xAC) && (IN_BETWEEN(0xA1, c2, 0xC2) || IN_BETWEEN(0xD1, c2, 0xF2))) 
  213.            )
  214.         {
  215.            return FULLWIDTH_ASCII_CHAR;
  216.         }
  217.  
  218.         if (
  219.             ((c1== 0xA4) &&             (IN_BETWEEN(0xA1, c2, 0xFE))) ||
  220.              (IN_BETWEEN(0xB0, c1, 0xC8))
  221.            )
  222.         {
  223.             return FULLWIDTH_PRONOUNCE_CHAR;
  224.         }
  225.  
  226.         if (IN_BETWEEN(0xCA, c1, 0xFD))
  227.             return KANJI_CHAR;
  228.  
  229.         return UNCLASSIFIED_CHAR;
  230.  
  231.     case CS_GB_8BIT:
  232.          /*
  233.             SEVEN_BIT_CHAR:                        [0x00-0x7F]
  234.             HALFWIDTH_PRONOUNCE_CHAR:            
  235.             FULLWIDTH_ASCII_CHAR:                [0xA3]        [0xC1-0xDA]
  236.                                                             [0xE1-0xFA]
  237.                                                 [0xA6]        [0xA1-0xB8]        Greek
  238.                                                             [0xC1-0xD8]
  239.                                                 [0xA7]        [0xA1-0xC1]        Cyrillic
  240.                                                             [0xD1-0xF1]        
  241.                                                 [0xA8]        [0xA1-0xBA]        European
  242.             FULLWIDTH_PRONOUNCE_CHAR:            [0xA4,0xA5,0xA8] [xxxx]
  243.             KANJI_CHAR:
  244.         */       
  245.         if (c1 < 0x80)
  246.             return  SEVEN_BIT_CHAR;
  247.  
  248.         c2 = *(pstr + 1);
  249.         if (
  250.             ((c1== 0xA3) && (IN_BETWEEN(0xC1, c2, 0xDA) || IN_BETWEEN(0xE1, c2, 0xFA))) ||
  251.             ((c1== 0xA6) && (IN_BETWEEN(0xA1, c2, 0xB8) || IN_BETWEEN(0xC1, c2, 0xD8))) ||
  252.             ((c1== 0xA7) && (IN_BETWEEN(0xA1, c2, 0xC1) || IN_BETWEEN(0xD1, c2, 0xF1))) ||
  253.             ((c1== 0xA8) && (IN_BETWEEN(0xA1, c2, 0xBA)) ) 
  254.            )
  255.         { 
  256.             return FULLWIDTH_ASCII_CHAR;
  257.         }
  258.  
  259.         if ((c1 == 0xA4) || (c1 == 0xA5) || (c1 == 0xA8)) 
  260.             return FULLWIDTH_PRONOUNCE_CHAR;
  261.  
  262.         if (IN_BETWEEN(0xB0, c1, 0xF7))
  263.             return KANJI_CHAR;
  264.  
  265.         return UNCLASSIFIED_CHAR;
  266.  
  267.     case CS_BIG5:
  268.         /*
  269.             SEVEN_BIT_CHAR:                [0x00-0x7F]
  270.             HALFWIDTH_PRONOUNCE_CHAR:    
  271.             FULLWIDTH_ASCII_CHAR:        [0xA2]        [0xCF-0xFF]
  272.                                         [0xA3]        [0x40-0x73]
  273.             FULLWIDTH_PRONOUNCE_CHAR:    [0xA3]        [0x74-0x7E]
  274.                                                     [0xA1-0xBF]
  275.             KANJI_CHAR:                    [0xA4-0xFF]    [xxxxxxxxx]
  276.         */        
  277.         if (c1 < 0x80)
  278.             return  SEVEN_BIT_CHAR;
  279.  
  280.         c2 = *(pstr + 1);
  281.  
  282.         switch(c1)
  283.         {
  284.             case 0xA2:
  285.                 if (IN_BETWEEN(0xCF, c2, 0xFF))
  286.                     return FULLWIDTH_ASCII_CHAR;
  287.                 break;
  288.  
  289.             case 0xA3:
  290.                 if (IN_BETWEEN(0x74, c2, 0x7E) || IN_BETWEEN(0xA1, c2, 0xBF))
  291.                     return FULLWIDTH_PRONOUNCE_CHAR;
  292.  
  293.                 if (IN_BETWEEN(0x40, c2, 0x73)) 
  294.                     return FULLWIDTH_ASCII_CHAR;
  295.  
  296.                 break;
  297.         }
  298.  
  299.         if (c1 >= 0xA4)
  300.             return KANJI_CHAR;
  301.  
  302.         return UNCLASSIFIED_CHAR;
  303.  
  304.     case CS_CNS_8BIT:        /* TO BE TEST ON UNIX */
  305.         /*
  306.             SEVEN_BIT_CHAR:                [0x00-0x7F]
  307.             HALFWIDTH_PRONOUNCE_CHAR:    
  308.             FULLWIDTH_ASCII_CHAR:        [0xA4]        [0xC1-0xFE]    
  309.                                         [0xA5]        [0xA1-0xC6]
  310.             FULLWIDTH_PRONOUNCE_CHAR:    [0xA5]        [0xC7-0xF0]        
  311.             KANJI_CHAR:                    [0xC4-0xFF]    [xxxxxxxxx]        
  312.                                         [0x8E]
  313.         */        
  314.         if (c1 < 0x80)
  315.             return  SEVEN_BIT_CHAR;
  316.  
  317.         c2 = *(pstr + 1);
  318.  
  319.         switch(c1)
  320.         {
  321.             case 0xA4:
  322.                 if(IN_BETWEEN(0xC1, c2, 0xFE))
  323.                     return FULLWIDTH_ASCII_CHAR;
  324.                 break;
  325.  
  326.             case 0xA5:
  327.                 if(IN_BETWEEN(0xC7, c2, 0xF0)) 
  328.                     return FULLWIDTH_PRONOUNCE_CHAR;
  329.  
  330.                 if(IN_BETWEEN(0xA1, c2, 0xC6))
  331.                     return FULLWIDTH_ASCII_CHAR;
  332.                 break;
  333.         }
  334.  
  335.         if (IN_BETWEEN(0xC4, c1, 0x8E))
  336.             return KANJI_CHAR;
  337.  
  338.         return UNCLASSIFIED_CHAR;  
  339.  
  340.     case CS_UTF8:
  341.         /*
  342.             SEVEN_BIT_CHAR:
  343.             
  344.             FULLWIDTH_ASCII_CHAR:
  345.                 U+0000 - U+10FF
  346.                                     [C0-E0] [xxxx]                Done
  347.                                     [E1]    [80-83]    [xxxx]        Done
  348.                 U+1E00 - U+1FFF
  349.                                     [E1]    [B8-BF]                Done
  350.                 U+FF21 - U+FF3A
  351.                                     [EF]    [BC]    [A1-BA]        Done
  352.                 U+FF41 - U+FF5A
  353.                                     [EF]    [BD]    [81-9A]        Done
  354.                                     
  355.             FULLWIDTH_PRONOUNCE_CHAR:
  356.                 U+1100 - U+11FF
  357.                                     [E1]    [84-87]                Done
  358.                 U+3040 - U+318F
  359.                                     [E3]    [81-85] [xx]        Done
  360.                                     [E3]    [86]    [80-8F]        Done
  361.                 U+FF66 - U+FFDC
  362.                                     [EF]    [BD]    [AC-]
  363.                                     [EF]    [BE]    
  364.                                     [EF]    [BF]    [-9C]
  365.                 U+AC00 - U+D7FF
  366.                                     [EA]    [B0-]                Done
  367.                                     [EB-EC]    [xxx]                Done
  368.                                     [ED]    [-9F]                Done
  369.  
  370.             KANJI_CHAR:
  371.                 U+4E00 - U+9FFF
  372.                                     [E4]    [B8-]                Done
  373.                                     [E5-E9] [xx]                Done
  374.         */        
  375.         if (c1 < 0x80)
  376.             return  SEVEN_BIT_CHAR;
  377.         
  378.         if (IN_BETWEEN(0xC0, c1, 0xE0))
  379.         {
  380.             return FULLWIDTH_ASCII_CHAR;
  381.         }
  382.  
  383.         c2 = *(pstr + 1);
  384.  
  385.         switch(c1)
  386.         {
  387.             case 0xE1:
  388.                 if (IN_BETWEEN(0x80, c2, 0x83) || IN_BETWEEN(0xB8, c2, 0xBF))
  389.                     return FULLWIDTH_ASCII_CHAR;
  390.                 if (IN_BETWEEN(0x84, c2, 0x87))
  391.                     return FULLWIDTH_PRONOUNCE_CHAR;
  392.  
  393.                 break;
  394.  
  395.             case 0xE3:
  396.                 if (IN_BETWEEN(0x81, c2, 0x85))
  397.                     return FULLWIDTH_PRONOUNCE_CHAR;
  398.  
  399.                 if (c2 == 0x86)
  400.                 {
  401.                     c3 = *(pstr + 2);
  402.                     if (IN_BETWEEN(0x80, c3, 0x8F))
  403.                         return FULLWIDTH_PRONOUNCE_CHAR;
  404.                 }
  405.                 
  406.                 break;
  407.  
  408.             case 0xE4:
  409.                 if (c2 >= 0xB8)
  410.                     return KANJI_CHAR;
  411.                 break;
  412.  
  413.             case 0xE5: case 0xE6: case 0xE7: case 0xE8: case 0xE9:
  414.                 return KANJI_CHAR;
  415.                 break;
  416.  
  417.             case 0xEA:
  418.                 if (c2 >= 0xB0)
  419.                     return FULLWIDTH_PRONOUNCE_CHAR;
  420.                 break;
  421.  
  422.             case 0xEB: case 0xEC:
  423.                 return FULLWIDTH_PRONOUNCE_CHAR;
  424.                 break;
  425.  
  426.             case 0xED:
  427.                 if (c2 <= 0x9F)
  428.                     return FULLWIDTH_PRONOUNCE_CHAR;
  429.                 break;
  430.  
  431.             case 0xEF:
  432.                 c3 = *(pstr + 2);
  433.                 switch(c2)
  434.                 {
  435.                     case 0xBC:
  436.                         if (IN_BETWEEN(0xA1, c3, 0xBA))
  437.                             return FULLWIDTH_ASCII_CHAR;
  438.                         break;
  439.  
  440.                     case 0xBD:
  441.                         if (IN_BETWEEN(0x81, c3, 0x9A))
  442.                             return FULLWIDTH_ASCII_CHAR;
  443.                         if (c3 >= 0xAC)
  444.                                 return FULLWIDTH_PRONOUNCE_CHAR;
  445.                         break;
  446.  
  447.                     case 0xBE:
  448.                         return FULLWIDTH_PRONOUNCE_CHAR;
  449.                         break;
  450.  
  451.                     case 0xBF:
  452.                         if (c3 <= 0x9C)
  453.                                 return FULLWIDTH_PRONOUNCE_CHAR;
  454.                         break;
  455.                 }
  456.                 break;
  457.         }
  458.  
  459.         return UNCLASSIFIED_CHAR;  
  460.     default:
  461.         break;
  462.     }
  463.  
  464.     return UNCLASSIFIED_CHAR;
  465. }
  466.  
  467. #define IF_A_IN_ARRAY_B_THEN_RETURN_C(a,b,c)    \
  468.     {    \
  469.         int j;    \
  470.         for (j = 0; (b)[j][0]; j++)    \
  471.             if (XP_STRNCMP((char *)a, (b)[j], XP_STRLEN((b)[j])) == 0)    \
  472.                 return  (c);    \
  473.     }
  474.  
  475. #define IF_PROHIBIT_CLASS_THEN_RETURN(a,ba,ea)    \
  476.     { \
  477.         IF_A_IN_ARRAY_B_THEN_RETURN_C(a,ba,PROHIBIT_BEGIN_OF_LINE);    \
  478.         IF_A_IN_ARRAY_B_THEN_RETURN_C(a,ea,PROHIBIT_END_OF_LINE);    \
  479.     }
  480.  
  481. PUBLIC int INTL_KinsokuClass(int16 win_csid, unsigned char *pstr)
  482. {
  483.     switch (win_csid)
  484.     {
  485.     case CS_SJIS:
  486.         IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_SJIS,ProhibitEnd_SJIS);
  487.         break;
  488.     case CS_EUCJP:
  489.         IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_EUCJP,ProhibitEnd_EUCJP);
  490.         break;
  491.     case CS_GB_8BIT:
  492.         IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_GB,ProhibitEnd_GB);
  493.         break;
  494.     case CS_BIG5:
  495.         IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_BIG5,ProhibitEnd_BIG5);
  496.         break;
  497.     case CS_CNS_8BIT:
  498.         IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_CNS,ProhibitEnd_CNS);
  499.         break;
  500.     case CS_KSC_8BIT:
  501.         IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_KSC,ProhibitEnd_KSC);
  502.         break;
  503.     case CS_UTF8:
  504.         IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_UTF8,ProhibitEnd_UTF8);
  505.         if( *pstr  <= 0xE2)     /* UCS2 < 0x2000 */
  506.             return PROHIBIT_WORD_BREAK;
  507.         break;
  508.     }
  509.  
  510.     return  PROHIBIT_NOWHERE;
  511. }
  512.  
  513.  
  514.  
  515.