home *** CD-ROM | disk | FTP | other *** search
/ AmigActive 6 / AACD06.ISO / AACD / Programming / ICU / src / icu / source / i18n / txtbdat.h < prev    next >
Encoding:
C/C++ Source or Header  |  1999-08-16  |  15.5 KB  |  401 lines

  1. /*
  2. *****************************************************************************************
  3. *                                                                                       *
  4. * COPYRIGHT:                                                                            *
  5. *   (C) Copyright Taligent, Inc.,  1997                                                 *
  6. *   (C) Copyright International Business Machines Corporation,  1997-1999                    *
  7. *   Licensed Material - Program-Property of IBM - All Rights Reserved.                  *
  8. *   US Government Users Restricted Rights - Use, duplication, or disclosure             *
  9. *   restricted by GSA ADP Schedule Contract with IBM Corp.                              *
  10. *                                                                                       *
  11. *****************************************************************************************
  12. *
  13. * File TXTBDAT.H
  14. *
  15. * Modification History:
  16. *
  17. *   Date        Name        Description
  18. *   02/18/97    aliu        Converted from OpenClass.
  19. *                           Made static data members const where appropriate.
  20. *   03/25/97    aliu        Removed subclasses, and merged their static data into this
  21. *                           class.  Instantiated four static instances for character,
  22. *                           word, sentence, and line.  Made forward(), backward(), and
  23. *                           map() methods inline.
  24. *   04/15/97    aliu        Worked around bug in AIX xlC compiler which occurs if static
  25. *                           arrays contain const elements.
  26. *   05/06/97    aliu        Made kSI, kStop, and kSI_Stop into #defines to help out
  27. *                           non-compliant compilers.
  28. *****************************************************************************************
  29. */
  30.  
  31. #ifndef TXTBDAT_H
  32. #define TXTBDAT_H
  33.  
  34. #include "utypes.h"
  35. class WordBreakTable;
  36. class UnicodeClassMapping;
  37. class SpecialMapping;
  38.  
  39. /**
  40.  * This class wraps up the data tables needed for SimpleTextBoundary.
  41.  * It is statically instantiated for each type of text boundary.  This
  42.  * class is not designed to be subclassed.
  43.  */
  44. class TextBoundaryData {
  45. public:
  46.     ~TextBoundaryData() {} // Do not subclass
  47.  
  48.     // Fast inline accessors
  49.     const WordBreakTable* forward(void) const;
  50.     const WordBreakTable* backward(void) const;
  51.     const UnicodeClassMapping* map(void) const;
  52.  
  53.     static const TextBoundaryData kCharacterBreakData;
  54.     static const TextBoundaryData kWordBreakData;
  55.     static const TextBoundaryData kLineBreakData;
  56.     static const TextBoundaryData kSentenceBreakData;
  57.  
  58.     typedef uint8_t Node;
  59.     typedef uint8_t Type;
  60.  
  61. private:
  62.     static const UChar ASCII_END_OF_TEXT;
  63.     static const UChar ASCII_HORIZONTAL_TABULATION;
  64.     static const UChar ASCII_LINEFEED;
  65.     static const UChar ASCII_VERTICAL_TABULATION;
  66.     static const UChar ASCII_FORM_FEED;
  67.     static const UChar ASCII_CARRIAGE_RETURN;
  68.     static const UChar ASCII_SPACE;
  69.     static const UChar ASCII_EXCLAMATION_MARK;
  70.     static const UChar ASCII_QUOTATION_MARK;
  71.     static const UChar ASCII_NUMBER_SIGN;
  72.     static const UChar ASCII_DOLLAR_SIGN;
  73.     static const UChar ASCII_PERCENT;
  74.     static const UChar ASCII_AMPERSAND;
  75.     static const UChar ASCII_APOSTROPHE;
  76.     static const UChar ASCII_COMMA;
  77.     static const UChar ASCII_FULL_STOP;
  78.     static const UChar ASCII_COLON;
  79.     static const UChar ASCII_SEMICOLON;
  80.     static const UChar ASCII_QUESTION_MARK;
  81.     static const UChar ASCII_NONBREAKING_SPACE;
  82.     static const UChar ASCII_CENT_SIGN;
  83.     static const UChar ASCII_POUND_SIGN;
  84.     static const UChar ASCII_YEN_SIGN;
  85.     static const UChar LATIN1_SOFTHYPHEN;
  86.     static const UChar LATIN1_DEGREE_SIGN;
  87.     static const UChar ARABIC_PERCENT_SIGN;
  88.     static const UChar ARABIC_DECIMAL_SEPARATOR;
  89.     static const UChar HANGUL_CHOSEONG_LOW;
  90.     static const UChar HANGUL_CHOSEONG_HIGH;
  91.     static const UChar HANGUL_JUNGSEONG_LOW;
  92.     static const UChar HANGUL_JUNGSEONG_HIGH;
  93.     static const UChar HANGUL_JONGSEONG_LOW;
  94.     static const UChar HANGUL_JONGSEONG_HIGH;
  95.     static const UChar FIGURE_SPACE;
  96.     static const UChar NONBREAKING_HYPHEN;
  97.     static const UChar PUNCTUATION_HYPHENATION_POINT;
  98.     static const UChar PUNCTUATION_LINE_SEPARATOR;
  99.     static const UChar PUNCTUATION_PARAGRAPH_SEPARATOR;
  100.     static const UChar PER_MILLE_SIGN;
  101.     static const UChar PER_TEN_THOUSAND_SIGN;
  102.     static const UChar PRIME;
  103.     static const UChar DOUBLE_PRIME;
  104.     static const UChar TRIPLE_PRIME;
  105.     static const UChar DEGREE_CELSIUS;
  106.     static const UChar DEGREE_FAHRENHEIT;
  107.     static const UChar PUNCTUATION_IDEOGRAPHIC_COMMA;
  108.     static const UChar PUNCTUATION_IDEOGRAPHIC_FULL_STOP; 
  109.     static const UChar IDEOGRAPHIC_ITERATION_MARK;
  110.     static const UChar HIRAGANA_LETTER_SMALL_A;
  111.     static const UChar HIRAGANA_LETTER_A;
  112.     static const UChar HIRAGANA_LETTER_SMALL_I;
  113.     static const UChar HIRAGANA_LETTER_I;
  114.     static const UChar HIRAGANA_LETTER_SMALL_U;
  115.     static const UChar HIRAGANA_LETTER_U;
  116.     static const UChar HIRAGANA_LETTER_SMALL_E;
  117.     static const UChar HIRAGANA_LETTER_E;
  118.     static const UChar HIRAGANA_LETTER_SMALL_O;
  119.     static const UChar HIRAGANA_LETTER_O;
  120.     static const UChar HIRAGANA_LETTER_DI;
  121.     static const UChar HIRAGANA_LETTER_SMALL_TU;
  122.     static const UChar HIRAGANA_LETTER_TU;
  123.     static const UChar HIRAGANA_LETTER_MO;
  124.     static const UChar HIRAGANA_LETTER_SMALL_YA;
  125.     static const UChar HIRAGANA_LETTER_YA;
  126.     static const UChar HIRAGANA_LETTER_SMALL_YU;
  127.     static const UChar HIRAGANA_LETTER_YU;
  128.     static const UChar HIRAGANA_LETTER_SMALL_YO;
  129.     static const UChar HIRAGANA_LETTER_YO;
  130.     static const UChar HIRAGANA_LETTER_RO;
  131.     static const UChar HIRAGANA_LETTER_SMALL_WA;
  132.     static const UChar HIRAGANA_LETTER_WA;
  133.     static const UChar HIRAGANA_LETTER_VU;
  134.     static const UChar COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK;
  135.     static const UChar HIRAGANA_SEMIVOICED_SOUND_MARK;
  136.     static const UChar HIRAGANA_ITERATION_MARK;
  137.     static const UChar HIRAGANA_VOICED_ITERATION_MARK;
  138.     static const UChar KATAKANA_LETTER_SMALL_A;
  139.     static const UChar KATAKANA_LETTER_A;
  140.     static const UChar KATAKANA_LETTER_SMALL_I;
  141.     static const UChar KATAKANA_LETTER_I;
  142.     static const UChar KATAKANA_LETTER_SMALL_U;
  143.     static const UChar KATAKANA_LETTER_U;
  144.     static const UChar KATAKANA_LETTER_SMALL_E;
  145.     static const UChar KATAKANA_LETTER_E;
  146.     static const UChar KATAKANA_LETTER_SMALL_O;
  147.     static const UChar KATAKANA_LETTER_O;
  148.     static const UChar KATAKANA_LETTER_DI;
  149.     static const UChar KATAKANA_LETTER_SMALL_TU;
  150.     static const UChar KATAKANA_LETTER_TU;
  151.     static const UChar KATAKANA_LETTER_MO;
  152.     static const UChar KATAKANA_LETTER_SMALL_YA;
  153.     static const UChar KATAKANA_LETTER_YA;
  154.     static const UChar KATAKANA_LETTER_SMALL_YU;
  155.     static const UChar KATAKANA_LETTER_YU;
  156.     static const UChar KATAKANA_LETTER_SMALL_YO;
  157.     static const UChar KATAKANA_LETTER_YO;
  158.     static const UChar KATAKANA_LETTER_RO;
  159.     static const UChar KATAKANA_LETTER_SMALL_WA;
  160.     static const UChar KATAKANA_LETTER_WA;
  161.     static const UChar KATAKANA_LETTER_VU;
  162.     static const UChar KATAKANA_LETTER_SMALL_KA;
  163.     static const UChar KATAKANA_LETTER_SMALL_KE;
  164.     static const UChar KATAKANA_LETTER_VA;
  165.     static const UChar KATAKANA_LETTER_VO;
  166.     static const UChar KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK;
  167.     static const UChar KATAKANA_ITERATION_MARK;
  168.     static const UChar KATAKANA_VOICED_ITERATION_MARK;
  169.     static const UChar UNICODE_LOW_BOUND_HAN;
  170.     static const UChar UNICODE_HIGH_BOUND_HAN;
  171.     static const UChar HANGUL_SYL_LOW;
  172.     static const UChar HANGUL_SYL_HIGH;
  173.     static const UChar CJK_COMPATIBILITY_F900;
  174.     static const UChar CJK_COMPATIBILITY_FA2D;
  175.     static const UChar UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE;
  176.     static const UChar FULLWIDTH_EXCLAMATION_MARK;
  177.     static const UChar FULLWIDTH_FULL_STOP;
  178.     static const UChar FULLWIDTH_QUESTION_MARK;
  179.     static const UChar END_OF_STRING;
  180.  
  181. private:
  182.     // Character data
  183.     enum CharacterMapping
  184.     {
  185.         // These enum values must occur in this order; do not
  186.         // modify unless you know what you are doing!  The forward
  187.         // and backward data tables are indexed by these enums.
  188.         kAccent_diacritic   = 0,
  189.         kBaseForm           = 1,
  190.         kBaseCR             = 2,
  191.         kBaseLF             = 3,
  192.         kChoseong           = 4,   // Korean initial consonant
  193.         kJungseong          = 5,  // Korean vowel
  194.         kJongseong          = 6,  // Korean final consonant
  195.         kEOS                = 7,
  196.         kCharacterCol_count = 8
  197.     };
  198.  
  199.     static Node                     kCharacterForwardData[];
  200.     static const int32_t            kCharacterForwardData_length;
  201.     static WordBreakTable*          kCharacterForward;
  202.     static Node                     kCharacterBackwardData[];
  203.     static const int32_t            kCharacterBackwardData_length;
  204.     static WordBreakTable*          kCharacterBackward;
  205.     static Type                     kCharacterRawMapping[];
  206.     static const int32_t            kCharacterRawMapping_length;
  207.     static SpecialMapping           kCharacterExceptionChar[];
  208.     static const int32_t            kCharacterExceptionChar_length;
  209.     static const bool_t             kCharacterExceptionFlags[];
  210.     static UnicodeClassMapping*     kCharacterMap;
  211.     static Type                     kCharacterAsciiValues[];
  212.  
  213. private:
  214.     // Word data
  215.     enum WordMapping
  216.     {
  217.         // These enum values must occur in this order; do not
  218.         // modify unless you know what you are doing!  The forward
  219.         // and backward data tables are indexed by these enums.
  220.         kBreak          = 0,
  221.         kLetter         = 1,
  222.         kNumber         = 2,
  223.         kMidLetter      = 3,
  224.         kMidLetNum      = 4,
  225.         kPreNum         = 5,
  226.         kPostNum        = 6,
  227.         kMidNum         = 7,
  228.         kPreMidNum      = 8,
  229.         kBlank          = 9,
  230.         kLF             = 10,
  231.         kKata           = 11,
  232.         kHira           = 12,
  233.         kKanji          = 13,
  234.         kDiacrit        = 14,
  235.         kCR             = 15,
  236.         kNsm            = 16,
  237.         kwEOS           = 17,
  238.         kWordCol_count  = 18
  239.     };
  240.  
  241.     static Node                     kWordForwardData[];
  242.     static const int32_t            kWordForwardData_length;
  243.     static WordBreakTable*          kWordForward;
  244.     static Node                     kWordBackwardData[];
  245.     static const int32_t            kWordBackwardData_length;
  246.     static WordBreakTable*          kWordBackward;
  247.     static Type                     kWordRawMapping[];
  248.     static const int32_t            kWordRawMapping_length;
  249.     static SpecialMapping           kWordExceptionChar[];
  250.     static const int32_t            kWordExceptionChar_length;
  251.     static UnicodeClassMapping*     kWordMap;
  252.     static Type                     kWordAsciiValues[];
  253.     static const bool_t             kWordExceptionFlags[];
  254.  
  255. private:
  256.     // Sentence data
  257.     enum SentenceMapping
  258.     {
  259.         // These enum values must occur in this order; do not
  260.         // modify unless you know what you are doing!  The forward
  261.         // and backward data tables are indexed by these enums.
  262.         kOther              = 0,
  263.         kSpace              = 1,
  264.         kTerminator         = 2,
  265.         kAmbiguousTerm      = 3,
  266.         kOpenBracket        = 4,
  267.         kCloseBracket       = 5,
  268.         kCJK                = 6,
  269.         kParagraphBreak     = 7,
  270.         kLowerCase          = 8,
  271.         kUpperCase          = 9,
  272.         ksNumber            = 10,
  273.         kQuote              = 11,
  274.         //ksCR,
  275.         ksNsm               = 12,
  276.         ksEOS               = 13,
  277.         kSentenceCol_count  = 14
  278.     };
  279.  
  280.     static Node                     kSentenceForwardData[];
  281.     static const int32_t            kSentenceForwardData_length;
  282.     static WordBreakTable*          kSentenceForward;
  283.     static Node                     kSentenceBackwardData[];
  284.     static const int32_t            kSentenceBackwardData_length;
  285.     static WordBreakTable*          kSentenceBackward;
  286.     static Type                     kSentenceRawMapping[];
  287.     static const int32_t            kSentenceRawMapping_length;
  288.     static SpecialMapping           kSentenceExceptionChar[];
  289.     static const int32_t            kSentenceExceptionChar_length;
  290.     static UnicodeClassMapping*     kSentenceMap;
  291.     static Type                     kSentenceAsciiValues[];
  292.     static const bool_t             kSentenceExceptionFlags[];
  293.  
  294. private:
  295.     // Line data
  296.     enum LineMapping
  297.     {
  298.         // These enum values must occur in this order; do not
  299.         // modify unless you know what you are doing!  The forward
  300.         // and backward data tables are indexed by these enums.
  301.         kLineBreak,
  302.         //always breaks (must be present as first item)
  303.         kLineBlank,
  304.         //spaces, tabs, nulls.
  305.         kLineCR,
  306.         //carriage return
  307.         kLineNonBlank,
  308.         //everything not included elsewhere
  309.         kLineOp,
  310.         //hyphens....
  311.         kLineJwrd,
  312.         //hiragana, katakana, and kanji
  313.         kLinePreJwrd,
  314.         //characters that bind to the beginning of a Japanese word
  315.         kLinePostJwrd,
  316.         //characters that bind to the end of a Japanese word
  317.         kLineDigit,
  318.         //digits
  319.         kLineNumPunct,
  320.         //punctuation that can appear within a number
  321.         kLineCurrency,
  322.         //currency symbols that can precede a number
  323.         kLineNsm,
  324.         // non-spacing marks
  325.         kLineNbsp,
  326.         // non-breaking characters
  327.         kLineEOS,
  328.         kLineCol_count
  329.     };
  330.  
  331.     static Node                     kLineForwardData[];
  332.     static const int32_t            kLineForwardData_length;
  333.     static WordBreakTable*          kLineForward;
  334.     static Node                     kLineBackwardData[];
  335.     static const int32_t            kLineBackwardData_length;
  336.     static WordBreakTable*          kLineBackward;
  337.     static Type                     kLineRawMapping[];
  338.     static const int32_t            kLineRawMapping_length;
  339.     static SpecialMapping           kLineExceptionChar[];
  340.     static const int32_t            kLineExceptionChar_length;
  341.     static const bool_t             kLineExceptionFlags[];
  342.     static UnicodeClassMapping*     kLineMap;
  343.     static Type                     kLineAsciiValues[];
  344.  
  345. protected:
  346.     /**
  347.      * Copy constructor and assignment operator provided to make
  348.      * compiler happy only. DO NOT CALL.
  349.      */
  350.     TextBoundaryData(const TextBoundaryData&) {}
  351.     TextBoundaryData& operator=(const TextBoundaryData&) { return *this; }
  352.     TextBoundaryData() {} // Do not subclass
  353.     TextBoundaryData(const WordBreakTable* forward,
  354.                      const WordBreakTable* backward,
  355.                      const UnicodeClassMapping* map)
  356.                      : fForward(forward), fBackward(backward), fMap(map) {}
  357.         
  358. private:
  359.     const WordBreakTable*       fForward;
  360.     const WordBreakTable*       fBackward;
  361.     const UnicodeClassMapping*  fMap;
  362. };
  363.  
  364. inline const WordBreakTable* TextBoundaryData::forward() const
  365. {
  366.     return fForward;
  367. }
  368.  
  369. inline const WordBreakTable* TextBoundaryData::backward() const
  370. {
  371.     return fBackward;
  372. }
  373.  
  374. inline const UnicodeClassMapping* TextBoundaryData::map() const
  375. {
  376.     return fMap;
  377. }
  378.  
  379. // These used to be static consts in the class, but some compilers didn't like that.
  380. #define kStop       (0)
  381. #define kSI         (0x80)
  382. #define kSI_Stop    (kSI+kStop)
  383.  
  384. #define kSI_1       (kSI+1)
  385. #define kSI_2       (kSI+2)
  386. #define kSI_3       (kSI+3)
  387. #define kSI_4       (kSI+4)
  388. #define kSI_5       (kSI+5)
  389. #define kSI_6       (kSI+6)
  390. #define kSI_7       (kSI+7)
  391. #define kSI_8       (kSI+8)
  392. #define kSI_9       (kSI+9)
  393. #define kSI_10      (kSI+10)
  394. #define kSI_11      (kSI+11)
  395. #define kSI_12      (kSI+12)
  396. #define kSI_13      (kSI+13)
  397. #define kSI_14      (kSI+14)
  398.  
  399. #endif // _TXTBDAT
  400. //eof
  401.