home *** CD-ROM | disk | FTP | other *** search
/ AmigActive 6 / AACD06.ISO / AACD / Programming / ICU / src / icu / source / i18n / snbkdat.cpp < prev    next >
Encoding:
C/C++ Source or Header  |  1999-08-16  |  20.6 KB  |  350 lines

  1. /*
  2. *****************************************************************************************
  3. *                                                                                       *
  4. * COPYRIGHT:                                                                            *
  5. *   (C) Copyright Taligent, Inc.,  1997                                                 *
  6. *   (C) Copyright International Business Machines Corporation,  1997-1998                    *
  7. *   Licensed Material - Program-Property of IBM - All Rights Reserved.                  *
  8. *   US Government Users Restricted Rights - Use, duplication, or disclosure             *
  9. *   restricted by GSA ADP Schedule Contract with IBM Corp.                              *
  10. *                                                                                       *
  11. *****************************************************************************************
  12. *
  13. * File SNBKDAT.CPP
  14. *
  15. * Modification History:
  16. *
  17. *   Date        Name        Description
  18. *   02/18/97    aliu        Converted from OpenClass.
  19. *                           Recoded kRawMapping table for Unicode::getType() type codes.
  20. *                           Made static data members const where appropriate.
  21. *   03/25/97    aliu        Moved into TextBoundaryData; no longer a subclass.
  22. *   04/15/97    aliu        Worked around bug in AIX xlC compiler which occurs if static
  23. *                           arrays contain const elements.
  24. *   05/06/97    aliu        Made SpecialMapping an array of objects instead of pointers,
  25. *                           to help out non-compliant compilers.
  26. *   08/14/98    helena      Sync-up JDK1.2.
  27. *    09/04/98    stephen        Sync with 8/31 JDK 1.2
  28. *****************************************************************************************
  29. */
  30.  
  31. // *****************************************************************************
  32. // This file was generated from the java source file SentenceBreakData.java
  33. // *****************************************************************************
  34.  
  35. #include "txtbdat.h"
  36. #include "wdbktbl.h"
  37. #include "unicdcm.h"
  38.  
  39. // *****************************************************************************
  40. // class SentenceBreakData
  41. // The following tables contain the transition state data for sentence break.
  42. // For more detailed explanation on the boundary break state machine, please
  43. // see the internal documentation of wdbktbl.cpp.
  44. // *****************************************************************************
  45.  
  46. // The forward transition states of sentence boundary data.
  47. TextBoundaryData::Node TextBoundaryData::kSentenceForwardData[] = {
  48.         // other       space          terminator     ambTerm
  49.         // open        close          CJK            PB
  50.         // lower       upper          digit          Quote
  51.         // nsm            EOS
  52.  
  53.         // 0
  54.         kStop,          kStop,          kStop,          kStop,
  55.         kStop,          kStop,          kStop,          kStop,
  56.         kStop,          kStop,          kStop,          kStop,
  57.         kStop,          kStop,
  58.  
  59.         // 1
  60.         kSI_1,          kSI_1,          kSI_2,          kSI_5,
  61.         kSI_1,          kSI_1,          kSI_1,          kSI_4,
  62.         kSI_1,          kSI_1,          kSI_1,          kSI_1,
  63.         kSI_1,          kSI_Stop,
  64.  
  65.         // 2
  66.         kSI_Stop,       kSI_3,          kSI_2,          kSI_5,
  67.         kSI_Stop,       kSI_2,          kSI_Stop,       kSI_4,
  68.         kSI_Stop,       kSI_Stop,       kSI_Stop,       kSI_2,
  69.         kSI_2,          kSI_Stop,
  70.  
  71.         // 3
  72.         kSI_Stop,       kSI_3,          kSI_Stop,       kSI_Stop,
  73.         kSI_Stop,       kSI_Stop,       kSI_Stop,       kSI_4,
  74.         kSI_Stop,       kSI_Stop,       kSI_Stop,       kSI_Stop,
  75.         kSI_3,          kSI_Stop,
  76.  
  77.         // 4
  78.         kSI_Stop,       kSI_Stop,       kSI_Stop,       kSI_Stop,
  79.         kSI_Stop,       kSI_Stop,       kSI_Stop,       kSI_Stop,
  80.         kSI_Stop,       kSI_Stop,       kSI_Stop,       kSI_Stop,
  81.         kSI_Stop,       kSI_Stop,
  82.  
  83.         // 5
  84.         kSI_1,          kSI_6,          kSI_2,          kSI_5,
  85.         kSI_1,          kSI_5,          kSI_Stop,       kSI_4,
  86.         kSI_1,          kSI_1,          kSI_1,          kSI_5,
  87.         kSI_5,          kSI_Stop,
  88.  
  89.         // 6
  90.         kSI_Stop,       kSI_6,          kSI_Stop,       kSI_Stop,
  91.         kSI_7,          kSI_1,          kSI_Stop,       kSI_4,
  92.         kSI_1,          kSI_Stop,       kSI_1,          kSI_Stop,
  93.         kSI_6,          kSI_Stop,
  94.  
  95.         // 7
  96.         kSI_Stop,       kSI_Stop,       kSI_Stop,       kSI_Stop,
  97.         7,              kSI_Stop,       kSI_Stop,       kSI_Stop,
  98.         kSI_1,          kStop,          kSI_Stop,       kSI_Stop,
  99.         kSI_7,          kSI_Stop,
  100.  
  101.         // 8
  102.         kSI_1,          kSI_1,          kSI_2,          kSI_8,
  103.         kSI_1,          kSI_5,          kSI_1,          kSI_4,
  104.         kSI_1,          kSI_8,          kSI_9,          kSI_5,
  105.         kSI_8,          kSI_Stop,
  106.  
  107.         // 9
  108.         kSI_1,          kSI_1,          kSI_2,          kSI_9,
  109.         kSI_1,          kSI_5,          kSI_1,          kSI_4,
  110.         kSI_1,          kSI_1,          kSI_9,          kSI_5,
  111.         kSI_9,          kSI_Stop
  112. };
  113.  
  114. const int32_t TextBoundaryData::kSentenceForwardData_length =
  115.     sizeof(TextBoundaryData::kSentenceForwardData) / sizeof(TextBoundaryData::kSentenceForwardData[0]);
  116.  
  117. WordBreakTable* TextBoundaryData::kSentenceForward = 
  118.     new WordBreakTable(kSentenceCol_count, kSentenceForwardData, kSentenceForwardData_length);
  119.  
  120. // The backward transition states of sentence boundary data.
  121. TextBoundaryData::Node TextBoundaryData::kSentenceBackwardData[] = {
  122.         // other       space          terminator     ambTerm
  123.         // open        close          CJK            PB
  124.         // lower       upper          digit          quote
  125.         // nsm            EOS
  126.  
  127.         // 0
  128.         kStop,          kStop,          kStop,          kStop,
  129.         kStop,          kStop,          kStop,          kStop,
  130.         kStop,          kStop,          kStop,          kStop,
  131.         kStop,          kStop,
  132.  
  133.         // 1
  134.         kSI_2,          kSI_2,          kSI_2,          kSI_2,
  135.         kSI_2,          kSI_2,          kSI_3,          kStop,
  136.         kSI_2,          kSI_3,          kSI_2,          kSI_2,
  137.         kSI_1,          kStop,
  138.  
  139.         // 2
  140.         kSI_2,          kSI_2,          kSI_2,          kSI_2,
  141.         kSI_2,          kSI_2,          kSI_3,          kStop,
  142.         kSI_2,          kSI_3,          kSI_2,          kSI_2,
  143.         kSI_2,          kStop,
  144.  
  145.         // 3
  146.         kSI_2,          kSI_4,          kSI_2,          kSI_2,
  147.         kSI_2,          kSI_2,          kSI_3,          kStop,
  148.         kSI_3,          kSI_2,          kSI_2,          kSI_2,
  149.         kSI_3,          kStop,
  150.  
  151.         // 4
  152.         kSI_2,          kSI_4,          kSI_Stop,       kSI_Stop,
  153.         kSI_2,          kSI_2,          kSI_3,          kStop,
  154.         kSI_2,          kSI_3,          kSI_2,      kSI_2,
  155.         kSI_4,          kStop
  156. };
  157.  
  158. const int32_t TextBoundaryData::kSentenceBackwardData_length =
  159.     sizeof(TextBoundaryData::kSentenceBackwardData) / sizeof(TextBoundaryData::kSentenceBackwardData[0]);
  160.  
  161. WordBreakTable* TextBoundaryData::kSentenceBackward = new WordBreakTable(kSentenceCol_count, kSentenceBackwardData, kSentenceBackwardData_length);
  162.  
  163. // The sentence type mapping of the break table.
  164. TextBoundaryData::Type TextBoundaryData::kSentenceRawMapping[] = {
  165.     // Re-coded to match Unicode 2 types [LIU]
  166.     kOther,         // UNASSIGNED               = 0,
  167.     kUpperCase,     // UPPERCASE_LETTER         = 1,
  168.     kLowerCase,     // LOWERCASE_LETTER         = 2,
  169.     kOther,         // TITLECASE_LETTER         = 3,
  170.     kOther,         // MODIFIER_LETTER          = 4,
  171.     kOther,         // OTHER_LETTER             = 5,
  172.     ksNsm,          // NON_SPACING_MARK         = 6,
  173.     ksNsm,          // ENCLOSING_MARK           = 7,
  174.     kOther,         // COMBINING_SPACING_MARK   = 8,
  175.     ksNumber,       // DECIMAL_DIGIT_NUMBER     = 9,
  176.     ksNumber,       // LETTER_NUMBER            = 10,
  177.     ksNumber,       // OTHER_NUMBER             = 11,
  178.     kSpace,         // SPACE_SEPARATOR          = 12,
  179.     kSpace,         // LINE_SEPARATOR           = 13,
  180.     kSpace,         // PARAGRAPH_SEPARATOR      = 14,
  181.     kOther,         // CONTROL                  = 15,
  182.     kOther,         // FORMAT                   = 16,
  183.     kOther,         // PRIVATE_USE              = 17,
  184.     kOther,         // SURROGATE                = 18,
  185.     kOther,         // DASH_PUNCTUATION         = 19,
  186.     kOpenBracket,   // START_PUNCTUATION        = 20,
  187.     kCloseBracket,  // END_PUNCTUATION          = 21,
  188.     kOther,         // CONNECTOR_PUNCTUATION    = 22,
  189.     kOther,         // OTHER_PUNCTUATION        = 23,
  190.     kOther,         // MATH_SYMBOL              = 24,
  191.     kOther,         // CURRENCY_SYMBOL          = 25,
  192.     kOther,         // MODIFIER_SYMBOL          = 26,
  193.     kOther,         // OTHER_SYMBOL             = 27,
  194.     kOther          // UNDEFINED                = 28
  195. };
  196.  
  197. const int32_t TextBoundaryData::kSentenceRawMapping_length =
  198.     sizeof(TextBoundaryData::kSentenceRawMapping) / sizeof(TextBoundaryData::kSentenceRawMapping[0]);
  199.  
  200. // The exceptions of the sentence break data.
  201. SpecialMapping TextBoundaryData::kSentenceExceptionChar[] = {
  202.         //note: the ranges in this table must be sorted in ascending order
  203.         //as required by the UnicodeClassMapping class.
  204.     SpecialMapping(TextBoundaryData::ASCII_HORIZONTAL_TABULATION, TextBoundaryData::kSpace),
  205.     SpecialMapping(TextBoundaryData::ASCII_LINEFEED, TextBoundaryData::kSpace),
  206.     SpecialMapping(TextBoundaryData::ASCII_FORM_FEED, TextBoundaryData::kTerminator),
  207.     SpecialMapping(TextBoundaryData::ASCII_CARRIAGE_RETURN, TextBoundaryData::kSpace),
  208.     SpecialMapping(TextBoundaryData::ASCII_EXCLAMATION_MARK, TextBoundaryData::kTerminator),
  209.     SpecialMapping(TextBoundaryData::ASCII_QUOTATION_MARK, TextBoundaryData::kQuote),
  210.     SpecialMapping(TextBoundaryData::ASCII_APOSTROPHE, TextBoundaryData::kQuote),
  211.     SpecialMapping(TextBoundaryData::ASCII_FULL_STOP, TextBoundaryData::kAmbiguousTerm),
  212.     SpecialMapping(TextBoundaryData::ASCII_QUESTION_MARK, TextBoundaryData::kTerminator),
  213.     SpecialMapping(TextBoundaryData::ASCII_NONBREAKING_SPACE, TextBoundaryData::kOther),
  214.     SpecialMapping(TextBoundaryData::PUNCTUATION_LINE_SEPARATOR, TextBoundaryData::kSpace),
  215.     SpecialMapping(TextBoundaryData::PUNCTUATION_PARAGRAPH_SEPARATOR, TextBoundaryData::kParagraphBreak),
  216.     SpecialMapping(TextBoundaryData::PUNCTUATION_IDEOGRAPHIC_FULL_STOP, TextBoundaryData::kTerminator),
  217.     SpecialMapping(TextBoundaryData::HIRAGANA_LETTER_SMALL_A, 
  218.                    TextBoundaryData::HIRAGANA_LETTER_VU, TextBoundaryData::kCJK),
  219.     SpecialMapping(TextBoundaryData::COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK,
  220.                    TextBoundaryData::HIRAGANA_SEMIVOICED_SOUND_MARK, TextBoundaryData::kCJK),         // cjk
  221.     SpecialMapping(TextBoundaryData::KATAKANA_LETTER_SMALL_A, 
  222.                    TextBoundaryData::KATAKANA_LETTER_SMALL_KE, TextBoundaryData::kCJK),   // cjk
  223.     SpecialMapping(TextBoundaryData::UNICODE_LOW_BOUND_HAN, 
  224.                    TextBoundaryData::UNICODE_HIGH_BOUND_HAN, TextBoundaryData::kCJK),
  225.     SpecialMapping(TextBoundaryData::CJK_COMPATIBILITY_F900,
  226.                    TextBoundaryData::CJK_COMPATIBILITY_FA2D,TextBoundaryData::kCJK),
  227.     SpecialMapping(TextBoundaryData::UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE, TextBoundaryData::kOther),
  228.     SpecialMapping(TextBoundaryData::FULLWIDTH_EXCLAMATION_MARK, TextBoundaryData::kTerminator),
  229.     SpecialMapping(TextBoundaryData::FULLWIDTH_FULL_STOP, TextBoundaryData::kAmbiguousTerm),
  230.     SpecialMapping(TextBoundaryData::FULLWIDTH_QUESTION_MARK, TextBoundaryData::kTerminator),
  231.     SpecialMapping(TextBoundaryData::END_OF_STRING, TextBoundaryData::ksEOS)
  232. };
  233.  
  234. const bool_t TextBoundaryData::kSentenceExceptionFlags[] = {
  235.     FALSE,            // kNonCharacter         = 0,
  236.     FALSE,            // kUppercaseLetter      = 1,
  237.     FALSE,            // kLowercaseLetter      = 2,
  238.     FALSE,            // kTitlecaseLetter      = 3,
  239.     FALSE,            // kModifierLetter       = 4,
  240.     TRUE,             // kOtherLetter          = 5,
  241.     TRUE,             // kNonSpacingMark       = 6,
  242.     FALSE,            // kEnclosingMark        = 7,
  243.     FALSE,            // kCombiningSpacingMark = 8,
  244.     FALSE,            // kDecimalNumber        = 9,
  245.     FALSE,            // kLetterNumber         = 10,
  246.     FALSE,            // kOtherNumber          = 11,
  247.     TRUE,             // kSpaceSeparator       = 12,
  248.     TRUE,             // kLineSeparator        = 13,
  249.     TRUE,             // kParagraphSeparator   = 14,
  250.     TRUE,             // kControlCharacter     = 15,
  251.     TRUE,             // kFormatCharacter      = 16,
  252.     FALSE,            // kPrivateUseCharacter  = 17,
  253.     FALSE,            // kSurrogate            = 18,
  254.     FALSE,            // kDashPunctuation      = 19,
  255.     FALSE,            // kOpenPunctuation      = 20,
  256.     FALSE,            // kClosePunctuation     = 21,
  257.     FALSE,            // kConnectorPunctuation = 22,
  258.     TRUE,             // kOtherPunctuation     = 23,
  259.     FALSE,            // kMathSymbol           = 24,
  260.     FALSE,            // kCurrencySymbol       = 25,
  261.     FALSE,            // kModifierSymbol       = 26,
  262.     FALSE,             // kOtherSymbol          = 27
  263.     FALSE            // UNDEFINED             = 28,
  264. };
  265. const int32_t TextBoundaryData::kSentenceExceptionChar_length =
  266.     sizeof(TextBoundaryData::kSentenceExceptionChar) / sizeof(TextBoundaryData::kSentenceExceptionChar[0]);
  267.  
  268. TextBoundaryData::Type TextBoundaryData::kSentenceAsciiValues[] = {
  269.         //  null        soh         stx         etx         eot         enq         ask         bell
  270.             kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,
  271.         //  bs          ht          lf          vt          ff          cr          so          si
  272.             kOther,     kSpace,     kSpace,     kOther,     kTerminator, kSpace,    kOther,     kOther,
  273.         //  dle         dc1         dc2         dc3         dc4         nak         syn         etb
  274.             kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,
  275.         //  can         em          sub         esc         fs          gs          rs          us
  276.             kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,
  277.         //  sp          !           "           #           $           %           &           '
  278.             kSpace,     kTerminator, kQuote,    kOther,     kOther,     kOther,     kOther,     kQuote,
  279.         //  (           )           *           +           ,           -           .           /
  280.             kOpenBracket, kCloseBracket, kOther, kOther,    kOther,     kOther,      kAmbiguousTerm, kOther,
  281.         //  0           1           2           3           4           5           6           7
  282.             ksNumber,   ksNumber,   ksNumber,   ksNumber,   ksNumber,   ksNumber,   ksNumber,   ksNumber,
  283.         //  8           9           :           ;           <           =           >           ?
  284.             ksNumber,   ksNumber,   kOther,     kOther,     kOther,     kOther,     kOther,     kTerminator,
  285.         //  @           A           B           C           D           E           F           G
  286.             kOther,     kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase,
  287.         //  H           I           J           K           L           M           N           O
  288.             kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase,
  289.         //  P           Q           R           S           T           U           V           W
  290.             kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase,
  291.         //  X           Y           Z           [           \           ]           ^           _
  292.             kUpperCase, kUpperCase, kUpperCase, kOpenBracket, kOther,   kCloseBracket, kOther,  kOther,
  293.         //  `           a           b           c           d           e           f           g
  294.             kOther,     kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase,
  295.         //  h           i           j           k           l           m           n           o
  296.             kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase,
  297.         //  p           q           r           s           t           u           v           w
  298.             kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase,
  299.         //  x           y           z           {           |           }           ~           del
  300.             kLowerCase, kLowerCase, kLowerCase, kOpenBracket, kOther,   kCloseBracket, kOther,  kOther,
  301.         //  ctrl        ctrl        ctrl        ctrl        ctrl        ctrl        ctrl        ctrl
  302.             kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,
  303.         //  ctrl        ctrl        ctrl        ctrl        ctrl        ctrl        ctrl        ctrl
  304.             kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,
  305.         //  ctrl        ctrl        ctrl        ctrl        ctrl        ctrl        ctrl        ctrl
  306.             kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,
  307.         //  ctrl        ctrl        ctrl        ctrl        ctrl        ctrl        ctrl        ctrl
  308.             kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,
  309.         //  nbsp        inv-!       cents       pounds      currency    yen         broken-bar  section
  310.             kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,     kOther,
  311.         //  umlaut      copyright   super-a     gui-left    not         soft-hyph   registered  macron
  312.             kOther,     kOther,     kLowerCase, kOpenBracket, kOther,   kOther,     kOther,     kOther,
  313.         //  degree      +/-         super-2     super-3     acute       micro       paragraph   bullet
  314.             kOther,     kOther,     ksNumber,   ksNumber,   kOther,     kLowerCase, kOther,     kOther,
  315.         //  cedilla     super-1     super-o     gui-right   1/4         1/2         3/4         inv-?
  316.             kOther,     kLowerCase, kOther,     kCloseBracket, ksNumber, ksNumber,  ksNumber,   kOther,
  317.         //  A-grave     A-acute     A-hat       A-tilde     A-umlaut    A-ring      AE          C-cedilla
  318.             kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase,
  319.         //  E-grave     E-acute     E-hat       E-umlaut    I-grave     I-acute     I-hat       I-umlaut
  320.             kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase,
  321.         //  Edh         N-tilde     O-grave     O-acute     O-hat       O-tilde     O-umlaut    times
  322.             kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kOther,
  323.         //  O=slash     U-grave     U-acute     U-hat       U-umlaut    Y-acute     Thorn       ess-zed
  324.             kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kUpperCase, kLowerCase,
  325.         //  a-grave     a-acute     a-hat       a-tilde     a-umlaut    a-ring      ae          c-cedilla
  326.             kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase,
  327.         //  e-grave     e-acute     e-hat       e-umlaut    i-grave     i-acute     i-hat       i-umlaut
  328.             kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase,
  329.         //  edh         n-tilde     o-grave     o-acute     o-hat       o-tilde     o-umlaut    over
  330.             kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kOther,
  331.         //  o-slash     u-grave     u-acute     u-hat       u-umlaut    y-acute     thorn       y-umlaut
  332.             kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase, kLowerCase
  333.     };
  334.  
  335. UnicodeClassMapping* TextBoundaryData::kSentenceMap = 
  336.     new UnicodeClassMapping(kSentenceRawMapping, kSentenceRawMapping_length, 
  337.                             kSentenceExceptionChar, kSentenceExceptionChar_length,
  338.                             kSentenceExceptionFlags,
  339.                             kSentenceAsciiValues);
  340.  
  341. /**
  342.  * This is the single instance of TextBoundaryData containing sentence
  343.  * break data.
  344.  */
  345. const TextBoundaryData TextBoundaryData::kSentenceBreakData(TextBoundaryData::kSentenceForward,
  346.                                                             TextBoundaryData::kSentenceBackward,
  347.                                                             TextBoundaryData::kSentenceMap);
  348.  
  349. //eof
  350.