home *** CD-ROM | disk | FTP | other *** search
/ Apple Developer Connection Student Program / ADC Tools Sampler CD Disk 3 1999.iso / Metrowerks CodeWarrior / Java Support / Java_Source / Java2 / src / java / text / CharacterBreakData.java < prev    next >
Encoding:
Java Source  |  1999-05-28  |  13.8 KB  |  260 lines  |  [TEXT/CWIE]

  1. /*
  2.  * @(#)CharacterBreakData.java    1.12 98/07/24
  3.  *
  4.  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  5.  * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  6.  *
  7.  * Portions copyright (c) 1996-1998 Sun Microsystems, Inc.
  8.  * All Rights Reserved.
  9.  *
  10.  * The original version of this source code and documentation
  11.  * is copyrighted and owned by Taligent, Inc., a wholly-owned
  12.  * subsidiary of IBM. These materials are provided under terms
  13.  * of a License Agreement between Taligent and Sun. This technology
  14.  * is protected by multiple US and International patents.
  15.  *
  16.  * This notice and attribution to Taligent may not be removed.
  17.  * Taligent is a registered trademark of Taligent, Inc.
  18.  *
  19.  * Permission to use, copy, modify, and distribute this software
  20.  * and its documentation for NON-COMMERCIAL purposes and without
  21.  * fee is hereby granted provided that this copyright notice
  22.  * appears in all copies. Please refer to the file "copyright.html"
  23.  * for further important copyright and licensing information.
  24.  *
  25.  * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
  26.  * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
  27.  * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
  28.  * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
  29.  * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
  30.  * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  31.  *
  32.  */
  33.  
  34. package java.text;
  35.  
  36. /**
  37.  * The CharacterBreakData contains data used by SimpleTextBoundary
  38.  * to determine character breaks.
  39.  * @see #BreakIterator
  40.  */
  41. final class CharacterBreakData extends TextBoundaryData
  42. {
  43.     private static final byte accent_diacritic = 0;
  44.     private static final byte baseForm = 1;
  45.     private static final byte baseCR = 2;
  46.     private static final byte baseLF = 3;
  47.     private static final byte choseong = 4;   // Korean initial consonant
  48.     private static final byte jungseong = 5;  // Korean vowel
  49.     private static final byte jongseong = 6;  // Korean final consonant
  50.     private static final byte EOS = 7;
  51.     private static final int COL_COUNT = 8;
  52.  
  53.     private static final byte SI = (byte)0x80;
  54.     private static final byte STOP = (byte) 0;
  55.     private static final byte SI_STOP = (byte)SI + STOP;
  56.  
  57.     public CharacterBreakData() {
  58.         super(kCharacterForwardTable, kCharacterBackwardTable, kCharacterMap);
  59.     }
  60.  
  61.     private static final byte kCharacterForwardData[] =
  62.     {
  63.         // acct        base             cr              lf
  64.         // cho         jung             jong            EOS
  65.         STOP,          STOP,            STOP,           STOP,
  66.         STOP,          STOP,            STOP,           STOP,
  67.  
  68.         // 1
  69.         (byte)(SI+2),  (byte)(SI+2),    (byte)(SI+3),   (byte)(SI+7),
  70.         (byte)(SI+4),  (byte)(SI+5),    (byte)(SI+6),   SI_STOP,
  71.  
  72.         // 2
  73.         (byte)(SI+2),  SI_STOP,         SI_STOP,        SI_STOP,
  74.         SI_STOP,       SI_STOP,         SI_STOP,        SI_STOP,
  75.  
  76.         // 3
  77.         SI_STOP,       SI_STOP,         SI_STOP,        (byte)(SI+7),
  78.         SI_STOP,       SI_STOP,         SI_STOP,        SI_STOP,
  79.  
  80.         // 4
  81.         (byte)(SI+2),  SI_STOP,         SI_STOP,        SI_STOP,
  82.         (byte)(SI+4),  (byte)(SI+5),    (byte)(SI+6),   SI_STOP,
  83.  
  84.         // 5
  85.         (byte)(SI+2),  SI_STOP,         SI_STOP,        SI_STOP,
  86.         SI_STOP,      (byte)(SI+5),    (byte)(SI+6),    SI_STOP,
  87.  
  88.         // 6
  89.         (byte)(SI+2),  SI_STOP,         SI_STOP,        SI_STOP,
  90.         SI_STOP,       SI_STOP,         (byte)(SI+6),   SI_STOP,
  91.  
  92.         // 7
  93.         SI_STOP,       SI_STOP,         SI_STOP,        SI_STOP,
  94.         SI_STOP,       SI_STOP,         SI_STOP,        SI_STOP
  95.     };
  96.     private static final WordBreakTable kCharacterForwardTable =
  97.     new WordBreakTable(COL_COUNT, kCharacterForwardData);
  98.     private static final byte kCharacterBackwardData[] =
  99.     {
  100.         // acct         base            cr              lf
  101.         // cho          jung            jong            EOS
  102.         STOP,           STOP,           STOP,           STOP,
  103.         STOP,           STOP,           STOP,           STOP,
  104.  
  105.         // 1
  106.         (byte)(SI+1),   SI_STOP,        SI_STOP,        (byte)(SI+1),
  107.         SI_STOP,        (byte)(SI+1),   (byte)(SI+1),   SI_STOP
  108.     };
  109.  
  110.     private static final WordBreakTable kCharacterBackwardTable =
  111.     new WordBreakTable(COL_COUNT, kCharacterBackwardData);
  112.     private static final int kRawMapping[] =
  113.     {
  114.         baseForm, //UNASSIGNED      = 0,
  115.         baseForm, //UPPERCASE_LETTER    = 1,
  116.         baseForm, //LOWERCASE_LETTER    = 2,
  117.         baseForm, //TITLECASE_LETTER    = 3,
  118.         baseForm, //MODIFIER_LETTER     = 4,
  119.         baseForm, //OTHER_LETTER        = 5,
  120.         accent_diacritic, //NON_SPACING_MARK    = 6,
  121.         accent_diacritic, //ENCLOSING_MARK      = 7,
  122.         baseForm, //COMBINING_SPACING_MARK  = 8,
  123.         baseForm, //DECIMAL_DIGIT_NUMBER    = 9,
  124.         baseForm, //LETTER_NUMBER       = 10,
  125.         baseForm, //OTHER_NUMBER        = 11,
  126.         baseForm, //SPACE_SEPARATOR     = 12,
  127.         baseForm, //LINE_SEPARATOR      = 13,
  128.         baseForm, //PARAGRAPH_SEPARATOR = 14,
  129.         baseForm, //CONTROL         = 15,
  130.         baseForm, //FORMAT      = 16,
  131.         baseForm, //????            = 17,
  132.         baseForm, //PRIVATE_USE     = 18,
  133.         baseForm, //SURROGATE        = 19,
  134.         baseForm, //DASH_PUNCTUATION    = 20,
  135.         baseForm, //START_PUNCTUATION    = 21,
  136.         baseForm, //END_PUNCTUATION     = 22,
  137.         baseForm, //CONNECTOR_PUNCTUATION   = 23,
  138.         baseForm, //OTHER_PUNCTUATION   = 24,
  139.         baseForm, //MATH_SYMBOL     = 25,
  140.         baseForm, //CURRENCY_SYMBOL     = 26,
  141.         baseForm, //MODIFIER_SYMBOL     = 27,
  142.         baseForm, //OTHER_SYMBOL        = 28;
  143.     };
  144.  
  145.     private static final SpecialMapping kExceptionChar[] = //{};
  146.     {
  147.         new SpecialMapping(ASCII_LINEFEED, baseLF),
  148.         new SpecialMapping(ASCII_CARRIAGE_RETURN, baseCR),
  149.         new SpecialMapping(HANGUL_CHOSEONG_LOW, HANGUL_CHOSEONG_HIGH, choseong),
  150.         new SpecialMapping(HANGUL_JUNGSEONG_LOW, HANGUL_JUNGSEONG_HIGH, jungseong),
  151.         new SpecialMapping(HANGUL_JONGSEONG_LOW, HANGUL_JONGSEONG_HIGH, jongseong),
  152.         new SpecialMapping(PUNCTUATION_LINE_SEPARATOR, PUNCTUATION_PARAGRAPH_SEPARATOR, baseLF),
  153.         new SpecialMapping(END_OF_STRING, EOS)
  154.     };
  155.  
  156.     private static final boolean CharacterExceptionFlags[] = {
  157.         false,          // kNonCharacter            = 0,
  158.         false,          // kUppercaseLetter         = 1,
  159.         false,          // kLowercaseLetter         = 2,
  160.         false,          // kTitlecaseLetter         = 3,
  161.         false,          // kModifierLetter          = 4,
  162.         true,           // kOtherLetter             = 5,
  163.         false,          // kNonSpacingMark          = 6,
  164.         false,          // kEnclosingMark           = 7,
  165.         false,          // kCombiningSpacingMark    = 8,
  166.         false,          // kDecimalNumber           = 9,
  167.         false,          // kLetterNumber            = 10,
  168.         false,          // kOtherNumber             = 11,
  169.         false,          // kSpaceSeparator          = 12,
  170.         true,           // kLineSeparator           = 13,
  171.         true,           // kParagraphSeparator      = 14,
  172.         true,           // kControlCharacter        = 15,
  173.         false,          // kFormatCharacter         = 16,
  174.         false,          // UNDEFINED                = 17,
  175.         false,          // kPrivateUseCharacter     = 18,
  176.         false,          // kSurrogate               = 19,
  177.         false,          // kDashPunctuation         = 20,
  178.         false,          // kOpenPunctuation         = 21,
  179.         false,          // kClosePunctuation        = 22,
  180.         false,          // kConnectorPunctuation    = 23,
  181.         false,          // kOtherPunctuation        = 24,
  182.         false,          // kMathSymbol              = 25,
  183.         false,          // kCurrencySymbol          = 26,
  184.         false,          // kModifierSymbol          = 27,
  185.         false           // kOtherSymbol             = 28
  186.     };
  187.  
  188.     private static final int kCharacterAsciiValues[] = {
  189.         //  null      soh       stx       etx       eot       enq       ask       bell
  190.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  191.         //  bs        ht        lf      vt        ff        cr      so        si
  192.             baseForm, baseForm, baseLF, baseForm, baseForm, baseCR, baseForm, baseForm,
  193.         //  dle       dc1       dc2       dc3       dc4       nak       syn       etb
  194.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  195.         //  can       em        sub       esc       fs        gs        rs        us
  196.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  197.         //  sp        !         "         #         $         %         &         '
  198.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  199.         //  (         )         *         +         ,         -         .         /
  200.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  201.         //  0         1         2         3         4         5         6         7
  202.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  203.         //  8         9         :         ;         <         =         >         ?
  204.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  205.         //  @         A         B         C         D         E         F         G
  206.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  207.         //  H         I         J         K         L         M         N         O
  208.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  209.         //  P         Q         R         S         T         U         V         W
  210.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  211.         //  X         Y         Z         [         \         ]         ^         _
  212.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  213.         //  `         a         b         c         d         e         f         g
  214.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  215.         //  h         i         j         k         l         m         n         o
  216.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  217.         //  p         q         r         s         t         u         v         w
  218.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  219.         //  x         y         z         {         |         }         ~         del
  220.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  221.         //  ctrl      ctrl      ctrl      ctrl      ctrl      ctrl      ctrl      ctrl
  222.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  223.         //  ctrl      ctrl      ctrl      ctrl      ctrl      ctrl      ctrl      ctrl
  224.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  225.         //  ctrl      ctrl      ctrl      ctrl      ctrl      ctrl      ctrl      ctrl
  226.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  227.         //  ctrl      ctrl      ctrl      ctrl      ctrl      ctrl      ctrl      ctrl
  228.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  229.         //  nbsp      inv-!     cents     pounds    currency  yen       broken-bar  section
  230.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  231.         //  umlaut    copyright super-a   gui-left  not       soft-hyph registered  macron
  232.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  233.         //  degree    +/-       super-2   super-3   acute     micro     paragraph  bullet
  234.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  235.         //  cedilla   super-1   super-o   gui-right 1/4       1/2       3/4      inv-?
  236.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  237.         //  A-grave   A-acute   A-hat     A-tilde   A-umlaut A-ring    AE        C-cedilla
  238.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  239.         //  E-grave   E-acute   E-hat     E-umlaut  I-grave   I-acute   I-hat    I-umlaut
  240.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  241.         //  Edh       N-tilde   O-grave   O-acute   O-hat     O-tilde   O-umlaut times
  242.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  243.         //  O-slash   U-grave   U-acute   U-hat     U-umlaut  Y-acute   Thorn    ess-zed
  244.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  245.         //  a-grave   a-acute   a-hat     a-tilde   a-umlaut  a-ring    ae       c-cedilla
  246.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  247.         //  e-grave   e-acute   e-hat     e-umlaut  i-grave   i-acute   i-hat    i-umlaut
  248.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  249.         //  edh       n-tilde   o-grave   o-acute   o-hat     o-tilde   o-umlaut  over
  250.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm,
  251.         //  o-slash   u-grave   u-acute   u-hat     u-umlaut  y-acute   thorn    y-umlaut
  252.             baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm, baseForm
  253.     };
  254.  
  255.     private static final UnicodeClassMapping kCharacterMap
  256.         = new UnicodeClassMapping(kRawMapping, kExceptionChar, CharacterExceptionFlags,
  257.         kCharacterAsciiValues);
  258. }
  259.  
  260.