home *** CD-ROM | disk | FTP | other *** search
- /*
- *****************************************************************************************
- * *
- * COPYRIGHT: *
- * (C) Copyright Taligent, Inc., 1996, 1997 *
- * (C) Copyright International Business Machines Corporation, 1996- 1999 *
- * Licensed Material - Program-Property of IBM - All Rights Reserved. *
- * US Government Users Restricted Rights - Use, duplication, or disclosure *
- * restricted by GSA ADP Schedule Contract with IBM Corp. *
- * *
- *****************************************************************************************
- */
- // Copyright (C) 1996-1997 Taligent, Inc. All rights reserved.
- //
- // FILE NAME : unicode.h
- //
- // CREATED
- // Wednesday, December 11, 1996
- //
- // CREATED BY
- // Helena Shih
- //
- // CHANGES
- // Thursday, April 15, 1999
- // Modified the definitions of all the functions
- // C++ Wrappers for Unicode
- // CHANGES BY
- // Madhu Katragadda
- // 5/20/99 Madhu Added the function getVersion()
- //********************************************************************************************
-
-
-
- #ifndef UNICODE_H
- #define UNICODE_H
-
- #include "utypes.h"
-
-
- /**
- * The Unicode class allows you to query the properties associated with individual
- * Unicode character values.
- * <p>
- * The Unicode character information, provided implicitly by the
- * Unicode character encoding standard, includes information about the sript
- * (for example, symbols or control characters) to which the character belongs,
- * as well as semantic information such as whether a character is a digit or
- * uppercase, lowercase, or uncased.
- * <P>
- * @subclassing Do not subclass.
- */
- class U_COMMON_API Unicode
- {
- public:
- /**
- * The minimum value a UChar can have. The lowest value a
- * UChar can have is 0x0000.
- */
- static const UChar MIN_VALUE;
-
- /**
- * The maximum value a UChar can have. The greatest value a
- * UChar can have is 0xffff.
- */
- static const UChar MAX_VALUE;
-
- /**
- * Public data for enumerated Unicode general category types
- */
-
- enum EUnicodeGeneralTypes
- {
- UNASSIGNED = 0,
- UPPERCASE_LETTER = 1,
- LOWERCASE_LETTER = 2,
- TITLECASE_LETTER = 3,
- MODIFIER_LETTER = 4,
- OTHER_LETTER = 5,
- NON_SPACING_MARK = 6,
- ENCLOSING_MARK = 7,
- COMBINING_SPACING_MARK = 8,
- DECIMAL_DIGIT_NUMBER = 9,
- LETTER_NUMBER = 10,
- OTHER_NUMBER = 11,
- SPACE_SEPARATOR = 12,
- LINE_SEPARATOR = 13,
- PARAGRAPH_SEPARATOR = 14,
- CONTROL = 15,
- FORMAT = 16,
- PRIVATE_USE = 17,
- SURROGATE = 18,
- DASH_PUNCTUATION = 19,
- START_PUNCTUATION = 20,
- END_PUNCTUATION = 21,
- CONNECTOR_PUNCTUATION = 22,
- OTHER_PUNCTUATION = 23,
- MATH_SYMBOL = 24,
- CURRENCY_SYMBOL = 25,
- MODIFIER_SYMBOL = 26,
- OTHER_SYMBOL = 27,
- INITIAL_PUNCTUATION = 28,
- FINAL_PUNCTUATION = 29,
- GENERAL_TYPES_COUNT = 30
- };
-
- enum EUnicodeScript
- {
- kBasicLatin,
- kLatin1Supplement,
- kLatinExtendedA,
- kLatinExtendedB,
- kIPAExtension,
- kSpacingModifier,
- kCombiningDiacritical,
- kGreek,
- kCyrillic,
- kArmenian,
- kHebrew,
- kArabic,
- kDevanagari,
- kBengali,
- kGurmukhi,
- kGujarati,
- kOriya,
- kTamil,
- kTelugu,
- kKannada,
- kMalayalam,
- kThai,
- kLao,
- kTibetan,
- kGeorgian,
- kHangulJamo,
- kLatinExtendedAdditional,
- kGreekExtended,
- kGeneralPunctuation,
- kSuperSubScript,
- kCurrencySymbolScript,
- kSymbolCombiningMark,
- kLetterlikeSymbol,
- kNumberForm,
- kArrow,
- kMathOperator,
- kMiscTechnical,
- kControlPicture,
- kOpticalCharacter,
- kEnclosedAlphanumeric,
- kBoxDrawing,
- kBlockElement,
- kGeometricShape,
- kMiscSymbol,
- kDingbat,
- kCJKSymbolPunctuation,
- kHiragana,
- kKatakana,
- kBopomofo,
- kHangulCompatibilityJamo,
- kKanbun,
- kEnclosedCJKLetterMonth,
- kCJKCompatibility,
- kCJKUnifiedIdeograph,
- kHangulSyllable,
- kHighSurrogate,
- kHighPrivateUseSurrogate,
- kLowSurrogate,
- kPrivateUse,
- kCJKCompatibilityIdeograph,
- kAlphabeticPresentation,
- kArabicPresentationA,
- kCombiningHalfMark,
- kCJKCompatibilityForm,
- kSmallFormVariant,
- kArabicPresentationB,
- kNoScript,
- kHalfwidthFullwidthForm,
- kScriptCount
- };
-
- /**
- * This specifies the language directional property of a character set.
- */
- enum EDirectionProperty {
- LEFT_TO_RIGHT = 0,
- RIGHT_TO_LEFT = 1,
- EUROPEAN_NUMBER = 2,
- EUROPEAN_NUMBER_SEPARATOR = 3,
- EUROPEAN_NUMBER_TERMINATOR = 4,
- ARABIC_NUMBER = 5,
- COMMON_NUMBER_SEPARATOR = 6,
- BLOCK_SEPARATOR = 7,
- SEGMENT_SEPARATOR = 8,
- WHITE_SPACE_NEUTRAL = 9,
- OTHER_NEUTRAL = 10,
- LEFT_TO_RIGHT_EMBEDDING = 11,
- LEFT_TO_RIGHT_OVERRIDE = 12,
- RIGHT_TO_LEFT_ARABIC = 13,
- RIGHT_TO_LEFT_EMBEDDING = 14,
- RIGHT_TO_LEFT_OVERRIDE = 15,
- POP_DIRECTIONAL_FORMAT = 16,
- DIR_NON_SPACING_MARK = 17,
- BOUNDARY_NEUTRAL = 18
- };
-
- /**
- * Values returned by the getCellWidth() function.
- * @see Unicode#getCellWidth
- */
- enum ECellWidths
- {
- ZERO_WIDTH = 0,
- HALF_WIDTH = 1,
- FULL_WIDTH = 2,
- NEUTRAL = 3
- };
-
- /**
- * Determines whether the specified UChar is a lowercase character
- * according to Unicode 2.1.2.
- *
- * @param ch the character to be tested
- * @return true if the character is lowercase; false otherwise.
- *
- * @see Unicode#isUpperCase
- * @see Unicode#isTitleCase
- * @see Unicode#toLowerCase
- */
- static bool_t isLowerCase(UChar ch);
-
- /**
- * Determines whether the specified character is an uppercase character
- * according to Unicode 2.1.2.
- *
- * @param ch the character to be tested
- * @return true if the character is uppercase; false otherwise.
- * @see Unicode#isLowerCase
- * @see Unicode#isTitleCase
- * @see Unicode#toUpperCase
- */
- static bool_t isUpperCase(UChar ch);
-
- /**
- * Determines whether the specified character is a titlecase character
- * according to Unicode 2.1.2.
- *
- * @param ch the character to be tested
- * @return true if the character is titlecase; false otherwise.
- * @see Unicode#isUpperCase
- * @see Unicode#isLowerCase
- * @see Unicode#toTitleCase
- */
- static bool_t isTitleCase(UChar ch);
-
- /**
- * Determines whether the specified character is a digit according to Unicode
- * 2.1.2.
- *
- * @param ch the character to be tested
- * @return true if the character is a digit; false otherwise.
- */
- static bool_t isDigit(UChar ch);
-
- /**
- * Determines whether the specified numeric value is actually a defined character
- * according to Unicode 2.1.2.
- *
- * @param ch the character to be tested
- * @return true if the character has a defined Unicode meaning; false otherwise.
- *
- * @see Unicode#isDigit
- * @see Unicode#isLetter
- * @see Unicode#isLetterOrDigit
- * @see Unicode#isUpperCase
- * @see Unicode#isLowerCase
- * @see Unicode#isTitleCase
- */
- static bool_t isDefined(UChar ch);
-
- /**
- * Determines whether the specified character is a control character according
- * to Unicode 2.1.2.
- *
- * @param ch the character to be tested
- * @return true if the Unicode character is a control character; false otherwise.
- *
- * @see Unicode#isPrintable
- */
- static bool_t isControl(UChar ch);
-
- /**
- * Determines whether the specified character is a printable character according
- * to Unicode 2.1.2.
- *
- * @param ch the character to be tested
- * @return true if the Unicode character is a printable character; false otherwise.
- *
- * @see Unicode#isControl
- */
- static bool_t isPrintable(UChar ch);
-
- /**
- * Determines whether the specified character is of the base form according
- * to Unicode 2.1.2.
- *
- * @param ch the character to be tested
- * @return true if the Unicode character is of the base form; false otherwise.
- *
- * @see Unicode#isLetter
- * @see Unicode#isDigit
- */
-
- static bool_t isBaseForm(UChar ch);
- /**
- * Determines whether the specified character is a letter
- * according to Unicode 2.1.2.
- *
- * @param ch the character to be tested
- * @return true if the character is a letter; false otherwise.
- *
- *
- * @see Unicode#isDigit
- * @see Unicode#isLetterOrDigit
- * @see Unicode#isUpperCase
- * @see Unicode#isLowerCase
- * @see Unicode#isTitleCase
- */
- static bool_t isLetter(UChar ch);
-
- /**
- * A convenience method for determining if a Unicode character
- * is allowed as the first character in a Java identifier.
- * <P>
- * A character may start a Java identifier if and only if
- * it is one of the following:
- * <ul>
- * <li> a letter
- * <li> a currency symbol (such as "$")
- * <li> a connecting punctuation symbol (such as "_").
- * </ul>
- *
- * @param ch the Unicode character.
- * @return TRUE if the character may start a Java identifier;
- * FALSE otherwise.
- * @see isJavaIdentifierPart
- * @see isLetter
- * @see isUnicodeIdentifierStart
- */
- static bool_t isJavaIdentifierStart(UChar ch);
-
- /**
- * A convenience method for determining if a Unicode character
- * may be part of a Java identifier other than the starting
- * character.
- * <P>
- * A character may be part of a Java identifier if and only if
- * it is one of the following:
- * <ul>
- * <li> a letter
- * <li> a currency symbol (such as "$")
- * <li> a connecting punctuation character (such as "_").
- * <li> a digit
- * <li> a numeric letter (such as a Roman numeral character)
- * <li> a combining mark
- * <li> a non-spacing mark
- * <li> an ignorable control character
- * </ul>
- *
- * @param ch the Unicode character.
- * @return TRUE if the character may be part of a Unicode identifier;
- * FALSE otherwise.
- * @see isIdentifierIgnorable
- * @see isJavaIdentifierStart
- * @see isLetter
- * @see isDigit
- * @see isUnicodeIdentifierPart
- */
- static bool_t isJavaIdentifierPart(UChar ch);
-
- /**
- * A convenience method for determining if a Unicode character
- * is allowed to start in a Unicode identifier.
- * A character may start a Unicode identifier if and only if
- * it is a letter.
- *
- * @param ch the Unicode character.
- * @return TRUE if the character may start a Unicode identifier;
- * FALSE otherwise.
- * @see isJavaIdentifierStart
- * @see isLetter
- * @see isUnicodeIdentifierPart
- */
- static bool_t isUnicodeIdentifierStart(UChar ch);
-
- /**
- * A convenience method for determining if a Unicode character
- * may be part of a Unicode identifier other than the starting
- * character.
- * <P>
- * A character may be part of a Unicode identifier if and only if
- * it is one of the following:
- * <ul>
- * <li> a letter
- * <li> a connecting punctuation character (such as "_").
- * <li> a digit
- * <li> a numeric letter (such as a Roman numeral character)
- * <li> a combining mark
- * <li> a non-spacing mark
- * <li> an ignorable control character
- * </ul>
- *
- * @param ch the Unicode character.
- * @return TRUE if the character may be part of a Unicode identifier;
- * FALSE otherwise.
- * @see isIdentifierIgnorable
- * @see isJavaIdentifierPart
- * @see isLetterOrDigit
- * @see isUnicodeIdentifierStart
- */
- static bool_t isUnicodeIdentifierPart(UChar ch);
-
- /**
- * A convenience method for determining if a Unicode character
- * should be regarded as an ignorable character in a Java
- * identifier or a Unicode identifier.
- * <P>
- * The following Unicode characters are ignorable in a Java identifier
- * or a Unicode identifier:
- * <table>
- * <tr><td>0x0000 through 0x0008,</td>
- * <td>ISO control characters that</td></tr>
- * <tr><td>0x000E through 0x001B,</td> <td>are not whitespace</td></tr>
- * <tr><td>and 0x007F through 0x009F</td></tr>
- * <tr><td>0x200C through 0x200F</td> <td>join controls</td></tr>
- * <tr><td>0x200A through 0x200E</td> <td>bidirectional controls</td></tr>
- * <tr><td>0x206A through 0x206F</td> <td>format controls</td></tr>
- * <tr><td>0xFEFF</td> <td>zero-width no-break space</td></tr>
- * </table>
- *
- * @param ch the Unicode character.
- * @return TRUE if the character may be part of a Unicode identifier;
- * FALSE otherwise.
- * @see isJavaIdentifierPart
- * @see isUnicodeIdentifierPart
- */
- static bool_t isIdentifierIgnorable(UChar ch);
-
- /**
- * The given character is mapped to its lowercase equivalent according to
- * Unicode 2.1.2; if the character has no lowercase equivalent, the character
- * itself is returned.
- * <P>
- * A character has a lowercase equivalent if and only if a lowercase mapping
- * is specified for the character in the Unicode 2.0 attribute table.
- * <P>
- * Unicode::toLowerCase() only deals with the general letter case conversion.
- * For language specific case conversion behavior, use UnicodeString::toLower().
- * For example, the case conversion for dot-less i and dotted I in Turkish,
- * or for final sigma in Greek.
- *
- * @param ch the character to be converted
- * @return the lowercase equivalent of the character, if any;
- * otherwise the character itself.
- *
- * @see UnicodeString#toLower
- * @see Unicode#isLowerCase
- * @see Unicode#isUpperCase
- * @see Unicode#toUpperCase
- * @see Unicode#toTitleCase
- */
- static UChar toLowerCase(UChar ch);
-
- /**
- * The given character is mapped to its uppercase equivalent according to Unicode
- * 2.1.2; if the character has no uppercase equivalent, the character itself is
- * returned.
- * <P>
- * Unicode::toUpperCase() only deals with the general letter case conversion.
- * For language specific case conversion behavior, use UnicodeString::toUpper().
- * For example, the case conversion for dot-less i and dotted I in Turkish,
- * or ess-zed (i.e., "sharp S") in German.
- *
- * @param ch the character to be converted
- * @return the uppercase equivalent of the character, if any;
- * otherwise the character itself.
- *
- * @see UnicodeString#toUpper
- * @see Unicode#isUpperCase
- * @see Unicode#isLowerCase
- * @see Unicode#toLowerCase
- * @see Unicode#toTitleCase
- */
- static UChar toUpperCase(UChar ch);
-
- /**
- * The given character is mapped to its titlecase equivalent according to Unicode
- * 2.1.2. There are only four Unicode characters that are truly titlecase forms
- * that are distinct from uppercase forms. As a rule, if a character has no
- * true titlecase equivalent, its uppercase equivalent is returned.
- * <P>
- * A character has a titlecase equivalent if and only if a titlecase mapping
- * is specified for the character in the Unicode 2.1.2 data.
- *
- * @param ch the character to be converted
- * @return the titlecase equivalent of the character, if any;
- * otherwise the character itself.
- * @see Unicode#isTitleCase
- * @see Unicode#toUpperCase
- * @see Unicode#toLowerCase
- */
- static UChar toTitleCase(UChar ch);
-
- /**
- * Determines if the specified character is a Unicode space character
- * according to Unicode 2.1.2.
- *
- * @param ch the character to be tested
- * @return true if the character is a space character; false otherwise.
- */
- static bool_t isSpaceChar(UChar ch);
-
- /**
- * Returns a value indicating a character category according to Unicode
- * 2.1.2.
- * @param ch the character to be tested
- * @return a value of type int, the character category.
- * @see Unicode#UNASSIGNED
- * @see Unicode#UPPERCASE_LETTER
- * @see Unicode#LOWERCASE_LETTER
- * @see Unicode#TITLECASE_LETTER
- * @see Unicode#MODIFIER_LETTER
- * @see Unicode#OTHER_LETTER
- * @see Unicode#NON_SPACING_MARK
- * @see Unicode#ENCLOSING_MARK
- * @see Unicode#COMBINING_SPACING_MARK
- * @see Unicode#DECIMAL_DIGIT_NUMBER
- * @see Unicode#OTHER_NUMBER
- * @see Unicode#SPACE_SEPARATOR
- * @see Unicode#LINE_SEPARATOR
- * @see Unicode#PARAGRAPH_SEPARATOR
- * @see Unicode#CONTROL
- * @see Unicode#PRIVATE_USE
- * @see Unicode#SURROGATE
- * @see Unicode#DASH_PUNCTUATION
- * @see Unicode#OPEN_PUNCTUATION
- * @see Unicode#CLOSE_PUNCTUATION
- * @see Unicode#CONNECTOR_PUNCTUATION
- * @see Unicode#OTHER_PUNCTUATION
- * @see Unicode#LETTER_NUMBER
- * @see Unicode#MATH_SYMBOL
- * @see Unicode#CURRENCY_SYMBOL
- * @see Unicode#MODIFIER_SYMBOL
- * @see Unicode#OTHER_SYMBOL
- */
- static int8_t getType(UChar ch);
-
- /**
- * Returns the linguistic direction property of a character.
- * <P>
- * Returns the linguistic direction property of a character.
- * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
- * property.
- * @see #EDirectionProperty
- */
- static EDirectionProperty characterDirection(UChar ch);
-
- /**
- * Returns the script associated with a character.
- * @see #EUnicodeScript
- */
- static EUnicodeScript getScript(UChar ch);
-
- /**
- * Returns a value indicating the display-cell width of the character
- * when used in Asian text, according to the Unicode standard (see p. 6-130
- * of The Unicode Standard, Version 2.0). The results for various characters
- * are as follows:
- * <P>
- * ZERO_WIDTH: Characters which are considered to take up no display-cell space:
- * control characters
- * format characters
- * line and paragraph separators
- * non-spacing marks
- * combining Hangul jungseong
- * combining Hangul jongseong
- * unassigned Unicode values
- * <P>
- * HALF_WIDTH: Characters which take up half a cell in standard Asian text:
- * all characters in the General Scripts Area except combining Hangul choseong
- * and the characters called out specifically above as ZERO_WIDTH
- * alphabetic and Arabic presentation forms
- * halfwidth CJK punctuation
- * halfwidth Katakana
- * halfwidth Hangul Jamo
- * halfwidth forms, arrows, and shapes
- * <P>
- * FULL_WIDTH: Characters which take up a full cell in standard Asian text:
- * combining Hangul choseong
- * all characters in the CJK Phonetics and Symbols Area
- * all characters in the CJK Ideographs Area
- * all characters in the Hangul Syllables Area
- * CJK compatibility ideographs
- * CJK compatibility forms
- * small form variants
- * fullwidth ASCII
- * fullwidth punctuation and currency signs
- * <P>
- * NEUTRAL: Characters whose cell width is context-dependent:
- * all characters in the Symbols Area, except those specifically called out above
- * all characters in the Surrogates Area
- * all charcaters in the Private Use Area
- * <P>
- * For Korean text, this algorithm should work properly with properly normalized Korean
- * text. Precomposed Hangul syllables and non-combining jamo are all considered full-
- * width characters. For combining jamo, we treat we treat choseong (initial consonants)
- * as double-width characters and junseong (vowels) and jongseong (final consonants)
- * as non-spacing marks. This will work right in text that uses the precomposed
- * choseong characters instead of teo choseong characters in a row, and which uses the
- * choseong filler character at the beginning of syllables that don't have an initial
- * consonant. The results may be slightly off with Korean text following different
- * conventions.
- */
- static uint16_t getCellWidth(UChar ch);
-
- /**
- * Retrives the decimal numeric value of a digit character.
- * @param ch the digit character for which to get the numeric value
- * @return the numeric value of ch in decimal radix. This method returns
- * -1 if ch is not a valid digit character.
- */
- static int32_t digitValue(UChar ch);
-
- /**
- * Retrieves the Unicode Standard Version number that is used
- * @return the Unicode Standard Version Number.
- */
- static const char* getVersion(void);
-
- protected:
- // These constructors, destructor, and assignment operator must
- // be protected (not private, as they semantically are) to make
- // various UNIX compilers happy. [LIU]
- Unicode();
- Unicode( const Unicode& other);
- ~Unicode();
- const Unicode& operator=( const Unicode& other);
-
-
-
- };
-
-
- #endif
-
-
-
-
-