home *** CD-ROM | disk | FTP | other *** search
- /*
- *******************************************************************************
- * *
- * COPYRIGHT: *
- * IBM Open Class Library *
- * (C) Copyright Taligent, Inc., 1996 *
- * (C) Copyright International Business Machines Corporation, 1996-1998 *
- * Licensed Material - Program-Property of IBM - All Rights Reserved. *
- * US Government Users Restricted Rights - Use, duplication, or disclosure *
- * restricted by GSA ADP Schedule Contract with IBM Corp. *
- * *
- *******************************************************************************
- */
-
-
- #include "ucmp16.h"
- #include "dcmpdata.h"
- #include "compdata.h"
-
- #include "normlzr.h"
- #include "utypes.h"
- #include "unistr.h"
- #include "chariter.h"
- #include "schriter.h"
- #include "unicode.h"
- #include "mutex.h"
-
-
- #define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array))
-
- inline static void insert(UnicodeString& dest,
- UTextOffset pos,
- UChar ch)
- {
- dest.replace(pos, 0, &ch, 1);
- }
-
- const UChar Normalizer::DONE = 0xFFFF;
- const UChar Normalizer::HANGUL_BASE = 0xac00;
- const UChar Normalizer::HANGUL_LIMIT= 0xd7a4;
- const UChar Normalizer::JAMO_LBASE = 0x1100;
- const UChar Normalizer::JAMO_VBASE = 0x1161;
- const UChar Normalizer::JAMO_TBASE = 0x11a7;
- const int16_t Normalizer::JAMO_LCOUNT = 19;
- const int16_t Normalizer::JAMO_VCOUNT = 21;
- const int16_t Normalizer::JAMO_TCOUNT = 28;
- const int16_t Normalizer::JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT;
-
-
-
- //-------------------------------------------------------------------------
- // Constructors and other boilerplate
- //-------------------------------------------------------------------------
-
- Normalizer::Normalizer(const UnicodeString& str,
- EMode mode)
- {
- init(new StringCharacterIterator(str), mode, 0);
- }
-
- Normalizer::Normalizer(const UnicodeString& str,
- EMode mode,
- int32_t opt)
- {
- init(new StringCharacterIterator(str), mode, opt);
- }
-
- Normalizer::Normalizer(const CharacterIterator& iter,
- EMode mode)
- {
- init(iter.clone(), mode, 0);
- }
-
- Normalizer::Normalizer(const CharacterIterator& iter,
- EMode mode,
- int32_t opt)
- {
- init(iter.clone(), mode, opt);
- }
-
- void Normalizer::init(CharacterIterator* adoptIter,
- EMode mode,
- int32_t options)
- {
- bufferPos = 0;
- bufferLimit = 0;
- fOptions = options;
- currentChar = DONE;
- fMode = mode;
- text = adoptIter;
-
- minDecomp = (fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT;
- }
-
- Normalizer::Normalizer(const Normalizer& copy)
- {
- init(copy.text->clone(), copy.fMode, copy.fOptions);
-
- buffer = copy.buffer;
- bufferPos = copy.bufferPos;
- bufferLimit = copy.bufferLimit;
- explodeBuf = copy.explodeBuf;
- currentChar = copy.currentChar;
- }
-
- Normalizer::~Normalizer()
- {
- delete text;
- }
-
- Normalizer*
- Normalizer::clone() const
- {
- return new Normalizer(*this);
- }
-
- /**
- * Generates a hash code for this iterator.
- */
- int32_t Normalizer::hashCode() const
- {
- return text->hashCode() + fMode + fOptions + bufferPos + bufferLimit;
- }
-
- bool_t Normalizer::operator==(const Normalizer& that) const
- {
- return *text == *(that.text)
- && currentChar == that.currentChar
- && buffer == that.buffer
- && explodeBuf == that.explodeBuf
- && bufferPos == that.bufferPos
- && bufferLimit == that.bufferLimit;
- }
-
- //-------------------------------------------------------------------------
- // Static utility methods
- //-------------------------------------------------------------------------
-
- void
- Normalizer::normalize(const UnicodeString& source,
- EMode mode,
- int32_t options,
- UnicodeString& result,
- UErrorCode &status)
- {
- switch (mode) {
- case NO_OP:
- result = source;
- break;
- case COMPOSE:
- case COMPOSE_COMPAT:
- compose(source, mode & COMPAT_BIT, options, result, status);
- break;
- case DECOMP:
- case DECOMP_COMPAT:
- decompose(source, mode & COMPAT_BIT, options, result, status);
- break;
- }
- }
-
- //-------------------------------------------------------------------------
- // Compose methods
- //-------------------------------------------------------------------------
-
- void
- Normalizer::compose(const UnicodeString& source,
- bool_t compat,
- int32_t options,
- UnicodeString& result,
- UErrorCode &status)
- {
- if (U_FAILURE(status)) {
- return;
- }
- result.truncate(0);
- UnicodeString explodeBuf;
-
- UTextOffset explodePos = EMPTY; // Position in input buffer
- UTextOffset basePos = 0; // Position of last base in output string
- uint16_t baseIndex = 0; // Index of last base in "actions" array
- uint32_t classesSeen = 0; // Combining classes seen since last base
- uint16_t action;
-
- // Compatibility explosions have lower indices; skip them if necessary
- uint16_t minExplode = compat ? 0 : ComposeData::MAX_COMPAT;
- uint16_t minDecomp = compat ? 0 : DecompData::MAX_COMPAT;
-
- UTextOffset i = 0;
- while (i < source.size() || explodePos != EMPTY) {
- // Get the next char from either the buffer or the source
- UChar ch;
- if (explodePos == EMPTY) {
- ch = source[i++];
- } else {
- ch = explodeBuf[explodePos++];
- if (explodePos >= explodeBuf.size()) {
- explodePos = EMPTY;
- explodeBuf.truncate(0);
- }
- }
-
- // Get the basic info for the character
- uint16_t charInfo = composeLookup(ch);
- uint16_t type = charInfo & ComposeData::TYPE_MASK;
- uint16_t index = charInfo >> ComposeData::INDEX_SHIFT;
-
- if (type == ComposeData::BASE) {
- classesSeen = 0;
- baseIndex = index;
- basePos = result.size();
- result += ch;
- }
- else if (type == ComposeData::COMBINING || type == ComposeData::NON_COMPOSING_COMBINING)
- {
- uint32_t cclass = ComposeData::typeMask[index];
-
- // We can only combine a character with the base if we haven't
- // already seen a combining character with the same canonical class.
- if (type == ComposeData::COMBINING && (classesSeen & cclass) == 0
- && (action = composeAction(baseIndex, index)) > 0)
- {
- if (action > ComposeData::MAX_COMPOSED) {
- // Pairwise explosion. Actions above this value are really
- // indices into an array that in turn contains indices
- // into the exploding string table
- // TODO: What if there are unprocessed chars in the explode buffer?
- UChar newBase = pairExplode(explodeBuf, action);
- explodePos = 0;
- result[basePos] = newBase;
-
- baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
- } else {
- // Normal pairwise combination. Replace the base char
- UChar newBase = (UChar) action;
- result[basePos] = newBase;
-
- baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
- }
- //
- // Since there are Unicode characters that cannot be combined in arbitrary
- // order, we have to re-process any combining marks that go with this
- // base character. There are only four characters in Unicode that have
- // this problem. If they are fixed in Unicode 3.0, this code can go away.
- //
- UTextOffset len = result.size();
- if (len - basePos > 1) {
- for (UTextOffset j = basePos+1; j < len; j++) {
- explodeBuf += result[j];
- }
- result.truncate(basePos+1);
- classesSeen = 0;
- if (explodePos == EMPTY) explodePos = 0;
- }
- } else {
- // No combination with this character
- bubbleAppend(result, ch, cclass);
- classesSeen |= cclass;
- }
- }
- else if (index > minExplode) {
- // Single exploding character
- explode(explodeBuf, index);
- explodePos = 0;
- }
- else if (type == ComposeData::HANGUL && minExplode == 0) {
- // If we're in compatibility mode we need to decompose Hangul to Jamo,
- // because some of the Jamo might have compatibility decompositions.
- hangulToJamo(ch, explodeBuf, minDecomp);
- explodePos = 0;
- }
- else if (type == ComposeData::INITIAL_JAMO) {
- classesSeen = 0;
- baseIndex = ComposeData::INITIAL_JAMO_INDEX;
- basePos = result.size();
- result += ch;
- }
- else if (type == ComposeData::MEDIAL_JAMO && classesSeen == 0
- && baseIndex == ComposeData::INITIAL_JAMO_INDEX) {
- // If the last character was an initial jamo, we can combine it with this
- // one to create a Hangul character.
- uint16_t l = result[basePos] - JAMO_LBASE;
- uint16_t v = ch - JAMO_VBASE;
- result[basePos] = (UChar)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT);
-
- baseIndex = ComposeData::MEDIAL_JAMO_INDEX;
- }
- else if (type == ComposeData::FINAL_JAMO && classesSeen == 0
- && baseIndex == ComposeData::MEDIAL_JAMO_INDEX) {
- // If the last character was a medial jamo that we turned into Hangul,
- // we can add this character too.
- result[basePos] = (UChar)(result[basePos] + (ch - JAMO_TBASE));
-
- baseIndex = 0;
- basePos = -1;
- classesSeen = 0;
- } else {
- baseIndex = 0;
- basePos = -1;
- classesSeen = 0;
- result += ch;
- }
- }
- }
-
- /**
- * Compose starting with current input character and continuing
- * until just before the next base char.
- * <p>
- * <b>Input</b>:
- * <ul>
- * <li>underlying char iter points to first character to decompose
- * </ul>
- * <p>
- * <b>Output:</b>
- * <ul>
- * <li>returns first char of decomposition or DONE if at end
- * <li>Underlying char iter is pointing at next base char or past end
- * </ul>
- */
- UChar Normalizer::nextCompose()
- {
- UTextOffset explodePos = EMPTY; // Position in input buffer
- UTextOffset basePos = 0; // Position of last base in output string
- uint16_t baseIndex = 0; // Index of last base in "actions" array
- uint32_t classesSeen = 0; // Combining classes seen since last base
- uint16_t action;
- UChar lastBase = 0;
- bool_t chFromText = TRUE;
-
- // Compatibility explosions have lower indices; skip them if necessary
- uint16_t minExplode = (fMode & COMPAT_BIT) ? 0 : ComposeData::MAX_COMPAT;
- uint16_t minDecomp = (fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT;
-
- initBuffer();
- explodeBuf.truncate(0);
-
- UChar ch = curForward();
-
- while (ch != DONE) {
- // Get the basic info for the character
- uint16_t charInfo = composeLookup(ch);
- uint16_t type = charInfo & ComposeData::TYPE_MASK;
- uint16_t index = charInfo >> ComposeData::INDEX_SHIFT;
-
- if (type == ComposeData::BASE) {
- if (buffer.size() > 0 && chFromText && explodePos == EMPTY) {
- // When we hit a base char in the source text, we can return the text
- // that's been composed so far. We'll re-process this char next time through.
- break;
- }
- classesSeen = 0;
- baseIndex = index;
- basePos = buffer.size();
- buffer += ch;
- lastBase = ch;
- }
- else if (type == ComposeData::COMBINING || type == ComposeData::NON_COMPOSING_COMBINING)
- {
- uint32_t cclass = ComposeData::typeMask[index];
-
- // We can only combine a character with the base if we haven't
- // already seen a combining character with the same canonical class.
- if (type == ComposeData::COMBINING && (classesSeen & cclass) == 0
- && (action = composeAction(baseIndex, index)) > 0)
- {
- if (action > ComposeData::MAX_COMPOSED) {
- // Pairwise explosion. Actions above this value are really
- // indices into an array that in turn contains indices
- // into the exploding string table
- // TODO: What if there are unprocessed chars in the explode buffer?
- UChar newBase = pairExplode(explodeBuf, action);
- explodePos = 0;
- buffer[basePos] = newBase;
-
- baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
- lastBase = newBase;
- } else {
- // Normal pairwise combination. Replace the base char
- UChar newBase = (UChar) action;
- buffer[basePos] = newBase;
-
- baseIndex = composeLookup(newBase) >> ComposeData::INDEX_SHIFT;
- lastBase = newBase;
- }
- //
- // Since there are Unicode characters that cannot be combined in arbitrary
- // order, we have to re-process any combining marks that go with this
- // base character. There are only four characters in Unicode that have
- // this problem. If they are fixed in Unicode 3.0, this code can go away.
- //
- UTextOffset len = buffer.size();
- if (len - basePos > 1) {
- for (UTextOffset j = basePos+1; j < len; j++) {
- explodeBuf += buffer[j];
- }
- buffer.truncate(basePos+1);
- classesSeen = 0;
- if (explodePos == EMPTY) explodePos = 0;
- }
- } else {
- // No combination with this character
- bubbleAppend(buffer, ch, cclass);
- classesSeen |= cclass;
- }
- }
- else if (index > minExplode) {
- // Single exploding character
- explode(explodeBuf, index);
- explodePos = 0;
- }
- else if (type == ComposeData::HANGUL && minExplode == 0) {
- // If we're in compatibility mode we need to decompose Hangul to Jamo,
- // because some of the Jamo might have compatibility decompositions.
- hangulToJamo(ch, explodeBuf, minDecomp);
- explodePos = 0;
- }
- else if (type == ComposeData::INITIAL_JAMO) {
- if (buffer.size() > 0 && chFromText && explodePos == EMPTY) {
- // When we hit a base char in the source text, we can return the text
- // that's been composed so far. We'll re-process this char next time through.
- break;
- }
- classesSeen = 0;
- baseIndex = ComposeData::INITIAL_JAMO_INDEX;
- basePos = buffer.size();
- buffer += ch;
- }
- else if (type == ComposeData::MEDIAL_JAMO && classesSeen == 0
- && baseIndex == ComposeData::INITIAL_JAMO_INDEX) {
- // If the last character was an initial jamo, we can combine it with this
- // one to create a Hangul character.
- uint16_t l = buffer[basePos] - JAMO_LBASE;
- uint16_t v = ch - JAMO_VBASE;
- UChar newCh = (UChar)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT);
- buffer[basePos] = newCh;
-
- baseIndex = ComposeData::MEDIAL_JAMO_INDEX;
- }
- else if (type == ComposeData::FINAL_JAMO && classesSeen == 0
- && baseIndex == ComposeData::MEDIAL_JAMO_INDEX) {
- // If the last character was a medial jamo that we turned into Hangul,
- // we can add this character too.
- UChar newCh = (UChar)(buffer[basePos] + (ch - JAMO_TBASE));
- buffer[basePos] = newCh;
-
- baseIndex = 0;
- basePos = -1;
- classesSeen = 0;
- } else {
- // TODO: deal with JAMO character types
- baseIndex = 0;
- basePos = -1;
- classesSeen = 0;
- buffer += ch;
- }
-
- if (explodePos == EMPTY) {
- ch = text->next();
- chFromText = TRUE;
- } else {
- ch = explodeBuf[explodePos++];
- if (explodePos >= explodeBuf.size()) {
- explodePos = EMPTY;
- explodeBuf.truncate(0);
- }
- chFromText = FALSE;
- }
- }
- if (buffer.size() > 0) {
- bufferLimit = buffer.size() - 1;
- ch = buffer[0];
- } else {
- ch = DONE;
- bufferLimit = 0;
- }
- return ch;
- }
-
- /**
- * Compose starting with the input UChar just before the current position
- * and continuing backward until (and including) the previous base char.
- * <p>
- * <b>Input</b>:
- * <ul>
- * <li>underlying char iter points just after last char to decompose
- * </ul>
- * <p>
- * <b>Output:</b>
- * <ul>
- * <li>returns last char of resulting decomposition sequence
- * <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
- * </ul>
- */
- UChar Normalizer::prevCompose()
- {
- UErrorCode status = U_ZERO_ERROR;
- initBuffer();
-
- // Slurp up characters until we hit a base char or an initial Jamo
- UChar ch;
- while ((ch = curBackward()) != DONE) {
- insert(buffer, 0, ch);
-
- // Get the basic info for the character
- uint16_t charInfo = composeLookup(ch);
- uint16_t type = charInfo & ComposeData::TYPE_MASK;
-
- if (type == ComposeData::BASE || type == ComposeData::HANGUL
- || type == ComposeData::INITIAL_JAMO || type == ComposeData::IGNORE)
- {
- break;
- }
- }
- // If there's more than one character in the buffer, compose it all at once....
- if (buffer.size() > 0) {
- // TODO: The performance of this is awful; add a way to compose
- // a UnicodeString& in place.
- UnicodeString composed;
- compose(buffer, (fMode & COMPAT_BIT), fOptions, composed, status);
- buffer.truncate(0);
- buffer += composed;
-
- if (buffer.size() > 1) {
- bufferLimit = bufferPos = buffer.size() - 1;
- ch = buffer[bufferPos];
- } else {
- ch = buffer[0];
- }
- }
- else {
- ch = DONE;
- }
-
- return ch;
- }
-
- void Normalizer::bubbleAppend(UnicodeString& target, UChar ch, uint32_t cclass) {
- UTextOffset i;
- for (i = target.size() - 1; i > 0; --i) {
- uint32_t iClass = getComposeClass(target[i]);
-
- if (iClass == 1 || iClass <= cclass) { // 1 means combining class 0
- // We've hit something we can't bubble this character past, so insert here
- break;
- }
- }
- // We need to insert just after character "i"
- insert(target, i+1, ch);
- }
-
-
- uint32_t Normalizer::getComposeClass(UChar ch) {
- uint32_t cclass = 0;
- uint16_t charInfo = composeLookup(ch);
- uint16_t type = charInfo & ComposeData::TYPE_MASK;
- if (type == ComposeData::COMBINING || type == ComposeData::NON_COMPOSING_COMBINING) {
- cclass = ComposeData::typeMask[charInfo >> ComposeData::INDEX_SHIFT];
- }
- return cclass;
- }
-
- uint16_t Normalizer::composeLookup(UChar ch) {
- return ucmp16_getu(ComposeData::lookup, ch);
- }
-
- uint16_t Normalizer::composeAction(uint16_t baseIndex, uint16_t comIndex)
- {
- return ucmp16_getu(ComposeData::actions,
- ((UChar)(baseIndex + ComposeData::MAX_BASES*comIndex)));
- }
-
- void Normalizer::explode(UnicodeString& target, uint16_t index) {
- UChar ch;
- while ((ch = ComposeData::replace[index++]) != 0)
- target += ch;
- }
-
- UChar Normalizer::pairExplode(UnicodeString& target, uint16_t action) {
- uint16_t index = ComposeData::actionIndex[action - ComposeData::MAX_COMPOSED];
- explode(target, index + 1);
- return ComposeData::replace[index]; // New base char
- }
-
- //-------------------------------------------------------------------------
- // Decompose methods
- //-------------------------------------------------------------------------
-
- void
- Normalizer::decompose(const UnicodeString& source,
- bool_t compat,
- int32_t options,
- UnicodeString& result,
- UErrorCode &status)
- {
- if (U_FAILURE(status)) {
- return;
- }
- bool_t hangul = (options & IGNORE_HANGUL) == 0;
- uint16_t limit = compat ? 0 : DecompData::MAX_COMPAT;
-
- result.truncate(0);
-
- for (UTextOffset i = 0; i < source.size(); ++i) {
- UChar ch = source[i];
-
- uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
-
-
- if (offset > limit) {
- doAppend(DecompData::contents, offset, result);
- } else if (ch >= HANGUL_BASE && ch < HANGUL_LIMIT && hangul) {
- hangulToJamo(ch, result, limit);
- } else {
- result += ch;
- }
- }
- fixCanonical(result);
- }
-
- /**
- * Decompose starting with current input character and continuing
- * until just before the next base char.
- * <p>
- * <b>Input</b>:
- * <ul>
- * <li>underlying char iter points to first character to decompose
- * </ul>
- * <p>
- * <b>Output:</b>
- * <ul>
- * <li>returns first char of decomposition or DONE if at end
- * <li>Underlying char iter is pointing at next base char or past end
- * </ul>
- */
- UChar Normalizer::nextDecomp()
- {
- bool_t hangul = ((fOptions & IGNORE_HANGUL) == 0);
- UChar ch = curForward();
-
- uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
-
- if (offset > minDecomp || ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
- {
- initBuffer();
-
- if (offset > minDecomp) {
- doAppend(DecompData::contents, offset, buffer);
- } else {
- buffer += ch;
- }
- bool_t needToReorder = FALSE;
-
- // Any other combining chacters that immediately follow the decomposed
- // character must be included in the buffer too, because they're
- // conceptually part of the same logical character.
- //
- // TODO: Might these need to be decomposed too?
- // (i.e. are there non-BASE characters with decompositions?
- //
- while ((ch = text->next()) != DONE
- && ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
- {
- needToReorder = TRUE;
- buffer += ch;
- }
-
- if (buffer.size() > 1 && needToReorder) {
- // If there is more than one combining character in the buffer,
- // put them into the canonical order.
- // But we don't need to sort if only characters are the ones that
- // resulted from decomosing the base character.
- fixCanonical(buffer);
- }
- bufferLimit = buffer.size() - 1;
- ch = buffer[0];
- } else {
- // Just use this character, but first advance to the next one
- text->next();
-
- // Do Hangul -> Jamo decomposition if necessary
- if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
- initBuffer();
- hangulToJamo(ch, buffer, minDecomp);
- bufferLimit = buffer.size() - 1;
- ch = buffer[0];
- }
- }
- return ch;
- }
-
-
- /**
- * Decompose starting with the input char just before the current position
- * and continuing backward until (and including) the previous base char.
- * <p>
- * <b>Input</b>:
- * <ul>
- * <li>underlying char iter points just after last char to decompose
- * </ul>
- * <p>
- * <b>Output:</b>
- * <ul>
- * <li>returns last char of resulting decomposition sequence
- * <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
- * </ul>
- */
- UChar Normalizer::prevDecomp() {
- bool_t hangul = (fOptions & IGNORE_HANGUL) == 0;
-
- UChar ch = curBackward();
-
- uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
-
- if (offset > minDecomp || ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
- {
- initBuffer();
-
- // Slurp up any combining characters till we get to a base char.
- while (ch != DONE && ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE) {
- insert(buffer, 0, ch);
- ch = text->previous();
- }
-
- // Now decompose this base character
- offset = ucmp16_getu(DecompData::offsets, ch);
- if (offset > minDecomp) {
- doInsert(DecompData::contents, offset, buffer, 0);
- } else {
- // This is a base character that doesn't decompose
- // and isn't involved in reordering, so throw it back
- text->next();
- }
-
- if (buffer.size() > 1) {
- // If there is more than one combining character in the buffer,
- // put them into the canonical order.
- fixCanonical(buffer);
- }
- bufferLimit = bufferPos = buffer.size() - 1;
- ch = buffer[bufferPos];
- }
- else if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
- initBuffer();
- hangulToJamo(ch, buffer, minDecomp);
- bufferLimit = bufferPos = buffer.size() - 1;
- ch = buffer[bufferPos];
- }
- return ch;
- }
-
- uint8_t Normalizer::getClass(UChar ch) {
- return ucmp8_get(DecompData::canonClass, ch);
- }
-
- /**
- * Fixes the sorting sequence of non-spacing characters according to
- * their combining class. The algorithm is listed on p.3-11 in the
- * Unicode Standard 2.0. The table of combining classes is on p.4-2
- * in the Unicode Standard 2.0.
- * @param result the string to fix.
- */
- void Normalizer::fixCanonical(UnicodeString& result) {
- UTextOffset i = result.size() - 1;
- uint8_t currentType = getClass(result[i]);
- uint8_t lastType;
-
- for (--i; i >= 0; --i) {
- lastType = currentType;
- currentType = getClass(result[i]);
-
- //
- // a swap is presumed to be rare (and a double-swap very rare),
- // so don't worry about efficiency here.
- //
- if (currentType > lastType && lastType != DecompData::BASE) {
- // swap characters
- UChar temp = result[i];
- result[i] = result[i+1];
- result[i+1] = temp;
-
- // if not at end, backup (one further, to compensate for for-loop)
- if (i < result.size() - 2) {
- i += 2;
- }
- // reset type, since we swapped.
- currentType = getClass(result[i]);
- }
- }
- }
-
-
- //-------------------------------------------------------------------------
- // CharacterIterator overrides
- //-------------------------------------------------------------------------
-
- /**
- * Return the current character in the normalized text.
- */
- UChar Normalizer:: current() const
- {
- // TODO: make this method const and guarantee that currentChar is always set?
- Normalizer *nonConst = (Normalizer*)this;
-
- if (currentChar == DONE) {
- switch (fMode) {
- case NO_OP:
- nonConst->currentChar = text->current();
- break;
- case COMPOSE:
- case COMPOSE_COMPAT:
- nonConst->currentChar = nonConst->nextCompose();
- break;
- case DECOMP:
- case DECOMP_COMPAT:
- nonConst->currentChar = nonConst->nextDecomp();
- break;
- }
- }
- return currentChar;
- }
-
- /**
- * Return the first character in the normalized text. This resets
- * the <tt>Normalizer's</tt> position to the beginning of the text.
- */
- UChar Normalizer::first() {
- return setIndex(text->startIndex());
- }
-
- /**
- * Return the last character in the normalized text. This resets
- * the <tt>Normalizer's</tt> position to be just before the
- * the input text corresponding to that normalized character.
- */
- UChar Normalizer::last() {
- text->setIndex(text->endIndex());
-
- currentChar = DONE; // The current char hasn't been processed
- clearBuffer(); // The buffer is empty too
- return previous();
- }
-
- /**
- * Return the next character in the normalized text and advance
- * the iteration position by one. If the end
- * of the text has already been reached, {@link #DONE} is returned.
- */
- UChar Normalizer::next() {
- if (bufferPos < bufferLimit) {
- // There are output characters left in the buffer
- currentChar = buffer[++bufferPos];
- }
- else {
- bufferLimit = bufferPos = 0; // Buffer is now out of date
- switch (fMode) {
- case NO_OP:
- currentChar = text->next();
- break;
- case COMPOSE:
- case COMPOSE_COMPAT:
- currentChar = nextCompose();
- break;
- case DECOMP:
- case DECOMP_COMPAT:
- currentChar = nextDecomp();
- break;
- }
- }
- return currentChar;
- }
-
- /**
- * Return the previous character in the normalized text and decrement
- * the iteration position by one. If the beginning
- * of the text has already been reached, {@link #DONE} is returned.
- */
- UChar Normalizer::previous()
- {
- if (bufferPos > 0) {
- // There are output characters left in the buffer
- currentChar = buffer[--bufferPos];
- }
- else {
- bufferLimit = bufferPos = 0; // Buffer is now out of date
- switch (fMode) {
- case NO_OP:
- currentChar = text->previous();
- break;
- case COMPOSE:
- case COMPOSE_COMPAT:
- currentChar = prevCompose();
- break;
- case DECOMP:
- case DECOMP_COMPAT:
- currentChar = prevDecomp();
- break;
- }
- }
- return currentChar;
- }
-
- void Normalizer::reset()
- {
- text->setIndex(text->startIndex());
- currentChar = DONE; // The current char hasn't been processed
- clearBuffer(); // The buffer is empty too
- }
-
- /**
- * Set the iteration position in the input text that is being normalized
- * and return the first normalized character at that position.
- * <p>
- * <b>Note:</b> This method sets the position in the <em>input</em> text,
- * while {@link #next} and {@link #previous} iterate through characters
- * in the normalized <em>output</em>. This means that there is not
- * necessarily a one-to-one correspondence between characters returned
- * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
- * returned from <tt>setIndex</tt> and {@link #getIndex}.
- * <p>
- * @param index the desired index in the input text.
- *
- * @return the first normalized character that is the result of iterating
- * forward starting at the given index.
- *
- * @throws IllegalArgumentException if the given index is less than
- * {@link #getBeginIndex} or greater than {@link #getEndIndex}.
- */
- UChar Normalizer::setIndex(UTextOffset index)
- {
- text->setIndex(index); // Checks range
- currentChar = DONE; // The current char hasn't been processed
- clearBuffer(); // The buffer is empty too
-
- return current();
- }
-
- /**
- * Retrieve the current iteration position in the input text that is
- * being normalized. This method is useful in applications such as
- * searching, where you need to be able to determine the position in
- * the input text that corresponds to a given normalized output character.
- * <p>
- * <b>Note:</b> This method sets the position in the <em>input</em>, while
- * {@link #next} and {@link #previous} iterate through characters in the
- * <em>output</em>. This means that there is not necessarily a one-to-one
- * correspondence between characters returned by <tt>next</tt> and
- * <tt>previous</tt> and the indices passed to and returned from
- * <tt>setIndex</tt> and {@link #getIndex}.
- *
- */
- UTextOffset Normalizer::getIndex() const {
- return text->getIndex();
- }
-
- /**
- * Retrieve the index of the start of the input text. This is the begin index
- * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
- * over which this <tt>Normalizer</tt> is iterating
- */
- UTextOffset Normalizer::startIndex() const {
- return text->startIndex();
- }
-
- /**
- * Retrieve the index of the end of the input text. This is the end index
- * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
- * over which this <tt>Normalizer</tt> is iterating
- */
- UTextOffset Normalizer::endIndex() const {
- return text->endIndex();
- }
-
- //-------------------------------------------------------------------------
- // Property access methods
- //-------------------------------------------------------------------------
-
- void
- Normalizer::setMode(EMode newMode)
- {
- fMode = newMode;
- minDecomp = ((fMode & COMPAT_BIT) != 0) ? 0 : DecompData::MAX_COMPAT;
- }
-
- Normalizer::EMode
- Normalizer::getMode() const
- {
- return fMode;
- }
-
- void
- Normalizer::setOption(int32_t option,
- bool_t value)
- {
- if (value) {
- fOptions |= option;
- } else {
- fOptions &= (~option);
- }
- }
-
- bool_t
- Normalizer::getOption(int32_t option) const
- {
- return (fOptions & option) != 0;
- }
-
- /**
- * Set the input text over which this <tt>Normalizer</tt> will iterate.
- * The iteration position is set to the beginning of the input text.
- */
- void
- Normalizer::setText(const UnicodeString& newText,
- UErrorCode &status)
- {
- if (U_FAILURE(status)) {
- return;
- }
- CharacterIterator *newIter = new StringCharacterIterator(newText);
- if (newIter == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- delete text;
- text = newIter;
- reset();
- }
-
- /**
- * Set the input text over which this <tt>Normalizer</tt> will iterate.
- * The iteration position is set to the beginning of the string.
- */
- void
- Normalizer::setText(const CharacterIterator& newText,
- UErrorCode &status)
- {
- if (U_FAILURE(status)) {
- return;
- }
- CharacterIterator *newIter = newText.clone();
- if (newIter == NULL) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- delete text;
- text = newIter;
- reset();
- }
-
-
- /**
- * Copies the text under iteration into the UnicodeString referred to by "result".
- * @param result Receives a copy of the text under iteration.
- */
- void
- Normalizer::getText(UnicodeString& result)
- {
- text->getText(result);
- }
-
-
- //-------------------------------------------------------------------------
- // Private utility methods
- //-------------------------------------------------------------------------
-
-
- UChar Normalizer::curForward() {
- UChar ch = text->current();
- return ch;
- }
-
- UChar Normalizer::curBackward() {
- UChar ch = text->previous();
- return ch;
- }
-
- void Normalizer::doAppend(const UChar source[], uint16_t offset, UnicodeString& dest) {
- uint16_t index = offset >> STR_INDEX_SHIFT;
- uint16_t length = offset & STR_LENGTH_MASK;
-
- if (length == 0) {
- UChar ch;
- while ((ch = source[index++]) != 0x0000) {
- dest += ch;
- }
- } else {
- while (length-- > 0) {
- dest += source[index++];
- }
- }
- }
-
- void Normalizer::doInsert(const UChar source[], uint16_t offset, UnicodeString& dest, UTextOffset pos)
- {
- uint16_t index = offset >> STR_INDEX_SHIFT;
- uint16_t length = offset & STR_LENGTH_MASK;
-
- if (length == 0) {
- UChar ch;
- while ((ch = source[index++]) != 0x0000) {
- insert(dest, pos++, ch);
- }
- } else {
- while (length-- > 0) {
- insert(dest, pos++, source[index++]);
- }
- }
- }
-
- void Normalizer::initBuffer() {
- buffer.truncate(0);
- clearBuffer();
- }
-
- void Normalizer::clearBuffer() {
- bufferLimit = bufferPos = 0;
- }
-
- //-----------------------------------------------------------------------------
- // Hangul / Jamo conversion utilities for internal use
- // See section 3.10 of The Unicode Standard, v 2.0.
- //
- /**
- * Convert a single Hangul syllable into one or more Jamo characters.
- *
- * @param conjoin If TRUE, decompose Jamo into conjoining Jamo.
- */
- void Normalizer::hangulToJamo(UChar ch, UnicodeString& result, uint16_t decompLimit)
- {
- UChar sIndex = (UChar)(ch - HANGUL_BASE);
- UChar leading = (UChar)(JAMO_LBASE + sIndex / JAMO_NCOUNT);
- UChar vowel = (UChar)(JAMO_VBASE +
- (sIndex % JAMO_NCOUNT) / JAMO_TCOUNT);
- UChar trailing= (UChar)(JAMO_TBASE + (sIndex % JAMO_TCOUNT));
-
- jamoAppend(leading, decompLimit, result);
- jamoAppend(vowel, decompLimit, result);
- if (trailing != JAMO_TBASE) {
- jamoAppend(trailing, decompLimit, result);
- }
- }
-
- void Normalizer::jamoAppend(UChar ch, uint16_t decompLimit, UnicodeString& dest) {
- uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
- if (offset > decompLimit) {
- doAppend(DecompData::contents, offset, dest);
- } else {
- dest += ch;
- }
- }
-
- void Normalizer::jamoToHangul(UnicodeString& buffer, UTextOffset start) {
- UTextOffset out = start;
- UTextOffset limit = buffer.size() - 1;
-
- UTextOffset in;
- uint16_t l, v, t;
-
- for (in = start; in < limit; in++) {
- UChar ch = buffer[in];
-
- if ((l = ch - JAMO_LBASE) >= 0 && l < JAMO_LCOUNT
- && (v = buffer[in+1] - JAMO_VBASE) >= 0 && v < JAMO_VCOUNT) {
- //
- // We've found a pair of Jamo characters to compose.
- // Snarf the Jamo vowel and see if there's also a trailing char
- //
- in++; // Snarf the Jamo vowel too.
-
- t = (in < limit) ? buffer.charAt(in+1) : 0;
- t -= JAMO_TBASE;
-
- if (t >= 0 && t < JAMO_TCOUNT) {
- in++; // Snarf the trailing consonant too
- } else {
- t = 0; // No trailing consonant
- }
- buffer[out++] = (UChar)((l*JAMO_VCOUNT + v) * JAMO_TCOUNT + t + HANGUL_BASE);
- } else {
- buffer[out++] = ch;
- }
- }
- while (in < buffer.size()) {
- buffer[out++] = buffer[in++];
- }
-
- buffer.truncate(out);
- }
-