AmigActive 6

home *** CD-ROM | disk | FTP | other *** search

/ AmigActive 6 / AACD06.ISO / AACD / Programming / ICU / src / icu / source / i18n / uniset.cpp < prev next >

Wrap

C/C++ Source or Header | 1999-10-26 | 40.2 KB | 1,142 lines

/* ********************************************************************** * Copyright (C) 1999 Alan Liu and others. All rights reserved. ********************************************************************** * Date Name Description * 10/20/99 alan Creation. ********************************************************************** */ #include "uniset.h" #include "parsepos.h" // N.B.: This mapping is different in ICU and Java const UnicodeString UnicodeSet::CATEGORY_NAMES( "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf"); /** * A cache mapping character category integers, as returned by * Unicode::getType(), to pairs strings. Entries are initially * zero length and are filled in on demand. */ UnicodeString* UnicodeSet::CATEGORY_PAIRS_CACHE = new UnicodeString[Unicode::GENERAL_TYPES_COUNT]; //---------------------------------------------------------------- // Debugging and testing //---------------------------------------------------------------- /** * Return the representation of this set as a list of character * ranges. Ranges are listed in ascending Unicode order. For * example, the set [a-zA-M3] is represented as "33AMaz". */ const UnicodeString& UnicodeSet::getPairs() const { return pairs; } //---------------------------------------------------------------- // Constructors &c //---------------------------------------------------------------- /** * Constructs an empty set. */ UnicodeSet::UnicodeSet() : pairs() {} /** * Constructs a set from the given pattern, optionally ignoring * white space. See the class description for the syntax of the * pattern language. * @param pattern a string specifying what characters are in the set * @param ignoreSpaces if <code>true</code>, all spaces in the * pattern are ignored, except those preceded by '\\'. Spaces are * those characters for which <code>Character.isSpaceChar()</code> * is <code>true</code>. * @exception <code>IllegalArgumentException</code> if the pattern * contains a syntax error. */ UnicodeSet::UnicodeSet(const UnicodeString& pattern, bool_t ignoreSpaces, UErrorCode& status) : pairs() { applyPattern(pattern, ignoreSpaces, status); } UnicodeSet::UnicodeSet(const UnicodeString& pattern, UErrorCode& status) : pairs() { applyPattern(pattern, status); } /** * Constructs a set from the given Unicode character category. * @param category an integer indicating the character category as * returned by <code>Character.getType()</code>. * @exception <code>IllegalArgumentException</code> if the given * category is invalid. */ UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) : pairs() { if (U_SUCCESS(status)) { if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) { status = U_ILLEGAL_ARGUMENT_ERROR; } else { pairs = getCategoryPairs(category); } } } /** * Constructs a set that is identical to the given UnicodeSet. */ UnicodeSet::UnicodeSet(const UnicodeSet& o) : pairs(o.pairs) {} /** * Destructs the set. */ UnicodeSet::~UnicodeSet() {} /** * Assigns this object to be a copy of another. */ UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { pairs = o.pairs; return *this; } /** * Compares the specified object with this set for equality. Returns * <tt>true</tt> if the two sets * have the same size, and every member of the specified set is * contained in this set (or equivalently, every member of this set is * contained in the specified set). * * @param o set to be compared for equality with this set. * @return <tt>true</tt> if the specified set is equal to this set. */ bool_t UnicodeSet::operator==(const UnicodeSet& o) const { return pairs == o.pairs; } /** * Returns the hash code value for this set. * * @return the hash code value for this set. * @see Object#hashCode() */ int32_t UnicodeSet::hashCode() const { return pairs.hashCode(); } //---------------------------------------------------------------- // Public API //---------------------------------------------------------------- /** * Modifies this set to represent the set specified by the given * pattern, optionally ignoring white space. See the class * description for the syntax of the pattern language. * @param pattern a string specifying what characters are in the set * @param ignoreSpaces if <code>true</code>, all spaces in the * pattern are ignored. Spaces are those characters for which * <code>Character.isSpaceChar()</code> is <code>true</code>. * Characters preceded by '\\' are escaped, losing any special * meaning they otherwise have. Spaces may be included by * escaping them. * @exception <code>IllegalArgumentException</code> if the pattern * contains a syntax error. */ void UnicodeSet::applyPattern(const UnicodeString& pattern, bool_t ignoreSpaces, UErrorCode& status) { if (U_FAILURE(status)) { return; } ParsePosition pos(0); UnicodeString* pat = (UnicodeString*) &pattern; // To ignore spaces, create a new pattern without spaces. We // have to process all '\' escapes. If '\' is encountered, // insert it and the following character (if any -- let parse // deal with any syntax errors) in the pattern. This allows // escaped spaces. if (ignoreSpaces) { pat = new UnicodeString(); for (int32_t i=0; i<pattern.length(); ++i) { UChar c = pattern.charAt(i); if (Unicode::isSpaceChar(c)) { continue; } if (c == '\\' && (i+1) < pattern.length()) { pat->append(c); c = pattern.charAt(++i); // Fall through and append the following char } pat->append(c); } } parse(pairs, *pat, pos, status); if (pos.getIndex() != pat->length()) { status = U_ILLEGAL_ARGUMENT_ERROR; } if (pat != &pattern) { delete pat; } } /** * Returns a string representation of this set. If the result of * calling this function is passed to a UnicodeSet constructor, it * will produce another set that is equal to this one. */ UnicodeString& UnicodeSet::toPattern(UnicodeString& result) const { result.remove().append((UChar)'['); // iterate through the ranges in the UnicodeSet for (int32_t i=0; i<pairs.length(); i+=2) { // for a range with the same beginning and ending point, // output that character, otherwise, output the start and // end points of the range separated by a dash result.append(pairs.charAt(i)); if (pairs.charAt(i) != pairs.charAt(i+1)) { result.append((UChar)'-').append(pairs.charAt(i+1)); } } return result.append((UChar)']'); } /** * Returns the number of elements in this set (its cardinality), * n, where <code>0 <= </code>n<code> <= 65536</code>. * * @return the number of elements in this set (its cardinality). */ int32_t UnicodeSet::size() const { int32_t n = 0; for (int32_t i=0; i<pairs.length(); i+=2) { n += pairs.charAt(i+1) - pairs.charAt(i) + 1; } return n; } /** * Returns <tt>true</tt> if this set contains no elements. * * @return <tt>true</tt> if this set contains no elements. */ bool_t UnicodeSet::isEmpty() const { return pairs.length() == 0; } /** * Returns <tt>true</tt> if this set contains the specified range * of chars. * * @return <tt>true</tt> if this set contains the specified range * of chars. */ bool_t UnicodeSet::contains(UChar first, UChar last) const { // Set i to the end of the smallest range such that its end // point >= last, or pairs.length() if no such range exists. int32_t i = 1; while (i<pairs.length() && last>pairs.charAt(i)) i+=2; return i<pairs.length() && first>=pairs.charAt(i-1); } /** * Returns <tt>true</tt> if this set contains the specified char. * * @return <tt>true</tt> if this set contains the specified char. */ bool_t UnicodeSet::contains(UChar c) const { return contains(c, c); } /** * Adds the specified range to this set if it is not already * present. If this set already contains the specified range, * the call leaves this set unchanged. If <code>last > first</code> * then an empty range is added, leaving the set unchanged. * * @param first first character, inclusive, of range to be added * to this set. * @param last last character, inclusive, of range to be added * to this set. */ void UnicodeSet::add(UChar first, UChar last) { if (first <= last) { addPair(pairs, first, last); } } /** * Adds the specified character to this set if it is not already * present. If this set already contains the specified character, * the call leaves this set unchanged. */ void UnicodeSet::add(UChar c) { add(c, c); } /** * Removes the specified range from this set if it is present. * The set will not contain the specified range once the call * returns. If <code>last > first</code> then an empty range is * removed, leaving the set unchanged. * * @param first first character, inclusive, of range to be removed * from this set. * @param last last character, inclusive, of range to be removed * from this set. */ void UnicodeSet::remove(UChar first, UChar last) { if (first <= last) { removePair(pairs, first, last); } } /** * Removes the specified character from this set if it is present. * The set will not contain the specified range once the call * returns. */ void UnicodeSet::remove(UChar c) { remove(c, c); } /** * Returns <tt>true</tt> if the specified set is a subset * of this set. * * @param c set to be checked for containment in this set. * @return <tt>true</tt> if this set contains all of the elements of the * specified set. */ bool_t UnicodeSet::containsAll(const UnicodeSet& c) const { // The specified set is a subset if all of its pairs are contained // in this set. int32_t i = 1; for (int32_t j=0; j<c.pairs.length(); j+=2) { UChar last = c.pairs.charAt(j+1); // Set i to the end of the smallest range such that its // end point >= last, or pairs.length() if no such range // exists. while (i<pairs.length() && last>pairs.charAt(i)) i+=2; if (i>pairs.length() || c.pairs.charAt(j) < pairs.charAt(i-1)) { return FALSE; } } return TRUE; } /** * Adds all of the elements in the specified set to this set if * they're not already present. This operation effectively * modifies this set so that its value is the union of the two * sets. The behavior of this operation is unspecified if the specified * collection is modified while the operation is in progress. * * @param c set whose elements are to be added to this set. * @see #add(char, char) */ void UnicodeSet::addAll(const UnicodeSet& c) { doUnion(pairs, c.pairs); } /** * Retains only the elements in this set that are contained in the * specified set. In other words, removes from this set all of * its elements that are not contained in the specified set. This * operation effectively modifies this set so that its value is * the intersection of the two sets. * * @param c set that defines which elements this set will retain. */ void UnicodeSet::retainAll(const UnicodeSet& c) { doIntersection(pairs, c.pairs); } /** * Removes from this set all of its elements that are contained in the * specified set. This operation effectively modifies this * set so that its value is the asymmetric set difference of * the two sets. * * @param c set that defines which elements will be removed from * this set. */ void UnicodeSet::removeAll(const UnicodeSet& c) { doDifference(pairs, c.pairs); } /** * Inverts this set. This operation modifies this set so that * its value is its complement. This is equivalent to the pseudo code: * <code>this = new UnicodeSet("[\u0000-\uFFFF]").removeAll(this)</code>. */ void UnicodeSet::complement() { doComplement(pairs); } /** * Removes all of the elements from this set. This set will be * empty after this call returns. */ void UnicodeSet::clear() { pairs.remove(); } //---------------------------------------------------------------- // Implementation: Pattern parsing //---------------------------------------------------------------- /** * Parses the given pattern, starting at the given position. The * character at pattern.charAt(pos.getIndex()) must be '[', or the * parse fails. Parsing continues until the corresponding closing * ']'. If a syntax error is encountered between the opening and * closing brace, the parse fails. Upon return from a U_SUCCESSful * parse, the ParsePosition is updated to point to the character * following the closing ']', and a StringBuffer containing a * pairs list for the parsed pattern is returned. This method calls * itself recursively to parse embedded subpatterns. * * @param pattern the string containing the pattern to be parsed. * The portion of the string from pos.getIndex(), which must be a * '[', to the corresponding closing ']', is parsed. * @param pos upon entry, the position at which to being parsing. * The character at pattern.charAt(pos.getIndex()) must be a '['. * Upon return from a U_SUCCESSful parse, pos.getIndex() is either * the character after the closing ']' of the parsed pattern, or * pattern.length() if the closing ']' is the last character of * the pattern string. * @return a StringBuffer containing a pairs list for the parsed * substring of <code>pattern</code> * @exception IllegalArgumentException if the parse fails. */ UnicodeString& UnicodeSet::parse(UnicodeString& pairsBuf /*result*/, const UnicodeString& pattern, ParsePosition& pos, UErrorCode& status) { if (U_FAILURE(status)) { return pairsBuf; } bool_t invert = FALSE; pairsBuf.remove(); /** * Nodes: 0 - idle, waiting for '[' * 10 - like 11, but immediately after "[" or "[^" * 11 - awaiting x, "]", "[...]", or "[:...:]" * 21 - after x * 23 - after x- * * The parsing state machine moves from node 0 through zero or more * other nodes back to node 0, in a U_SUCCESSful parse. */ int32_t node = 0; UChar first = 0; int32_t i; /** * This loop iterates over the characters in the pattern. We * start at the position specified by pos. We exit the loop * when either a matching closing ']' is seen, or we read all * characters of the pattern. */ for (i=pos.getIndex(); i<pattern.length(); ++i) { UChar c = pattern.charAt(i); /** * Handle escapes here. If a character is escaped, then * it assumes its literal value. This is true for all * characters, both special characters and characters with * no special meaning. We also interpret '\\uxxxx' Unicode * escapes here. */ bool_t isLiteral = FALSE; if (c == '\\') { ++i; if (i < pattern.length()) { c = pattern.charAt(i); isLiteral = TRUE; if (c == 'u') { if ((i+4) >= pattern.length()) { status = U_ILLEGAL_ARGUMENT_ERROR; return pairsBuf; } c = (UChar)0x0000; for (int32_t j=(++i)+4; i<j; ++i) { // [sic] // TO DO: Change this to use Unicode::digit() // when that method exists. int32_t digit = /*Unicode::*/UnicodeSet::digit(pattern.charAt(i), 16); if (digit<0) { status = U_ILLEGAL_ARGUMENT_ERROR; return pairsBuf; } c = (UChar) ((c << 4) | digit); } --i; // Move i back to last parsed character } } else { status = U_ILLEGAL_ARGUMENT_ERROR; return pairsBuf; } } /** * Within this loop, we handle each of the four * conditions: '[', ']', '-', other. The first three * characters must not be escaped. */ /** * An opening bracket indicates either the first bracket * of the entire subpattern we are parsing, in which case * we are in node 0 and move into node 10. We also check * for an immediately following '^', indicating the * complement of the following pattern. ('^' is any other * position has no special meaning.) If we are not in * node 0, '[' represents a nested subpattern that must be * recursively parsed and checked for following operators * ('&' or '|'). If two nested subpatterns follow one * another with no operator, their union is formed, just * as with any other elements that follow one another * without intervening operator. The other thing we * handle here is the syntax "[:Xx:]" or "[:X:]" that * indicates a Unicode category or supercategory. */ if (!isLiteral && c == '[') { bool_t parseOp = FALSE; UChar d = charAfter(pattern, i); // "[:...:]" represents a character category if (d == ':') { if (node == 23) { status = U_ILLEGAL_ARGUMENT_ERROR; return pairsBuf; } if (node == 21) { addPair(pairsBuf, first, first); node = 11; } i += 2; int32_t j = pattern.indexOf(":]", i); if (j < 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return pairsBuf; } UnicodeString categoryName; pattern.extract(i, j-i, categoryName); UnicodeString temp; doUnion(pairsBuf, getCategoryPairs(temp, categoryName, status)); if (U_FAILURE(status)) { return pairsBuf; } i = j+1; if (node == 10) { node = 11; parseOp = TRUE; } else if (node == 0) { break; } } else { if (node == 0) { node = 10; if (d == '^') { invert = TRUE; ++i; } } else { // Nested '[' pos.setIndex(i); UnicodeString subPairs; // Pairs for the nested [] doUnion(pairsBuf, parse(subPairs, pattern, pos, status)); if (U_FAILURE(status)) { return pairsBuf; } i = pos.getIndex() - 1; // Subtract 1 to point at ']' parseOp = TRUE; } } /** * parseOp is true after "[:...:]" or a nested * "[...]". It is false only after the final closing * ']'. If parseOp is true, we look past the closing * ']' to see if we have an operator character. If * so, we parse the subsequent "[...]" recursively, * then perform the operation. We do this in a loop * until there are no more operators. Note that this * means the operators have equal precedence and are * bound left-to-right. */ if (parseOp) { for (;;) { // Is the next character an operator? UChar op = charAfter(pattern, i); if (op == '-' || op == '&') { pos.setIndex(i+2); // Add 2 to point AFTER op UnicodeString rhs; parse(rhs, pattern, pos, status); if (U_FAILURE(status)) { return pairsBuf; } if (op == '-') { doDifference(pairsBuf, rhs); } else if (op == '&') { doIntersection(pairsBuf, rhs); } i = pos.getIndex() - 1; // - 1 to point at ']' } else { break; } } } } /** * A closing bracket can only be a closing bracket for * "[...]", since the closing bracket for "[:...:]" is * taken care of when the initial "[:" is seen. When we * see a closing bracket, we then know, if we were in node * 21 (after x) or 23 (after x-) that nothing more is * coming, and we add the last character(s) we saw to the * set. Note that a trailing '-' assumes its literal * meaning, just as a leading '-' after "[" or "[^". */ else if (!isLiteral && c == ']') { if (node == 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return pairsBuf; } if (node == 21 || node == 23) { addPair(pairsBuf, first, first); if (node == 23) { addPair(pairsBuf, '-', '-'); } } node = 0; break; } /** * '-' has the following interpretations: 1. Within * "[...]", between two letters, it indicates a range. * 2. Between two nested bracket patterns, "[[...]-[...]", * it indicates asymmetric difference. 3. At the start of * a bracket pattern, "[-...]", "[^-...]", it indicates * the literal character '-'. 4. At the end of a bracket * pattern, "[...-]", it indicates the literal character * '-'. * * We handle cases 1 and 3 here. Cases 2 and 4 are * handled in the ']' parsing code. */ else if (!isLiteral && c == '-') { if (node == 10) { addPair(pairsBuf, c, c); // Handle "[-...]", "[^-...]" } else if (node == 21) { node = 23; } else { status = U_ILLEGAL_ARGUMENT_ERROR; return pairsBuf; } } /** * If we fall through to this point, we have a literal * character, either one that has been escaped with a * backslash, escaped with a backslash u, or that isn't * a special '[', ']', or '-'. * * Literals can either start a range "x-...", end a range, * "...-x", or indicate a single character "x". */ else { if (node == 10 || node == 11) { first = c; node = 21; } else if (node == 21) { addPair(pairsBuf, first, first); first = c; node = 21; } else if (node == 23) { if (c < first) { status = U_ILLEGAL_ARGUMENT_ERROR; return pairsBuf; } addPair(pairsBuf, first, c); node = 11; } else { status = U_ILLEGAL_ARGUMENT_ERROR; return pairsBuf; } } } if (node != 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return pairsBuf; } /** * i indexes the last character we parsed or is * pattern.length(). In the latter case, the node will not be * zero, since we have run off the end without finding a * closing ']'. Therefore, the above statement will have * thrown an exception, and we'll never get here. If we get * here, we know i < pattern.length(), and we set the * ParsePosition to the next character to be parsed. */ pos.setIndex(i+1); /** * If we saw a '^' after the initial '[' of this pattern, then * perform the complement. (Inversion after '[:' is handled * elsewhere.) */ if (invert) { doComplement(pairsBuf); } return pairsBuf; } //---------------------------------------------------------------- // Implementation: Efficient in-place union & difference //---------------------------------------------------------------- /** * Performs a union operation: adds the range 'c'-'d' to the given * pairs list. The pairs list is modified in place. The result * is normalized (in order and as short as possible). For * example, addPair("am", 'l', 'q') => "aq". addPair("ampz", 'n', * 'o') => "az". */ void UnicodeSet::addPair(UnicodeString& pairs, UChar c, UChar d) { UChar a = 0; UChar b = 0; for (int32_t i=0; i<pairs.length(); i+=2) { UChar e = pairs.charAt(i); UChar f = pairs.charAt(i+1); if (e <= (d+1) && c <= (f+1)) { // Merge with this range f = (UChar) icu_max(d, f); // Check to see if we need to merge with the // subsequent range also. This happens if we have // "abdf" and are merging in "cc". We only need to // check on the right side -- never on the left. if ((i+2) < pairs.length() && pairs.charAt(i+2) == (f+1)) { f = pairs.charAt(i+3); pairs.remove(i+2, 2); } pairs.setCharAt(i, (UChar) icu_min(c, e)); pairs.setCharAt(i+1, f); return; } else if ((b+1) < c && (d+1) < e) { // Insert before this range c, then d pairs.insert(i, d); // d gets moved to i+1 by next insert pairs.insert(i, c); return; } a = e; b = f; } // If nothing else, fall through and append this new range to // the end. pairs.append(c).append(d); } /** * Performs an asymmetric difference: removes the range 'c'-'d' * from the pairs list. The pairs list is modified in place. The * result is normalized (in order and as short as possible). For * example, removePair("am", 'l', 'q') => "ak". * removePair("ampz", 'l', 'q') => "akrz". */ void UnicodeSet::removePair(UnicodeString& pairs, UChar c, UChar d) { // Iterate over pairs until we find a pair that overlaps // with the given range. for (int32_t i=0; i<pairs.length(); i+=2) { UChar b = pairs.charAt(i+1); if (b < c) { // Range at i is entirely before the given range, // since we have a-b < c-d. No overlap yet...keep // iterating. continue; } UChar a = pairs.charAt(i); if (d < a) { // Range at i is entirely after the given range; c-d < // a-b. Since ranges are in order, nothing else will // overlap. break; } // Once we get here, we know c <= b and d >= a. // rangeEdited is set to true if we have modified the // range a-b (the range at i) in place. bool_t rangeEdited = FALSE; if (c > a) { // If c is after a and before b, then we have overlap // of this sort: a--c==b--d or a--c==d--b, where a-b // and c-d are the ranges of interest. We need to // add the range a,c-1. pairs.setCharAt(i+1, (UChar)(c-1)); // i is already a rangeEdited = TRUE; } if (d < b) { // If d is after a and before b, we overlap like this: // c--a==d--b or a--c==d--b, where a-b is the range at // i and c-d is the range being removed. We need to // add the range d+1,b. if (rangeEdited) { // Insert {d+1, b} pairs.insert(i+2, b); // b moves to i+3 by next insert: pairs.insert(i+2, (UChar)(d+1)); i += 2; } else { pairs.setCharAt(i, (UChar)(d+1)); // i+1 is already b rangeEdited = TRUE; } } if (!rangeEdited) { // If we didn't add any ranges, that means the entire // range a-b must be deleted, since we have // c--a==b--d. pairs.remove(i, 2); i -= 2; } } } //---------------------------------------------------------------- // Implementation: Fundamental operators //---------------------------------------------------------------- /** * Changes the pairs list to represent the complement of the set it * currently represents. The pairs list will be normalized (in * order and in shortest possible form) if the original pairs list * was normalized. */ void UnicodeSet::doComplement(UnicodeString& pairs) { if (pairs.length() == 0) { pairs.append((UChar)0x0000).append((UChar)0xffff); return; } // Change each end to a start and each start to an end of the // gaps between the ranges. That is, 3-7 9-12 becomes x-2 8-8 // 13-x, where 'x' represents a range that must now be fixed // up. for (int32_t i=0; i<pairs.length(); i+=2) { pairs.setCharAt(i, (UChar) (pairs.charAt(i) - 1)); pairs.setCharAt(i+1, (UChar) (pairs.charAt(i+1) + 1)); } // Fix up the initial range, either by adding a start point of // U+0000, or by deleting the range altogether, if the // original range was U+0000 - x. if (pairs.charAt(0) == (UChar)0xFFFF) { pairs.remove(0, 1); } else { pairs.insert(0, (UChar)0x0000); } // Fix up the final range, either by adding an end point of // U+FFFF, or by deleting the range altogether, if the // original range was x - U+FFFF. if (pairs.charAt(pairs.length() - 1) == (UChar)0x0000) { pairs.remove(pairs.length() - 1); } else { pairs.append((UChar)0xFFFF); } } /** * Given two pairs lists, changes the first in place to represent * the union of the two sets. */ void UnicodeSet::doUnion(UnicodeString& c1, const UnicodeString& c2) { UnicodeString result; int32_t i = 0; int32_t j = 0; // consider all the characters in both strings while (i < c1.length() && j < c2.length()) { UChar ub; // the first character in the result is the lower of the // starting characters of the two strings, and "ub" gets // set to the upper bound of that range if (c1.charAt(i) < c2.charAt(j)) { result.append(c1.charAt(i)); ub = c1.charAt(++i); } else { result.append(c2.charAt(j)); ub = c2.charAt(++j); } // for as long as one of our two pointers is pointing to a range's // end point, or i is pointing to a character that is less than // "ub" plus one (the "plus one" stitches touching ranges together)... while (i % 2 == 1 || j % 2 == 1 || (i < c1.length() && c1.charAt(i) <= ub + 1)) { // advance i to the first character that is greater than // "ub" plus one while (i < c1.length() && c1.charAt(i) <= ub + 1) ++i; // if i points to the endpoint of a range, update "ub" // to that character, or if i points to the start of // a range and the endpoint of the preceding range is // greater than "ub", update "up" to _that_ character if (i % 2 == 1) ub = c1.charAt(i); else if (i > 0 && c1.charAt(i - 1) > ub) ub = c1.charAt(i - 1); // now advance j to the first character that is greater // that "ub" plus one while (j < c2.length() && c2.charAt(j) <= ub + 1) ++j; // if j points to the endpoint of a range, update "ub" // to that character, or if j points to the start of // a range and the endpoint of the preceding range is // greater than "ub", update "up" to _that_ character if (j % 2 == 1) ub = c2.charAt(j); else if (j > 0 && c2.charAt(j - 1) > ub) ub = c2.charAt(j - 1); } // when we finally fall out of this loop, we will have stitched // together a series of ranges that overlap or touch, i and j // will both point to starting points of ranges, and "ub" will // be the endpoint of the range we're working on. Write "ub" // to the result result.append(ub); // loop back around to create the next range in the result } // we fall out to here when we've exhausted all the characters in // one of the operands. We can append all of the remaining characters // in the other operand without doing any extra work. if (i < c1.length()) result.append(c1, i, LONG_MAX); if (j < c2.length()) result.append(c2, j, LONG_MAX); c1 = result; } /** * Given two pairs lists, changes the first in place to represent * the asymmetric difference of the two sets. */ void UnicodeSet::doDifference(UnicodeString& pairs, const UnicodeString& pairs2) { UnicodeString p2(pairs2); doComplement(p2); doIntersection(pairs, p2); } /** * Given two pairs lists, changes the first in place to represent * the intersection of the two sets. */ void UnicodeSet::doIntersection(UnicodeString& c1, const UnicodeString& c2) { UnicodeString result; int32_t i = 0; int32_t j = 0; int32_t oldI; int32_t oldJ; // iterate until we've exhausted one of the operands while (i < c1.length() && j < c2.length()) { // advance j until it points to a character that is larger than // the one i points to. If this is the beginning of a one- // character range, advance j to point to the end if (i < c1.length() && i % 2 == 0) { while (j < c2.length() && c2.charAt(j) < c1.charAt(i)) ++j; if (j < c2.length() && j % 2 == 0 && c2.charAt(j) == c1.charAt(i)) ++j; } // if j points to the endpoint of a range, save the current // value of i, then advance i until it reaches a character // which is larger than the character pointed at // by j. All of the characters we've advanced over (except // the one currently pointed to by i) are added to the result oldI = i; while (j % 2 == 1 && i < c1.length() && c1.charAt(i) <= c2.charAt(j)) ++i; result.append(c1, oldI, i-oldI); // if i points to the endpoint of a range, save the current // value of j, then advance j until it reaches a character // which is larger than the character pointed at // by i. All of the characters we've advanced over (except // the one currently pointed to by i) are added to the result oldJ = j; while (i % 2 == 1 && j < c2.length() && c2.charAt(j) <= c1.charAt(i)) ++j; result.append(c2, oldJ, j-oldJ); // advance i until it points to a character larger than j // If it points at the beginning of a one-character range, // advance it to the end of that range if (j < c2.length() && j % 2 == 0) { while (i < c1.length() && c1.charAt(i) < c2.charAt(j)) ++i; if (i < c1.length() && i % 2 == 0 && c2.charAt(j) == c1.charAt(i)) ++i; } } c1 = result; } //---------------------------------------------------------------- // Implementation: Generation of pairs for Unicode categories //---------------------------------------------------------------- /** * Returns a pairs string for the given category, given its name. * The category name must be either a two-letter name, such as * "Lu", or a one letter name, such as "L". One-letter names * indicate the logical union of all two-letter names that start * with that letter. Case is significant. If the name starts * with the character '^' then the complement of the given * character set is returned. * * Although individual categories such as "Lu" are cached, we do * not currently cache single-letter categories such as "L" or * complements such as "^Lu" or "^L". It would be easy to cache * these as well in a hashtable should the need arise. */ UnicodeString& UnicodeSet::getCategoryPairs(UnicodeString& result, const UnicodeString& catName, UErrorCode& status) { if (U_FAILURE(status)) { return result; } // The temporary cat is only really needed if invert is true. // TO DO: Allocate cat on the heap only if needed. UnicodeString cat(catName); bool_t invert = (catName.length() > 1 && catName.charAt(0) == '^'); if (invert) { cat.remove(0, 1); } result.remove(); // if we have two characters, search the category map for that // code and either construct and return a UnicodeSet from the // data in the category map or throw an exception if (cat.length() == 2) { int32_t i = CATEGORY_NAMES.indexOf(cat); if (i>=0 && i%2==0) { i /= 2; result = getCategoryPairs((int8_t)i); if (!invert) { return result; } } } else if (cat.length() == 1) { // if we have one character, search the category map for // codes beginning with that letter, and union together // all of the matching sets that we find (or throw an // exception if there are no matches) for (int32_t i=0; i<Unicode::GENERAL_TYPES_COUNT; ++i) { if (CATEGORY_NAMES.charAt(2*i) == cat.charAt(0)) { const UnicodeString& pairs = getCategoryPairs((int8_t)i); if (result.length() == 0) { result = pairs; } else { doUnion(result, pairs); } } } } if (result.length() == 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return result; } if (invert) { doComplement(result); } return result; } /** * Returns a pairs string for the given category. This string is * cached and returned again if this method is called again with * the same parameter. */ const UnicodeString& UnicodeSet::getCategoryPairs(int8_t cat) { // In order to tell what cache entries are empty, we assume // every category specifies at least one character. Thus // pair lists in the cache that are empty are uninitialized. if (CATEGORY_PAIRS_CACHE[cat].length() == 0) { // Walk through all Unicode characters, noting the start // and end of each range for which Character.getType(c) // returns the given category integer. Since we are // iterating in order, we can simply append the resulting // ranges to the pairs string. UnicodeString& pairs = CATEGORY_PAIRS_CACHE[cat]; int32_t first = -1; int32_t last = -2; for (int32_t i=0; i<=0xFFFF; ++i) { if (Unicode::getType((UChar)i) == cat) { if ((last+1) == i) { last = i; } else { if (first >= 0) { pairs.append((UChar)first).append((UChar)last); } first = last = i; } } } if (first >= 0) { pairs.append((UChar)first).append((UChar)last); } } return CATEGORY_PAIRS_CACHE[cat]; } //---------------------------------------------------------------- // Implementation: Utility methods //---------------------------------------------------------------- /** * Returns the character after the given position, or '\uFFFF' if * there is none. */ UChar UnicodeSet::charAfter(const UnicodeString& str, int32_t i) { return ((++i) < str.length()) ? str.charAt(i) : (UChar)0xFFFF; } /** * TEMPORARY WORKAROUND UNTIL Unicode::digit() exists. * Return the digit value of the given UChar, or -1. The radix * value is ignored for now and hardcoded as 16. */ int8_t UnicodeSet::digit(UChar c, int8_t radix) { int32_t d = Unicode::digitValue(c); if (d < 0) { if (c >= (UChar)'a' && c <= (UChar)'f') { d = c - (UChar)('a' - 10); } else if (c >= (UChar)'A' && c <= (UChar)'F') { d = c - (UChar)('A' - 10); } } return (int8_t)d; }