home *** CD-ROM | disk | FTP | other *** search
- /*
- **********************************************************************
- * Copyright (C) 1999 Alan Liu and others. All rights reserved.
- **********************************************************************
- * Date Name Description
- * 10/22/99 alan Creation.
- **********************************************************************
- */
-
- #include "rbbi.h"
- #include "rbbi_bld.h"
-
- /**
- * A token used as a character-category value to identify ignore characters
- */
- int8_t RuleBasedBreakIterator::IGNORE = -1;
-
- /**
- * The state number of the starting state
- */
- int16_t RuleBasedBreakIterator::START_STATE = 1;
-
- /**
- * The state-transition value indicating "stop"
- */
- int16_t RuleBasedBreakIterator::STOP_STATE = 0;
-
- //=======================================================================
- // constructors
- //=======================================================================
-
- /**
- * Constructs a RuleBasedBreakIterator according to the description
- * provided. If the description is malformed, throws an
- * IllegalArgumentException. Normally, instead of constructing a
- * RuleBasedBreakIterator directory, you'll use the factory methods
- * on BreakIterator to create one indirectly from a description
- * in the framework's resource files. You'd use this when you want
- * special behavior not provided by the built-in iterators.
- */
- RuleBasedBreakIterator::RuleBasedBreakIterator(const UnicodeString& description) {
- this.description = description;
-
- // the actual work is done by the Builder class
- Builder builder;
- builder.buildBreakIterator(*this, description);
- }
-
- //=======================================================================
- // boilerplate
- //=======================================================================
- /**
- * Clones this iterator.
- * @return A newly-constructed RuleBasedBreakIterator with the same
- * behavior as this one.
- */
- RuleBasedBreakIterator* RuleBasedBreakIterator::clone() const {
- return new RuleBasedBreakIterator(*this);
- }
-
- /**
- * Returns true if both BreakIterators are of the same class, have the same
- * rules, and iterate over the same text.
- */
- bool_t RuleBasedBreakIterator::operator==(const RuleBasedBreakIterator& that) {
- return description.equals(((RuleBasedBreakIterator)that).description)
- && text.equals(((RuleBasedBreakIterator)that).text);
- }
-
- /**
- * Returns the description used to create this iterator
- */
- UnicodeString RuleBasedBreakIterator::toString() {
- return description;
- }
-
- /**
- * Compute a hashcode for this BreakIterator
- * @return A hash code
- */
- int32_t RuleBasedBreakIterator::hashCode() {
- return description.hashCode();
- }
-
- //=======================================================================
- // BreakIterator overrides
- //=======================================================================
- /**
- * Sets the current iteration position to the beginning of the text.
- * (i.e., the CharacterIterator's starting offset).
- * @return The offset of the beginning of the text.
- */
- int32_t RuleBasedBreakIterator::first() {
- CharacterIterator t = getText();
-
- t.first();
- return t.getIndex();
- }
-
- /**
- * Sets the current iteration position to the end of the text.
- * (i.e., the CharacterIterator's ending offset).
- * @return The text's past-the-end offset.
- */
- int32_t RuleBasedBreakIterator::last() {
- CharacterIterator t = getText();
-
- // I'm not sure why, but t.last() returns the offset of the last character,
- // rather than the past-the-end offset
- t.setIndex(t.getEndIndex());
- return t.getIndex();
- }
-
- /**
- * Advances the iterator either forward or backward the specified number of steps.
- * Negative values move backward, and positive values move forward. This is
- * equivalent to repeatedly calling next() or previous().
- * @param n The number of steps to move. The sign indicates the direction
- * (negative is backwards, and positive is forwards).
- * @return The character offset of the boundary position n boundaries away from
- * the current one.
- */
- int32_t RuleBasedBreakIterator::next(int32_t n) {
- int32_t result = current();
- while (n > 0) {
- result = handleNext();
- --n;
- }
- while (n < 0) {
- result = previous();
- ++n;
- }
- return result;
- }
-
- /**
- * Advances the iterator to the next boundary position.
- * @return The position of the first boundary after this one.
- */
- int32_t RuleBasedBreakIterator::next() {
- return handleNext();
- }
-
- /**
- * Advances the iterator backwards, to the last boundary preceding this one.
- * @return The position of the last boundary position preceding this one.
- */
- int32_t RuleBasedBreakIterator::previous() {
- // if we're already sitting at the beginning of the text, return DONE
- CharacterIterator text = getText();
- if (current() == text.getBeginIndex())
- return BreakIterator.DONE;
-
- // set things up. handlePrevious() will back us up to some valid
- // break position before the current position (we back our internal
- // iterator up one step to prevent handlePrevious() from returning
- // the current position), but not necessarily the last one before
- // where we started
- int32_t start = current();
- text.previous();
- int32_t lastResult = handlePrevious();
- int32_t result = lastResult;
-
- // iterate forward from the known break position until we pass our
- // starting point. The last break position before the starting
- // point is our return value
- while (result != BreakIterator.DONE && result < start) {
- lastResult = result;
- result = handleNext();
- }
-
- // set the current iteration position to be the last break position
- // before where we started, and then return that value
- text.setIndex(lastResult);
- return lastResult;
- }
-
- /**
- * Sets the iterator to refer to the first boundary position following
- * the specified position.
- * @offset The position from which to begin searching for a break position.
- * @return The position of the first break after the current position.
- */
- int32_t RuleBasedBreakIterator::following(int32_t offset) {
- // if the offset passed in is already past the end of the text,
- // just return DONE
- CharacterIterator text = getText();
- if (offset == text.getEndIndex())
- return BreakIterator.DONE;
-
- // otherwise, set our internal iteration position (temporarily)
- // to the position passed in. If this is the _beginning_ position,
- // then we can just use next() to get our return value
- text.setIndex(offset);
- if (offset == text.getBeginIndex())
- return handleNext();
-
- // otherwise, we have to sync up first. Use handlePrevious() to back
- // us up to a known break position before the specified position (if
- // we can determine that the specified position is a break position,
- // we don't back up at all). This may or may not be the last break
- // position at or before our starting position. Advance forward
- // from here until we've passed the starting position. The position
- // we stop on will be the first break position after the specified one.
- int32_t result = handlePrevious();
- while (result != BreakIterator.DONE && result <= offset)
- result = handleNext();
- return result;
- }
-
- /**
- * Sets the iterator to refer to the last boundary position before the
- * specified position.
- * @offset The position to begin searching for a break from.
- * @return The position of the last boundary before the starting position.
- */
- int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
- // if we start by updating the current iteration position to the
- // position specified by the caller, we can just use previous()
- // to carry out this operation
- CharacterIterator text = getText();
- text.setIndex(offset);
- return previous();
- }
-
- /**
- * Returns true if the specfied position is a boundary position. As a side
- * effect, leaves the iterator pointing to the first boundary position at
- * or after "offset".
- * @param offset the offset to check.
- * @return True if "offset" is a boundary position.
- */
- bool_t RuleBasedBreakIterator::isBoundary(int32_t offset) {
- // 0 is always a boundary position (I suspect this code is wrong; I think
- // we're supposed to be comparing "offset" against text.getBeginIndex(). )
- if (offset == 0)
- return TRUE;
-
- // otherwise, we can use following() on the position before the specified
- // one and return true of the position we get back is the one the user
- // specified
- else
- return following(offset - 1) == offset;
- }
-
- /**
- * Returns the current iteration position.
- * @return The current iteration position.
- */
- int32_t RuleBasedBreakIterator::current() {
- return getText().getIndex();
- }
-
- /**
- * Return a CharacterIterator over the text being analyzed. This version
- * of this method returns the actual CharacterIterator we're using internally.
- * Changing the state of this iterator can have undefined consequences. If
- * you need to change it, clone it first.
- * @return An iterator over the text being analyzed.
- */
- CharacterIterator RuleBasedBreakIterator::getText() {
- // The iterator is initialized pointing to no text at all, so if this
- // function is called while we're in that state, we have to fudge an
- // an iterator to return.
- if (text == 0)
- text = new StringCharacterIterator("");
- return text;
- }
-
- /**
- * Set the iterator to analyze a new piece of text. This function resets
- * the current iteration position to the beginning of the text.
- * @param newText An iterator over the text to analyze.
- */
- void RuleBasedBreakIterator::setText(CharacterIterator newText) {
- text = newText;
- text.first();
- }
- //=======================================================================
- // implementation
- //=======================================================================
- /**
- * This method is the actual implementation of the next() method. All iteration
- * vectors through here. This method initializes the state machine to state 1
- * and advances through the text character by character until we reach the end
- * of the text or the state machine transitions to state 0. We update our return
- * value every time the state machine passes through a possible end state.
- */
- int32_t RuleBasedBreakIterator::handleNext() {
- // if we're already at the end of the text, return DONE.
- CharacterIterator text = getText();
- if (text.getIndex() == text.getEndIndex())
- return BreakIterator.DONE;
-
- // no matter what, we always advance at least one character forward
- int32_t result = text.getIndex() + 1;
-
- // begin in state 1
- int32_t state = START_STATE;
- int32_t category;
- UChar c = text.current();
-
- // loop until we reach the end of the text or transition to state 0
- while (c != CharacterIterator.DONE && state != STOP_STATE) {
-
- // look up the current character's character category (which tells us
- // which column in the state table to look at)
- category = lookupCategory(c);
-
- // if the character isn't an ignore character, look up a state
- // transition in the state table
- if (category != IGNORE) {
- state = lookupState(state, category);
- }
-
- // if the state we've just transitioned to is an accepting state,
- // update our return value to be the current iteration position
- if (endStates[state])
- result = text.getIndex() + 1;
- c = text.next();
- }
- text.setIndex(result);
- return result;
- }
-
- /**
- * This method backs the iterator back up to a "safe position" in the text.
- * This is a position that we know, without any context, must be a break position.
- * The various calling methods then iterate forward from this safe position to
- * the appropriate position to return. (For more information, see the description
- * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
- */
- int32_t RuleBasedBreakIterator::handlePrevious() {
- CharacterIterator text = getText();
- int32_t state = START_STATE;
- int32_t category = 0;
- int32_t lastCategory = 0;
- UChar c = text.current();
-
- // loop until we reach the beginning of the text or transition to state 0
- while (c != CharacterIterator.DONE && state != STOP_STATE) {
-
- // save the last character's category and look up the current
- // character's category
- lastCategory = category;
- category = lookupCategory(c);
-
- // if the current character isn't an ignore character, look up a
- // state transition in the backwards state table
- if (category != IGNORE)
- state = lookupBackwardState(state, category);
-
- // then advance one character backwards
- c = text.previous();
- }
-
- // if we didn't march off the beginning of the text, we're either one or two
- // positions away from the real break position. (One because of the call to
- // previous() at the end of the loop above, and another because the character
- // that takes us into the stop state will always be the character BEFORE
- // the break position.)
- if (c != CharacterIterator.DONE) {
- if (lastCategory != IGNORE)
- text.setIndex(text.getIndex() + 2);
- else
- text.next();
- }
- return text.getIndex();
- }
-
- /**
- * Looks up a character's category (i.e., its category for breaking purposes,
- * not its Unicode category)
- */
- int32_t RuleBasedBreakIterator::lookupCategory(UChar c) {
- return UCharCategoryTable.elementAt(c);
- }
-
- /**
- * Given a current state and a character category, looks up the
- * next state to transition to in the state table.
- */
- int32_t RuleBasedBreakIterator::lookupState(int32_t state, int32_t category) {
- return stateTable[state * numCategories + category];
- }
-
- /**
- * Given a current state and a character category, looks up the
- * next state to transition to in the backwards state table.
- */
- int32_t RuleBasedBreakIterator::lookupBackwardState(int32_t state, int32_t category) {
- return backwardsStateTable[state * numCategories + category];
- }
-