home *** CD-ROM | disk | FTP | other *** search
- /**********************************************************************\
- *
- * DFA.C
- *
- * Constructs deterministic finite state automata from followpos.
- * Because of exponential space requirements on some input (and 64k
- * segmentation in PC !#?@#), a sort of lazy evaluation tecnique is used.
- * FIXTRAN first transitions are always calculated and kept in memory.
- * If more states are needed, they are computed while matching the
- * expression. Calculated transitions are stored in a cache. If some
- * transition is not in the fixed transition table and it is not found
- * from those states currently in cache then one cache state is deleted
- * and new one is added.
- *
- * Bugs: Things get conciderably slower, if more than NTRAN
- * states are needed.
- *
- * Author: Jarmo Ruuth 15-Mar-1988
- *
- * Copyright (C) 1988-90 by Jarmo Ruuth
- * May be freely copied for any non-commercial usage.
- \**********************************************************************/
-
- #include "system.h"
- #include "dfa.h"
-
- /**********************************************************************
- *
- * init_dfa
- *
- * Allocates state sets and transition table.
- */
- static bool init_dfa(void)
- {
- REG1 int i;
-
- for (i = 0; i < NTRAN; i++) {
- if ((rbuf.dstates[i] = calloc(1, sizeof(state_t))) == NULL)
- return FALSE;
- if ((rbuf.dstates[i]->set = calloc(1, rbuf.setsize)) == NULL)
- return FALSE;
- #ifdef FAST
- assert(rbuf.dstates[i] < MAXTRAN);
- #endif
- }
- if ((rbuf.tmpstate = malloc(rbuf.setsize)) == NULL)
- return FALSE;
- return TRUE;
- }
-
- /**********************************************************************
- *
- * is_accepting
- *
- * Tests if the given state set is accepting state. State is accepting
- * if right end marker position (at root-1) belongs to it.
- */
- #define is_accepting(_state_set) in_set(_state_set, rbuf.root-1)
-
- /**********************************************************************
- *
- * transition_to
- *
- * Returns the transition to the state specified by it's index.
- * In the fast version, it is the state pointer, or otherwise it
- * is index to the state.
- */
- #ifdef FAST
- #define transition_to(_index) ((tran_t)rbuf.dstates[_index])
- #else
- #define transition_to(_index) ((tran_t)(_index))
- #endif
-
- /**********************************************************************
- *
- * build_state
- *
- * First builds a new state set to the parameter 'state_set' by taking a
- * union of all those followpositions that belong to the current position
- * 'pos' and that contain 'inp'.
- * Then finds a state that has same set of states as 'state_set' and returns
- * transition to it. If there isn't equal position in dstates, returns
- * UNKNOWN. If RE_FULL is not specified and we have accepting state, returns
- * transition to the virtual ACCEPTING state.
- */
- static tran_t build_state(set_t* state_set, unsigned inp, state_t* state)
- {
- REG1 int i;
- REG2 set_t* dsset = state->set;
- REG3 node_t* tptr;
- REG4 int type;
-
- if (rbuf.ignorecase && islower(inp))
- inp = toupper(inp);
- set_clear(state_set, rbuf.setsize);
- /* build union of followpositions */
- for (i = rbuf.root; i >= 0; i--) {
- if (!in_set(dsset, i))
- continue;
- tptr = &rbuf.tree[i];
- type = tptr->type;
- if ((type == ID && tptr->val.item == inp)
- || (type == CLASS_ID && in_set(tptr->val.class, inp)))
- set_union(state_set, state_set, rbuf.follow_array[i],
- rbuf.setsize);
- }
- if (set_empty(state_set, rbuf.setsize))
- return rbuf.default_tran;
- if (!rbuf.full && is_accepting(state_set))
- return ACCEPTING;
- /* try to find state with same set of followpositions */
- for (i = rbuf.last_state; i >= 0; i--)
- if (set_equal(state_set, rbuf.dstates[i]->set, rbuf.setsize))
- return transition_to(i);
- if (is_accepting(state_set))
- return ACCEPTING;
- else
- return UNKNOWN;
- }
-
- /**********************************************************************
- *
- * add_state
- *
- * Adds new state to dstates in position 'state'. Initializes also
- * state transition table in dstates.
- */
- static void add_state(set_t* state_set, REG2 state_t* state)
- {
- REG1 int i;
- REG3 unsigned* langptr;
-
- /* set default transition */
- if (is_accepting(state_set))
- for (i = 0; i < NLANG; i++)
- state->tran[i] = ACCEPTING;
- else
- for (i = 0; i < NLANG; i++)
- state->tran[i] = rbuf.default_tran;
- state->tran[EOL] = state->tran[rbuf.eol];
- state->tran[rbuf.eol] = END_OF_LINE;
- if (rbuf.full || !is_accepting(state_set)) {
- i = rbuf.language[0];
- langptr = rbuf.language + i;
- while (i--)
- state->tran[*langptr--] = UNKNOWN;
- }
- set_copy(state->set, state_set, rbuf.setsize);
- }
-
- /**********************************************************************
- *
- * add_transition
- *
- * Adds transition to dtran. Handles also end of line transition if
- * transition isn't UNKNOWN. Returns transition, which can be changed
- * in the case of end of line transition.
- */
- static tran_t add_transition(REG2 state_t* state, REG3 int input,
- REG1 tran_t tran)
- {
- if (tran != UNKNOWN && input == rbuf.eol) {
- state->tran[EOL] = tran;
- tran = END_OF_LINE;
- }
- state->tran[input] = tran;
- if (rbuf.ignorecase && isalpha(input))
- state->tran[CHCASE(input)] = tran;
- return tran;
- }
-
- /**********************************************************************
- *
- * free_dfamem
- *
- * Frees all unnecessary memory.
- */
- static void free_dfamem(void)
- {
- d_free(&rbuf.tree[rbuf.root].firstpos);
- }
-
- #if FIXTRAN > 0 /* don't include unused code */
-
- /**********************************************************************
- *
- * unmarked
- *
- * Finds next unmarked state.
- */
- static state_t* unmarked(void)
- {
- REG1 int i;
-
- for (i = 0; i <= rbuf.last_state; i++)
- if (!rbuf.dstates[i]->marked)
- return rbuf.dstates[i];
- return NULL;
- }
-
- /**********************************************************************
- *
- * build_maxtran
- *
- * Constructs FIXTRAN first transitions to dtran.
- */
- static void build_maxtran(void)
- {
- REG1 int input;
- REG2 tran_t tran;
- REG3 int i;
- REG4 state_t* state;
- REG5 int state_cnt;
-
- state_cnt = 0;
- while (state_cnt++ < FIXTRAN && (state = unmarked()) != NULL) {
- state->marked = TRUE;
- for (i = 0; i < rbuf.language[0]; i++) {
- input = rbuf.language[i+1];
- if (input == EOL)
- continue;
- tran = build_state(rbuf.tmpstate, input, state);
- if (tran == UNKNOWN && rbuf.last_state < NTRAN-1) {
- rbuf.last_state++;
- tran = transition_to(rbuf.last_state);
- add_state(rbuf.tmpstate, state_of(tran));
- state_of(tran)->marked = FALSE;
- }
- add_transition(state, input, tran);
- }
- }
- }
-
- #endif /* FIXTRAN > 0 */
-
- /**********************************************************************
- *
- * dfa
- */
- bool dfa(void)
- {
- assert(MAXTRAN < UNKNOWN);
- if (!init_dfa()) {
- free_dfamem();
- return FALSE;
- }
- rbuf.last_state = 0;
- rbuf.default_tran = rbuf.search ? FIRST_STATE : STOPSTATE;
- /* create initial state */
- add_state(rbuf.tree[rbuf.root].firstpos, rbuf.dstates[0]);
- rbuf.dstates[0]->marked = FALSE;
- #if FIXTRAN > 0
- build_maxtran();
- #endif
- free_dfamem();
- return TRUE;
- }
-
- #ifdef TEST
- static int removed_tran_cnt = 0;
- #endif
-
- /**********************************************************************
- *
- * remove_transition
- *
- * Removes all transitions to given state.
- */
- static void remove_transition(tran_t tran)
- {
- REG3 int i;
- REG1 int j;
- REG2 tran_t* pos;
-
- #ifdef TEST
- ++removed_tran_cnt;
- #endif
- for (i = 0; i < NTRAN; i++) {
- pos = rbuf.dstates[i]->tran;
- for (j = 0; j < NLANG; j++, pos++) {
- if (*pos == tran)
- *pos = UNKNOWN;
- }
- }
- }
-
- /**********************************************************************
- *
- * free_cache_pos
- *
- * Returns free cache position. If cache is full, removes old one
- * and returns it as free position. For each state rbuf.dstates[state].marked
- * is used as a use counter for that state (i.e. how many transitions there
- * are to that state). When cache position is needed, the one with smallest
- * reference count is freed and returned. If there are more than one state with
- * the same reference count then the last one is taken.
- */
- static tran_t free_cache_pos(REG4 state_t* state)
- {
- REG1 unsigned i;
- REG2 unsigned mincnt;
- REG3 unsigned minpos;
- REG5 int minstate;
-
- if (rbuf.last_state < NTRAN-1) {
- rbuf.last_state++;
- return transition_to(rbuf.last_state);
- }
- mincnt = ~0; /* == maxint */
- minstate = max(FIXTRAN, 1); /* never remove state zero */
- /* find the removed state (the one with smallesr reference count) */
- for (i = rbuf.last_state; i >= minstate; i--) {
- if (mincnt > rbuf.dstates[i]->marked
- && rbuf.dstates[i] != state)
- {
- mincnt = rbuf.dstates[i]->marked;
- minpos = i;
- }
- }
- rbuf.dstates[minpos]->marked = 1;
- remove_transition(transition_to(minpos));
- return transition_to(minpos);
- }
-
- /**********************************************************************
- *
- * get_transition
- *
- * Gets real transition for UNKNOWN transition. This function is called
- * when matching the pattern.
- */
- tran_t get_transition(REG2 tran_t tran, REG3 unsigned input)
- {
- REG1 tran_t new_tran;
-
- new_tran = build_state(rbuf.tmpstate, input, state_of(tran));
- if (new_tran == UNKNOWN) {
- /* new state is not one of the old states */
- new_tran = free_cache_pos(state_of(tran));
- add_state(rbuf.tmpstate, state_of(new_tran));
- } else if (new_tran < MAXTRAN)
- state_of(new_tran)->marked++;
- return add_transition(state_of(tran), input, new_tran);
- }
-
- #ifdef TEST
-
- /**********************************************************************
- *
- * show_dfa_report
- */
- void show_dfa_report(void)
- {
- int i, j;
- long ntran, utran;
-
- ntran = utran = 0L;
- printf ("%u states of %u possible states used\n"
- "%u cache states deleted\n",
- rbuf.last_state+1,
- NTRAN,
- removed_tran_cnt);
- for (i = 0; i <= rbuf.last_state; i++)
- for (j = 0; j < rbuf.language[0]; j++)
- if (rbuf.dstates[i]->tran[rbuf.language[j+1]] == UNKNOWN)
- utran++;
- ntran = (rbuf.last_state+1)*rbuf.language[0];
- printf("%lu transitions of %lu possible unknown transitions (%lu%%) are unknown\n",
- utran, ntran, ntran == 0 ? 0 : utran*100/ntran);
- #ifdef __TURBOC__
- printf("%lu bytes memory free\n", (unsigned long)coreleft());
- #endif
- fflush(stdout);
- }
-
- /**********************************************************************
- *
- * print_accepting_states
- */
- void print_accepting_states(void)
- {
- REG1 int i;
-
- printf("Accepting states: ");
- for (i = 0; i <= rbuf.last_state; i++)
- if (is_accepting(rbuf.dstates[i]->set))
- printf("%d ", i);
- printf("\n");
- fflush(stdout);
- }
-
- /**********************************************************************
- *
- * char_image
- */
- char* char_image(int c)
- {
- static char s[3];
- char* s1;
-
- switch (c) {
- case REM:
- s1 = "REM";
- break;
- case EOL:
- s1 = "EOL";
- break;
- case BOL:
- s1 = "BOL";
- break;
- default:
- if (iscntrl(c)) {
- s[0] = '^';
- s[1] = '@' + c;
- s[2] = '\0';
- } else {
- s[0] = c;
- s[1] = '\0';
- }
- s1 = s;
- break;
- }
- return s1;
- }
-
- /**********************************************************************
- *
- * print_language
- */
- void print_language(void)
- {
- int i;
-
- printf(" ");
- for (i = 0; i < rbuf.language[0]; i++)
- printf("%3s ", char_image(rbuf.language[i+1]));
- printf("\n");
- fflush(stdout);
- }
-
- /**********************************************************************
- *
- * show_tran
- */
- static void show_tran(tran_t t)
- {
- int i;
-
- if (t == ACCEPTING)
- printf("%3s ", "Acc");
- else if (t == STOPSTATE)
- printf("%3s ", "Sto");
- else if (t == END_OF_LINE)
- printf("%3s ", "Eol");
- else if (t == UNKNOWN)
- printf("%3s ", "Unk");
- else {
- #ifdef FAST
- /* find index of the t */
- for (i = 0; i < NTRAN; i++)
- if (rbuf.dstates[i] == t)
- break;
- assert(i < NTRAN);
- #else
- i = (int)t;
- #endif
- printf("%3d ", i);
- }
- fflush(stdout);
- }
-
- /**********************************************************************
- *
- * print_dtran
- */
- void print_dtran(void)
- {
- int i, j;
-
- printf("Dstates for each state:\n");
- for (i = 0; i <= rbuf.last_state; i++) {
- printf("%2d:", i);
- for (j = 0; j <= rbuf.root; j++)
- if (in_set(rbuf.dstates[i]->set, j))
- printf(" %d", j);
- printf("\n");
- }
- printf("Transition table (default transition is ");
- show_tran(rbuf.default_tran);
- printf("):\n");
- print_language();
- for (i = 0; i <= rbuf.last_state; i++) {
- printf("%2d: ", i);
- for (j = 0; j < rbuf.language[0]; j++)
- show_tran(rbuf.dstates[i]->tran[rbuf.language[j+1]]);
- printf("\n");
- }
- fflush(stdout);
- }
-
- #endif /* TEST */
-