home *** CD-ROM | disk | FTP | other *** search
- /**********************************************************************\
- *
- * DFAREGEX.C
- *
- * Deterministic finite automata regular expression matcher.
- *
- * Author: Jarmo Ruuth 15-Mar-1988
- *
- * Copyright (C) 1988-90 by Jarmo Ruuth
- * May be freely copied for any non-commercial usage.
- \**********************************************************************/
-
- #include "system.h"
- #include "dfa.h"
- #include "dfaregex.h"
-
- /*
- Regular expression buffer. It is defined statically here, so
- it is not possible to compile and use several regexps at the
- same time. The reason for the static buffer is that it makes
- things slightly faster on the PC. Using dynamic buffer would
- also require adding the regbuf_t* parameter to almost all
- function calls.
- */
- regbuf_t rbuf;
-
- #ifdef TEST
- extern int debug;
- #endif
-
- /**********************************************************************
- *
- * free_rbufmem
- *
- * Releases memory held by regular expression.
- */
- static void free_rbufmem(void)
- {
- REG1 int i;
-
- for (i = 0; i < NTRAN; i++) {
- d_free(&rbuf.dstates[i]->set);
- d_free(&rbuf.dstates[i]);
- }
- if (rbuf.follow_array != NULL) {
- for (i = 0; i <= rbuf.root; i++)
- d_free(&rbuf.follow_array[i]);
- d_free(&rbuf.follow_array);
- }
- if (rbuf.tree != NULL) {
- for (i = 0; i <= rbuf.root; i++) {
- if (rbuf.tree[i].type == CLASS_ID)
- d_free(&rbuf.tree[i].val.class);
- }
- d_free(&rbuf.tree);
- }
- d_free(&rbuf.regmust);
- d_free(&rbuf.language);
- d_free(&rbuf.tmpstate);
- rbuf.mem_allocated = FALSE;
- }
-
- /**********************************************************************
- *
- * reg_comp_eol
- *
- * Compiles regular expression. It is possible to specify end of line
- * characters. If the end of line characters equal, then only one end
- * of line character is used.
- */
- char* reg_comp_eol(REG1 char* regexp, char eol1, char eol2, int type_bits)
- {
- if (regexp == NULL)
- return "No regular expression";
- rbuf.reg_error = NULL;
- rbuf.eol = eol1;
- rbuf.eol2 = eol2;
- rbuf.neol = (eol1 == eol2 ? 1 : 2);
- rbuf.ignorecase = type_bits & RE_IGNORECASE;
- rbuf.search = type_bits & RE_SEARCH;
- rbuf.full = type_bits & RE_FULL;
- if (rbuf.mem_allocated)
- free_rbufmem();
- if (create_tree(regexp) == ERROR)
- return rbuf.reg_error;
- if (!find_regmust(rbuf.tree, rbuf.root)) {
- free_rbufmem();
- return rbuf.reg_error = "Out of memory when allocating regmust";
- }
- #ifdef TEST
- if (debug)
- printf("Regmust = '%s'\n", rbuf.regmust);
- #endif
- if (!calcpos()) {
- free_rbufmem();
- return rbuf.reg_error = "Out of memory in calcpos";
- }
- if (!dfa()) {
- free_rbufmem();
- return rbuf.reg_error = "Out of memory when creating dfa";
- }
- rbuf.mem_allocated = TRUE;
- #ifdef TEST
- if (debug > 1) {
- print_accepting_states();
- print_dtran();
- }
- #endif
- return rbuf.reg_error;
- }
-
- /**********************************************************************
- *
- * reg_comp
- *
- * Compiles regular expression with predefined end of line character
- * suitable for C's null-terminating string representation.
- */
- char* reg_comp(char* regexp, int type_bits)
- {
- return reg_comp_eol(regexp, '\0', '\0', type_bits);
- }
-
- /**********************************************************************
- *
- * reg_exec
- *
- * Executes regular expression to the string 's'. Return value is NULL,
- * if no match was found and pointer to the last matching char, if match
- * was found. Note that last matching char can be end of line character.
- *
- * Parameters:
- * Strbeg points to the beginning of the line.
- * Strend points to the first character after the string. We put
- * the end of line character to the *strend (like GNU grep),
- * so it should point to somewhere valid.
- * Linecnt points to the counter, that we increment every time we see
- * an end of line character, if it does not belong to the matched
- * line. If linecnt is NULL, it is not incremented.
- */
- char* reg_exec(char* strbeg, REG5 char* strend, REG6 ulong* linecnt)
- {
- REG1 uchar* s;
- #ifdef FAST
- REG2 tran_t cur_state = FIRST_STATE;
- REG3 tran_t prev_state;
- #else
- /* use unsigned state variables, because some compilers can't
- put char variables to registers (usually tran_t is char) */
- REG2 unsigned cur_state = FIRST_STATE;
- REG3 unsigned prev_state;
- #endif
- REG4 char save_char;
-
- s = strbeg;
- if (s == NULL || strend == NULL)
- return NULL;
- save_char = *strend;
- *strend = rbuf.eol;
- if (rbuf.begline) {
- check_bol:
- /* we need separate beginning-of-line check, because
- there are no real character for that position */
- prev_state = FIRST_STATE;
- cur_state = rbuf.dstates[0]->tran[BOL];
- if (cur_state == UNKNOWN)
- cur_state = get_transition(FIRST_STATE, BOL);
- if (cur_state == STOPSTATE)
- cur_state = FIRST_STATE; /* we might match later */
- else if (cur_state == ACCEPTING)
- s++; /* now we return correct pointer */
- }
- do {
- /* the inner loop */
- for (; cur_state < MAXTRAN; ) {
- prev_state = cur_state;
- cur_state = state_of(cur_state)->tran[*s++];
- }
- if (cur_state == UNKNOWN)
- cur_state = get_transition(prev_state, *(s-1));
- if (cur_state == END_OF_LINE) {
- cur_state = state_of(prev_state)->tran[EOL];
- if (cur_state == ACCEPTING)
- break; /* return match */
- if (linecnt)
- (*linecnt)++;
- /* if there are more than one eol-char, skip them */
- if (rbuf.neol > 1 && *s == rbuf.eol2)
- s++;
- if (s > strend || cur_state == STOPSTATE)
- break; /* return NULL */
- if (rbuf.begline)
- goto check_bol;
- cur_state = FIRST_STATE;
- }
- } while (cur_state < MAXTRAN);
- *strend = save_char;
- return cur_state == ACCEPTING ? s-1 : NULL;
- }
-