home *** CD-ROM | disk | FTP | other *** search
- /*
- * Bawk regular expression compiler/interpreter
- */
- #include <stdio.h>
- #include "bawk.h"
-
- int re_compile( patbuf )
- char *patbuf; /* where to put compiled pattern */
- {
- /*
- * Compile a regular expression from current input file
- * into the given pattern buffer.
- */
- register int c, /* Current character */
- o; /* Temp */
- register char delim, /* pattern delimiter */
- *patptr, /* destination string pntr */
- *lp, /* Last pattern pointer */
- *spp; /* Save beginning of pattern */
- char *cclass(); /* Compile class routine */
-
- DBUG_ENTER("re_compile");
- lp = patptr = patbuf;
- delim = getcharacter();
-
- while ( (c = getcharacter()) != -1 && c != delim )
- {
- /*
- * STAR, PLUS and MINUS are special.
- */
- if (c == '*' || c == '+' || c == '-') {
- if (patptr == patbuf ||
- (o=patptr[-1]) == BOL ||
- o == EOL ||
- o == STAR ||
- o == PLUS ||
- o == MINUS)
- error( "illegal occurrance op", RE_ERROR );
- *patptr++ = ENDPAT;
- *patptr++ = ENDPAT;
- spp = patptr; /* Save pattern end */
- while (--patptr > lp) /* Move pattern down... */
- *patptr = patptr[-1]; /* one byte */
- *patptr = (c == '*') ? STAR :
- (c == '-') ? MINUS : PLUS;
- patptr = spp; /* Restore pattern end */
- continue;
- }
- /*
- * All the rest.
- */
- lp = patptr; /* Remember start */
- switch(c) {
-
- case '^':
- *patptr++ = BOL;
- break;
-
- case '$':
- *patptr++ = EOL;
- break;
-
- case '.':
- *patptr++ = ANY;
- break;
-
- case '[':
- patptr = cclass( patptr );
- break;
-
- case ':':
- if ( (c=getcharacter()) != -1 )
- {
- switch( tolower( c ) )
- {
-
- case 'a':
- *patptr++ = ALPHA;
- break;
-
- case 'd':
- *patptr++ = DIGIT;
- break;
-
- case 'n':
- *patptr++ = NALPHA;
- break;
-
- case ' ':
- *patptr++ = PUNCT;
- break;
-
- default:
- error( "unknown ':' type", RE_ERROR );
-
- }
- }
- else
- error( "no ':' type", RE_ERROR );
- break;
-
- case '\\':
- c = getcharacter();
-
- default:
- *patptr++ = CHAR;
- *patptr++ = c;
- }
- }
- *patptr++ = ENDPAT;
- *patptr++ = 0; /* Terminate string */
-
- DBUG_EXECUTE("re_match",re_debug(patbuf,patptr););
- DBUG_RETURN(patptr - patbuf);
- }
-
- #ifndef DBUG_OFF
- re_debug(patbuf, patptr)
- register char *patbuf, *patptr;
- {
- register char *lp;
-
- for ( lp=patbuf; lp<patptr; ++lp )
- {
- switch ( *lp )
- {
- case CHAR: DBUG_PRINT("re_match",("char ")); break;
- case BOL: DBUG_PRINT("re_match",("bol ")); break;
- case EOL: DBUG_PRINT("re_match",("eol ")); break;
- case ANY: DBUG_PRINT("re_match",("any ")); break;
- case CLASS: DBUG_PRINT("re_match",("class(%d) ",*++lp)); break;
- case NCLASS: DBUG_PRINT("re_match",("notclass(%d) ",*++lp)); break;
- case STAR: DBUG_PRINT("re_match",("star ")); break;
- case PLUS: DBUG_PRINT("re_match",("plus ")); break;
- case MINUS: DBUG_PRINT("re_match",("minus ")); break;
- case ALPHA: DBUG_PRINT("re_match",("alpha ")); break;
- case DIGIT: DBUG_PRINT("re_match",("digit ")); break;
- case NALPHA: DBUG_PRINT("re_match",("notalpha ")); break;
- case PUNCT: DBUG_PRINT("re_match",("punct ")); break;
- case RANGE: DBUG_PRINT("re_match",("range ")); break;
- case ENDPAT: DBUG_PRINT("re_match",("endpat ")); break;
- default: DBUG_PRINT("re_match",("<%c> ", *lp)); break;
- }
- }
- }
- #endif
-
- char *
- cclass( patbuf )
- register char *patbuf; /* destination pattern buffer */
- {
- /*
- * Compile a class (within [])
- */
- register char *patptr, /* destination pattern pointer */
- *cp; /* Pattern start */
- register int c, /* Current character */
- o; /* Temp */
-
- DBUG_ENTER("cclass");
- patptr = patbuf;
-
- if ( (c = getcharacter()) == -1 )
- error( "class terminates badly", RE_ERROR );
- else if ( c == '^')
- {
- /*
- * Class exclusion, for example: [^abc]
- * Swallow the "^" and set token type to class exclusion.
- */
- o = NCLASS;
- }
- else
- {
- /*
- * Normal class, for example: [abc]
- * push back the character and set token type to class
- */
- ungetcharacter( (char) c );
- o = CLASS;
- }
- *patptr++ = o;
-
- cp = patptr; /* remember where byte count is */
- *patptr++ = 0; /* and initialize byte count */
- while ( (c = getcharacter()) != -1 && c!=']' )
- {
- o = getcharacter(); /* peek at next char */
- if (c == '\\') /* Store quoted chars */
- {
- if ( o == -1) /* Gotta get something */
- error( "class terminates badly", RE_ERROR );
- *patptr++ = o;
- }
- else if ( c=='-' && (patptr-cp)>1 && o!=']' && o != -1 )
- {
- c = patptr[-1]; /* Range start */
- patptr[-1] = RANGE; /* Range signal */
- *patptr++ = c; /* Re-store start */
- *patptr++ = o; /* Store end char */
- }
- else
- {
- *patptr++ = c; /* Store normal char */
- ungetcharacter( (char) o );
- }
- }
- if (c != ']')
- error( "unterminated class", RE_ERROR );
- if ( (c = (patptr - cp)) >= 256 )
- error( "class too large", RE_ERROR );
- if ( c == 0 )
- error( "empty class", RE_ERROR );
- *cp = c; /* fill in byte count */
-
- DBUG_RETURN(patptr);
- }
-
- int match( line, pattern )
- char *line; /* line to match */
- register char *pattern; /* pattern to match */
- {
- /*
- * Match the current line (in Linebuf[]), return 1 if it does.
- */
- register char *l; /* Line pointer */
- char *pmatch();
- register char *next;
- register int matches;
-
- DBUG_ENTER("match");
- matches = 0;
- for (l = line; *l; l++)
- {
- if ( next = pmatch(line, l, pattern) )
- {
- l = next - 1;
- ++matches;
- DBUG_PRINT("match",("match found"));
- }
- }
-
- DBUG_RETURN(matches);
- }
-
- char *
- pmatch(linestart, line, pattern)
- char *linestart; /* start of line to match */
- char *line; /* (partial) line to match */
- char *pattern; /* (partial) pattern to match */
- {
- register char *l;/* Current line pointer */
- register char *p;/* Current pattern pointer */
- register char c; /* Current character */
- register char *e;/* End for STAR and PLUS match */
- register int op; /* Pattern operation */
- register int n; /* Class counter */
- char *are; /* Start of STAR match */
-
- DBUG_ENTER("pmatch");
- l = line;
-
- DBUG_PRINT("pmatch",("line: (%s)", line));
-
- p = pattern;
- while ((op = *p++) != ENDPAT) {
-
- DBUG_PRINT("pmatch",("byte[%d] = 0%o, '%c', op = 0%o",l-line, *l, *l, op));
-
- switch(op) {
-
- case CHAR:
- if ( *l++ != *p++)
- DBUG_RETURN(0);
- break;
-
- case BOL:
- if (l != linestart)
- DBUG_RETURN(0);
- break;
-
- case EOL:
- if (*l != '\0')
- DBUG_RETURN(0);
- break;
-
- case ANY:
- if (*l++ == '\0')
- DBUG_RETURN(0);
- break;
-
- case DIGIT:
- if ((c = *l++) < '0' || (c > '9'))
- DBUG_RETURN(0);
- break;
-
- case ALPHA:
- c = *l++;
- c = tolower( c );
- if (c < 'a' || c > 'z')
- DBUG_RETURN(0);
- break;
-
- case NALPHA:
- c = *l++;
- c = tolower( c );
- if (c >= 'a' && c <= 'z')
- break;
- else if (c < '0' || c > '9')
- DBUG_RETURN(0);
- break;
-
- case PUNCT:
- c = *l++;
- if (c == 0 || c > ' ')
- DBUG_RETURN(0);
- break;
-
- case CLASS:
- case NCLASS:
- c = *l++;
- n = *p++ & 0377;
- do {
- if (*p == RANGE) {
- p += 3;
- n -= 2;
- if (c >= p[-2] && c <= p[-1])
- break;
- }
- else if (c == *p++)
- break;
- } while (--n > 1);
- if ((op == CLASS) == (n <= 1))
- DBUG_RETURN(0);
- if (op == CLASS)
- p += n - 2;
- break;
-
- case MINUS:
- e = pmatch(linestart,l,p);/* Look for a match */
- while (*p++ != ENDPAT); /* Skip over pattern */
- if (e) /* Got a match? */
- l = e; /* Yes, update string */
- break; /* Always succeeds */
-
- case PLUS: /* One or more ... */
- if ((l = pmatch(linestart,l,p)) == 0)
- DBUG_RETURN(0); /* Gotta have a match */
- case STAR: /* Zero or more ... */
- are = l; /* Remember line start */
- while (*l && (e = pmatch(linestart,l,p)))
- l = e; /* Get longest match */
- while (*p++ != ENDPAT); /* Skip over pattern */
- while (l >= are) { /* Try to match rest */
- if (e = pmatch(linestart,l,p))
- DBUG_RETURN(e);
- --l; /* Nope, try earlier */
- }
- DBUG_RETURN(0); /* Nothing else worked */
-
- default:
- fprintf( stderr, "bad op code %d\n", op );
- error( "can't happen -- match", RE_ERROR );
- }
- }
- DBUG_RETURN(l);
- }
-
-