home *** CD-ROM | disk | FTP | other *** search
- #ifndef lint
- static char *sccsid="@(#)regexp.c 1.2 (LBL) 4/12/85";
- static char Version[] =
- "$Id: regexp.c,v 1.2 91/10/01 00:30:20 gvr Exp $";
- #endif
-
- /*
- * Regular expression matching routines for lgrind/tgrind/tfontedpr.
- *
- * These routines were written by Dave Presotto (I think) for vgrind.
- * Minor mods & attempts to improve performance by Van Jacobson
- * (van@@lbl-rtsg) and Chris Torek (chris@@maryland).
- *
- * Modifications.
- * --------------
- * Sep 91 George V Reilly Fixed up some bogus uses of @NIL@ and @NULL@.
- * 30 Mar 85 Van & Chris Changed @expmatch()@ to return pointer to
- * start of what was matched in addition to
- * pointer to match end. Several changes to
- * improve performance (too numerous to mention).
- * 11 Dec 84 Dave Presotto Written.
- */
- #include <stdio.h>
- #include <ctype.h>
-
- #ifdef __STDC__
- #include <stdlib.h>
- #else
- extern void srand();
- extern int rand();
- extern void *malloc();
- extern void *realloc();
- extern void free();
- #endif
-
-
- typedef int boolean;
- #define TRUE 1
- #define FALSE 0
-
- #define makelower(c) (isupper((c)) ? tolower((c)) : (c))
-
- void expconv(); /* forward declaration */
-
- int (*re_strncmp)(); /* function used by @expmatch()@ to compare
- * strings. The caller should make it point to
- * @strncmp()@ if case is significant &
- * @lc_strncmp()@ otherwise.
- */
-
-
- /* @lc_strncmp()@ --- like @strncmp()@ except that we convert the
- * first string to lower case before comparing.
- */
- int
- lc_strncmp(s1, s2, len)
- register char *s1,*s2;
- register int len;
- {
- while (len-- > 0)
- if (*s2 - makelower(*s1))
- return 1;
- else
- s2++, s1++;
-
- return 0;
- }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- /* The following routine converts an irregular expression to
- * internal format.
- *
- * Either meta symbols (%|\a|%, %|\d|%, or %|\p|%) or character strings
- * or operations (alternation or parenthesizing) can be specified.
- * Each starts with a descriptor byte. The descriptor byte
- * has @STR@ set for strings, @META@ set for meta symbols,
- * and @OPER@ set for operations. The descriptor byte can also have
- * the @OPT@ bit set if the object defined is optional. Also @ALT@
- * can be set to indicate an alternation.
- *
- * For metasymbols, the byte following the descriptor byte identities
- * the meta symbol (containing an ASCII @'a'@, @'d'@, @'p'@, @'|'@,
- * or @'('@). For strings, the byte after the descriptor is a
- * character count for the string:
- *@
- * meta symbols := descriptor
- * symbol
- *
- * strings := descriptor
- * character count
- * the string
- *
- * operations := descriptor
- * symbol
- * character count@
- */
-
-
- /*
- * handy macros for accessing parts of match blocks
- */
- #define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
- #define MNEXT(A) (A+2) /* character following a metasymbol block */
-
- #define OSYM(A) (*(A+1)) /* symbol in an operation block */
- #define OCNT(A) (*(A+2)) /* character count */
- #define ONEXT(A) (A+3) /* next character after the operation */
- #define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
-
- #define SCNT(A) (*(A+1)) /* byte count of a string */
- #define SSTR(A) (A+2) /* address of the string */
- #define SNEXT(A) (A+2+*(A+1)) /* character following the string */
-
-
- /*
- * bit flags in the descriptor
- */
- #define OPT 1
- #define STR 2
- #define META 4
- #define ALT 8
- #define OPER 16
-
- char *ure; /* pointer current position in unconverted exp */
- char *ccre; /* pointer to current position in converted exp */
-
-
-
- /*
- * Convert an irregular expression to the internal form
- */
- char *
- convexp(re)
- char *re; /* unconverted irregular expression */
- {
- register char *cre; /* pointer to converted regular expression */
-
- /* allocate room for the converted expression */
- if (re == NULL)
- return NULL;
- if (*re == '\0')
- return NULL;
- cre = malloc(4 * strlen(re) + 3);
- ccre = cre;
- ure = re;
-
- /* start the conversion with a %|\a|% */
- *cre = META | OPT;
- MSYM(cre) = 'a';
- ccre = MNEXT(cre);
-
- /* start the conversion (it's recursive) */
- expconv();
- *ccre = 0;
- return cre;
- }
-
-
- /*
- * Actually do the conversion
- */
- static void
- expconv()
- {
- register char *cs; /* pointer to current symbol in converted exp */
- register char c; /* character being processed */
- register char *acs; /* pointer to last alternate */
- register int temp;
-
- /* let the conversion begin */
- acs = NULL;
- cs = NULL;
- while (*ure != '\0') {
- switch (c = *ure++) {
-
- case '\\':
- switch (c = *ure++) {
-
- /* escaped characters are just characters */
- default:
- if (cs == NULL || (*cs & STR) == 0) {
- cs = ccre;
- *cs = STR;
- SCNT(cs) = 1;
- ccre += 2;
- } else
- SCNT(cs)++;
- *ccre++ = c;
- break;
-
- /* normal(?) metacharacters */
- case 'a':
- case 'd':
- case 'e':
- case 'p':
- if (acs != NULL && acs != cs) {
- do {
- temp = OCNT(acs);
- OCNT(acs) = ccre - acs;
- acs -= temp;
- } while (temp != 0);
- acs = NULL;
- }
- cs = ccre;
- *cs = META;
- MSYM(cs) = c;
- ccre = MNEXT(cs);
- break;
- }
- break;
-
- /* just put the symbol in */
- case '^':
- case '$':
- if (acs != NULL && acs != cs) {
- do {
- temp = OCNT(acs);
- OCNT(acs) = ccre - acs;
- acs -= temp;
- } while (temp != 0);
- acs = NULL;
- }
- cs = ccre;
- *cs = META;
- MSYM(cs) = c;
- ccre = MNEXT(cs);
- break;
-
- /* mark the last match sequence as optional */
- case '?':
- if (cs)
- *cs = *cs | OPT;
- break;
-
- /* recurse and define a subexpression */
- case '(':
- if (acs != NULL && acs != cs) {
- do {
- temp = OCNT(acs);
- OCNT(acs) = ccre - acs;
- acs -= temp;
- } while (temp != 0);
- acs = NULL;
- }
- cs = ccre;
- *cs = OPER;
- OSYM(cs) = '(';
- ccre = ONEXT(cs);
- expconv();
- OCNT(cs) = ccre - cs; /* offset to next symbol */
- break;
-
- /* return from a recursion */
- case ')':
- if (acs != NULL) {
- do {
- temp = OCNT(acs);
- OCNT(acs) = ccre - acs;
- acs -= temp;
- } while (temp != 0);
- acs = NULL;
- }
- cs = ccre;
- *cs = META;
- MSYM(cs) = c;
- ccre = MNEXT(cs);
- return;
-
- /* Mark the last match sequence as having an alternate. */
- /* The third byte will contain an offset to jump over the */
- /* alternate match in case the first did not fail */
- case '|':
- if (acs != NULL && acs != cs)
- OCNT(ccre) = ccre - acs; /* make a back pointer */
- else
- OCNT(ccre) = 0;
- *cs |= ALT;
- cs = ccre;
- *cs = OPER;
- OSYM(cs) = '|';
- ccre = ONEXT(cs);
- acs = cs; /* remember that the pointer is to be filled */
- break;
-
- /* if it's not a metasymbol, just build a character string */
- default:
- if (cs == NULL || (*cs & STR) == 0) {
- cs = ccre;
- *cs = STR;
- SCNT(cs) = 1;
- ccre = SSTR(cs);
- } else
- SCNT(cs)++;
- *ccre++ = c;
- break;
- }
- }
- if (acs != NULL) {
- do {
- temp = OCNT(acs);
- OCNT(acs) = ccre - acs;
- acs -= temp;
- } while (temp != 0);
- acs = NULL;
- }
- return;
- } /* end of @expconv()@ */
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- /*
- * The following routine recognises an irregular expression
- * with the following special characters:
- *
- * %|\?|% means last match was optional
- * %|\a|% matches any number of characters
- * %|\d|% matches any number of spaces and tabs
- * %|\p|% matches any number of alphanumeric characters.
- * The characters matched will be copied into
- * the area pointed to by @mstring@.
- * %|\||% alternation
- * %|\( \)|% grouping used mostly for alternation and
- * optionality
- *
- * The irregular expression must be translated to internal form
- * prior to calling this routine
- *
- * The value returned is the pointer to the last character matched.
- * If @strtptr@ is non-@NULL@, a pointer to the start of the string matched
- * (excluding %|\a|% matches) will be returned at @*strtptr@.
- * If @mstring@ is non-@NULL@, the string matched by a %|\p|% will be copied
- * into @mstring@.
- */
-
- boolean _escaped; /* true if we are currently @_escaped@ */
- char *_start; /* start of string. Set in @putScp()@. */
-
-
-
- char *
- expmatch(s, re, strtptr, mstring)
- register char *s; /* string to check for a match in */
- register char *re; /* a converted irregular expression */
- char **strtptr; /* where to put ptr to start of match */
- char *mstring; /* where to put whatever matches a %|\p|% */
- {
- register char *cs; /* the current symbol */
- register char *ptr, *s1; /* temporary pointer */
- register char c; /* temporary */
- boolean matched; /* a temporary @boolean@ */
-
- /* initial conditions */
- if (strtptr)
- *strtptr = '\0';
- if (re == NULL)
- return NULL;
- cs = re;
- matched = FALSE;
-
- /* loop till expression string is exhausted (or at least pretty tired) */
- while (*cs) {
- switch (*cs & (OPER | STR | META)) {
-
- /* try to match a string */
- case STR:
- matched = !((*re_strncmp)(s, SSTR(cs), SCNT(cs)));
- if (matched) {
-
- /* hoorah: it matches */
- s += SCNT(cs);
- cs = SNEXT(cs);
- } else if (*cs & ALT) {
-
- /* alternation: skip to next expression */
- cs = SNEXT(cs);
- } else if (*cs & OPT) {
-
- /* the match is optional */
- cs = SNEXT(cs);
- matched = 1; /* indicate a successful match */
- } else {
-
- /* no match: error return */
- return NULL;
- }
- break;
-
- /* an operator: do something fancy */
- case OPER:
- switch (OSYM(cs)) {
-
- /* this is an alternation */
- case '|':
- if (matched)
-
- /* last thing in the alternation was a match: skip ahead */
- cs = OPTR(cs);
- else
-
- /* no match: keep trying */
- cs = ONEXT(cs);
- break;
-
- /* this is a grouping: recurse */
- case '(':
- ptr = expmatch(s, ONEXT(cs), strtptr, mstring);
- if (ptr != NULL) {
-
- /* the subexpression matched */
- matched = 1;
- s = ptr;
- } else if (*cs & ALT) {
-
- /* alternation: skip to next expression */
- matched = 0;
- } else if (*cs & OPT) {
-
- /* the match is optional */
- matched = 1; /* indicate a successful match */
- } else {
-
- /* no match: error return */
- return NULL;
- }
- cs = OPTR(cs);
- break;
- }
- break;
-
- /* try to match a metasymbol */
- case META:
- switch (MSYM(cs)) {
-
- /* try to match anything and remember what was matched */
- case 'p':
- /*
- * This is really the same as trying the match the
- * remaining parts of the expression to any subset
- * of the string.
- */
- s1 = s;
- do {
- ptr = expmatch(s1, MNEXT(cs), strtptr, mstring);
- if (ptr != NULL && s1 != s) {
-
- /* we have a match: remember the match */
- if (mstring) {
- strncpy(mstring, s, s1 - s);
- mstring[s1 - s] = '\0';
- }
- return ptr;
-
- } else if (ptr != NULL && (*cs & OPT)) {
-
- /* %|\p|% was optional, so no match is ok */
- return ptr;
-
- } else if (ptr != NULL) {
-
- /* not optional and we still matched */
- return NULL;
- }
- if (!isalnum(*s1) && *s1 != '_')
- return NULL;
- if (*s1 == '\\')
- _escaped = _escaped ? FALSE : TRUE;
- else
- _escaped = FALSE;
- } while (*s1++);
- return NULL;
-
- /* try to match anything */
- case 'a':
- /*
- * This is really the same as trying the match the
- * remaining parts of the expression to any subset
- * of the string.
- */
- s1 = s;
- do {
- /*
- * Hack for an important special case: if the next thing
- * in the pattern is a string, just gobble characters until
- * we find something that matches that string (this saves
- * the cost of a recursive call on @expmatch()@ while scanning
- * for the start of comments or strings). Since many
- * patterns end with a string, we also treat that as a
- * special case.
- */
- if(*(ptr = MNEXT(cs)) == STR) {
- c = *SSTR(ptr);
- while(*s1 && *s1 != c)
- s1++;
-
- if (*s1 == 0)
- return NULL;
-
- if (SNEXT(ptr) == 0 && (s1 != s || *cs & OPT)) {
- /* next item is a string, it's the last item and
- * the %|\a|% match is ok --- just loop to try & match
- * the string.
- */
- if (strtptr)
- *strtptr = s1;
-
- cs = ptr;
- s = s1;
- break;
- }
- }
- ptr = expmatch(s1, MNEXT(cs), strtptr, mstring);
- if (ptr != NULL && (s1 != s || *cs & OPT)) {
-
- /* we have a match */
- if (strtptr)
- *strtptr = s1;
-
- return ptr;
-
- } else if (ptr != NULL) {
-
- /* not optional and we still matched */
- return NULL;
- }
- if (*s1 == '\\')
- _escaped = _escaped ? FALSE : TRUE;
- else
- _escaped = FALSE;
- } while (*s1++);
- return NULL;
-
- /* fail if we are currently @_escaped@ */
- case 'e':
- if (_escaped)
- return NULL;
- cs = MNEXT(cs);
- break;
-
- /* match any number of tabs and spaces */
- case 'd':
- ptr = s;
- while (*s == ' ' || *s == '\t')
- s++;
-
- if (s != ptr || s == _start) {
-
- /* match: be happy */
- matched = 1;
- cs = MNEXT(cs);
- } else if (*s == '\n' || *s == '\0') {
-
- /* match: be happy */
- matched = 1;
- cs = MNEXT(cs);
- } else if (*cs & ALT) {
-
- /* try the next part */
- matched = 0;
- cs = MNEXT(cs);
- } else if (*cs & OPT) {
-
- /* doesn't matter */
- matched = 1;
- cs = MNEXT(cs);
- } else
-
- /* no match: error return */
- return NULL;
-
- break;
-
- /* check for end of line */
- case '$':
- if (*s == '\0' || *s == '\n') {
-
- /* match: be happy */
- s++;
- matched = 1;
- cs = MNEXT(cs);
- } else if (*cs & ALT) {
-
- /* try the next part */
- matched = 0;
- cs = MNEXT(cs);
- } else if (*cs & OPT) {
-
- /* doesn't matter */
- matched = 1;
- cs = MNEXT(cs);
- } else
-
- /* no match: error return */
- return NULL;
- break;
-
- /* check for start of line */
- case '^':
- if (s == _start) {
-
- /* match: be happy */
- matched = 1;
- cs = MNEXT(cs);
- } else if (*cs & ALT) {
-
- /* try the next part */
- matched = 0;
- cs = MNEXT(cs);
- } else if (*cs & OPT) {
-
- /* doesn't matter */
- matched = 1;
- cs = MNEXT(cs);
- } else
-
- /* no match: error return */
- return NULL;
- break;
-
- /* end of a subexpression: return success */
- case ')':
- return s;
- }
- break;
- }
- }
- return s;
- }
-