home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
OS/2 Shareware BBS: 5 Edit
/
05-Edit.zip
/
me34src.zip
/
me3
/
util
/
regex.c
< prev
next >
Wrap
C/C++ Source or Header
|
1995-01-14
|
22KB
|
695 lines
/*
* regex - Regular expression pattern matching
* and replacement
*
* By: Ozan S. Yigit (oz), Dept. of Computer Science, York University
* Mods: C Durland
*
* These routines are the PUBLIC DOMAIN equivalents of regex routines as
* found in 4.nBSD UN*X, with minor extensions.
*
* These routines are derived from various implementations found in software
* tools books, and Conroy's grep. They are NOT derived from
* licensed/restricted software. For more interesting/academic/complicated
* implementations, see Henry Spencer's regexp routines, or GNU Emacs
* pattern matching module.
*
* dfa = deterministic finite automata
* Routines:
* re_comp: compile a regular expression into a DFA.
* char *re_comp(s)
* char *s;
* returns: NULL if OK, else error string
* If s is NULL or 0 length, last compiled pattern is used.
* re_exec: execute the DFA to match a pattern.
* int re_exec(s)
* char *s;
* re_subs: substitute the matched portions in a new string.
* int re_subs(src, dst)
* char *src;
* char *dst;
* re_fail: failure routine for re_exec.
* void re_fail(msg, op)
* char *msg;
* char op;
*
* Regular Expressions:
*
* [1] char matches itself, unless it is a special
* character (metachar): . \ [ ] * + ^ $
*
* [2] . matches any character.
*
* [3] \ matches the character following it, except
* when followed by one of: ()123456789<> adnwW
* (see [7], [8] and [9])
* It is used as an escape character for all other
* meta-characters, and itself. When used in a set
* ([4]), it is treated as an ordinary character.
*
* [4] [set] matches one of the characters in the set.
* If the first character in the set is "^",
* it matches a character NOT in the set. A
* shorthand S-E is used to specify a set of
* characters S upto E, inclusive. The special
* characters "]" and "-" have no special
* meaning if they appear as the first chars
* in the set.
* Example: Matches:
* -------- -------
* [a-z] Any lowercase alpha.
*
* [^]-] Any char except ] and -.
*
* [^A-Z] Any char except uppercase alpha.
*
* [a-zA-Z] Any alpha.
*
* [a-b-c] == [a-bb-c] == [a-c]
* [a-a] == [a]
* [-abc] == [abc-] Match -, a, b, c.
*
* []] == ] Match "]"
* [-]] Match only "-]". This is a set ([-])
* and a character (]).
*
* [z-a] Nothing and is an error.
*
* [5] * any regular expression form [1] to [4], followed by
* closure char (*) matches zero or more matches of
* that form.
*
* [6] + same as [5], except it matches one or more.
*
* [7] a regular expression in the form [1] to [10], enclosed
* as \(form\) matches what form matches. The enclosure
* creates a set of tags, used for [8] and for
* pattern substution. The tagged forms are numbered
* starting from 1.
*
* [8] a \ followed by a digit 1 to 9 matches whatever a
* previously tagged regular expression ([7]) matched.
*
* [9] \< a regular expression starting with a \< construct
* \> and/or ending with a \> construct, restricts the
* pattern matching to the beginning of a word, and/or
* the end of a word. A word is defined to be a character
* string beginning and/or ending with the characters
* A-Z a-z 0-9 and _. It must also be preceded and/or
* followed by any character outside those mentioned.
*
* [10] a composite regular expression xy where x and y
* are in the form [1] to [10] matches the longest
* match of x followed by a match for y.
*
* [11] ^ a regular expression starting with a ^ character
* $ and/or ending with a $ character, restricts the
* pattern matching to the beginning of the line,
* or the end of line. [anchors] Elsewhere in the
* pattern, ^ and $ are treated as ordinary characters.
*
* Acknowledgements:
* HCR's Hugh Redelmeier has been most helpful in various stages of
* development. He convinced me to include BOW and EOW constructs,
* originally invented by Rob Pike at the University of Toronto.
* References:
* Software tools Kernighan & Plauger
* Software tools in Pascal Kernighan & Plauger
* Grep [rsx-11 C dist] David Conroy
* ed - text editor Un*x Programmer's Manual
* Advanced editing on Un*x B. W. Kernighan
* RegExp routines Henry Spencer
* Notes:
* This implementation uses a bit-set representation for character sets for
* speed and compactness. Each character is represented by one bit in a
* N-bit block. Thus, SET or NSET always takes a constant M bytes in the
* internal dfa, and re_exec does a single bit comparison to locate the
* character in the set. N is 128 for 7 bits ASCII and 256 for 8 bit
* data. Thus M is 16 or 32 bytes.
* Put CLO in front of what gets closed for ease of interpreting.
* Put END at end of what gets closed to limit recursion.
* Examples:
* pattern: foo*.*
* compile: CHR f CHR o CLO CHR o END CLO ANY END END
* matches: fo foo fooo foobar fobar foxx ...
*
* pattern: fo[ob]a[rz]
* compile: CHR f CHR o SET bitset CHR a SET bitset END
* matches: fobar fooar fobaz fooaz
*
* pattern: foo\\+
* compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
* matches: foo\ foo\\ foo\\\ ...
*
* pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
* compile: BOT 1 CHR f CHR o CHR o EOT 1 SET bitset REF 1 END
* matches: foo1foo foo2foo foo3foo
*
* pattern: \(fo.*\)-\1
* compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
* matches: foo-foo fo-fo fob-fob foobar-foobar ...
*/
#include <char.h>
#include <const.h>
typedef unsigned char Char;
#define MAXDFA 768 /* amount of space for compiled RE */
#define MAXTAG 10
#define CHR 1 /* character :: CHR<character> */
#define ANY 2 /* . :: ANY */
#define SET 3 /* set: [...] :: SET bitset */
#define NSET 4 /* not set: [^...] :: SET bitset */
#define BOL 5 /* beginning of line: ^ :: BOL */
#define EOL 6 /* end of line: $ :: EOL */
#define BOT 7 /* beginning of tag: \( */
#define EOT 8 /* end of tag: \) */
#define BOW 9 /* beginning of word: \< */
#define EOW 10 /* end of word: \> */
#define REF 11 /* tag reference: \1,...,\9 */
#define CLO 12 /* closure: +, * :: CLO dfa END */
#define SPACE 13 /* ": ": match isspace() */
#define ALPHA 14 /* :a match isalpha() */
#define DIGIT 15 /* :d match isdigit() */
#define ALNUM 16 /* :n match isalnum() */
#define WORD 17 /* :w match isword() */
#define NWORD 18 /* :W match !isword() */
#define END 0
/* ******************************************************************** */
/* **************************** Bit Tables **************************** */
/* ******************************************************************** */
/*
* Bit table: a string of bits stored in an array of char
* .-----------------------------------.
* |01234567|89012345|67890123|45678901|
* `-----------------------------------'
* bits[0] [1] [2] [3]
* To find what bucket the nth bit is in (8 bits per bucket):
* bit_bucket(n) = bits[n/8]
* It might be a good idea to restrict n so it doesn't index outside its
* range (only works if number of bits is a power of 2):
* n = n & ((max_n - 1) & ~7) where max_n is a power of 2
* The ~7 is just to get rid of the lower bits that won't do anything
* anyway.
* The nth bit in the bucket is n mod 8 (ie the lower 3 bits of n (0-7) are
* the bit position):
* bit_flag(n) = 1 << (n & 7)
* To find the state of the nth bit (0 == off and !0 == on):
* bit_bucket(n) & bit_flag(n)
* To set the nth bit:
* bits[bit_bucket(n)] |= bit_flag(n)
* Notes:
* The bits are stored in a character array so that the array can be
* easily copied without worrying about alignment (ie can use a loop as
* well as memcpy()).
* This is based on two's complement math.
*/
/* The following defines are for character set size. 128 for straight
* ASCII, 256 for Euro ASCII (8 bit characters).
*/
#define MAXCHR 256 /* 128 or 256 */
#define BLKIND 0xf8 /* 0x78 or 0xf8 */
/*
* The following defines are not meant to be changeable.
* They are for readability only.
*/
#define CHRBIT 8
#define BITBLK MAXCHR/CHRBIT
#define BITIND 0x7
static Char bittab[BITBLK]; /* bit table for SET */
/* Add or check to see if a character is in the bit table (character
* set).
* Note:
* When calling these routines, make sure c is an unsigned char (or
* int) so if it has the high bit set, casting it to an int won't
* make it a large negative number.
*/
#define ISINSET(bittab,c) ((bittab)[((c) & BLKIND)>>3] & (1<<((c) & BITIND)))
#define CHSET(bittab,c) (bittab)[((c) & BLKIND)>>3] |= 1<<((c) & BITIND)
static void chset(c) register int c; { CHSET(bittab,c); }
/* ******************************************************************** */
#define SLOP 50 /* dfa overflow protection */
static Char dfa[MAXDFA + SLOP]; /* automaton */
static int
tagstk[MAXTAG], /* subpat tag stack */
pattern_compiled = FALSE; /* status of lastpat */
#define BADPAT(msg) return (*dfa = END, (Char *)msg)
#define STORE(x) (*mp++ = x) /* !!! should check for dfa overflow */
/* Compile RE to internal format & store in dfa[]
* Input:
* pat : pointer to regular expression string to compile.
* Returns:
* NULL: RE compiled OK.
* pointer to error message.
*/
Char *re_comp(pat) Char *pat;
{
register Char
*p, /* pattern pointer */
*mp = dfa, /* dfa pointer */
*lp, /* saved pointer */
*sp = dfa; /* another one */
register int
tagi = 0, /* tag stack index */
tagc = 1, /* actual tag count */
n;
if (pat == NULL || *pat == '\0')
if (pattern_compiled) return NULL;
else BADPAT("No previous regular expression");
pattern_compiled = FALSE;
*dfa = END; /* init dfa for lp and sp */
for (p = pat; *p; p++)
{
lp = mp; /* start of next dfa state */
switch(*p)
{
case '.': STORE(ANY); break; /* match any character */
case '^': /* match beginning of line */
if (p == pat) STORE(BOL);
else { STORE(CHR); STORE(*p); } /* match ^ */
break;
case '$': /* match end of line */
if (*(p+1) == '\0') STORE(EOL);
else { STORE(CHR); STORE(*p); } /* match $ */
break;
case '[': /* match a set of characters */
if (*++p == '^') { STORE(NSET); p++; } else STORE(SET);
if (*p == ']') chset(*p++); /* real bracket, match ] */
if (*p == '-') chset(*p++); /* real dash, match - */
while (*p && *p != ']')
{
if (*p == '-' && *(p+1) != '\0' && *(p+1) != ']') /* [a-z] */
{
int c1, c2;
p++;
c1 = *(p-2); /* 'a' */
c2 = *p++; /* 'z' */
if (c1 > c2) BADPAT("Empty set"); /* something like [z-a] */
/* remember that 'a' has already been put into bittab */
while (++c1 <= c2) chset(c1); /* build bit table */
}
#ifdef EXTEND
else if (*p == '\\' && *(p+1)) { p++; chset(*p++); }
#endif
else chset(*p++);
}
if (*p == '\0') BADPAT("Missing ]");
for (n = 0; n < BITBLK; bittab[n++] = '\0') STORE(bittab[n]);
break;
case '*': /* match 0 or more of preceding RE */
case '+': /* match 1 or more. Note: x+ == xx* */
if (p == pat) BADPAT("Empty closure");
lp = sp; /* previous opcode */
if (*lp == CLO) break; /* equivalence: x** == x* */
switch(*lp)
{
case BOL: case BOT: case EOT: case BOW: case EOW: case REF:
BADPAT("Illegal closure");
}
if (*p == '+') for (sp = mp; lp < sp; lp++) STORE(*lp);
STORE(END); STORE(END); sp = mp;
while (--mp > lp) *mp = mp[-1]; STORE(CLO); /* open hole for CLO */
mp = sp;
break;
case '\\': /* tags, backrefs */
switch(*++p)
{
case '\0': BADPAT("Bad quote");
case '(':
if (tagc < MAXTAG)
{ tagstk[++tagi] = tagc; STORE(BOT); STORE(tagc++); }
else BADPAT("Too many \\(\\) pairs");
break;
case ')':
if (*sp == BOT) BADPAT("Null pattern inside \\(\\)");
if (tagi > 0) { STORE(EOT); STORE(tagstk[tagi--]); }
else BADPAT("Unmatched \\)");
break;
case '<': STORE(BOW); break;
case '>':
if (*sp == BOW) BADPAT("Null pattern inside \\<\\>");
STORE(EOW);
break;
case '1': case '2': case '3': case '4': case '5': case '6':
case '7': case '8': case '9':
n = *p - '0';
if (tagi > 0 && tagstk[tagi] == n) BADPAT("Cyclical reference");
if (tagc > n) { STORE(REF); STORE(n); }
else BADPAT("Undetermined reference");
break;
case ' ': STORE(SPACE); break;
case 'a': STORE(ALPHA); break;
case 'd': STORE(DIGIT); break;
case 'n': STORE(ALNUM); break;
case 'w': STORE(WORD); break;
case 'W': STORE(NWORD); break;
#ifdef EXTEND
case 'b': STORE(CHR); STORE('\b'); break;
case 'n': STORE(CHR); STORE('\n'); break;
case 'f': STORE(CHR); STORE('\f'); break;
case 'r': STORE(CHR); STORE('\r'); break;
case 't': STORE(CHR); STORE('\t'); break;
#endif
default: STORE(CHR); STORE(*p);
}
break;
default : STORE(CHR); STORE(*p); break; /* an ordinary character */
}
sp = lp; /* start of previous state */
if (mp > dfa + MAXDFA) BADPAT("Pattern too long (braindead re_comp())");
}
if (tagi > 0) BADPAT("Unmatched \\(");
STORE(END);
pattern_compiled = TRUE;
return NULL;
}
static Char *pmatch();
Char *bopat[MAXTAG], *eopat[MAXTAG];
int re_errorcode; /* sleaze */
static Char *bol;
/* re_exec: execute dfa to find a match.
*
* special cases: (dfa[0])
* BOL
* Match only once, starting from the beginning.
* CHR
* First locate the character without calling pmatch(), and if found,
* call pmatch() for the remaining string.
* END
* re_comp() failed, poor luser did not check for it. Fail fast.
*
* If a match is found, bopat[0] and eopat[0] are set to the beginning and
* the end of the matched fragment, respectively.
*
* Input:
* lp: string to search
* SoL==TRUE if lp starts line
* move==TRUE if search the entire string for match
* Returns:
* TRUE:
* FALSE:
* ????
* Munges:
* re_errorcode: Set to FALSE, re_fail() might want to set it.
* Notes:
* If SoL is FALSE, lp[-1] MUST be at valid! A couple of REs will look
* there if they can.
*/
int re_exec(lp,SoL,move) register Char *lp; int SoL, move;
{
register Char *ap = dfa, c;
Char *ep = NULL;
re_errorcode = FALSE; /* assume no match */
bol = (SoL ? lp : NULL);
bopat[0] = bopat[1] = bopat[2] = bopat[3] = bopat[4] = bopat[5] =
bopat[6] = bopat[7] = bopat[8] = bopat[9] = NULL;
switch(*ap)
{
case END: return FALSE; /* munged automaton. fail always */
case BOL: /* anchored: match from BOL only */
if (!SoL) return FALSE; /* BoL can only be at front of dfa */
ep = pmatch(lp,++ap);
break;
case CHR: /* ordinary char: locate it fast */
if (move)
{
c = *(ap+1);
while (*lp && !ceq(*lp,c)) lp++;
if (!*lp) return FALSE; /* if EoS, fail. else fall thru */
}
default: /* regular matching all the way. */
if (!move) { ep = pmatch(lp,ap); break; }
#if 0
while (*lp)
{
if ((ep = pmatch(lp,ap))) break;
lp++;
}
#else
while (TRUE)
{
if ((ep = pmatch(lp,ap))) break;
if ('\0' == *lp++) break;
}
#endif
break;
}
if (!ep) return re_errorcode; /* only if pmatch() returns NULL */
bopat[0] = lp; eopat[0] = ep;
return TRUE;
}
/*
* pmatch: internal routine for the hard part
*
* This code is mostly snarfed from an early grep written by David Conroy.
* The backref and tag stuff, and various other mods are by oZ.
*
* special cases: (dfa[n], dfa[n+1])
* CLO ANY
* We KNOW ".*" will match ANYTHING upto the end of line. Thus, go to
* the end of line straight, without calling pmatch() recursively. As in
* the other closure cases, the remaining pattern must be matched by
* moving backwards on the string recursively, to find a match for xy (x
* is ".*" and y is the remaining pattern) where the match satisfies the
* LONGEST match for x followed by a match for y.
* CLO CHR
* Scan forward matching the single char without recursion, and at the
* point of failure, we execute the remaining dfa recursively, as
* described above.
*
* At the end of a successful match, bopat[n] and eopat[n] are set to the
* beginning and end of subpatterns matched by tagged expressions (n = 1
* to 9).
*
* Input:
* Returns:
* NULL: No match, re_fail() may have been called.
* else: pointer to end of match.
*/
extern void re_fail();
/* skip values for CLO XXX to skip past the closure */
#define ANYSKIP 2 /* CLO ANY END ... */
#define CHRSKIP 3 /* CLO CHR chr END ... */
#define SETSKIP (2 +BITBLK) /* CLO SET 16bytes END ... */
static Char *pmatch(lp,dfa) register Char *lp, *dfa;
{
register Char
*e, /* extra pointer for CLO */
*bp, *ep; /* beginning and ending of subpat */
Char *are; /* to save the line ptr */
register int op, c, n;
while ((op = *dfa++) != END)
switch(op)
{
case CHR: if (!ceq(*lp++,*dfa++)) return NULL; break;
case ANY: if (*lp++ == '\0') return NULL; break;
case SET:
c = *lp++;
if (!ISINSET(dfa,c)) return NULL; /* ISINSET(dfa,0) is FALSE */
dfa += BITBLK;
break;
case NSET:
if ((c = *lp++) == '\0' || ISINSET(dfa,c)) return NULL;
dfa += BITBLK;
break;
case BOT: bopat[*dfa++] = lp; break;
case EOT: eopat[*dfa++] = lp; break;
case EOL: if (*lp != '\0') return NULL; break;
case ALNUM: if (!isalnum(*lp++)) return NULL; break;
case ALPHA: if (!isalpha(*lp++)) return NULL; break;
case DIGIT: if (!isdigit(*lp++)) return NULL; break;
case SPACE: if (!isspace(*lp++)) return NULL; break;
case WORD: if (!isword (*lp++)) return NULL; break;
case NWORD: if (*lp == '\0' || isword(*lp++)) return NULL; break;
case BOW:
if (!(lp != bol && isword(lp[-1])) && isword(*lp)) break;
return NULL;
case EOW: /* 'w\0' is OK here */
if ((lp != bol && isword(lp[-1])) && !isword(*lp)) break;
return NULL;
case REF: /* !!! case_fold? */
n = *dfa++; bp = bopat[n]; ep = eopat[n];
while (bp < ep) if (*bp++ != *lp++) return NULL; /* !!! recurse? */
break;
case CLO:
are = lp; n = ANYSKIP;
switch(*dfa)
{
case ANY: while (*lp) lp++; break;
case ALNUM: while (isalnum(*lp)) lp++; break;
case ALPHA: while (isalpha(*lp)) lp++; break;
case DIGIT: while (isdigit(*lp)) lp++; break;
case SPACE: while (isspace(*lp)) lp++; break;
case WORD: while (isword (*lp)) lp++; break;
case NWORD: while (*lp && !isword(*lp)) lp++; break;
case CHR:
c = *(dfa+1); /* we know c!='\0' */
while (ceq(*lp,c)) lp++;
n = CHRSKIP;
break;
case SET: case NSET:
while (*lp && (e = pmatch(lp,dfa))) lp = e;
n = SETSKIP;
break;
default: re_fail("closure: bad dfa.",*dfa); return NULL;
}
dfa += n;
while (lp >= are) /* backup up till match next pattern */
{
if (e = pmatch(lp,dfa)) return e;
--lp;
}
return NULL;
default: re_fail("re_exec: bad dfa.",op); return NULL;
}
return lp;
}
/*
* re_subs: substitute the matched portions of the src in dst.
* & substitute the entire matched pattern.
* \digit substitute a subpattern, with the given
* tag number. Tags are numbered from 1 to
* 9. If the particular tagged subpattern
* does not exist, null is substituted.
* !!!Note: if the line that was used re_exec() has gone byebye
* then \digit will blow cookies since the tags point into the line.
* Input:
* src:
* dst:
* Returns:
* TRUE: everything went as expected
* FALSE: Bad src or no match.
*/
int re_subs(src,dst) register Char *src, *dst;
{
register Char c, *bp, *ep;
register int pin;
if (!bopat[0]) return FALSE;
while (c = *src++)
{
switch(c)
{
case '&': pin = 0; break;
case '\\':
c = *src++;
if (c >= '0' && c <= '9') { pin = c - '0'; break; }
default: *dst++ = c; continue;
}
if ((bp = bopat[pin]) && (ep = eopat[pin]))
{
while (*bp && bp < ep) *dst++ = *bp++;
if (bp < ep) return FALSE;
}
}
*dst = '\0';
return TRUE;
}
/* ******************************************************************** */
/* ************************* DEBUG ************************************ */
/* ******************************************************************** */
#ifdef DEBUG
/*
* symbolic - produce a symbolic dump of the dfa
*/
symbolic(s)
char *s;
{
printf("pattern: %s\n", s);
printf("dfacode:\n");
dfadump(dfa);
}
static
dfadump(dfa) Char *dfa;
{
register int n;
while (*dfa != END)
switch(*dfa++)
{
case CLO:
printf("CLOSURE");
dfadump(dfa);
switch(*dfa)
{
case CHR: n = CHRSKIP; break;
case ANY: n = ANYSKIP; break;
case SET: case NSET: n = SETSKIP; break;
}
dfa += n;
break;
case CHR: printf("\tCHR %c\n",*dfa++); break;
case ANY: printf("\tANY .\n"); break;
case BOL: printf("\tBOL -\n"); break;
case EOL: printf("\tEOL -\n"); break;
case BOT: printf("BOT: %d\n",*dfa++); break;
case EOT: printf("EOT: %d\n",*dfa++); break;
case BOW: printf("BOW\n"); break;
case EOW: printf("EOW\n"); break;
case REF: printf("REF: %d\n",*dfa++); break;
case SET:
printf("\tSET [");
for (n = 0; n < MAXCHR; n++)
if (ISINSET(dfa,n)) printf("%c",n);
printf("]\n");
dfa += BITBLK;
break;
case NSET:
printf("\tNSET [");
for (n = 0; n < MAXCHR; n++)
if (ISINSET(dfa,n)) printf("%c",n);
printf("]\n"); dfa += BITBLK;
break;
default:
printf("bad dfa. opcode %o\n", dfa[-1]);
exit(1);
break;
}
}
#endif