home *** CD-ROM | disk | FTP | other *** search
- /*
- ** regexp - a C++-ized version of Henry Spencers regexp package.
- **
- ** regexp.C regexp.C 1.7 Delta\'d: 15:39:42 9/22/92 Mike Lijewski, CNSF
- **
- **
- ** @\(#\)regexp.c 1.3 of 18 April 87
- **
- ** Copyright \(c\) 1986 by University of Toronto.
- ** Written by Henry Spencer. Not derived from licensed software.
- **
- ** Permission is granted to anyone to use this software for any
- ** purpose on any computer system, and to redistribute it freely,
- ** subject to the following restrictions:
- **
- ** 1. The author is not responsible for the consequences of use of
- ** this software, no matter how awful, even if they arise
- ** from defects in it.
- **
- ** 2. The origin of this software must not be misrepresented, either
- ** by explicit claim or by omission.
- **
- ** 3. Altered versions must be plainly marked as such, and must not
- ** be misrepresented as being the original software.
- **
- ** Beware that some of this code is subtly aware of the way operator
- ** precedence is structured in regular expressions. Serious changes in
- ** regular-expression syntax might require a total rethink.
- */
-
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
-
- #include "regexp.h"
-
- /*
- ** The "internal use only" fields in regexp.h are present to pass info from
- ** compile to execute that permits the execute phase to run lots faster on
- ** simple cases. They are:
- **
- ** regstart char that must begin a match; \'\0\' if none obvious
- ** reganch is the match anchored \(at beginning-of-line only\)?
- ** regmust string \(pointer into program\) that match must include, or 0
- ** regmlen length of regmust string
- **
- ** Regstart and reganch permit very fast decisions on suitable starting points
- ** for a match, cutting down the work a lot. Regmust permits fast rejection
- ** of lines that cannot possibly match. The regmust tests are costly enough
- ** that regcomp\(\) supplies a regmust only if the r.e. contains something
- ** potentially expensive \(at present, the only such thing detected is * or +
- ** at the start of the r.e., which can involve a lot of backup\). Regmlen is
- ** supplied because the test in regexec\(\) needs it and regcomp\(\) is computing
- ** it anyway.
- */
-
- /*
- ** Structure for regexp "program". This is essentially a linear encoding
- ** of a nondeterministic finite-state machine \(aka syntax charts or
- ** "railroad normal form" in parsing technology\). Each node is an opcode
- ** plus a "next" pointer, possibly plus an operand. "Next" pointers of
- ** all nodes except BRANCH implement concatenation; a "next" pointer with
- ** a BRANCH on both ends of it is connecting two alternatives. \(Here we
- ** have one of the subtle syntax dependencies: an individual BRANCH \(as
- ** opposed to a collection of them\) is never concatenated with anything
- ** because of operator precedence.\) The operand of some types of node is
- ** a literal string; for others, it is a node leading into a sub-FSM. In
- ** particular, the operand of a BRANCH node is the first node of the branch.
- ** \(NB this is *not* a tree structure: the tail of the branch connects
- ** to the thing following the set of BRANCHes.\) The opcodes are:
- */
-
- /* definition number opnd? meaning */
- const int END = 0; /* no End of program. */
- const int BOL = 1; /* no Match "" at beginning of line. */
- const int EOL = 2; /* no Match "" at end of line. */
- const int ANY = 3; /* no Match any one character. */
- const int ANYOF = 4; /* str Match any character in this string. */
- const int ANYBUT = 5; /* str Match any character not in this string. */
- const int BRANCH = 6; /* node Match this alternative, or the next... */
- const int BACK = 7; /* no Match "", "next" ptr points backward. */
- const int EXACTLY = 8; /* str Match this string. */
- const int NOTHING = 9; /* no Match empty string. */
- const int STAR = 10; /* node Match this \(simple\) thing 0 or more times. */
- const int PLUS = 11; /* node Match this \(simple\) thing 1 or more times. */
- const int OPEN = 20; /* no Mark this point in input as start of #n. */
- /* OPEN+1 is number 1, etc. */
- const int CLOSE = 30; /* no Analogous to OPEN. */
-
- /*
- ** Opcode notes:
- **
- ** BRANCH The set of branches constituting a single choice are hooked
- ** together with their "next" pointers, since precedence prevents
- ** anything being concatenated to any individual branch. The
- ** "next" pointer of the last BRANCH in a choice points to the
- ** thing following the whole choice. This is also where the
- ** final "next" pointer of each individual branch points; each
- ** branch starts with the operand node of a BRANCH node.
- **
- ** BACK Normal "next" pointers all implicitly point forward; BACK
- ** exists to make loop structures possible.
- **
- ** STAR,PLUS \'?\', and complex \'*\' and \'+\', are implemented as circular
- ** BRANCH structures using BACK. Simple cases \(one character
- ** per match\) are implemented with STAR and PLUS for speed
- ** and to minimize recursive plunges.
- **
- ** OPEN,CLOSE ...are numbered at compile time.
- */
-
- /*
- ** A node is one char of opcode followed by two chars of "next" pointer.
- ** "Next" pointers are stored as two 8-bit pieces, high order first. The
- ** value is a positive offset from the opcode of the node containing it.
- ** An operand, if any, simply follows the node. \(Note that much of the
- ** code generation knows about this implicit relationship.\)
- **
- ** Using two bytes for the "next" pointer is vast overkill for most things,
- ** but allows patterns to get big without disasters.
- */
-
- #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
-
- const int MAGIC = 0234;
- const char *const META = "^$.[()|?+*\\";
-
- inline char OP(const char *const &p) { return *p; }
- inline char *OPERAND(char *p) { return p + 3; }
- inline int UCHARAT(const char *p) { return (int)*(unsigned char *)p; }
- inline int FAIL(const char *m) { REerror = m; return 0; }
- inline int ISMULT(char c) { return c == '*' || c == '+' || c == '?'; }
-
- // Flags to be passed up and down.
- const int HASWIDTH = 01; /* Known never to match null string. */
- const int SIMPLE = 02; /* Simple enough to be STAR/PLUS operand. */
- const int SPSTART = 04; /* Starts with * or +. */
- const int WORST = 0; /* Worst case. */
-
- // our definition of REerror
- const char *REerror;
-
- // Global work variables for regcomp\(\).
- static const char *regparse; /* Input-scan pointer. */
- static int regnpar; /* \(\) count. */
- static char regdummy;
- static char *regcode; /* Code-emit pointer; ®dummy = don\'t. */
- static long regsize; /* Code size. */
-
- /*
- ** regnext - dig the "next" pointer out of a node
- */
-
- static char *regnext(char *p)
- {
- if (p == ®dummy) return 0;
- int offset = NEXT(p);
- if (offset == 0) return 0;
- return (OP(p) == BACK) ? p-offset : p+offset;
- }
-
- /*
- ** regtail - set the next-pointer at the end of a node chain
- */
-
- static void regtail(char *p, char *val)
- {
- if (p == ®dummy) return;
-
- /* Find last node. */
- char *scan = p;
- char *temp;
- for (;;) {
- temp = regnext(scan);
- if (temp == 0) break;
- scan = temp;
- }
-
- int offset = (OP(scan) == BACK) ? scan - val : val - scan;
- *(scan+1) = (offset>>8)&0377;
- *(scan+2) = offset&0377;
- }
-
- /*
- ** regoptail - regtail on operand of first argument; nop if operandless
- */
-
- static void regoptail(char *p, char *val)
- {
- /* "Operandless" and "op != BRANCH" are synonymous in practice. */
- if (p == 0 || p == ®dummy || OP(p) != BRANCH) return;
- regtail(OPERAND(p), val);
- }
-
- /*
- ** regnode - emit a node
- */
-
- static char *regnode(char op)
- {
- char *ret = regcode;
- if (ret == ®dummy) { regsize += 3; return ret; }
- char *ptr = ret;
- *ptr++ = op;
- *ptr++ = '\0'; /* Null "next" pointer. */
- *ptr++ = '\0';
- regcode = ptr;
- return ret;
- }
-
- /*
- ** reginsert - insert an operator in front of already-emitted operand
- **
- ** Means relocating the operand.
- */
-
- static void reginsert(char op, char *opnd)
- {
- if (regcode == ®dummy) { regsize += 3; return; }
-
- char *src = regcode;
- regcode += 3;
- char *dst = regcode;
- while (src > opnd) *--dst = *--src;
-
- char *place = opnd; // Op node, where operand used to be.
- *place++ = op;
- *place++ = '\0';
- *place++ = '\0';
- }
-
- /*
- ** regc - emit \(if appropriate\) a byte of code
- */
-
- static inline void regc(char b)
- {
- if (regcode != ®dummy)
- *regcode++ = b;
- else
- regsize++;
- }
-
- // forward reference
- static char *reg(int paren, int *flagp);
-
- /*
- ** regatom - the lowest level
- **
- ** Optimization: gobbles an entire sequence of ordinary characters so that
- ** it can turn them into a single node, which is smaller to store and
- ** faster to run. Backslashed characters are exceptions, each becoming a
- ** separate node; the code is simpler that way and it\'s not worth fixing.
- */
-
- static char *regatom(int *flagp)
- {
- char *ret;
- int flags;
-
- *flagp = WORST; /* Tentatively. */
-
- switch (*regparse++) {
- case '^': ret = regnode(BOL); break;
- case '$': ret = regnode(EOL); break;
- case '.': ret = regnode(ANY); *flagp |= HASWIDTH|SIMPLE; break;
- case '[': {
- if (*regparse == '^') { /* Complement of range. */
- ret = regnode(ANYBUT);
- regparse++;
- } else
- ret = regnode(ANYOF);
- if (*regparse == ']' || *regparse == '-') regc(*regparse++);
- while (*regparse != '\0' && *regparse != ']') {
- if (*regparse == '-') {
- regparse++;
- if (*regparse == ']' || *regparse == '\0')
- regc('-');
- else {
- int theclass = UCHARAT(regparse-2)+1;
- int classend = UCHARAT(regparse);
- if (theclass > classend+1) FAIL("invalid [] range");
- for (; theclass <= classend; theclass++) regc(theclass);
- regparse++;
- }
- } else
- regc(*regparse++);
- }
- regc('\0');
- if (*regparse != ']') FAIL("unmatched []");
- regparse++;
- *flagp |= HASWIDTH|SIMPLE;
- }
- break;
- case '(':
- ret = reg(1, &flags);
- if (ret == 0) return 0;
- *flagp |= flags&(HASWIDTH|SPSTART);
- break;
- case '\0':
- case '|':
- case ')':
- FAIL("internal urp"); break; /* Supposed to be caught earlier. */
- case '?':
- case '+':
- case '*':
- FAIL("?+* follows nothing"); break;
- case '\\':
- if (*regparse == '\0') FAIL("trailing \\");
- ret = regnode(EXACTLY);
- regc(*regparse++);
- regc('\0');
- *flagp |= HASWIDTH|SIMPLE;
- break;
- default: {
- regparse--;
- int len = (int) strcspn(regparse, META);
- if (len <= 0) FAIL("internal disaster");
- char ender = *(regparse+len);
- if (len > 1 && ISMULT(ender)) len--; // Back off clear of ?+* operand
- *flagp |= HASWIDTH;
- if (len == 1) *flagp |= SIMPLE;
- ret = regnode(EXACTLY);
- while (len > 0) { regc(*regparse++); len--; }
- regc('\0');
- }
- break;
- }
- return ret;
- }
-
- /*
- ** regpiece - something followed by possible \[*+?\]
- **
- ** Note that the branching code sequences used for ? and the general cases
- ** of * and + are somewhat optimized: they use the same NOTHING node as
- ** both the endmarker for their branch list and the body of the last branch.
- ** It might seem that this node could be dispensed with entirely, but the
- ** endmarker role is not redundant.
- */
-
- static char *regpiece(int *flagp)
- {
- int flags;
- char *ret = regatom(&flags);
- if (ret == 0) return 0;
-
- char op = *regparse;
- if (!ISMULT(op)) { *flagp = flags; return ret; }
-
- if (!(flags&HASWIDTH) && op != '?') FAIL("*+ operand could be empty");
- *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
-
- char *next;
- if (op == '*' && (flags&SIMPLE))
- reginsert(STAR, ret);
- else if (op == '*') {
- /* Emit x* as \(x&|\), where & means "self". */
- reginsert(BRANCH, ret); /* Either x */
- regoptail(ret, regnode(BACK)); /* and loop */
- regoptail(ret, ret); /* back */
- regtail(ret, regnode(BRANCH)); /* or */
- regtail(ret, regnode(NOTHING)); /* null. */
- } else if (op == '+' && (flags&SIMPLE))
- reginsert(PLUS, ret);
- else if (op == '+') {
- /* Emit x+ as x\(&|\), where & means "self". */
- next = regnode(BRANCH); /* Either */
- regtail(ret, next);
- regtail(regnode(BACK), ret); /* loop back */
- regtail(next, regnode(BRANCH)); /* or */
- regtail(ret, regnode(NOTHING)); /* null. */
- } else if (op == '?') {
- /* Emit x? as \(x|\) */
- reginsert(BRANCH, ret); /* Either x */
- regtail(ret, regnode(BRANCH)); /* or */
- next = regnode(NOTHING); /* null. */
- regtail(ret, next);
- regoptail(ret, next);
- }
- regparse++;
- if (ISMULT(*regparse)) FAIL("nested *?+");
- return ret;
- }
-
- /*
- ** regbranch - one alternative of an | operator
- **
- ** Implements the concatenation operator.
- */
-
- static char *regbranch(int *flagp)
- {
- *flagp = WORST; /* Tentatively. */
- char *latest;
- int flags;
-
- char *ret = regnode(BRANCH);
- char *chain = 0;
- while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
- latest = regpiece(&flags);
- if (latest == 0) return 0;
- *flagp |= flags&HASWIDTH;
- if (chain == 0) /* First piece. */
- *flagp |= flags&SPSTART;
- else
- regtail(chain, latest);
- chain = latest;
- }
- if (chain == 0) (void) regnode(NOTHING); /* Loop ran zero times. */
- return ret;
- }
-
- /*
- ** reg - regular expression, i.e. main body or parenthesized thing
- **
- ** Caller must absorb opening parenthesis.
- **
- ** Combining parenthesis handling with the base level of regular expression
- ** is a trifle forced, but the need to tie the tails of the branches to what
- ** follows makes it hard to avoid.
- */
-
- static char *reg(int paren, int *flagp)
- {
- *flagp = HASWIDTH; /* Tentatively. */
- char *ret;
- int parno;
-
- /* Make an OPEN node, if parenthesized. */
- if (paren) {
- if (regnpar >= NSUBEXP) FAIL("too many ()");
- parno = regnpar;
- regnpar++;
- ret = regnode(OPEN+parno);
- } else
- ret = 0;
-
- /* Pick up the branches, linking them together. */
- int flags;
- char *br = regbranch(&flags);
- if (br == 0) return(0);
- if (ret != 0)
- regtail(ret, br); /* OPEN -> first. */
- else
- ret = br;
- if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
- *flagp |= flags&SPSTART;
- while (*regparse == '|') {
- regparse++;
- br = regbranch(&flags);
- if (br == 0) return 0;
- regtail(ret, br); /* BRANCH -> BRANCH. */
- if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
- *flagp |= flags&SPSTART;
- }
-
- /* Make a closing node, and hook it on the end. */
- char *ender = regnode((paren) ? CLOSE+parno : END);
- regtail(ret, ender);
-
- /* Hook the tails of the branches to the closing node. */
- for (br = ret; br != 0; br = regnext(br)) regoptail(br, ender);
-
- /* Check for proper termination. */
- if (paren && *regparse++ != ')') {
- FAIL("unmatched ()");
- } else if (!paren && *regparse != '\0') {
- if (*regparse == ')') {
- FAIL("unmatched ()");
- } else
- FAIL("junk on end"); /* "Can\'t happen". */
- /* NOTREACHED */
- }
- return ret;
- }
-
- /*
- ** regcomp - compile a regular expression into internal code
- **
- ** We can\'t allocate space until we know how big the compiled form will be,
- ** but we can\'t compile it \(and thus know how big it is\) until we\'ve got a
- ** place to put the code. So we cheat: we compile it twice, once with code
- ** generation turned off and size counting turned on, and once "for real".
- ** This also means that we don\'t allocate space until we are sure that the
- ** thing really will compile successfully, and we never have to move the
- ** code and thus invalidate pointers into it. \(Note that it has to be in
- ** one piece because free\(\) must be able to free it all.\)
- **
- ** Beware that the optimization-preparation code in here knows about some
- ** of the structure of the compiled regexp.
- */
-
- regexp *regcomp(const char *exp)
- {
- if (exp == 0) FAIL("0 argument");
-
- /* First pass: determine size, legality. */
- regparse = exp;
- regnpar = 1;
- regsize = 0L;
- regcode = ®dummy;
- regc(MAGIC);
-
- int flags;
- if (reg(0, &flags) == 0) return 0;
-
- /* Small enough for pointer-storage convention? */
- if (regsize >= 32767L) FAIL("regexp too big"); // Probably could be 65535L
-
- /* Allocate space. */
- regexp *r = (regexp *) new char[sizeof(regexp) + (unsigned)regsize];
- if (r == 0) FAIL("out of space");
-
- /* Second pass: emit code. */
- regparse = exp;
- regnpar = 1;
- regcode = r->program;
- regc(MAGIC);
- if (reg(0, &flags) == 0) return 0;
-
- /* Dig out information for optimizations. */
- r->regstart = '\0'; /* Worst-case defaults. */
- r->reganch = 0;
- r->regmust = 0;
- r->regmlen = 0;
- char *scan = r->program+1; // First BRANCH.
- if (OP(regnext(scan)) == END) { // Only one top-level choice.
- scan = OPERAND(scan);
- /* Starting-point info. */
- if (OP(scan) == EXACTLY)
- r->regstart = *OPERAND(scan);
- else if (OP(scan) == BOL)
- r->reganch++;
- /*
- * If there\'s something expensive in the r.e., find the
- * longest literal string that must appear and make it the
- * regmust. Resolve ties in favor of later strings, since
- * the regstart check works with the beginning of the r.e.
- * and avoiding duplication strengthens checking. Not a
- * strong reason, but sufficient in the absence of others.
- */
- char *longest;
- int len;
- if (flags&SPSTART) {
- longest = 0;
- len = 0;
- for (; scan != 0; scan = regnext(scan))
- if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
- longest = OPERAND(scan);
- len = (int) strlen(OPERAND(scan));
- }
- r->regmust = longest;
- r->regmlen = len;
- }
- }
- return r;
- }
-
- /*
- ** regexec and friends
- */
-
- /*
- ** Global work variables for regexec\(\).
- */
- static char *reginput; /* String-input pointer. */
- static char *regbol; /* Beginning of input, for ^ check. */
- static char **regstartp; /* Pointer to startp array. */
- static char **regendp; /* Ditto for endp. */
-
- #ifdef DEBUG
- int regnarrate = 0;
- void regdump();
- static char *regprop();
- #endif
-
- /*
- ** regrepeat - repeatedly match something simple, report how many
- */
-
- static int regrepeat(char *p)
- {
- int count = 0;
- char *scan = reginput;
- char *opnd = OPERAND(p);
- switch (OP(p)) {
- case ANY:
- count = (int) strlen(scan);
- scan += count;
- break;
- case EXACTLY:
- while (*opnd == *scan) { count++; scan++; }
- break;
- case ANYOF:
- while (*scan != '\0' && strchr(opnd, *scan) != 0) {
- count++;
- scan++;
- }
- break;
- case ANYBUT:
- while (*scan != '\0' && strchr(opnd, *scan) == 0) {
- count++;
- scan++;
- }
- break;
- default: /* Oh dear. Called inappropriately. */
- REerror = "internal foulup";
- count = 0; /* Best compromise. */
- break;
- }
- reginput = scan;
- return count;
- }
-
- /*
- ** regmatch - main matching routine
- **
- ** Conceptually the strategy is simple: check to see whether the current
- ** node matches, call self recursively to see whether the rest matches,
- ** and then act accordingly. In practice we make some effort to avoid
- ** recursion, in particular by going through "ordinary" nodes \(that don\'t
- ** need to know whether the rest of the match failed\) by a loop instead of
- ** by recursion.
- **
- ** 0 failure, 1 success
- */
-
- static int regmatch(char *prog)
- {
- char *scan = prog; /* Current node. */
- char *next; /* Next node. */
-
- #ifdef DEBUG
- if (scan != 0 && regnarrate) fprintf(stderr, "%s(\n", regprop(scan));
- #endif
- while (scan != 0) {
- #ifdef DEBUG
- if (regnarrate) fprintf(stderr, "%s...\n", regprop(scan));
- #endif
- next = regnext(scan);
-
- switch (OP(scan)) {
- case BOL: if (reginput != regbol) return 0; break;
- case EOL: if (*reginput != '\0') return 0; break;
- case ANY: if (*reginput == '\0') return(0); reginput++; break;
- case EXACTLY: {
- char *opnd = OPERAND(scan);
- /* Inline the first character, for speed. */
- if (*opnd != *reginput) return 0;
- int len = (int) strlen(opnd);
- if (len > 1 && strncmp(opnd, reginput, len) != 0) return 0;
- reginput += len;
- }
- break;
- case ANYOF:
- if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == 0)
- return 0;
- reginput++;
- break;
- case ANYBUT:
- if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != 0)
- return 0;
- reginput++;
- break;
- case NOTHING: break;
- case BACK: break;
- case OPEN+1:
- case OPEN+2:
- case OPEN+3:
- case OPEN+4:
- case OPEN+5:
- case OPEN+6:
- case OPEN+7:
- case OPEN+8:
- case OPEN+9: {
- int no = OP(scan) - OPEN;
- char *save = reginput;
- if (regmatch(next)) {
- /*
- ** Don\'t set startp if some later invocation of the same
- ** parentheses already has.
- */
- if (regstartp[no] == 0) regstartp[no] = save; return 1;
- } else
- return 0;
- }
- case CLOSE+1:
- case CLOSE+2:
- case CLOSE+3:
- case CLOSE+4:
- case CLOSE+5:
- case CLOSE+6:
- case CLOSE+7:
- case CLOSE+8:
- case CLOSE+9: {
- int no = OP(scan) - CLOSE;
- char *save = reginput;
- if (regmatch(next)) {
- /*
- ** Don\'t set endp if some later invocation of the same
- ** parentheses already has.
- */
- if (regendp[no] == 0) regendp[no] = save;
- return 1;
- } else
- return 0;
- }
- case BRANCH: {
- char *save;
- if (OP(next) != BRANCH) /* No choice. */
- next = OPERAND(scan); /* Avoid recursion. */
- else {
- do {
- save = reginput;
- if (regmatch(OPERAND(scan))) return(1);
- reginput = save;
- scan = regnext(scan);
- } while (scan != 0 && OP(scan) == BRANCH);
- return 0;
- }
- }
- break;
- case STAR:
- case PLUS: {
- /*
- ** Lookahead to avoid useless match attempts
- ** when we know what character comes next.
- */
- char nextch = '\0';
- if (OP(next) == EXACTLY) nextch = *OPERAND(next);
- int min = (OP(scan) == STAR) ? 0 : 1;
- char *save = reginput;
- int no = regrepeat(OPERAND(scan));
- while (no >= min) {
- /* If it could work, try it. */
- if (nextch == '\0' || *reginput == nextch)
- if (regmatch(next)) return(1);
- /* Couldn\'t or didn\'t -- back up. */
- no--;
- reginput = save + no;
- }
- return 0;
- }
- case END: return 1; /* Success! */
- default: REerror = "memory corruption"; return 0;
- }
- scan = next;
- }
-
- /*
- * We get here only if there\'s trouble -- normally "case END" is
- * the terminating point.
- */
- REerror = "corrupted pointers";
- return 0;
- }
-
- /*
- ** regtry - try match at specific point
- **
- ** 0 failure, 1 success
- */
-
- static int regtry(regexp *prog, char *string)
- {
- reginput = string;
- regstartp = prog->startp;
- regendp = prog->endp;
-
- char **sp = prog->startp;
- char **ep = prog->endp;
- for (int i = NSUBEXP; i > 0; i--) { *sp++ = 0; *ep++ = 0; }
- if (regmatch(prog->program + 1)) {
- prog->startp[0] = string;
- prog->endp[0] = reginput;
- return 1;
- } else
- return 0;
- }
-
- /*
- ** - regexec - match a regexp against a string
- **/
-
- int regexec(regexp *prog, char *string)
- {
- char *s;
-
- /* Be paranoid... */
- if (prog == 0 || string == 0) { REerror = "0 parameter"; return 0; }
-
- /* Check validity of program. */
- if (UCHARAT(prog->program) != MAGIC) {
- REerror = "corrupted program";
- return 0;
- }
-
- /* If there is a "must appear" string, look for it. */
- if (prog->regmust != 0) {
- s = string;
- while ((s = strchr(s, prog->regmust[0])) != 0) {
- if (strncmp(s, prog->regmust, prog->regmlen) == 0) break; // Found
- s++;
- }
- if (s == 0) return(0); /* Not present. */
- }
-
- /* Mark beginning of line for ^ . */
- regbol = string;
-
- /* Simplest case: anchored match need be tried only once. */
- if (prog->reganch) return(regtry(prog, string));
-
- /* Messy cases: unanchored match. */
- s = string;
- if (prog->regstart != '\0')
- /* We know what char it must start with. */
- while ((s = strchr(s, prog->regstart)) != 0) {
- if (regtry(prog, s)) return 1;
- s++;
- }
- else
- /* We don\'t -- general case. */
- do {
- if (regtry(prog, s)) return(1);
- } while (*s++ != '\0');
-
- /* Failure. */
- return 0;
- }
-
- #ifdef TEST
- int main()
- {
- char *str = "do";
- char *test1 = "do it";
- char *test2 = "dog it";
- char *test3 = "don't do it";
- regexp *rxp = regcomp(str);
- if (!rxp) printf("regcomp() failed on %s\n", str);
- if (regexec(rxp, test1)) printf("matched %s\n", test1);
- if (regexec(rxp, test2)) printf("matched %s\n", test2);
- if (regexec(rxp, test3)) printf("matched %s\n", test3);
- }
- #endif
-