home *** CD-ROM | disk | FTP | other *** search
- /* -------------------------------------------------------------------- */
- /* String++ Version 3.00 04/10/93 */
- /* */
- /* Enhanced string class for Turbo C++/Borland C++. */
- /* Copyright 1991-1993 by Carl W. Moreland */
- /* */
- /* Derived from code Copyright 1988, Jim Mischel. All rights reserved. */
- /* */
- /* regexp.cpp */
- /* -------------------------------------------------------------------- */
-
- #include <stddef.h>
- #include "regexp.h"
-
- #define TRUE -1
- #define FALSE 0
- #define ALMOST 1 // returned when a closure matches a NULL
-
- const char ENDSTR = '\0';
- const char EOL = '$';
- const char BOL = '^';
- const char NEGATE = '^';
- const char CCL = '[';
- const char NCCL = ']';
- const char CCLEND = ']';
- const char ANY = '.';
- const char DASH = '-';
- const char OR = '|';
- const char ESCAPE = '\\';
- const char LPAREN = '(';
- const char RPAREN = ')';
- const char CLOSURE = '*';
- const char POS_CLO = '+';
- const char ZERO_ONE = '?';
- const char LITCHAR = 'c';
- const char END_TERM = 'e';
- const String FS_DEFAULT = "[ \t]+";
-
- int RSTART; // start of matched substring
- int RLENGTH; // length of matched substring
- int NF; // number of fields from most current split
- RegExp FS(FS_DEFAULT); // global field separator
-
- /* -------------------------------------------------------------------- */
-
- RegExp::RegExp(const String& s)
- {
- reExpression = s;
-
- if(reExpression() != "")
- MakePattern();
- }
-
- void RegExp::operator=(const char* p)
- {
- reExpression = p;
-
- if(reExpression() != "")
- MakePattern();
- }
-
- void RegExp::operator=(const String& s)
- {
- reExpression = s;
-
- if(reExpression() != "")
- MakePattern();
- }
-
- /* -------------------------------------------------------------------- */
-
- // MakePattern() - "compile" the regular expression into rePattern
-
- const char* RegExp::MakePattern(void)
- {
- static String tmp;
-
- if(ParseExpression(tmp) == "" || reExpression != ENDSTR)
- {
- rePattern = "";
- reExpression.Reset();
- return NULL;
- }
- rePattern = tmp;
- reExpression.Reset();
- return rePattern;
- }
-
- // ParseExpression() - Parse and translate an expression. Returns a pointer
- // to the compiled expression, or NULL on error.
-
- const char* RegExp::ParseExpression(String& expr)
- {
- String term;
- expr = "";
-
- if(ParseTerm(term) == "") // get the first term
- return(NULL);
-
- while(reExpression == OR) // parse all subsequent terms
- {
- expr += OR;
- expr += term;
- expr += END_TERM;
- reExpression++;
-
- if(ParseTerm(term) == "")
- return (NULL);
- }
- expr += term;
- expr += END_TERM;
-
- return expr;
- }
-
- // ParseTerm() - parse and translate a term. Returns a pointer to the
- // compiled term or NULL on error.
-
- const char* RegExp::ParseTerm(String& term)
- {
- String factor;
- term = "";
-
- if(reExpression == BOL)
- {
- term += reExpression[0];
- reExpression++;
- }
- do
- {
- if(ParseFactor(factor) == "")
- return(NULL);
-
- term += factor;
- }
- while(IsFactor()); // parse all factors of this term
-
- return term;
- }
-
- // IsFactor() - returns TRUE for a valid factor character
-
- int RegExp::IsFactor(void)
- {
- static char* nfac_chars = "^|)]+?*";
-
- return (strchr(nfac_chars, reExpression[0]) == NULL) ? TRUE : FALSE;
- }
-
- // ParseFactor() - parse and translate a factor. Returns a pointer to the
- // compiled factor or NULL on error.
-
- const char* RegExp::ParseFactor(String& factor)
- {
- String tmp;
- factor = "";
-
- switch(reExpression[0])
- {
- case LPAREN: // ( - parenthesised expression
- reExpression++;
- ParseExpression(tmp);
- factor += tmp;
- if(reExpression != RPAREN)
- return(NULL);
- reExpression++;
- break;
-
- case CCL: // [ - character class; Ex: [0-9]
- reExpression++;
- ParseCCL(tmp);
- factor += tmp;
- if(reExpression != CCLEND)
- return(NULL);
- reExpression++;
- break;
-
- case ANY: // .
- case EOL: // $
- factor += reExpression[0];
- reExpression++;
- break;
-
- case ESCAPE : // \
- reExpression++;
- factor += LITCHAR;
- factor += ParseEscape();
- reExpression++;
- break;
-
- case CLOSURE: // *
- case POS_CLO: // +
- case ZERO_ONE: // ?
- case NEGATE: // ^
- case CCLEND: // ]
- case RPAREN: // )
- case OR: // |
- return(NULL); // not valid characters
-
- default: // literal character
- factor += LITCHAR;
- factor += reExpression[0];
- reExpression++;
- break;
- }
-
- if(reExpression == CLOSURE || // check for closure
- reExpression == ZERO_ONE ||
- reExpression == POS_CLO)
- {
- if(ParseClosure(factor) == FALSE)
- return(NULL);
- }
- return factor;
- }
-
- // ParseEscape() - returns ASCII value of character(s) following ESCAPE
-
- char RegExp::ParseEscape(void)
- {
- static unsigned char ch;
-
- switch(reExpression[0])
- {
- case 'b': reExpression++; return ('\b'); // backspace
- case 't': reExpression++; return ('\t'); // tab
- case 'f': reExpression++; return ('\f'); // formfeed
- case 'n': reExpression++; return ('\n'); // linefeed
- case 'r': reExpression++; return ('\r'); // carriage return
-
- case '0': // 0-7 is octal constant
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- {
- ch = reExpression[0] - '0';
- reExpression++;
- if(reExpression >= '0' && reExpression < '8')
- {
- ch <<= 3;
- ch += reExpression[0] - '0';
- reExpression++;
- }
- if(reExpression >= '0' && reExpression < '8')
- {
- ch <<= 3;
- ch += reExpression[0] - '0';
- reExpression++;
- }
- return(ch);
- }
- default: // otherwise, just that char
- return(reExpression[0]);
- }
- }
-
- // ParseClosure() - place closure character and size before the factor
- // in the compiled string.
-
- int RegExp::ParseClosure(String& factor)
- {
- if(factor.Len() > 253)
- return FALSE; // closure expression too large
-
- factor.Insert(0, " ");
- factor[0] = reExpression[0];
- factor[1] = (char)(factor.Len()-2);
-
- reExpression++;
-
- return TRUE;
- }
-
- // ParseCCL() - parse and translate a character class. Return pointer to the
- // compiled class or NULL on error.
-
- const char* RegExp::ParseCCL(String& ccl)
- {
- int first = TRUE;
- int len;
-
- ccl = "[ ";
-
- if(reExpression == NEGATE) // if first character is NEGATE (^)
- {
- ccl[0] = NCCL; // then we have a negated
- reExpression++; // character class
- }
-
- // parse all characters up to the closing bracket or end of string marker
-
- while(reExpression != CCLEND && reExpression != ENDSTR)
- {
- if(reExpression == DASH && first == FALSE) // DASH, check for range
- {
- reExpression++;
- if(reExpression == NCCL)
- ccl += DASH; // not range, literal DASH
- else
- {
- ParseDASH(ccl, reExpression);
- reExpression++;
- }
- }
- else
- {
- if(reExpression == ESCAPE)
- {
- reExpression++;
- ccl += ParseEscape();
- reExpression++;
- }
- else
- {
- ccl += reExpression[0];
- reExpression++;
- }
- }
- first = FALSE;
- }
-
- len = ccl.Len()-2;
-
- if(len > 255)
- return NULL; // character class too large
- else
- {
- ccl[1] = (char)len; // store CCL length at ccl[1]
- return ccl;
- }
- }
-
- // ParseDASH() - fill in range characters.
-
- const char* RegExp::ParseDASH(String& ccl, char ch)
- {
- static char c;
-
- for(c = ccl[ccl.Len() - 1] + 1; c <= ch; c++)
- ccl += c;
-
- return ccl;
- }
-
- /* -------------------------------------------------------------------- */
-
- // Match() - Return the position of the first character of the left-most
- // longest substring of s that Matches pattern, or NULL if no match is
- // found. Sets RSTART and RLENGTH.
-
- int RegExp::Match(const char* s, const char* pattern)
- {
- char* c = (char*)s;
- reLastChar = (char*)s;
-
- while(*c != ENDSTR)
- {
- if(MatchTerm(int(c-(char*)s), c, (char*)pattern) != FALSE)
- {
- RSTART = int(c-(char*)s);
- RLENGTH = int(reLastChar - c);
- return RSTART;
- }
- c++;
- }
- RSTART = RLENGTH = 0;
- return -1;
- }
-
- // MatchTerm() - Match a compiled term. Returns TRUE, FALSE, or ALMOST.
-
- int RegExp::MatchTerm(int offset, char* s, char* pattern)
- {
- reLastChar = s;
-
- if(*pattern == ENDSTR)
- return FALSE;
-
- do
- {
- switch(*pattern)
- {
- case BOL: // ^ - match beginning of line
- if(offset != 0)
- return FALSE;
- pattern++;
- break;
-
- case LITCHAR: // c - match literal character
- if(*s++ != *++pattern)
- return FALSE;
- pattern++;
- break;
-
- case END_TERM: // e - skip end-of-term character
- pattern++;
- break;
-
- case ANY: // . - match any character
- if(*s++ == ENDSTR) // except end of string
- return FALSE;
- pattern++;
- break;
-
- case OR:
- return MatchOR(offset, s, pattern);
-
- case CCL: // [ - character class
- case NCCL: // ]
- if(*s == ENDSTR)
- return FALSE;
- if(!MatchCCL(*s++, pattern++))
- return FALSE;
- pattern += pattern[0] + 1;
- break;
-
- case EOL: // $ - match end of string
- if(*s != ENDSTR)
- return FALSE;
- pattern++;
- break;
-
- case ZERO_ONE: // ?
- return Match_0_1(offset, s, pattern);
-
- case CLOSURE: // *
- case POS_CLO: // +
- {
- char ClosurePattern[1024];
-
- strncpy(ClosurePattern, pattern+2, *(pattern+1));
- ClosurePattern[*(pattern+1)] = ENDSTR;
- return MatchClosure(offset, s, pattern, ClosurePattern);
- }
-
- default:
-
- // If we get to this point, then something has gone very wrong.
- // Most likely, someone has tried to match with an invalid
- // compiled pattern.
-
- return FALSE;
- }
- reLastChar = s;
- } while(*pattern != ENDSTR);
-
- return TRUE;
- }
-
- // MatchOR() - Handles selection processing.
-
- int RegExp::MatchOR(int offset, char* s, char* pattern)
- {
- char tmp_pattern[1024];
- char *t1, *t2, *junk;
-
- // The first case is build into tmp_pattern. Second case is already there.
- // Both patterns are searched to determine the longest matched substring.
-
- tmp_pattern[0] = ENDSTR;
- pattern++;
- junk = (char*)SkipTerm(pattern);
-
- strncat(tmp_pattern, pattern, int(junk-pattern));
- strcat(tmp_pattern, SkipTerm(junk));
- t1 = (MatchTerm(offset, s, tmp_pattern) != FALSE) ? reLastChar : NULL;
-
- // The second pattern need not be searched if the first pattern results
- // in a match through to the end of the string, since the longest possible
- // match has already been found.
-
- if(t1 == NULL || *reLastChar != ENDSTR)
- {
- t2 = (MatchTerm(offset, s, junk) != FALSE) ? reLastChar : NULL;
-
- // determine which matched the longest substring
-
- if(t1 != NULL && (t2 == NULL || t1 > t2))
- reLastChar = t1;
- }
- return (t1 == NULL && t2 == NULL) ? FALSE : TRUE;
- }
-
- // SkipTerm() - Skip over the current term and return a pointer to the
- // next term in the pattern.
-
- const char* RegExp::SkipTerm(char* pattern)
- {
- register int nterm = 1;
-
- while(nterm > 0)
- {
- switch(*pattern)
- {
- case OR:
- nterm++;
- break;
-
- case CCL:
- case NCCL:
- case CLOSURE:
- case ZERO_ONE:
- case POS_CLO:
- pattern++;
- pattern += pattern[0];
- break;
-
- case END_TERM:
- nterm--;
- break;
-
- case LITCHAR:
- pattern++;
- break;
- }
- pattern++;
- }
- return pattern;
- }
-
- // Match_0_1() - Match the ZERO_ONE operator. First, this routine attempts
- // to match the entire pattern with the input string. If that fails, it
- // skips over the closure pattern and attempts to match the rest of the
- // pattern.
-
- int RegExp::Match_0_1(int offset, char* s, char* pattern)
- {
- char* old_s = s;
-
- if(MatchTerm(offset, s, pattern+2) == TRUE)
- return TRUE;
- else if(MatchTerm(offset, old_s, pattern+2+*(pattern+1)) == FALSE)
- return FALSE;
- else
- return ALMOST;
- }
-
- // MatchClosure() - Match CLOSURE and POS_CLO. Match as many of the
- // closure patterns as possible, then attempt to match the remaining
- // pattern with what's left of the input string. Backtrack until we've
- // either matched the remaing pattern or we arrive back at where we
- // started.
-
- int RegExp::MatchClosure(int offset, char* s,
- char* pattern, char* ClosurePattern)
- {
- char* old_s = s;
-
- if(MatchTerm(offset, s, ClosurePattern) == TRUE)
- {
- old_s = reLastChar;
- if(MatchClosure(offset, old_s, pattern, ClosurePattern) != FALSE)
- return TRUE;
- else
- return MatchTerm(offset, old_s, pattern+2+*(pattern+1));
- }
- else if(*pattern != CLOSURE) // POS_CLO requires at least one match
- return FALSE;
- else if(MatchTerm(offset, old_s, pattern+2+*(pattern+1)) == TRUE)
- return ALMOST;
- else
- return FALSE;
- }
-
- // MatchCCL() - Match a character class or negated character class
-
- int RegExp::MatchCCL(char c, char* pattern)
- {
- register int x;
- char ccl = *pattern++;
-
- for(x = *pattern; x > 0; x--)
- {
- if(c == pattern[x])
- return(ccl == CCL);
- }
- return(ccl != CCL);
- }
-
- /* -------------------------------------------------------------------- */
-
- // Return the position of the first instance of t in s, or -1 if no match.
-
- int index(const String& s, const RegExp& t)
- {
- return match(s, (RegExp)t);
- }
-
- // sub() - Substitute 'to' for the leftmost longest substring of str
- // matched by the regular expression 'from'. Return number of substitutions
- // made.
-
- int sub(RegExp& from, const String& to, String& str)
- {
- if(match(str, from) == -1)
- return 0;
-
- str.Replace(RSTART, RLENGTH, to);
- return 1;
- }
-
- // gsub() - Substitute 'to' globally for all substrings in str matched by
- // the regular expression 'from'. Return number of substitutions made.
-
- int gsub(RegExp& from, const String& to, String& str, unsigned count)
- {
- int n = 0;
- ParseString tmp = str;
-
- while(match(tmp, from) != -1)
- {
- tmp.Replace(RSTART, RLENGTH, to);
- tmp += RSTART+RLENGTH;
- if(++n == count)
- break;
- }
- tmp.Reset();
- str = tmp;
-
- return n;
- }
-
- // split() - split the string s into fields in the array a on field
- // separator fs. Returns number of fields. Also sets the global variable
- // NF.
-
- int split(const String& s, String*& array, const RegExp& fs)
- {
- String *tmp;
- ParseString ps = s;
- int rstart = RSTART; // save RSTART and RLENGTH
- int rlength = RLENGTH;
- NF = 1;
-
- while(match(ps, (RegExp)fs) != -1)
- {
- ps += RSTART+RLENGTH;
- NF++;
- }
-
- tmp = new String[NF];
- ps.Reset();
- NF = 0;
-
- while(match(ps, (RegExp)fs) != -1)
- {
- tmp[NF++] = ps(0, RSTART);
- ps += RSTART+RLENGTH;
- }
- array = tmp;
-
- RSTART = rstart; // restore globals
- RLENGTH = rlength;
- return (NF);
- }
-
- ostream& operator<<(ostream& strm, const RegExp& re)
- {
- return strm << re.reExpression();
- }
-
-