home *** CD-ROM | disk | FTP | other *** search
- /* $Header: E:\SRC\UUPC\RN\RCS/SEARCH.C 1.1 1992/11/21 06:14:58 ahd Exp $
- *
- * $Log: SEARCH.C $
- * Revision 1.1 1992/11/21 06:14:58 ahd
- * Initial
- *
- *
- * Rev 1.0 18 Nov 1990 0:21:58
- * Initial revision.
- * Revision 4.3.2.2 90/03/22 23:05:31 sob
- * Fixes provided by Wayne Davison <drivax!davison>
- *
- * Revision 4.3.2.1 90/03/17 17:46:29 sob
- * Added changes to insure that null search strings won't result in core dumps
- * on non-VAX computers.
- *
- * Revision 4.3 85/05/01 11:50:16 lwall
- * Baseline for release with 4.3bsd.
- *
- */
-
- /* string search routines */
-
- /* Copyright (c) 1981,1980 James Gosling */
-
- /* Modified Aug. 12, 1981 by Tom London to include regular expressions
- as in ed. RE stuff hacked over by jag to correct a few major problems,
- mainly dealing with searching within the buffer rather than copying
- each line to a separate array. Newlines can now appear in RE's */
-
- /* Ripped to shreds and glued back together to make a search package,
- * July 6, 1984, by Larry Wall. (If it doesn't work, it's probably my fault.)
- * Changes include:
- * Buffer, window, and mlisp stuff gone.
- * Translation tables reduced to 1 table.
- * Expression buffer is now dynamically allocated.
- * Character classes now implemented with a bitmap.
- */
-
- #include <stdlib.h>
-
- #include "EXTERN.h"
- #include "common.h"
- #include "util.h"
- #include "INTERN.h"
- #include "search.h"
-
- #ifndef BITSPERBYTE
- #define BITSPERBYTE 8
- #endif
-
- #define BMAPSIZ (127 / BITSPERBYTE + 1)
-
- /* meta characters in the "compiled" form of a regular expression */
- #define CBRA 2 /* \( -- begin bracket */
- #define CCHR 4 /* a vanilla character */
- #define CDOT 6 /* . -- match anything
- * except a newline */
- #define CCL 8 /* [...] -- character
- * class */
- #define NCCL 10 /* [^...] -- negated
- * character class */
- #define CDOL 12 /* $ -- matches the end
- * of a line */
- #define CEND 14 /* The end of the
- * pattern */
- #define CKET 16 /* \) -- close bracket */
- #define CBACK 18 /* \N -- backreference
- * to the Nth bracketed
- * string */
- #define CIRC 20 /* ^ matches the
- * beginning of a line */
-
- #define WORD 32 /* matches word
- * character \w */
- #define NWORD 34 /* matches non-word
- * characer \W */
- #define WBOUND 36 /* matches word boundary
- * \b */
- #define NWBOUND 38 /* matches non-(word
- * boundary) \B */
-
- #define STAR 01 /* * -- Kleene star,
- * repeats the previous
- * REas many times as
- * possible; the value
- * ORs with the other
- * operator types */
-
- #define ASCSIZ 0200
- typedef char TRANSTABLE[ASCSIZ];
-
- static TRANSTABLE trans = {
- 0000, 0001, 0002, 0003, 0004, 0005, 0006, 0007,
- 0010, 0011, 0012, 0013, 0014, 0015, 0016, 0017,
- 0020, 0021, 0022, 0023, 0024, 0025, 0026, 0027,
- 0030, 0031, 0032, 0033, 0034, 0035, 0036, 0037,
- 0040, 0041, 0042, 0043, 0044, 0045, 0046, 0047,
- 0050, 0051, 0052, 0053, 0054, 0055, 0056, 0057,
- 0060, 0061, 0062, 0063, 0064, 0065, 0066, 0067,
- 0070, 0071, 0072, 0073, 0074, 0075, 0076, 0077,
- 0100, 0101, 0102, 0103, 0104, 0105, 0106, 0107,
- 0110, 0111, 0112, 0113, 0114, 0115, 0116, 0117,
- 0120, 0121, 0122, 0123, 0124, 0125, 0126, 0127,
- 0130, 0131, 0132, 0133, 0134, 0135, 0136, 0137,
- 0140, 0141, 0142, 0143, 0144, 0145, 0146, 0147,
- 0150, 0151, 0152, 0153, 0154, 0155, 0156, 0157,
- 0160, 0161, 0162, 0163, 0164, 0165, 0166, 0167,
- 0170, 0171, 0172, 0173, 0174, 0175, 0176, 0177,
- };
- static bool folding = FALSE;
-
- static int err;
- static char *FirstCharacter;
-
- void
- search_init()
- {
-
- #ifdef UNDEF
- register int i;
-
- for (i = 0; i < ASCSIZ; i++)
- trans[i] = i;
- #else
- ;
- #endif
- }
-
- void
- init_compex(compex)
- register COMPEX *compex;
- {
- /* the following must start off zeroed */
-
- compex->eblen = 0;
- compex->brastr = Nullch;
- }
-
- void
- free_compex(compex)
- register COMPEX *compex;
- {
- if (compex->eblen)
- {
- free(compex->expbuf);
- compex->eblen = 0;
- }
- if (compex->brastr)
- {
- free(compex->brastr);
- compex->brastr = Nullch;
- }
- }
-
- static char *gbr_str = Nullch;
- static int gbr_siz = 0;
-
- char *
- getbracket(compex, n)
- register COMPEX *compex;
- int n;
- {
- int length = compex->braelist[n] - compex->braslist[n];
-
- if (!compex->nbra || n > compex->nbra || !compex->braelist[n] || length < 0)
- return nullstr;
- growstr(&gbr_str, &gbr_siz, length + 1);
- safecpy(gbr_str, compex->braslist[n], length + 1);
- return gbr_str;
- }
-
- void
- case_fold(which)
- int which;
- {
- register int i;
-
- if (which != folding)
- {
- if (which)
- {
- for (i = 'A'; i <= 'Z'; i++)
- trans[i] = tolower(i);
- }
- else
- {
- for (i = 'A'; i <= 'Z'; i++)
- trans[i] = i;
- }
- folding = which;
- }
- }
-
- /* Compile the given regular expression into a [secret] internal format */
-
- char *
- compile(compex, strp, RE, fold)
- register COMPEX *compex;
- register char *strp;
- int RE;
- int fold;
- {
- register int c;
- register char *ep;
- char *lastep;
- char bracket[NBRA],
- *bracketp;
- char **alt = compex->alternatives;
- char *retmes = "Badly formed search string";
-
- case_fold(compex->do_folding = fold);
- if (!compex->eblen)
- {
- compex->expbuf = safemalloc(84);
- compex->eblen = 80;
- }
- ep = compex->expbuf; /* point at expression
- * buffer */
- *alt++ = ep; /* first alternative
- * starts here */
- bracketp = bracket; /* first bracket goes
- * here */
- if (*strp == 0)
- { /* nothing to compile? */
- if (*ep == 0) /* nothing there yet? */
- return "Null search string";
- return Nullch; /* just keep old
- * expression */
- }
- compex->nbra = 0; /* no brackets yet */
- lastep = 0;
- for (;;)
- {
- if (ep - compex->expbuf >= compex->eblen)
- grow_eb(compex);
- c = *strp++; /* fetch next char of
- * pattern */
- if (c == 0)
- { /* end of pattern? */
- if (bracketp != bracket)
- { /* balanced brackets? */
-
- #ifdef VERBOSE
- retmes = "Unbalanced parens";
- #endif
-
- goto cerror;
- }
- *ep++ = CEND; /* terminate expression */
- *alt++ = 0; /* terminal alternative
- * list */
- /* compex->eblen = ep - compex->expbuf + 1; compex->expbuf =
- * saferealloc(compex->expbuf,compex->eblen+4); */
- return Nullch; /* return success */
- }
- if (c != '*')
- lastep = ep;
- if (!RE)
- { /* just a normal search
- * string? */
- *ep++ = CCHR; /* everything is a
- * normal char */
- *ep++ = c;
- }
- else /* it is a regular
- * expression */
- switch (c)
- {
-
- case '\\': /* meta something */
- switch (c = *strp++)
- {
- case '(':
- if (compex->nbra >= NBRA)
- {
-
- #ifdef VERBOSE
- retmes = "Too many parens";
- #endif
-
- goto cerror;
- }
- *bracketp++ = ++compex->nbra;
- *ep++ = CBRA;
- *ep++ = compex->nbra;
- break;
- case '|':
- if (bracketp > bracket)
- {
-
- #ifdef VERBOSE
- retmes = "No \\| in parens"; /* Alas! */
- #endif
-
- goto cerror;
- }
- *ep++ = CEND;
- *alt++ = ep;
- break;
- case ')':
- if (bracketp <= bracket)
- {
-
- #ifdef VERBOSE
- retmes = "Unmatched right paren";
- #endif
-
- goto cerror;
- }
- *ep++ = CKET;
- *ep++ = *--bracketp;
- break;
- case 'w':
- *ep++ = WORD;
- break;
- case 'W':
- *ep++ = NWORD;
- break;
- case 'b':
- *ep++ = WBOUND;
- break;
- case 'B':
- *ep++ = NWBOUND;
- break;
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- *ep++ = CBACK;
- *ep++ = c - '0';
- break;
- default:
- *ep++ = CCHR;
- if (c == '\0')
- goto cerror;
- *ep++ = c;
- break;
- }
- break;
- case '.':
- *ep++ = CDOT;
- continue;
-
- case '*':
- if (lastep == 0 || *lastep == CBRA || *lastep == CKET
- || *lastep == CIRC
- || (*lastep & STAR) || *lastep > NWORD)
- goto defchar;
- *lastep |= STAR;
- continue;
-
- case '^':
- if (ep != compex->expbuf && ep[-1] != CEND)
- goto defchar;
- *ep++ = CIRC;
- continue;
-
- case '$':
- if (*strp != 0 && (*strp != '\\' || strp[1] != '|'))
- goto defchar;
- *ep++ = CDOL;
- continue;
-
- case '[':
- { /* character class */
- register int i;
-
- if (ep - compex->expbuf >= compex->eblen - BMAPSIZ)
- grow_eb(compex); /* reserve bitmap */
- for (i = BMAPSIZ; i; --i)
- ep[i] = 0;
-
- if ((c = *strp++) == '^')
- {
- c = *strp++;
- *ep++ = NCCL; /* negated */
- }
- else
- *ep++ = CCL; /* normal */
-
- i = 0; /* remember oldchar */
- do
- {
- if (c == '\0')
- {
-
- #ifdef VERBOSE
- retmes = "Missing ]";
- #endif
-
- goto cerror;
- }
- if (*strp == '-' && *(++strp))
- i = *strp++;
- else
- i = c;
- while (c <= i)
- {
- ep[c / BITSPERBYTE] |= 1 << (c % BITSPERBYTE);
- if (fold && isalpha(c))
- ep[(c ^ 32) / BITSPERBYTE] |=
- 1 << ((c ^ 32) % BITSPERBYTE);
- /* set the other bit too */
- c++;
- }
- } while ((c = *strp++) != ']');
- ep += BMAPSIZ;
- continue;
- }
-
- defchar:
- default:
- *ep++ = CCHR;
- *ep++ = c;
- }
- }
- cerror:
- compex->expbuf[0] = 0;
- compex->nbra = 0;
- return retmes;
- }
-
- void
- grow_eb(compex)
- register COMPEX *compex;
- {
- compex->eblen += 80;
- compex->expbuf = saferealloc(compex->expbuf, (MEM_SIZE) compex->eblen + 4);
- }
-
- char *
- execute(compex, addr)
- register COMPEX *compex;
- char *addr;
- {
- register char *p1 = addr;
- register char *trt = trans;
- register int c;
-
- if (addr == Nullch || compex->expbuf == Nullch)
- return Nullch;
- if (compex->nbra)
- { /* any brackets? */
- for (c = 0; c <= compex->nbra; c++)
- compex->braslist[c] = compex->braelist[c] = Nullch;
- if (compex->brastr)
- free(compex->brastr);
- compex->brastr = savestr(p1); /* in case p1 is not
- * static */
- p1 = compex->brastr; /* ! */
- }
- case_fold(compex->do_folding); /* make sure table is
- * correct */
- FirstCharacter = p1; /* for ^ tests */
- if (compex->expbuf[0] == CCHR && !compex->alternatives[1])
- {
- c = trt[compex->expbuf[1]]; /* fast check for first
- * character */
- do
- {
- if (trt[*p1] == c && advance(compex, p1, compex->expbuf))
- return p1;
- p1++;
- } while (*p1 && !err);
- return Nullch;
- }
- else
- { /* regular algorithm */
- do
- {
- register char **alt = compex->alternatives;
-
- while (*alt)
- {
- if (advance(compex, p1, *alt++))
- return p1;
- }
- p1++;
- } while (*p1 && !err);
- return Nullch;
- }
- }
-
- /* advance the match of the regular expression starting at ep along the
- string lp, simulates an NDFSA */
- bool
- advance(compex, lp, ep)
- register COMPEX *compex;
- register char *ep;
- register char *lp;
- {
- register char *curlp;
- register char *trt = trans;
- register int i;
-
- while ((*ep & STAR) || *lp || *ep == CIRC || *ep == CKET)
- switch (*ep++)
- {
-
- case CCHR:
- if (trt[*ep++] != trt[*lp])
- return FALSE;
- lp++;
- continue;
-
- case CDOT:
- if (*lp == '\n')
- return FALSE;
- lp++;
- continue;
-
- case CDOL:
- if (!*lp || *lp == '\n')
- continue;
- return FALSE;
-
- case CIRC:
- if (lp == FirstCharacter || lp[-1] == '\n')
- continue;
- return FALSE;
-
- case WORD:
- if (isalnum(*lp))
- {
- lp++;
- continue;
- }
- return FALSE;
-
- case NWORD:
- if (!isalnum(*lp))
- {
- lp++;
- continue;
- }
- return FALSE;
-
- case WBOUND:
- if ((lp == FirstCharacter || !isalnum(lp[-1])) !=
- (!*lp || !isalnum(*lp)))
- continue;
- return FALSE;
-
- case NWBOUND:
- if ((lp == FirstCharacter || !isalnum(lp[-1])) ==
- (!*lp || !isalnum(*lp)))
- continue;
- return FALSE;
-
- case CEND:
- return TRUE;
-
- case CCL:
- if (cclass(ep, *lp, 1))
- {
- ep += BMAPSIZ;
- lp++;
- continue;
- }
- return FALSE;
-
- case NCCL:
- if (cclass(ep, *lp, 0))
- {
- ep += BMAPSIZ;
- lp++;
- continue;
- }
- return FALSE;
-
- case CBRA:
- compex->braslist[*ep++] = lp;
- continue;
-
- case CKET:
- i = *ep++;
- compex->braelist[i] = lp;
- compex->braelist[0] = lp;
- compex->braslist[0] = compex->braslist[i];
- continue;
-
- case CBACK:
- if (compex->braelist[i = *ep++] == 0)
- {
- fputs("bad braces\n", stdout) FLUSH;
- err = TRUE;
- return FALSE;
- }
- if (backref(compex, i, lp))
- {
- lp += compex->braelist[i] - compex->braslist[i];
- continue;
- }
- return FALSE;
-
- case CBACK | STAR:
- if (compex->braelist[i = *ep++] == 0)
- {
- fputs("bad braces\n", stdout) FLUSH;
- err = TRUE;
- return FALSE;
- }
- curlp = lp;
- while (backref(compex, i, lp))
- {
- lp += compex->braelist[i] - compex->braslist[i];
- }
- while (lp >= curlp)
- {
- if (advance(compex, lp, ep))
- return TRUE;
- lp -= compex->braelist[i] - compex->braslist[i];
- }
- continue;
-
- case CDOT | STAR:
- curlp = lp;
- while (*lp++ && lp[-1] != '\n');
- goto star;
-
- case WORD | STAR:
- curlp = lp;
- while (*lp++ && isalnum(lp[-1]));
- goto star;
-
- case NWORD | STAR:
- curlp = lp;
- while (*lp++ && !isalnum(lp[-1]));
- goto star;
-
- case CCHR | STAR:
- curlp = lp;
- while (*lp++ && trt[lp[-1]] == trt[*ep]);
- ep++;
- goto star;
-
- case CCL | STAR:
- case NCCL | STAR:
- curlp = lp;
- while (*lp++ && cclass(ep, lp[-1], ep[-1] == (CCL | STAR)));
- ep += BMAPSIZ;
- goto star;
-
- star:
- do
- {
- lp--;
- if (advance(compex, lp, ep))
- return TRUE;
- } while (lp > curlp);
- return FALSE;
-
- default:
- fputs("Badly compiled pattern\n", stdout) FLUSH;
- err = TRUE;
- return -1;
- }
- if (*ep == CEND || *ep == CDOL)
- {
- return TRUE;
- }
- return FALSE;
- }
-
- bool
- backref(compex, i, lp)
- register COMPEX *compex;
- register int i;
- register char *lp;
- {
- register char *bp;
-
- bp = compex->braslist[i];
- while (*lp && *bp == *lp)
- {
- bp++;
- lp++;
- if (bp >= compex->braelist[i])
- return TRUE;
- }
- return FALSE;
- }
-
- bool
- cclass(set, c, af)
- register char *set;
- register int c;
- {
- c &= 0177;
-
- #if BITSPERBYTE == 8
- if (set[c >> 3] & 1 << (c & 7))
- #else
- if (set[c / BITSPERBYTE] & 1 << (c % BITSPERBYTE))
- #endif
-
- return af;
- return !af;
- }
-