home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
ftp.barnyard.co.uk
/
2015.02.ftp.barnyard.co.uk.tar
/
ftp.barnyard.co.uk
/
cpm
/
walnut-creek-CDROM
/
CPM
/
BDSC
/
BDSC-3
/
PAT.CQ
/
PAT.C
Wrap
Text File
|
2000-06-30
|
11KB
|
428 lines
/* Pattern Search Utility
*
* This utility will search for regular expressions in text streams.
* First a function is called to "compile" the search pattern into a form
* easier to search with. Then a function is called with the text to be
* searched; it returns a pointer to the first line within the text which
* matches the pattern (or 0 if none did). This second function can be
* called repeatedly, with successive pointers to find all lines which
* match the pattern. The second function also returns a pointer to the
* memory location just past the matched string, making repeated searches
* easier.
* For example:
*
* comppat("*one*",'\n',FALSE);
* sptr = srchpat(textbuf,bufsz,endptr);
*
* will set sptr to the first line in textbuf containing the string
* "one" anywhere in it, and endptr to the point just past the matched
* line; or else it will return zero if there was no matching line.
* Although the line must match the pattern exactly, searching for
* a pattern anywhere in the line can be done by prefixing and postfixing
* the pattern with "*". Also, comppat() can be told (via the third
* parameter) to ignore case when comparing letters.
*
* Characters in the regular expressions have the following meanings:
*
* <char> - unless one of the ones below, match exactly <char>.
* ? - match any single character.
* * - match zero or more of any character.
* [<char_set>] - match any of the characters in the set.
* [-<char_set>] - match any of the characters not in the set.
* <char_set> ::= <char> match <char>
* <char1>-<char2> match any char in the
* range <char1>-<char2>
* "\" can be used to quote characters (i.e., prevent them from being
* interpretted specially).
*
*
* Initial coding 8/18/82 by Harry R. Chesley.
*
*
* Copyright 1982 by Harry R. Chesley.
* Unlimited permission granted for non-profit use only.
*/
#include "pat.h"
/* comppat(pat,eolchar,anycase)
*
* Function: Compile the regular expression pattern given in pat
* into a series of pattern elements; append an element which matches
* the eolchar. If anycase is TRUE, ignore case on letters.
*
* Algorithm: Parse pat for regular expression special characters,
* constructing a compiled version in cpat. The following transformations
* are made to get from the original pattern to the compiled series of
* elements:
* <char> - if anycase FALSE, pel_type = MONEC, pel_c = <char>;
* if anycase TRUE, pel_type = MSET, set = [<char>,
* other case of <char>].
* ? - pel_type = MONEA.
* * - pel_type = MMANY.
* [...] - pel_type = MSET, set = indicated characters.
* [-...] - pel_type = MSET, set = all but indicated characters.
*
* Also, eolchar is not allowed in any but the last element.
*
* Comments: This routine compiles into the global cpat, and also
* sets cptop and eolch. Therefore, it is not reentrant, and it is not
* possible to be using more than one pattern at a time.
*/
comppat(pat,eolchar,anycase)
char *pat;
char eolchar;
int anycase;
{
int negflg; /* Used in [...] processing to remember leading -. */
char c1, c2, ct; /* Used in [...] for <char>-<char> processing. */
eolch = eolchar; /* Remember end-of-line char. */
cptop = cpat; /* Start with nothing. */
/* Go thru pattern 'til end of string. */
while (*pat != 0) {
/* Never allow the EOL character in the search pattern. */
if (*pat == eolch) {
pat++;
continue;
};
/* Look for special characters. */
switch (*pat++) {
/* Zero or more of anything. */
case '*':
cptop->pel_type = MMANY; /* Zero or more. */
break;
/* One of anything. */
case '?':
cptop->pel_type = MONEA; /* One of anything.*/
break;
/* Character sets. */
case '[':
cptop->pel_type = MSET; /* Any in set. */
clrall(cptop); /* Start with nothing. */
/* Check for "any but". */
if (*pat == '-') {
pat++;
negflg = TRUE;
} else negflg = FALSE;
/* Figure the set. */
while ((*pat != ']') && (*pat != 0)) {
if (*pat == '\\')
if (*(++pat) == 0) break;
/* Check for <char>-<char>. */
if (*(pat+1) == '-') {
if (*(pat+2) != 0) {
if ((c1 = *pat) >
(c2 = *(pat+2))) {
ct = c1;
c1 = c2;
c2 = ct;
};
do if (anycase)
setnoc(cptop,c1);
else
setmem(cptop,c1);
while (c1++ < c2);
pat += 2;
} else pat++;
} else {
if (anycase)
setnoc(cptop,*pat);
else setmem(cptop,*pat);
};
pat++;
};
if (negflg) negset(cptop);
if (*pat == ']') pat++;
/* Never match the EOL char. */
clrmem(cptop,eolch);
break;
/* Quote. */
case '\\':
if (*pat != 0) pat++;
/* Fall thru to match single processing. */
/* Anything else: match only it. */
default:
if (anycase) {
cptop->pel_type = MSET;
clrall(cptop);
setnoc(cptop,*(pat-1));
} else {
cptop->pel_type = MONEC;
cptop->pel_c = *(pat-1);
};
break;
};
cptop++; /* Next element. */
};
/* Last of all, match EOL. */
cptop->pel_type = MONEC;
cptop->pel_c = eolchar;
#ifdef DEBUG
/* Print out the pattern we just compiled. */
prtpat();
#endif
}
/* srchpat(strng,sz,eosptr)
*
* Function: Using the previously compiled pattern, search the string
* strng (of size sz) for a line exactly matching the pattern. Return a
* pointer to that line, or 0. On a non-zero return, return in eosptr a
* pointer to the next character after the matched string.
*
* Algorithm: Repeatedly call match() on each successive line until a
* line matching the pattern is found, or we run out of data.
*
* Comments: See comments on comppat().
* The eosptr return value is passed from match by side-effect.
*/
char *srchpat(strng,sz,eosptr)
register char *strng;
register unsigned sz;
char **eosptr;
{
/* While we've still got something to search thru. */
while (sz != 0) {
/* If this one matches, return it. */
if (match(strng,sz,cpat)) {
*eosptr = nextstr;
return(strng);
};
/* Otherwise, find the next line, and try it. */
while (*strng != eolch) {
strng++; sz--;
if (sz == 0) return(0);
};
strng++; sz--; /* Skip EOL. */
};
return(0);
}
/* match(sptr,sz,cpptr)
*
* Function: Return TRUE if the string sptr (of size sz) exactly
* matches the compiled search pattern cpptr. If returning TRUE, also
* return the next character past the match in nextstr.
*
* Algorithm: Match only the first element (shortest string first),
* recursively calling ourself to match the remainder of the string.
*
* Comments: This function is used by srchpat() only. The user should
* never call it directly.
* nextstr is a side-effect return, which is not generally a nice idea.
* However, match() is the most crucial routine in this package with regard
* to execution-time efficiency, and passing another parameter thru the whole
* recursive search would severly slow things down.
* The recursive depth of this routine is bounded by the maximum size
* of the search pattern. I.e., with a max pattern size of 100, this routine
* will never call itself more than 100 times (or is it 101 times?).
*/
match(sptr,sz,cpptr)
register char *sptr;
register unsigned sz;
register struct pel *cpptr;
{
struct pel *cpp1; /* cpptr + 1. */
/* If there's nothing left of the string, we can't match it. */
if (sz == 0) return(FALSE);
/* Calculate next cpptr for later use. */
cpp1 = cpptr+1;
/* Switch on type of element. */
switch (cpptr->pel_type) {
/* Match one exactly. */
case MONEC:
if (*sptr != cpptr->pel_c) return(FALSE);
break;
/* Match any one. */
case MONEA:
if (*sptr == eolch) return(FALSE);
break;
/* Match any in set. */
case MSET:
if (! inset(cpptr,*sptr)) return(FALSE);
break;
/* Match zero or more. */
case MMANY:
/* Try matching 0, 1, 2, ... */
do {
if (match(sptr,sz,cpp1)) return(TRUE);
if (*sptr++ == eolch) break;
} while (--sz);
return(FALSE);
/* We'd better not get here! */
default:
#ifdef DEBUG
printf("Illegal pel_type in match().\n");
#endif
exit(1);
};
/* It matched so see if we're at the end or the rest matches. */
if (cpptr == cptop) {
nextstr = sptr+1; /* Return next char past line. */
return(TRUE);
};
/* See if the rest matches. */
if (match(sptr+1,sz-1,cpp1)) return(TRUE);
/* No match. */
return(FALSE);
}
/* setall(sptr), clrall(sptr), negset(sptr), setmem(sptr,mem),
* setnoc(sptr,mem)
*
* Function: Set operations: set all bits, clear all, complement all,
* clear one member bit, set one member bit, and set one member bit and it's
* other case equivalent if a letter.
*
* Algorithm: Obvious.
*
* Comments: One other set operation (inset) is define as a macro
* for speed.
*/
setall(sptr)
struct pel *sptr;
{
int i;
int *iptr;
for (i = SETMAX, iptr = sptr->pel_set; i-- > 0; *iptr++ = 0xFFFF);
}
clrall(sptr)
struct pel *sptr;
{
int i;
int *iptr;
for (i = SETMAX, iptr = sptr->pel_set; i-- > 0; *iptr++ = 0);
}
negset(sptr)
struct pel *sptr;
{
int i;
int *iptr;
for (i = SETMAX, iptr = sptr->pel_set; i-- > 0;
*iptr = ~*iptr, iptr++);
}
clrmem(sptr,mem)
struct pel *sptr;
char mem;
{
sptr->pel_set[(mem>>4)&7] &= ~(1<<(mem&15));
}
setmem(sptr,mem)
struct pel *sptr;
char mem;
{
sptr->pel_set[(mem>>4)&7] |= (1<<(mem&15));
}
setnoc(sptr,mem)
struct pel *sptr;
char mem;
{
if ((mem >= 'a') && (mem <= 'z')) setmem(sptr,mem+('A'-'a'));
else if ((mem >= 'A') && (mem <= 'Z')) setmem(sptr,mem+('a'-'A'));
setmem(sptr,mem);
}
/* prtpat()
*
* Function: Print the current pattern is a readable form.
*
* Algorithm: Obvious to the casual observer.
*
* Comments: This is for debugging purposes only, and is only compiled
* or called if DEBUG is defined.
*/
#ifdef DEBUG
prtpat()
{
struct pel *cpptr;
char c;
for (cpptr = cpat; cpptr <= cptop; cpptr++) {
printf("** Prtpat: ");
switch (cpptr->pel_type) {
case MMANY:
printf("*");
break;
case MONEA:
printf("?");
break;
case MONEC:
if (cpptr->pel_c < ' ')
printf("\\%3.3o",cpptr->pel_c);
else printf("%c",cpptr->pel_c);
break;
case MSET:
printf("[");
for (c = 0; c <= 128; c++ )
if (inset(cpptr,c))
if (c < ' ')
printf(" \\%3.3o",c);
else printf(" %c",c);
printf("]");
break;
default:
printf("Illegal pel_type in prtpat!\n");
break;
};
printf("\n");
};
}
#endif