home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Power-Programmierung
/
CD1.mdf
/
magazine
/
drdobbs
/
c_spec
/
sources
/
match.c
< prev
next >
Wrap
C/C++ Source or Header
|
1986-02-20
|
18KB
|
799 lines
#include <stdio.h>
#include <ctype.h>
/* MATCH.C (c) Copyright 1985, Allen I. Holub,
* All Rights Reserved
*
* The pattern matching routines used by grep.
*
* Revision History:
*
* 10/20/84 fixed bug in amatch
* 10/29/84 fixed bug in dodash
* 11/03/84 explicitly declared stoupper() as returning char *
* 11/21/84 dodash and makepat:
* - test error returns from malloc() calls.
* - changed dodash to use a bit map
* 11/24/85 Added the + operator to makepat. Added the match()
* subroutine.
*/
#define NUL 0x00 /* ^@ */
#define CR 0x0d /* ^M */
#define SUB 0x1a /* ^Z */
#define CPMEOF SUB
#define getpat(pat) makepat(pat,0)
#define max(a,b) ((a) > (b) ? (a) : (b))
/* HEX_L(c) converts c (a hex digit in ASCII) to a number.
* HEX_H(c) does the same but puts the result in the high nibble.
*/
#define HEX_L(c) ( isdigit(c) ? (c)-'0' : ((toupper(c))-'A')+10)
#define HEX_H(c) ( HEX_L(c) << 4 )
/* Definitions of meta-characters used in pattern matching routines.
* LITCHAR & NCCL are only used as token identifiers; all the others
* are also both token identifiers and the actual symbol used in
* the regular expression
*/
#define BOL '^'
#define EOL '$'
#define ANY '.'
#define LITCHAR 'L'
#define ESCAPE '\\'
#define CCL '[' /* Character class: [...] */
#define CCLEND ']'
#define NEGATE '^'
#define NCCL '!' /* Negative character class [^...] */
#define PCLOSE '+' /* 1 or more times */
#define CLOSURE '*' /* 0 or more times */
#define OR_SYM '|'
/*
* Tokens are used to hold pattern templates. (see makepat() in
* tools.h
*/
typedef struct token
{
char tok;
char lchar;
char *bitmap;
struct token *next;
}TOKEN;
#define TOKSIZE sizeof(TOKEN)
/*----------------------------------------------------------------------*/
static char *amatch( lin, pat, boln )
char *lin, *boln;
TOKEN *pat;
{
/* Scans through the pattern template looking for a match
* with lin. Each element of lin is compared with the template
* until either a mis-match is found or the end of the template
* is reached. In the former case a 0 is returned; in the latter,
* a pointer into lin (pointing to the last character in the
* matched pattern) is returned.
*
* "lin" is a pointer to the line being searched.
* "pat" is a pointer to a template made by makepat().
* "boln" is a pointer into "lin" which points at the
* character at the beginning of line.
*/
register char *bocl, *rval, *strstart;
if (pat == 0)
return( (char *)0 );
strstart = lin;
while ( pat )
{
if (pat->tok == CLOSURE && pat->next)
{
/*
* Process a closure:
* First skip over the closure token to the
* object to be repeated. This object can be
* a character class.
*/
pat = pat->next;
/* Now match as many occurrences of the
* closure pattern as possible.
*/
bocl = lin;
while ( *lin && omatch(&lin, pat, boln) )
;
/* 'Lin' now points to the character that made
* made us fail. Now go on to process the
* rest of the string. A problem here is
* a character following the closure which
* could have been in the closure.
* For example, in the pattern "[a-z]*t" (which
* matches any lower-case word ending in a t),
* the final 't' will be sucked up in the while
* loop. So, if the match fails, we back up a
* notch and try to match the rest of the
* string again, repeating this process
* recursively until we get back to the
* beginning of the closure. The recursion
* goes, at most, two levels deep.
*/
if (pat = pat->next)
{
while ( bocl <= lin )
{
if (rval = amatch(lin, pat, boln) )
{
/* success */
return(rval);
}
else
--lin;
}
return( (char *)0 ); /* match failed */
}
}
else if ( omatch(&lin, pat, boln) )
pat = pat->next;
else
return( (char *) 0);
}
/*
* Note that omatch() advances lin to point at the next
* character to be matched. Consequently, when we reach
* the end of the template, lin will be pointing at the
* character following the last character matched.
* The exceptions are templates containing only a
* BOLN or EOLN token. In these cases omatch doesn't
* advance.
*
* So, decrement lin to make it point at the end of the
* matched string. Then, check to make sure that we haven't
* decremented past the beginning of the string.
*
* A philosophical point should be mentioned here. Is $
* a position or a character? (Ie. does $ mean the EOL
* character itself or does it mean the character at the end of
* the line.) I decided here to make it mean the former, in
* order to make the behavior of amatch() consistent. If you
* give amatch the pattern ^$ (match all lines consisting only
* of an end of line) then, since something has to be returned,
* a pointer to the end of line character itself is returned.
*
* The --lin is done outside the return statement because max()
* is often a macro (which has side-effects).
*/
--lin;
return( (char *) max(strstart, lin) );
}
/* ---------------------------------------------------------------------- */
static setbit( c, field )
int c;
char field[];
{
/* Set a bit in the bit ASCII bit map corresponding to the
* character c. Field must be at least 16 bytes long.
*/
field[ (c & 0x7f) >> 3 ] |= 1 << (c & 0x07) ;
}
/* ---------------------------------------------------------------------- */
static testbit( c, field )
int c;
char *field;
{
/* See if the bit corresponding to c in field is set.
*/
return ( field[ (c & 0x7f)>>3 ] & (1 << (c & 0x07)) );
}
/* ---------------------------------------------------------------------- */
static char *dodash( delim, src, map )
int delim;
char *src, *map;
{
/* Expand the set pointed to by "*src" into the bitmap "map."
* Stop at delim or end of string. Update *src to point
* at terminator. A set can have one element {x} or several
* elements ( {abcdefghijklmnopqrstuvwxyz} and {a-z}
* are equivalent ). Note that the dash notation is expanded
* as sequential numbers. This means (since we are using the
* ASCII character set) that a-Z will contain the entire alphabet
* plus the symbols: [\]^_`
*
* The character classes are stored in a 16 byte wide bit field
* where each bit corresponds to an ASCII character.
*/
register int first, last;
char *start;
start = src;
while( *src && *src != delim )
{
if( *src != '-')
setbit( esc( &src ), map );
else if( src == start || *(src+1) == delim )
setbit( '-', map );
else
{
src++;
if( *src < *(src - 2) )
{
first = *src;
last = *(src-2);
}
else
{
first = *(src - 2);
last = *src;
}
while( ++first <= last )
setbit( first, map );
src++;
}
}
return( src );
}
/* ---------------------------------------------------------------------- */
static int esc(s)
char **s;
{
/* Map escape sequences into their equivalent symbols. Returns the
* Correct ASCII character. c is the character following the \.
*/
register int rval;
if( **s != ESCAPE )
rval = *( (*s)++ );
else
{
(*s)++;
switch( toupper(**s) )
{
case '\0': rval = ESCAPE; break;
case 'B': rval = '\b' ; break;
case 'F': rval = '\f' ; break;
case 'N': rval = '\n' ; break;
case 'R': rval = '\r' ; break;
case 'S': rval = ' ' ; break;
case 'T': rval = '\t' ; break;
case 'E': rval = '\033'; break;
default: rval = **s ; break;
case 'X': (*s)++;
rval = HEX_H( **s );
(*s)++;
rval |= HEX_L( **s );
break;
}
(*s)++;
}
return (rval);
}
/* ---------------------------------------------------------------------- */
static TOKEN *addnode( headp, ntok )
TOKEN **headp, *ntok;
{
/* Called by makepat, adds one node to the pattern template.
*/
static TOKEN *tail, **nextlast;
if( *headp == NULL ) /* Chain is empty */
{
nextlast = headp;
*headp = tail = ntok ;
}
else if( ntok->tok != CLOSURE ) /* Insert at end of list */
{
tail->next = ntok;
nextlast = &(tail->next) ;
tail = ntok;
}
else /* Insert before end */
{
ntok->next = *nextlast ;
*nextlast = ntok ;
}
tail->next = NULL ;
#ifdef DEBUG
pr_tok( *headp );
#endif
return( tail );
}
/* ---------------------------------------------------------------------- */
TOKEN *makepat(arg, delim)
char *arg;
int delim;
{
/* Make a pattern template from the string pointed to by arg.
* Stop when delim or '\000' or '\n' is found in arg.
* Return a pointer to the pattern template.
*
* The pattern templates used here are somewhat different
* than those used in the book; each token is a structure
* of the form TOKEN (see tools.h). A token consists of
* an identifier, a pointer to a string, a literal
* character and a pointer to another token. This last is 0 if
* there is no subsequent token.
*
* The one strangeness here is caused (again) by CLOSURE which
* has to be put in front of the previous token. To make this
* insertion a little easier, the 'next' field of the last
* token in the chain (the one pointed to by 'tail') is made
* to point at the previous node. When we are finished,
* tail->next is set to 0.
*/
TOKEN *head = NULL, *anothertok ;
register TOKEN *ntok;
register TOKEN *tail;
/* Check for characters that aren't legal at the beginning
* of a template.
*/
if(!*arg || *arg==delim || *arg=='\n'|| *arg==CLOSURE || *arg==PCLOSE)
return( NULL );
for(; *arg && *arg != delim && *arg != '\n' ; arg++ )
{
if( !(ntok = (TOKEN *) malloc(TOKSIZE)) )
{
fprintf(stderr, "Out of memory\n");
goto fatal_error;
}
switch( *arg )
{
case ANY:
ntok->tok = ANY;
break;
case BOL:
if (head != NULL)
goto fatal_error;
ntok->tok = BOL;
break;
case EOL:
if( *(arg+1) == delim || *(arg+1) == '\0'
|| *(arg+1) == '\n' )
ntok->tok = EOL;
else
goto fatal_error;
break;
case CCL:
if( *(arg+1) == NEGATE )
{
ntok->tok = NCCL;
arg += 2;
}
else
{
ntok->tok = CCL;
arg++;
}
if( ntok->bitmap = (char *) calloc( 16, 1 ) )
arg = dodash(CCLEND, arg, ntok->bitmap );
else
{
fprintf(stderr,"Out of memory\n");
goto fatal_error;
}
break;
case PCLOSE:
if( !(anothertok = (TOKEN *) malloc(TOKSIZE)) )
{
fprintf(stderr,"Out of memory\n");
goto fatal_error;
}
memcpy( anothertok, tail, sizeof(TOKEN) );
tail = addnode( &head, anothertok );
/* Fall through to closure case */
case CLOSURE:
switch( tail->tok )
{
case BOL:
case EOL:
case PCLOSE:
case CLOSURE: goto fatal_error;
default: ntok->tok = CLOSURE;
}
break;
default:
ntok->tok = LITCHAR;
ntok->lchar = esc( &arg );
--arg; /* esc advances us past the character */
}
tail = addnode( &head, ntok );
}
return (head);
fatal_error:
unmakepat( head );
return( NULL );
}
/* ---------------------------------------------------------------------- */
char *matchs(line, pat, ret_endp)
char *line;
TOKEN *pat;
int ret_endp;
{
/*
* Compares line and pattern. Line is a character string while
* pat is a pattern template made by getpat().
* Returns:
* 1. A zero if no match was found.
* 2. A pointer the last character
* satisfying the match if ret_endp is non-zero.
* 3. A pointer to the beginning of the matched string
* if ret_endp is 0;
*
* For example:
*
* matchs ("1234567890", getpat("4[0-9]*7"), 0);
*
* will return a pointer to the '4', while
*
* matchs ("1234567890", getpat("4[0-9]*7"), 1);
*
* will return a pointer to the '7'. Note that success
* is returned when BOL or EOL (by itself ) is being requested
* on a null string. In this case a pointer to the null is
* returned.
*/
char *bptr, *rval = (char *)0;
if( !*line )
{
if((pat->tok == EOL) ||
(pat->tok == BOL && (!pat->next || pat->next->tok == EOL)))
rval = line;
}
else
{
bptr = line;
while (*line)
{
if ( (rval = amatch(line, pat, bptr)) == 0 )
line++;
else
{
rval = ret_endp ? rval : line ;
break;
}
}
}
return (rval);
}
/* ---------------------------------------------------------------------- */
static int omatch( linp, pat, boln )
char **linp, *boln;
TOKEN *pat;
{
/* Match one pattern element, pointed at by pat, with the
* character at **linp. Return non-zero on match.
* Otherwise, return 0. *Linp is advanced to skip over the
* matched character; it is not advanced on failure. The
* amount of the advance is 0 for patterns that match null
* strings, 1 otherwise. "boln" should point at the position
* that will match a BOL token.
*/
register int advance;
advance = -1;
if( **linp == '\0' )
{
/* This clause takes care of EOL matching a terminating
* null. The case of EOL matching a '\n' is handled
* in the following else clause.
*/
if( pat->tok == EOL )
advance = 0;
}
else
{
switch ( pat->tok )
{
case LITCHAR:
if ( **linp == pat->lchar )
advance = 1;
break;
case BOL:
if ( *linp == boln )
advance = 0;
break;
case ANY:
if ( **linp != '\n' )
advance = 1;
break;
case EOL:
if ( **linp == '\n' )
advance = 0;
break;
case CCL:
if( testbit( **linp, pat->bitmap ) )
advance = 1;
break;
case NCCL:
if( !testbit( **linp, pat->bitmap) )
advance = 1;
break;
default:
printf("omatch: can't happen\n");
}
}
if( advance > 0 )
*linp += advance;
return( ++advance );
}
/* --------------------------------------------------------------- */
unmakepat(head)
TOKEN *head;
{
/* Free up the memory used for the token string */
register TOKEN *old_head;
while (head)
{
switch (head->tok)
{
case CCL:
case NCCL:
free( head->bitmap );
/* no break, fall through to default */
default:
old_head = head;
head = head->next;
free( old_head );
break;
}
}
}
/*----------------------------------------------------------------------*/
char *match( look_for, in_this_string)
char *look_for, *in_this_string;
{
/* Return a pointer to the characters matching look_for
* if it's in the string, else return 0.
*/
register TOKEN *p;
register char *rval;
rval = matchs( in_this_string, p = getpat(look_for), 0 );
unmakepat( p );
return( rval );
}
/* ---------------------------------------------------------------------- */
#ifdef DEBUG
pr_tok( head )
TOKEN *head;
{
/* Print out the pattern template (linked list of TOKENs)
* pointed to by head. This is a useful debugging aid. Note
* that pr_tok() just scans along the linked list, terminating
* on a null pointer; so, you can't use pr_tok from inside
* makepat() because tail->next points to the previous
* node instead of being null. Note that NEGATE and OR_SYM
* are not listed because they won't occur in a template.
*/
register char *str;
register int i;
for (; head ; head = head->next )
{
switch (head->tok)
{
case BOL:
str = "BOL";
break;
case EOL:
str = "EOL";
break;
case ANY:
str = "ANY";
break;
case LITCHAR:
str = "LITCHAR";
break;
case ESCAPE:
str = "ESCAPE";
break;
case CCL:
str = "CCL";
break;
case CCLEND:
str = "CCLEND";
break;
case NCCL:
str = "NCCL";
break;
case CLOSURE:
str = "CLOSURE";
break;
default:
str = "**** unknown ****";
}
printf("%-8s at: 0x%x, ", str, head);
if (head->tok == CCL || head->tok == NCCL)
{
printf("string (at 0x%x) =<", head->bitmap );
for( i = 0; i < 0x7f ; i++)
if( testbit(i, head->bitmap) )
putchar(i);
printf(">, ");
}
else if (head->tok == LITCHAR)
printf("lchar = %c, ", head->lchar);
printf("next = 0x%x\n", head->next);
}
putchar('\n');
}
/*----------------------------------------------------------------------*/
main()
{
static char ibuf[80], search_this[80], for_this[80];
register char *p, *start, *end;
register TOKEN *tp;
while(1)
{
printf("----------------------------------------------\n");
printf("Str to search (CR=<%s>): ", search_this );
gets(ibuf);
if( *ibuf )
strcpy(search_this, ibuf);
printf("Pattern to search for : ");
gets(for_this);
printf("\nlooking for <%s> in <%s>\n", for_this, search_this);
if( p = match( for_this, search_this ) )
{
printf("FOUND at <%s>\n\n", p );
tp = getpat(for_this);
start = matchs(search_this , tp, 0 );
end = matchs(search_this , tp, 1 );
printf("tp = 0x%x, start = 0x%x, end = 0x%x\n",
tp, start, end );
unmakepat( tp );
if( !*search_this )
printf("search_this (at 0x%x) is empty\n",
search_this );
else
{
putchar('\n');
for( p = search_this ; *p ; p++ )
{
if( p == start )
putchar('<');
putchar( *p );
if( p == end )
puts(">");
}
}
}
else
printf("NOT FOUND\n\n");
}
}
#endif