home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Shareware Overload
/
ShartewareOverload.cdr
/
progm
/
cpgms.zip
/
FGREP.C
< prev
next >
Wrap
C/C++ Source or Header
|
1985-11-17
|
24KB
|
948 lines
/* FGREP.C - Search File(s) for Fixed Pattern(s)
*
* Version 1.01 February 11th, 1985
*
* Modifications:
* V1.00 (84/12/01) - beta test release
* V1.01 (85/01/01) - added -P option
* - improved command line validation
* V1.02 (85/01/06) - modified "run_fsa()" and "bd_move()"
* V1.03 (85/02/11) - added -S option
*
* Copyright 1985: Ian Ashdown
* by Heart Software
* 1089 West 21st Street
* North Vancouver, B.C V7P 2C6
*
* This program may be copied for personal, non-commercial use
* only, provided that the above copyright notice is included in
* all copies of the source code. Copying for any other use
* without previously obtaining the written permission of the
* author is prohibited.
*
* Machine readable versions of this program may be purchased for
* $35.00 (U.S.) from byHeart Software. Supported disk formats
* are CP/M 8" SSSD and PC-DOS (v2.x) 5-1/4" DSDD.
*
* Notes:
*
* The algorithm used in this program constructs a deterministic
* finite state automaton (FSA) for pattern matching from the sub-
* strings, then uses the FSA to process the text string in one
* pass. The time taken to construct the FSA is proportional to
* the sum of the lengths of the substrings. The number of
* state transitions made by the FSA in processing the text
* string is independent of the number of substrings.
*
* Algorithm Source:
*
* "Efficient String Matching: An Aid to Bibliographic Search"
* Alfred V. Aho & Margaret J. Corasick
* Communications of the ACM
* pp. 333-340, Vol. 18 No. 6 (June '75)
*
* USAGE: fgrep [-vclnhyefxps] [strings] <files>
*
* where:
*
* -v All lines but those matched are printed.
* -c Only a count of the matching lines is printed.
* -l The names of the files with matching lines are
* listed (once), separated by newlines.
* -n Each line is preceded by its line number in the
* file.
* -h Do not print filename headers with output lines.
* -y All characters in the file are mapped to upper
* case before matching. (This is the default if the
* string is given in the command line under CP/M,
* as CP/M maps everything on the command line to
* upper case. Use the -f option if you need both
* lower and upper case.) Not a true UNIX "fgrep"
* option (normally available under "grep" only),
* but too useful to leave out.
* -e <string>. Same as a string argument, but useful
* when the string begins with a '-'.
* -f <file>. The strings (separated by newlines) are
* taken from a file. If several strings are listed
* in the file, then a match is flagged if any of
* the strings are matched. If -f is given, any
* following argument on the command line is taken
* to be a filename.
* -x Only lines matched in their entirety are printed.
* -p Each matched line is preceded by the matching
* substring(s). Not a UNIX "fgrep" option, but too
* useful to leave out.
* -s No output is produced, only status. Used when
* "fgrep" is run as a process that returns a
* status value to its parent process. Under CP/M, a
* non-zero value returned by "exit()" may terminate
* a submit file that initiated the program,
* although this is implementation-dependant.
*
* DIAGNOSTICS:
*
* Exit status is 0 if any matches are found, 1 if none, 2 for
* error condition.
*
* BUGS:
*
* The following UNIX-specific optiono is not supported:
*
* -b Each line is preceded by the block number in
* which it was found.
*
*
* Lines are limited to 256 characters.
*
*/
/*** Definitions ***/
#define TRUE -1
#define FALSE 0
#define MAX_LINE 257 /* Maximum number of characters */
/* per line plus NULL delimiter */
/* #define CP/M */ /* Comment out for compilation */
/* under UNIX */
#define MSDOS 1 /* 1 if running under MSDOS, else 0 */
#define DESMET 1 /* 1 if using the DeSmet C compiler, else 0 */
#define CMD_ERR 0 /* Error Codes */
#define OPT_ERR 1
#define INP_ERR 2
#define STR_ERR 3
#define MEM_ERR 4
/*** TYPEDEFS ***/
typedef int BOOL; /* BOOLEAN FLAG */
/*** Data Structures ***/
char *documentation[] = {
"fgrep search a set of files for one or more strings",
" fgrep [flags] [string] file_list [>file]",
"",
"Flags are single characters preceeded by '-':",
" -v All lines but those matched are sent to STDOUT.",
" -c Only a count of the matching lines is output.",
" -l Only names of the files with matching lines are output.",
" -n Each line is preceeded by its line number.",
" -h Do not output filename headers with output lines.",
" -y The input file is mapped to upper case prior to matching.",
" -e <string>. Same as string argument, but useful when the ",
" -f <file>. The strings (separated by newlines) are taken ",
" from a file. A match is flagged if any string is matched. ",
" The next argument is taken to be a filename. ",
" -x Only lines matched in their entirety are output.",
" -s No output is produced, only ERRORLEVEL. Used in BAT files.",
" ",
" ERRORLEVEL is set to zero if any matches are found, one if none are ",
" found, and two for error conditions.",
" ",
"The file_list is a list of files (wildcards are acceptable).",
"",
"Output is to stdout. To print add >PRN: to your command.",
0 };
/* Queue element */
typedef struct queue
{
struct state_el *s_ptr;
struct queue *next_elq;
} QUEUE;
/* Transition element */
typedef struct transition
{
char lchar; /* Transition character */
struct state_el *nextst_ptr; /* Transition state pointer */
struct transition *next_el;
} TRANSITION;
/* FSA state element */
typedef struct state_el
{
TRANSITION *go_ls; /* Pointer to head of "go" list */
TRANSITION *mv_ls; /* Pointer to head of "move" list */
struct state_el *fail_state; /* "failure" transition state */
char *out_str; /* Terminal state message (if any) */
} FSA;
/*** Gloval Variables and Structures ***/
/* Dummy "failure" state */
FSA FAIL_STATE;
/* Define a separate data structure for State 0 of the FSA to
* speed processing of the input while the FSA is in that state.
* Since the Aho-Corasick algorithm only defines "go" transitions
* for this state (one for each valid input character) and no
* "failure" transitions or output messages, only an array of
* "go" transition state numbers is needed. The array is accessed
* directly, using the input character as the index.
*/
FSA *MZ[128]; /* State 0 of FSA */
BOOL vflag = FALSE, /* Command-line option flags */
cflag = FALSE,
lflag = FALSE,
nflag = FALSE,
hflag = FALSE,
yflag = FALSE,
eflag = FALSE,
fflag = FALSE,
xflag = FALSE,
pflag = FALSE,
dflag = FALSE,
sflag = FALSE;
/*** Include Files ***/
#include <stdio.h>
#if DESMET
#else
#include <ctype.h>
#endif
/*** Main Body of Program ***/
int main(argc, argv)
int argc;
char **argv;
{
char *temp, *p, *fexpnd();
BOOL match_flag = FALSE, proc_file();
void bd_go(), bd_move(), error();
/* Check for minimum number of command line arguments */
if (argc < 2)
error(CMD_ERR, NULL);
if (argc == 2 && argv[1][0] == '?' && argv[1][1] == 0) {
help(documentation);
return;
}
/* Parse the command line for user-selected options */
while (--argc && (*++argv)[0] == '-' && eflag == FALSE)
for (temp = argv[0]+1; *temp != '\0'; temp++)
switch(toupper(*temp))
{
case '?':
help(documentation);
break;
case 'V':
vflag = TRUE;
break;
case 'C':
cflag = TRUE;
break;
case 'L':
lflag = TRUE;
break;
case 'N':
nflag = TRUE;
break;
case 'H':
hflag = TRUE;
break;
case 'Y':
yflag = TRUE;
break;
case 'E':
eflag = TRUE;
break;
case 'F':
fflag = TRUE;
break;
case 'X':
xflag = TRUE;
break;
case 'P':
pflag = TRUE;
break;
case 'D':
dflag = TRUE;
break;
case 'S':
sflag = TRUE;
break;
default:
error(OPT_ERR, NULL);
}
/* "pflag" can only be TRUE if the following flags are FALSE */
if (vflag == TRUE || cflag == TRUE || lflag == TRUE ||
xflag == TRUE || sflag == TRUE)
pflag = FALSE;
/* Check for string (or string file) argument */
if (!argc)
error(CMD_ERR, NULL);
/* show available space */
if (dflag)
printf(" Available Memory = %u \n", (unsigned)(_showsp() - _memory()));
/* Build the "go" transitions */
bd_go(*argv++);
argc--;
/* Build the "failure" and "move" transitions */
bd_move();
/* Process each of the input files if not "stdin". */
if (argc < 2)
hflag = TRUE;
if (!argc)
{
if (proc_file(NULL, FALSE) == TRUE && match_flag == FALSE)
match_flag = TRUE;
}
else
while (argc--) {
#if MSDOS
while (p = fexpnd(*argv)) /* expand wildcards, etc */
#else
if (p = *argv)
#endif
if (proc_file(p, TRUE) == TRUE && match_flag == FALSE)
match_flag = TRUE;
argv++;
}
/* Return status to the parent process. Status is zero if any
* matches are found, 1 if none. */
if (match_flag == TRUE)
exit(0);
else
exit(1);
} /* end of main */
/*** Functions and Procedures ***/
/* PROC_FILE() - Run the FSA on the input file "in_file". Returns
* TRUE if a match was found, FALSE otherwise. */
BOOL proc_file(in_file, prt_flag)
char *in_file;
BOOL prt_flag;
{
char buffer[MAX_LINE], /* input string buffer */
*nl, *index(), *stoupper(), *fgets();
long line_cnt = 0L, /* line counter */
mtch_cnt = 0L; /* matched line counter */
BOOL mtch_flag, /* Matched line flag */
run_fsa();
FILE *in_fd, *fopen();
void error();
if (in_file != NULL) /* A file was specified as the input */
{
if (!(in_fd = fopen(in_file, "r")))
error(INP_ERR, in_file);
}
else
in_fd = stdin;
/* Read in a line at a time for processing */
while (fgets(buffer, MAX_LINE, in_fd))
{
if (nl = index(buffer, '\n')) /* remove newline */
*nl = '\0';
#if DESMET
if (nl = index(buffer, '\r')) /* remove carriage return */
*nl = '\0';
#endif
#ifdef CP/M
if (fflag == FALSE || yflag == TRUE)
stoupper(buffer);
#else
if (yflag == TRUE)
stoupper(buffer);
#endif
line_cnt++;
if ((mtch_flag = run_fsa(buffer)) == TRUE)
mtch_cnt++; /* incr matched line counter */
if (cflag == FALSE && lflag == FALSE && sflag == FALSE &&
((mtch_flag == TRUE && vflag == FALSE) ||
(mtch_flag == FALSE && vflag == TRUE)))
{
if (hflag == FALSE && prt_flag == TRUE)
printf("%s: ", in_file);
if (nflag == TRUE)
printf("%05ld: ", line_cnt);
puts(buffer);
#if DESMET
puts("\n");
#endif
}
}
if (lflag == TRUE && mtch_cnt > 0)
printf("%s\n", in_file);
else if (cflag == TRUE && sflag == FALSE)
printf("%ld\n", mtch_cnt);
if (in_file != NULL)
fclose(in_fd);
if (mtch_cnt) /* match found? */
return TRUE; /* ..yes */
else
return FALSE; /* ..no */
} /* end of proc_file */
/* RUN_FSA() - Run the finite state automaton with string "str"
* as input. Return TRUE if match, FALSE otherwise.
*/
BOOL run_fsa(str)
register char *str;
{
register FSA *st_ptr;
char *message = NULL;
BOOL msg_flag = FALSE;
FSA *go(), *move();
st_ptr = NULL; /* Initialize FSA */
if (xflag == FALSE)
{
/* Process next input character in the string */
while (*str)
{
st_ptr = move(st_ptr, *str);
/* print terminal state message and update FSA */
if (st_ptr == 0 && message)
{
printf("\n--> %s ", message);
message = NULL;
st_ptr = move(st_ptr, *str);
}
str++;
if (st_ptr)
if (st_ptr->out_str) /* Terminal state */
if (pflag == TRUE)
{
/* Save terminal state message */
message = st_ptr->out_str;
msg_flag = TRUE;
}
else
return TRUE;
}
if (message) /* Print any remaining terminal state message */
printf("\n--> %s ", message);
return msg_flag;
}
else /* match exact lines only */
{
while(*str)
{
st_ptr = go(st_ptr, *str++);
if (!st_ptr || st_ptr == &FAIL_STATE)
return FALSE; /* line not matched */
}
return TRUE;
}
} /* end of run_fsa */
/* GO() - Process "litchar" and return a pointer to the FSA's
* corresponding "go" transition state. If the character
* is not in the FSA state's "go" transition list, then
* return a pointer to FAIL_STATE.
*/
FSA *go(st_ptr, litchar)
FSA *st_ptr;
register char litchar;
{
register TRANSITION *current;
/* If state 0, then access separate state 0 data structure of
* the FSA. Note that there are no failure states defined for
* any input to FSA state 0.
*/
if (!st_ptr)
return MZ[litchar];
else
{
/* Point to the head of the linked list of "go" transitions
* associated with the state.
*/
current = st_ptr->go_ls;
/* Transverse the list looking for a match to othe input
* character.
*/
while (current)
{
if (current->lchar == litchar)
break;
current = current->next_el;
}
/* Null value for "current" indicates end of list was reached
* without having found match to input character.
*/
return current ? current->nextst_ptr : &FAIL_STATE;
}
} /* end of go */
/* MOVE() - Process "litchar" and return a pointer to the FSA's
* corresponding "move" transition state.
*/
FSA *move(st_ptr, litchar)
FSA *st_ptr;
register char litchar;
{
register TRANSITION *current;
/* If state 0, then access separate state data structure of
* the FSA.
*/
if (!st_ptr)
return MZ[litchar];
else
{
/* Point to the head of the linked list of "move" transitions
* associated with the state.
*/
current = st_ptr->mv_ls;
/* Transverse the list looking for a match to the input
* character.
*/
while (current)
{
if (current->lchar == litchar)
break;
current = current->next_el;
}
/* Null value for "current" indicates end of list was reached
* without having found match to input character. The
* returned pointer is then to state 0.
*/
return current ? current->nextst_ptr : NULL;
}
} /* end of move() */
/* BD_GO() - Build the "go" transitons for each state from the
* command line arguments.
*/
void bd_go(str)
char *str;
{
register char litchar;
char *nl,
buffer[MAX_LINE], /* input string buffer */
*stoupper(), *fgets(), *index();
FILE *str_fd, *fopen();
void error(), enter();
/* Initialize FSA state 0 "go" transition array so that every
* invocation of "go()" with "state" = 0 initially returns a
* pointer to FAIL_STATE.
*/
for (litchar = 1; litchar <= 127; litchar++)
MZ[litchar] = &FAIL_STATE;
/* If the -f option was selected, get the newline-separated
* strings from the file "str" one at a time and enter them
* into the FSA. Otherwise, enter the string "str" into the
* FSA.
*/
if (fflag == TRUE)
{
if (!(str_fd = fopen(str, "r")))
error(STR_ERR, str);
while (fgets(buffer, MAX_LINE, str_fd))
{
if (nl = index(buffer, '\n')) /* remove the newline */
*nl = '\0';
#if DESMET
if (nl = index(buffer, '\r')) /* remove carriage return */
*nl = '\0';
#endif
if (yflag == TRUE)
stoupper(buffer);
enter(buffer);
}
fclose(str_fd);
}
else
{
if (yflag == TRUE)
stoupper(str);
enter(str);
}
/* For every input character that does not lead to a defined
* "go" transition from FSA state 0, set the corresponding
* element in the state 0 "go" trasitions array to indicate
* a "go" transtion to state 0.
*/
for (litchar = 1; litchar <= 127; litchar++)
if (MZ[litchar] == &FAIL_STATE)
MZ[litchar] = NULL;
} /* end of bd_go */
/* ENTER() - Enter a string into the FSA by running each
* character in thrun through the current partially-
* built FSA. When a failure occurs, add the remainder
* of the string to the FSA as one new state per
* character. (Note the '\0' can never be a valid
* character - C uses it to terminate a string.)
*/
void enter(str)
char *str;
{
FSA *s, *go(), *create();
TRANSITION *current, *insert();
char *strsave();
register char *temp;
register FSA *st_ptr = NULL; /* Start in FSA State 0 */
register FSA *nextst_ptr;
void error();
/* Run each character in turn through partially-built FSA until
* a failure occurs.
*/
temp = str;
while ((s = go(st_ptr, *temp)) != &FAIL_STATE)
{
temp++;
st_ptr = s;
}
/* Process the remainder of the string */
while (*temp)
{
/* If a new state, then create a new state and insert
* transitiono character and "go" transition in current
* state. (Note special case of FSA state 0.)
*/
if (!st_ptr)
nextst_ptr = MZ[*temp++] = create();
else if (!(current = st_ptr->go_ls))
{
nextst_ptr = create();
st_ptr->go_ls = insert(nextst_ptr, *temp++);
}
else
{
/* ... or it was the character that the FSA returned a
* "failure" for. Find the tail of the current state's list
* of "go" transitions, create a new state and append it
* to the current state's "go" list.
*/
while (current->next_el)
current = current->next_el;
nextst_ptr = create();
current->next_el = insert(nextst_ptr, *temp++);
}
st_ptr = nextst_ptr;
}
/* Make string terminal state's output message */
st_ptr->out_str = strsave(str);
} /* end of enter() */
/* INSERT() - Create a new "go" transition and return a pointer
* to it.
*/
TRANSITION *insert(st_ptr, litchar)
FSA *st_ptr;
char litchar;
{
TRANSITION *current;
char *malloc();
void error();
if (!(current = (TRANSITION *)malloc(sizeof(TRANSITION))))
error(MEM_ERR, NULL);
current->lchar = litchar;
current->nextst_ptr = st_ptr;
current->next_el = NULL;
return current;
} /* end of insert() */
/* CREATE() - Create an FSA state and return a pointer to it. */
FSA *create()
{
FSA *st_ptr;
char *malloc();
void error();
if (!(st_ptr = (FSA *)malloc(sizeof(FSA))))
error(MEM_ERR, NULL);
st_ptr->go_ls = st_ptr->mv_ls = NULL;
st_ptr->fail_state = NULL;
st_ptr->out_str = NULL;
return st_ptr;
} /* end of create() */
/* BD_MOVE - Build the "failure" and "move" transitions for
* each state from the "go" transitions.
*/
void bd_move()
{
register char litchar;
register FSA *r, /* temp FSA state pointers */
*s, *t;
FSA *go(), *move();
TRANSITION *current, *insert();
QUEUE *first, /* pointer to head of queue */
*last; /* pointer to tail of queue */
void add_queue(), delete_queue();
if (dflag)
printf("BD_MOVE entry\n");
last = first = NULL; /* initialize the queue of FSA states */
/* For each input character with a "go" transition out of FSA
* state 0, add a pointer to the "go" state to the queue. Note
* that this will also serve as the "move" transition list for
* state 0.
*/
for (litchar = 1; litchar <= 127; litchar++)
if (s = go(NULL, litchar))
add_queue(&first, &last, s);
/* While there are still state pointers in the queue, do ... */
while (first)
{
/* remove state "r" pointer from the head of the queue */
r = first->s_ptr;
delete_queue(&first);
/* Skip (terminal state with no "go" transitions */
if (!r->go_ls)
continue;
/* Make "move" transition list for terminal state same as its
* "go" transition list.
*/
if (r->out_str)
r->mv_ls = r->go_ls;
/* For every input to state "r" that has a "go" transition to
* state "s", do ... */
for (litchar = 1; litchar <= 127; litchar++)
{
if ((s = go(r, litchar)) != &FAIL_STATE)
{
/* If a defined "go" transition exists for state "r" on
* input "litchar", add a pointer to state "s" to the
* end of the queue.
*/
add_queue(&first, &last, s);
/* Calculate the "failure" transition of state "s" using
* the following algorithm.
*/
t = r->fail_state;
while (go(t, litchar) == &FAIL_STATE)
t = t->fail_state;
s->fail_state = go(t, litchar);
}
else
{
/* ... otherwise set the pointer to state "s" to a
* pointer to the precalculated "move" transition of
* state "r"'s failure state on input "litchar".
*/
s = move(r->fail_state, litchar);
}
/* Add state "s" as the "move" transition for state "r" on
* input "litchar" only if it is not state 0 and "r" is not
* a terminal state.
*/
if (dflag)
printf("BD_MOVE ready to build move(s,a)\n");
if (s && !r->out_str) {
if (dflag)
printf("BD_MOVE about to insert move(s,a)\n");
if (!r->mv_ls) /* 1st instance of the list? */
current = r->mv_ls = insert(s, litchar);
else
current = current->next_el = insert(s, litchar);
}
}
}
if (dflag)
printf("BD_MOVE exit\n");
} /* end of bd_move() */
/* ADD_QUEUE - Add an instance to the tail of a queue. */
void add_queue(head_ptr, tail_ptr, st_ptr)
QUEUE **head_ptr, **tail_ptr;
FSA *st_ptr;
{
QUEUE *pq;
char *malloc();
void error();
/* Allocate the necessary memory and set the variables */
if (dflag)
printf("ADD_QUEUE h=%x t=%x s=%x\n", head_ptr, tail_ptr, st_ptr);
if (!(pq = (QUEUE *)malloc(sizeof(QUEUE))))
error(MEM_ERR, NULL);
pq->s_ptr = st_ptr;
pq->next_elq = NULL;
if (!*head_ptr) /* 1st instance of the queue? */
*tail_ptr = *head_ptr = pq;
else /* no, just another one */
*tail_ptr = (*tail_ptr)->next_elq = pq;
} /* end of add_queue() */
/* DELETE_QUEUE() - Delete an instance from the head of queue */
void delete_queue(head_ptr)
QUEUE **head_ptr;
{
if (dflag)
printf("del_QUEUE h=%x \n", head_ptr);
*head_ptr = (*head_ptr)->next_elq;
} /* end of delete_queue() */
/* STRSAVE() - Save a string somewhere in memory. */
char *strsave(str)
char *str;
{
int strlen();
char *p, *malloc();
void error();
if (p = malloc(strlen(str) + 1))
strcpy(p, str);
else
error(MEM_ERR, NULL);
return p;
} /* end of strsave() */
/* STOUPPER() - Map entire string pointed to by :str: to upper
* case.
*/
char *stoupper(str)
register char *str;
{
register char *temp;
temp = str;
while (*temp)
*temp++ = toupper(*temp);
return str;
} /* end of stoupper() */
/* ERROR() - Error reporting. Returns an exit status ofo 2 to the
* parent process.
*/
void error(n, str)
int n;
char *str;
{
fprintf(stderr, "\007\n*** ERROR - ");
switch(n)
{
case CMD_ERR:
fprintf(stderr, "Illegal command line");
break;
case OPT_ERR:
fprintf(stderr, "Illegal command line option");
break;
case INP_ERR:
fprintf(stderr, "Can't open input file %s", str);
break;
case STR_ERR:
fprintf(stderr, "Can't open string file %s", str);
break;
case MEM_ERR:
fprintf(stderr, "Out of memory");
break;
default:
break;
}
fprintf(stderr, " ***\n\nUsage: fgrep [-vclnhyefxps]");
fprintf(stderr, " [strings] <files>\n");
exit(2);
} /* end of error() */
/* end of fgrep.c */