home *** CD-ROM | disk | FTP | other *** search
- /* A variation on Hume & Sunday's Tuned Boyer-Moore algorithm
- * for searching for a literal string pattern in a text.
- *
- * References:
- * Hume, A., D.M. Sunday. "Fast String Searching."
- * _Software--Practice and Experience_. Vol. 21.
- * No. 11. p. 1221-48. November 1991.
- *
- * Copyright (c) 1994 Christopher J. Kane.
- *
- * Version 1.2 (March 22, 1994)
- */
-
- #include <misckit/MiscTBMK.h>
- #include <ctype.h>
- extern void *calloc(unsigned int, unsigned int);
- extern void free(void *);
-
- #define TBMKMAGIC 0x54424D4B
-
- struct Misc_tbmk_pattern {
- unsigned int magic;
- signed int *pattern;
- signed int skip[256];
- signed int patlen;
- signed int direction;
- signed int jump;
- };
-
- Misc_TBMKpattern
- Misc_TBMKpattern_alloc(const unsigned char *pattern,
- unsigned int pattern_length,
- unsigned char reverse,
- unsigned char nocases)
- {
- Misc_TBMKpattern new;
- unsigned char *tmppat;
- int i;
-
- if (pattern_length==0) {
- return 0;
- }
- new = (Misc_TBMKpattern)calloc(1, sizeof(struct Misc_tbmk_pattern));
- if (new==0) {
- return 0;
- }
- new->pattern = (int *)calloc(pattern_length, sizeof(int));
- if (new->pattern==0) {
- free(new);
- return 0;
- }
- tmppat = (unsigned char *)calloc(pattern_length, sizeof(char));
- if (tmppat==0) {
- free(new->pattern);
- free(new);
- return 0;
- }
- new->magic = TBMKMAGIC;
- for (i = pattern_length; i--;) {
- tmppat[i] = nocases?tolower(pattern[i]):pattern[i];
- }
- new->patlen = reverse?-pattern_length:pattern_length;
- new->direction = reverse?-1:1;
- for (i = 256; i--;) {
- new->skip[i] = new->patlen;
- }
- if (reverse) {
- for (i = pattern_length-1; i>=0; i--) {
- if (nocases) {
- new->skip[tolower(tmppat[i])] = -i;
- new->skip[toupper(tmppat[i])] = -i;
- } else {
- new->skip[tmppat[i]] = -i;
- }
- }
- for (i = 1; i<pattern_length && tmppat[i]!=tmppat[0]; i++);
- new->jump = -i;
- for (i = 0; i<pattern_length; i++) {
- new->pattern[pattern_length-1-i] = new->skip[pattern[i]];
- }
- } else {
- for (i = 0; i<pattern_length; i++) {
- if (nocases) {
- new->skip[tolower(tmppat[i])] = pattern_length-1-i;
- new->skip[toupper(tmppat[i])] = pattern_length-1-i;
- } else {
- new->skip[tmppat[i]] = pattern_length-1-i;
- }
- }
- for (i = pattern_length-2; i>=0 && tmppat[i]!=tmppat[pattern_length-1]; i--);
- new->jump = pattern_length-1-i;
- for (i = 0; i<pattern_length; i++) {
- new->pattern[i] = new->skip[pattern[i]];
- }
- }
- free(tmppat);
- return new;
- }
-
- void
- Misc_TBMKpattern_free(Misc_TBMKpattern *pattern)
- {
- if (pattern!=0 && *pattern!=0 && (*pattern)->magic==TBMKMAGIC) {
- if ((*pattern)->pattern!=0) {
- free((*pattern)->pattern);
- }
- free(*pattern);
- *pattern = 0;
- }
- }
-
- #define UNROLL 4
-
- int
- Misc_TBMKsearch_memory(const Misc_TBMKpattern pattern,
- const unsigned char *text,
- int text_extent,
- unsigned char all_matches,
- int *match_position)
- {
- const unsigned char *cp, *tmpcp, *lb, *flb, *ub, *fub;
- unsigned int nmatch, i;
-
- if (!pattern || pattern->magic!=TBMKMAGIC || !text ||
- (pattern->direction<0 && 0<text_extent) ||
- (0<pattern->direction && text_extent<0)) {
- return -1; /* it's bad, bad */
- }
- cp = text+pattern->patlen-pattern->direction;
- if (0<pattern->direction) { /* calculate upper and lower bounds for search */
- ub = text+text_extent-1;
- fub = ub-UNROLL*pattern->patlen;
- lb = flb = cp;
- } else {
- lb = text+text_extent+1;
- flb = lb+UNROLL*pattern->patlen;
- ub = fub = cp;
- }
- nmatch = 0;
- skip_loop:
- if (ub<cp || cp<lb) { /* search is out of bounds; done */
- return nmatch;
- }
- while (pattern->skip[*cp]) { /* not match of last character in pattern */
- if (flb<cp && cp<fub) {
- cp += pattern->skip[*cp]; /* skip or align to next possible match */
- cp += pattern->skip[*cp]; /* skip or align to next possible match */
- cp += pattern->skip[*cp]; /* skip or align to next possible match */
- cp += pattern->skip[*cp]; /* skip or align to next possible match */
- }
- cp += pattern->skip[*cp]; /* skip or align to next possible match */
- if (ub<cp || cp<lb) { /* search is out of bounds; done */
- return nmatch;
- }
- } /* fall out of skip loop, possible match */
- tmpcp = cp-pattern->patlen+pattern->direction;
- for (i = 0; tmpcp!=cp; i++) { /* compare pattern with text */
- if (pattern->skip[*tmpcp]!=pattern->pattern[i]) {
- cp += pattern->jump;
- goto skip_loop; /* mismatch, go back to fast skip loop */
- }
- tmpcp += pattern->direction; /* move to next character */
- }
- *match_position = cp-text; /* match found */
- if (0<pattern->direction) {
- *match_position -= (pattern->patlen-1);
- }
- nmatch++;
- if (all_matches) {
- cp += pattern->patlen; /* move current position beyond match */
- goto skip_loop; /* go back to fast skip loop */
- }
- return nmatch;
- }
-
- #if defined(NeXT)
-
- #define STREAM_MAGIC 0xbead0369
- #define Tell(S) ((S)->buf_ptr-(S)->buf_base+(S)->offset)
- #define Getc(S) ((S)->buf_left>0?(*(S)->buf_ptr):_NXStreamFillBuffer(S))
-
- /* Seek() can handle NX_FROMSTART and NX_FROMCURRENT only. */
- #define Seek(S, O, M) ({int _off=(O)+((M)==1?Tell(S):0); \
- if ((S)->eof<Tell(S)) {(S)->eof = Tell(S);} \
- if (_off<0 || (S)->eof<_off) {NX_RAISE(NX_illegalSeek, (S), 0);} \
- (*(S)->functions->seek)((S), _off);})
-
- int
- Misc_TBMKsearch_stream(const Misc_TBMKpattern pattern,
- NXStream *stream,
- int text_extent,
- unsigned char all_matches,
- int *match_position)
- {
- unsigned int nmatch, i, writable;
- int skip, current, start;
-
- if (!pattern || pattern->magic!=TBMKMAGIC ||
- !stream || stream->magic_number!=STREAM_MAGIC ||
- (pattern->direction<0 && 0<text_extent) ||
- (0<pattern->direction && text_extent<0) ||
- !(stream->flags&NX_READFLAG && stream->flags&NX_CANSEEK)) {
- return -1;
- }
- if ((0<pattern->direction && text_extent<pattern->patlen) ||
- (pattern->direction<0 && text_extent>pattern->patlen)) {
- return 0;
- }
- writable = stream->flags&NX_WRITEFLAG;
- stream->flags &= ~NX_WRITEFLAG;
- NX_DURING
- start = Tell(stream);
- Seek(stream, pattern->patlen-pattern->direction, NX_FROMCURRENT);
- text_extent -= pattern->patlen;
- nmatch = 0;
- skip_loop:
- do { /* fast skip loop */
- skip = pattern->skip[Getc(stream)];
- Seek(stream, skip, NX_FROMCURRENT);
- text_extent -= skip;
- skip = pattern->skip[Getc(stream)];
- Seek(stream, skip, NX_FROMCURRENT);
- text_extent -= skip;
- if ((0<pattern->direction && text_extent<0) ||
- (pattern->direction<0 && text_extent>0)) {
- if (writable) {
- stream->flags |= NX_WRITEFLAG;
- }
- return nmatch;
- }
- } while (skip);
- current = Tell(stream);
- Seek(stream, -pattern->patlen+pattern->direction, NX_FROMCURRENT);
- for (i = 0; Tell(stream)!=current; i++) { /* compare pattern with text */
- if (pattern->skip[Getc(stream)]!=pattern->pattern[i]) {
- Seek(stream, current+pattern->jump, NX_FROMSTART);
- text_extent -= pattern->jump;
- goto skip_loop; /* mismatch, go back to fast skip loop */
- }
- Seek(stream, pattern->direction, NX_FROMCURRENT);
- }
- *match_position = current-start; /* match found */
- if (0<pattern->direction) {
- *match_position -= (pattern->patlen-1);
- }
- nmatch++;
- if (!all_matches) {
- if (writable) {
- stream->flags |= NX_WRITEFLAG;
- }
- return nmatch;
- }
- Seek(stream, pattern->patlen, NX_FROMCURRENT);
- text_extent -= pattern->patlen;
- goto skip_loop; /* go back to fast skip loop */
- NX_HANDLER
- if (writable) {
- stream->flags |= NX_WRITEFLAG;
- }
- switch (NXLocalHandler.code) {
- case NX_illegalSeek:
- return nmatch;
- default:
- NX_RERAISE();
- }
- NX_ENDHANDLER
- return nmatch; // added by Don to remove warning on HPPA beta compiler.
- // we should never get here, so I don't think this is a problem...
- }
-
- #endif /* defined(NeXT) */
-