home *** CD-ROM | disk | FTP | other *** search
- /*
- * Copyright 1990 Karl Lehenbauer, Mark Diekhans,
- * Peter da Silva and Jordan Henderson.
- *
- * Permission to use, copy, modify, and distribute this
- * software and its documentation for any purpose and without
- * fee is hereby granted, provided that the above copyright
- * notice appear in all copies. Karl Lehenbauer, Mark Diekhans,
- * Peter da Silva and Jordan Henderson make no representations
- * about the suitability of this software for any purpose.
- * It is provided "as is" without express or implied warranty.
- */
-
- /*
- *
- * Hacked to run under Sys V by karl, Sat Sep 8 09:25:55 CDT 1990
- *
- * Hacked to have a separate routine to precompile the pattern delta table
- * and such by karl, too
- *
- * From: chris@mimsy.umd.edu (Chris Torek)
- * Newsgroups: alt.sources
- * Subject: [comp.lang.c] Re: strstr sources: a summary of responses
- * Keywords: ANSI lib, strstr
- * Message-ID: <1990Aug27.191251.21720@math.lsa.umich.edu>
- * Date: 27 Aug 90 19:12:51 GMT
- * Organization: University of Michigan, Department of Mathematics
- * X-Original-Newsgroups: comp.lang.c
- *
- * Archive-name: torek-boyer-moore/27-Aug-90
- * Original-posting-by: chris@mimsy.umd.edu (Chris Torek)
- * Original-subject: Re: strstr sources: a summary of responses
- * Reposted-by: emv@math.lsa.umich.edu (Edward Vielmetti)
- *
- * In article <1158@umvlsi.ecs.umass.edu> srivasta@umvlsi.ecs.umass.edu
- * (Manoj Srivastava) posts a Boyer-Moore based strstr() by John Lacey.
- *
- * >And finally, an implementation of the Boyer-Moore algorithm. I
- * >am not sure offhand, but while the worst case complexity remains
- * >O(N+M), the avg case behavior is O(N/M) ???
- *
- * (where N is the length of the string being searched and M is the length
- * of the substring, aka pattern). Yes, it is N/M.
- *
- * Next, the code:
- *
- * > strstr
- * [some stuff deleted]
- * >int ch, d[SCHAR_MAX + 1];
- * [more stuff deleted]
- * > i += d[text[i]];
- * [still more deleted]
- *
- * If `char' is signed, and a string containing a negative character value
- * is passed to strstr(), this code can attempt to fetch, e.g., d[-40]. I
- * found the code overly difficult to read, so I rewrote it. I also wrote
- * a long comment describing the algorithm. This has been only lightly
- * tested, but is simple enough that it should work. Note that this
- * version handles strings with embedded NULL characters as well.
- *
- *
- */
-
- #ifndef NOSCCSID
- static char *sccs_id = "%M% %I% %H%";
- #endif
-
- #ifdef M_XENIX
- # define XENIX
- # define XENIX286 1
- #endif
-
- #if defined(XENIX) || defined(BSD)
- # define CHAR_MAX 255
- #else
- # include <limits.h>
- #endif
-
- #if XENIX286
- typedef int size_t;
- #endif
-
- #include <sys/types.h>
- #include <stdio.h>
-
- typedef char *PTR; /* `char *' for old compilers */
- #define ckalloc malloc
- char *malloc();
-
- struct compiled_search_struct {
- size_t patlen;
- size_t deltaspace[CHAR_MAX + 1];
- };
-
- /*
- * Boyer-Moore search: input is `text' (a string) and its length,
- * and a `pattern' (another string) and its length.
- *
- * The linear setup cost of this function is approximately 256 + patlen.
- * Afterwards, however, the average cost is O(textlen/patlen), and the
- * worst case is O(textlen+patlen).
- *
- * The Boyer-Moore algorithm works by observing that, for each position
- * in the text, if the character there does *not* occur somewhere in the
- * search pattern, no comparisons including that character will match.
- * That is, given the text "hello world..." and the pattern "goodbye", the
- * `w' in `world' means that none of `hello w', `ello wo', `llo wor',
- * `lo worl', `o world', ` world.', and `world..' can match. In fact,
- * exactly patlen strings are certain not to match. We can discover this
- * simply by looking at the patlen'th character. Furthermore, even if
- * the text character does occur, it may be that it rules out some number
- * of other matches. Again, we can discover this by doing the match
- * `backwards'.
- *
- * We set up a table of deltas for each possible character, with
- * delta[character] being patlen for characters not in the pattern,
- * less for characters in the pattern, growing progressively smaller
- * as we near the end of the pattern. Matching then works as follows:
- *
- * 0 1 2 3
- * 01234567890123456789012345678901234567
- * "Here is the string being searched into" (text)
- * ------ (pos = [0..5])
- * "string" (pat)
- * 654321- (deltas)
- *
- * (the delta for `-' will be derived below).
- *
- * Positions 0..5 end with `i', which is not the `g' we want. `i' does
- * appear in `string', but two characters before the end. We skip
- * forward so as to make the `i's match up:
- *
- * "Here is the string being searched into" (text)
- * "string" (pos = [2..7])
- *
- * Next we find that ` ' and `g' do not match. Since ` ' does not appear
- * in the pattern at all, we can skip forward 6:
- *
- * "Here is the string being searched into" (text)
- * "string" (pos = [8..13])
- *
- * Comparing `t' vs `g', we again find no match, and so we obtain the
- * delta for `t', which is 4. We skip to position 17:
- *
- * "Here is the string being searched into" (text)
- * "string" (pos = [12..17])
- *
- * It thus takes only four steps to move the search point forward to the
- * match, in this case.
- *
- * If the pattern has a recurring character, we must set the delta for
- * that character to the distance of the one closest to the end:
- *
- * "befuddle the cat" (text)
- * "fuddle" (pos = [0..5])
- * 654321- (delta)
- *
- * We want the next search to line the `d's up like this:
- *
- * "befuddle the cat" (text)
- * "fuddle" (pos = [2..7])
- *
- * and not like this:
- *
- * "befuddle the cat" (text)
- * "fuddle" (pos = [3..8])
- *
- * so we take the smaller delta for d, i.e., 2.
- *
- * The last task is computing the delta we have noted above as `-':
- *
- * "candlesticks" (text)
- * "hand" (pos = [0..3])
- * 4321- (delta)
- *
- * Here the `d' in `hand' matches the `d' in `candlesticks', but the
- * strings differ. Since there are no other `d's in `hand', we know
- * that none of (cand,andl,ndle,dles) can match, and thus we want this
- * delta to be 4 (the length of the pattern). But if we had, e.g.:
- *
- * "candlesticks" (text)
- * "deed" (pos = [0..3])
- * 4321- (delta)
- *
- * then we should advance to line up the other `d':
- *
- * "candlesticks" (text)
- * "deed" (pos = [3..6])
- *
- * As this suggests, the delta should be that for the `d' nearest the
- * end, but not including the end. This is easily managed by setting up
- * a delta table as follows:
- *
- * for int:c in [0..255] { delta[c] = patlen; };
- * for int:x in [0..patlen-1) { delta[pat[x]] = patlen - (x + 1); };
- *
- * delta[pat[patlen-1]] is never written, so the last letter inherits the
- * delta from an earlier iteration or from the previous loop.
- *
- * NB: the nonsense with `deltaspace' below exists merely because gcc
- * does a horrible job of common subexpression elimination (it does not
- * notice that the array is at a constant stack address).
- */
-
- char *
- boyer_moore_search_compile(pat, patlen)
- char *pat;
- int patlen;
- {
- register unsigned char *p, *t;
- register size_t i, p1, j, *delta;
- struct compiled_search_struct *cp;
- int alloc_len;
-
- /* algorithm fails if pattern is empty */
- if ((p1 = patlen) == 0)
- return (NULL);
-
- alloc_len = sizeof(struct compiled_search_struct) + patlen + 1;
- cp = (struct compiled_search_struct *)ckalloc(alloc_len);
-
- strncpy((char *)cp+sizeof(struct compiled_search_struct), pat, patlen);
- *((char *)cp+alloc_len-1) = '\0';
-
- /* set up deltas */
- delta = cp->deltaspace;
-
- for (i = 0; i <= CHAR_MAX; i++)
- delta[i] = p1;
-
- for (p = (unsigned char *)pat, i = p1; --i > 0;)
- delta[*p++] = i;
-
- cp->patlen = patlen;
- return((char*) cp);
- }
-
-
- char *
- boyer_moore_search_execute(text, textlen, compPtr, patLenP)
- char *text;
- size_t textlen;
- char *compPtr;
- unsigned *patLenP;
- {
- register unsigned char *p, *t;
- struct compiled_search_struct *csp =
- (struct compiled_search_struct*) compPtr;
- register size_t i, p1, j, *delta = csp->deltaspace;
- char *pat;
- size_t patlen;
-
- *patLenP = p1 = patlen = csp->patlen;
- /* code below fails (whenever i is unsigned) if pattern too long */
- if (p1 > textlen)
- return (NULL);
-
- pat = (char *)csp + sizeof(struct compiled_search_struct);
- /*
- * From now on, we want patlen - 1.
- * In the loop below, p points to the end of the pattern,
- * t points to the end of the text to be tested against the
- * pattern, and i counts the amount of text remaining, not
- * including the part to be tested.
- */
- p1--;
- p = (unsigned char *)pat + p1;
- t = (unsigned char *)text + p1;
- i = textlen - patlen;
- for (;;) {
- if (*p == *t && memcmp((PTR)(p - p1), (PTR)(t - p1), p1) == 0)
- return ((char *)t - p1);
- j = delta[*t];
- if (i < j)
- break;
- i -= j;
- t += j;
- }
- return (NULL);
- }
-
-