home *** CD-ROM | disk | FTP | other *** search
- /* Copyright (c) 1991 Sun Wu and Udi Manber. All Rights Reserved. */
- #include <stdio.h>
- #define MAXSYM 256
- #define MAXMEMBER 8192
- #define CHARTYPE unsigned char
- #define MaxError 20
- #define MAXPATT 256
- #define MAXLINE 1024
- #define MaxCan 2048
- #define BLOCKSIZE 8192
- #define MAX_SHIFT_2 4096
- #define ON 1
- #define LOG_ASCII 8
- #define LOG_DNA 3
- #define MAXMEMBER_1 65536
- #define LONG_EXAC 20
- #define LONG_APPX 24
- #define W_DELIM 2
-
- extern COUNT, FNAME, SILENT, FILENAMEONLY, num_of_matched;
- extern DNA ; /* DNA flag is set in checksg when pattern is DNA pattern and
- p_size > 16 */
- extern WORDBOUND, WHOLELINE, NOUPPER;
- extern unsigned char CurrentFileName[], Progname[];
- extern unsigned Mask[];
- extern unsigned endposition;
-
- unsigned char BSize; /* log_c m */
- unsigned char char_map[MAXSYM];
-
-
- /* data area */
- int shift_1;
- CHARTYPE SHIFT[MAXSYM];
- CHARTYPE MEMBER[MAXMEMBER];
- CHARTYPE pat[MAXPATT];
- unsigned Hashmask;
- char MEMBER_1[MAXMEMBER_1];
- CHARTYPE TR[MAXSYM];
-
- char_tr(pat, m)
- unsigned char *pat;
- int *m;
- {
- int i;
- unsigned char temp[MAXPATT];
- for(i=0; i<MAXSYM; i++) TR[i] = i;
- if(NOUPPER) {
- for(i='A'; i<= 'Z'; i++) TR[i] = i + 'a' - 'A';
- }
- if(WORDBOUND) { /* SUN: To be added to be more complete */
- TR['\n'] = TR['\t'] = TR[' '] = TR[','] = TR[';'] = TR[':'] =
- TR['!'] = TR['"'] = TR['?'] = TR['<'] = TR['>'] =
- TR['='] = TR['['] = TR[']'] = TR['\''] =
- TR['('] = TR[')'] = TR['$'] = TR['/'] = TR['\\'] = W_DELIM;
- }
- if(WHOLELINE) {
- memcpy(temp, pat, *m);
- pat[0] = '\n';
- memcpy(pat+1, temp, *m);
- pat[*m+1] = '\n';
- pat[*m+2] = 0;
- *m = *m + 2;
- }
- }
-
- sgrep(pat, m, fd, D)
- CHARTYPE *pat; int fd, m, D;
- {
- CHARTYPE text[BLOCKSIZE+2*MAXLINE+MAXPATT]; /* input text stream */
- int offset = 2*MAXLINE;
- int buf_end, num_read, i, start, end, residue = 0;
- if(pat[0] == '^' || pat[0] == '$') pat[0] = '\n';
- if(pat[m-1] == '^' || pat[m-1] == '$') pat[m-1] = '\n';
- char_tr(pat, &m); /* will change pat, and m if WHOLELINE is ON */
- text[offset-1] = '\n'; /* initial case */
- for(i=0; i < MAXLINE; i++) text[i] = 0; /* security zone */
- start = offset;
- if(WHOLELINE) start--;
- if(m >= MAXPATT) {
- fprintf(stderr, "%s: pattern too long\n", Progname);
- exit(2);
- }
- if(D == 0) {
- if(m > LONG_EXAC) m_preprocess(pat);
- else prep_bm(pat, m);
- }
- else if (DNA) prep4(pat, m);
- else if(m >= LONG_APPX) am_preprocess(pat);
- else {
- prep(pat, m, D);
- initmask(pat, Mask, m, 0, &endposition);
- }
- for(i=1; i<=m; i++) text[BLOCKSIZE+offset+i] = pat[m-1];
- /* to make sure the skip loop in bm() won't go out of bound */
- while( (num_read = read(fd, text+offset, BLOCKSIZE)) > 0)
- {
- buf_end = end = offset + num_read -1 ;
- if(num_read < BLOCKSIZE) if(text[end] != '\n') text[++end] = '\n';
- while(text[end] != '\n' && end > offset) end--;
- residue = buf_end - end + 1 ;
- text[start-1] = '\n';
- if(D==0) {
- if(m > LONG_EXAC) monkey(pat, m, text+start, text+end);
- else bm(pat, m, text+start, text+end);
- }
- else {
- if(DNA) monkey4( pat, m, text+start, text+end, D );
- else {
- if(m >= LONG_APPX) a_monkey(pat, m, text+start, text+end, D);
- else agrep(pat, m, text+start, text+end, D);
- }
- }
- if(FILENAMEONLY && num_of_matched) {
- printf("%s\n", CurrentFileName);
- return; }
- start = offset - residue ;
- if(start < MAXLINE) {
- start = MAXLINE;
- }
- strncpy(text+start, text+end, residue);
- start++;
- } /* end of while(num_read = ... */
- return;
- } /* end sgrep */
-
- /* SUN: bm assumes that the content of text[n]...text[n+m-1] is
- pat[m-1] such that the skip loop is guaranteed to terminated */
-
- bm(pat, m, text, textend)
- CHARTYPE *text, *textend, *pat; int m;
- {
- register int shift;
- register int m1, j, d1;
-
- /*
- printf("%d\t", textend - text);
- printf("%c, %c", *text, *textend);
- */
-
- d1 = shift_1; /* at least 1 */
- m1 = m - 1;
- shift = 0;
- while (text <= textend) {
- shift = SHIFT[*(text += shift)];
- while(shift) {
- shift = SHIFT[*(text += shift)];
- shift = SHIFT[*(text += shift)];
- shift = SHIFT[*(text += shift)];
- }
- j = 0;
- while(TR[pat[m1 - j]] == TR[*(text - j)]) {
- if(++j == m) break; /* if statement can be
- saved, but for safty ... */
- }
- if (j == m ) {
- if(text > textend) return;
- if(WORDBOUND) {
- if(TR[*(text+1)] != W_DELIM) goto CONT;
- if(TR[*(text-m)] != W_DELIM) goto CONT;
- }
- num_of_matched++;
- if(FILENAMEONLY) return;
- if(!(COUNT)) {
- if(FNAME) printf("%s: ", CurrentFileName);
- while(*(--text) != '\n');
- while(*(++text) != '\n') putchar(*(text));
- putchar(*text);
- }
- else { while(*text != '\n') text++; }
- CONT:
- shift = 1;
- }
- else shift = d1;
- }
- return;
- }
-
-
- /* initmask() initializes the mask table for the pattern */
- /* endposition is a mask for the endposition of the pattern */
- /* endposition will contain k mask bits if the pattern contains k fragments */
- initmask(pattern, Mask, m, D, endposition)
- CHARTYPE *pattern; unsigned *Mask; register int m, D; unsigned *endposition;
- {
- register unsigned Bit1, c;
- register int i, j, frag_num;
-
- Bit1 = 1 << 31; /* the first bit of Bit1 is 1, others 0. */
- frag_num = D+1; *endposition = 0;
- for (i = 0; i < frag_num; i++) *endposition = *endposition | (Bit1 >> i);
- *endposition = *endposition >> (m - frag_num);
- for(i = 0; i < m; i++)
- if (pattern[i] == '^' || pattern[i] == '$') {
- pattern[i] = '\n';
- }
- for(i = 0; i < MAXSYM; i++) Mask[i] = ~0;
- for(i = 0; i < m; i++) /* initialize the mask table */
- { c = pattern[i];
- for ( j = 0; j < m; j++)
- if( c == pattern[j] )
- Mask[c] = Mask[c] & ~( Bit1 >> j ) ;
- }
- }
-
- prep(Pattern, M, D) /* preprocessing for partitioning_bm */
- CHARTYPE *Pattern; /* can be fine-tuned to choose a better partition */
- register int M, D;
- {
- register int i, j, k, p, shift;
- register unsigned m;
- unsigned hash, b_size = 3;
- m = M/(D+1);
- p = M - m*(D+1);
- for (i = 0; i < MAXSYM; i++) SHIFT[i] = m;
- for (i = M-1; i>=p ; i--) {
- shift = (M-1-i)%m;
- hash = Pattern[i];
- if(SHIFT[hash] > shift) SHIFT[hash] = shift;
- }
- #ifdef DEBUG
- for(i=0; i<M; i++) printf(" %d,", SHIFT[Pattern[i]]);
- printf("\n");
- #endif
- shift_1 = m;
- for(i=0; i<D+1; i++) {
- j = M-1 - m*i;
- for(k=1; k<m; k++) {
- for(p=0; p<D+1; p++)
- if(Pattern[j-k] == Pattern[M-1-m*p])
- if(k < shift_1) shift_1 = k;
- }
- }
- #ifdef DEBUG
- printf("\nshift_1 = %d", shift_1);
- #endif
- if(shift_1 == 0) shift_1 = 1;
- for(i=0; i<MAXMEMBER; i++) MEMBER[i] = 0;
- if (m < 3) b_size = m;
- for(i=0; i<D+1; i++) {
- j = M-1 - m*i;
- hash = 0;
- for(k=0; k<b_size; k++) {
- hash = (hash << 2) + Pattern[j-k];
- }
- #ifdef DEBUG
- printf(" hash = %d,", hash);
- #endif
- MEMBER[hash] = 1;
- }
- }
-
-
- agrep( pat, M, text, textend, D )
- int M, D ; register CHARTYPE *text, *textend, *pat;
- {
- register int i;
- register int m = M/(D+1);
- register CHARTYPE *textstart;
- register int shift, HASH;
- int j=0, k, m1, d1;
- int n, cdx;
- int Candidate[MaxCan][2], round, lastend=0;
- unsigned R1[MaxError+1], R2[MaxError+1];
- register unsigned int r1, endpos, c;
- unsigned currentpos;
- unsigned Bit1;
- unsigned r_newline;
-
- Candidate[0][0] = Candidate[0][1] = 0;
- d1 = shift_1;
- cdx = 0;
- if(m < 3) r1 = m;
- else r1 = 3;
- textstart = text;
- shift = m-1;
- while (text < textend) {
- shift = SHIFT[*(text += shift)];
- while(shift) {
- shift = SHIFT[*(text += shift)];
- shift = SHIFT[*(text += shift)];
- }
- j = 1; HASH = *text;
- while(j < r1) { HASH = (HASH << 2) + *(text-j);
- j++; }
- if (MEMBER[HASH]) {
- i = text - textstart;
- if((i - M - D - 10) > Candidate[cdx][1]) {
- Candidate[++cdx][0] = i-M-D-2;
- Candidate[cdx][1] = i+M+D; }
- else Candidate[cdx][1] = i+M+D;
- shift = d1;
- }
- else shift = d1;
- }
-
-
- text = textstart;
- n = textend - textstart;
- r_newline = '\n';
- /* for those candidate areas, find the D-error matches */
- if(Candidate[1][0] < 0) Candidate[1][0] = 0;
- endpos = endposition; /* the mask table and the endposition */
- Bit1 = (1 << 31);
- for(round = 0; round <= cdx; round++)
- { i = Candidate[round][0] ;
- if(Candidate[round][1] > n) Candidate[round][1] = n;
- if(i < 0) i = 0;
- #ifdef DEBUG
- printf("round: %d, start=%d, end=%d, ", round, i, Candidate[round][1]);
- #endif
- R1[0] = R2[0] = ~0;
- R1[1] = R2[1] = ~Bit1;
- for(k = 1; k <= D; k++) R1[k] = R2[k] = (R1[k-1] >> 1) & R1[k-1];
- while (i < Candidate[round][1])
- {
- c = text[i++];
- if(c == r_newline) {
- for(k = 0 ; k <= D; k++) R1[k] = R2[k] = (~0 );
- }
- r1 = Mask[c];
- R1[0] = (R2[0] >> 1) | r1;
- for(k=1; k<=D; k++)
- R1[k] = ((R2[k] >> 1) | r1) & R2[k-1] & ((R1[k-1] & R2[k-1]) >> 1);
- if((R1[D] & endpos) == 0) {
- num_of_matched++;
- if(FILENAMEONLY) { return; }
- currentpos = i;
- if(i <= lastend) i = lastend;
- else {
- s_output(text, ¤tpos);
- i = currentpos;
- }
- lastend = i;
- for(k=0; k<=D; k++) R1[k] = R2[k] = ~0;
- }
- c = text[i++];
- if(c == r_newline) {
- for(k = 0 ; k <= D; k++) R1[k] = R2[k] = (~0 );
- }
- r1 = Mask[c];
- R2[0] = (R1[0] >> 1) | r1;
- for(k = 1; k <= D; k++)
- R2[k] = ((R1[k] >> 1) | r1) & R1[k-1] & ((R1[k-1] & R2[k-1]) >> 1);
- if((R2[D] & endpos) == 0) { currentpos = i;
- num_of_matched++;
- if(FILENAMEONLY) { return; }
- if(i <= lastend) i = lastend;
- else {
- s_output(text, ¤tpos);
- i = currentpos;
- }
- lastend = i;
- for(k=0; k<=D; k++) R1[k] = R2[k] = ~0;
- }
- }
- }
- return;
- }
-
- s_output (text, i)
- int *i; CHARTYPE *text;
- {
- int kk, bp;
- if(SILENT) return;
- if(COUNT) {
- while(text[*i] != '\n') *i = *i + 1;
- return;
- }
- if(FNAME == ON) printf("%s: ", CurrentFileName);
- bp = *i;
- while(text[--bp] != '\n');
- while(text[++bp] != '\n') putchar(text[bp]);
- putchar('\n');
- *i = bp;
- }
-
-
- prep_bm(Pattern, m)
- unsigned char *Pattern;
- register m;
- {
- int i, j;
- unsigned hash;
- unsigned char lastc;
- for (i = 0; i < MAXSYM; i++) SHIFT[i] = m;
- for (i = m-1; i>=0; i--) {
- hash = TR[Pattern[i]];
- if(SHIFT[hash] >= m - 1) SHIFT[hash] = m-1-i;
- }
- shift_1 = m-1;
- lastc = TR[Pattern[m-1]];
- for (i= m-2; i>=0; i--) {
- if(TR[Pattern[i]] == lastc )
- { shift_1 = m-1 - i; i = -1; }
- }
- if(shift_1 == 0) shift_1 = 1;
- if(NOUPPER) for(i='A'; i<='Z'; i++) SHIFT[i] = SHIFT[i + 'a' - 'A'];
- #ifdef DEBUG
- for(i='a'; i<='z'; i++) printf("%c: %d", i, SHIFT[i]); printf("\n");
- for(i='A'; i<='Z'; i++) printf("%c: %d", i, SHIFT[i]); printf("\n");
- #endif
- }
-
-
- /* a_monkey() the approximate monkey move */
-
- a_monkey( pat, m, text, textend, D )
- register int m, D ; register CHARTYPE *text, *textend, *pat;
- {
- register CHARTYPE *oldtext;
- register unsigned hash, i, hashmask, suffix_error;
- register int m1 = m-1-D, j, pos;
-
- hashmask = Hashmask;
- oldtext = text;
- while (text < textend) {
- text = text+m1;
- suffix_error = 0;
- while(suffix_error <= D) {
- hash = *text--;
- while(MEMBER_1[hash]) {
- hash = ((hash << LOG_ASCII) + *(text--)) & hashmask;
- }
- suffix_error++;
- }
- if(text <= oldtext) {
- if((pos = verify(m, 2*m+D, D, pat, oldtext)) > 0) {
- text = oldtext+pos;
- if(text > textend) return;
- num_of_matched++;
- if(FILENAMEONLY) return;
- if(!(COUNT)) {
- if(FNAME) printf("%s: ", CurrentFileName);
- while(*(--text) != '\n');
- while(*(++text) != '\n') putchar(*text);
- printf("\n");
- }
- else {
- while(*text != '\n') text++;
- }
- }
- else {
- text = oldtext + m;
- }
- }
- oldtext = text;
- }
- }
-
- /* monkey uses two characters for delta_1 shifting */
-
- CHARTYPE SHIFT_2[MAX_SHIFT_2];
-
- monkey( pat, m, text, textend )
- register int m ; register CHARTYPE *text, *textend, *pat;
- {
- register unsigned hash, i;
- register CHARTYPE shift;
- register int m1, j;
- register unsigned r_newline;
-
- r_newline = '\n';
-
- m1 = m - 1;
- text = text+m1;
- while (text < textend) {
- hash = *text;
- hash = (hash << 3) + *(text-1);
- shift = SHIFT_2[hash];
- while(shift) {
- text = text + shift;
- hash = (*text << 3) + *(text-1);
- shift = SHIFT_2[hash];
- }
- j = 0;
- while(TR[pat[m1 - j]] == TR[*(text - j)]) { if(++j == m) break; }
- if (j == m ) {
- if(text >= textend) return;
- num_of_matched++;
- if(FILENAMEONLY) return;
- if(COUNT) {
- while (*text != r_newline) text++;
- text--;
- }
- else {
- if(FNAME) printf("%s: ", CurrentFileName);
- while(*(--text) != r_newline);
- while(*(++text) != r_newline) putchar(*text);
- printf("\n");
- text--;
- }
- }
- text++;
- }
- }
-
- am_preprocess(Pattern)
- CHARTYPE *Pattern;
- {
- int i, j, m;
- unsigned hash;
- m = strlen(Pattern);
- for (i = 1, Hashmask = 1 ; i<16 ; i++) Hashmask = (Hashmask << 1) + 1 ;
- for (i = 0; i < MAXMEMBER_1; i++) MEMBER_1[i] = 0;
- for (i = m-1; i>=0; i--) {
- MEMBER_1[Pattern[i]] = 1;
- }
- for (i = m-1; i > 0; i--) {
- MEMBER_1[(Pattern[i] << LOG_ASCII) + Pattern[i-1]] = 1;
- }
- }
-
-
- verify(m, n, D, pat, text)
- register int m, n, D;
- CHARTYPE *pat, *text;
- {
- int A[MAXPATT], B[MAXPATT];
- register int last = D;
- register int cost = 0;
- register int k, i, c;
- register int m1 = m+1;
- CHARTYPE *textend = text+n;
- CHARTYPE *textbegin = text;
-
- for (i = 0; i <= m1; i++) A[i] = B[i] = i;
- while (text < textend)
- {
- for (k = 1; k <= last; k++)
- {
- cost = B[k-1]+1;
- if (pat[k-1] != *text)
- { if (B[k]+1 < cost) cost = B[k]+1;
- if (A[k-1]+1 < cost) cost = A[k-1]+1; }
- else cost = cost -1;
- A[k] = cost;
- }
- if(pat[last] == *text++) { A[last+1] = B[last]; last++; }
- if(A[last] < D) A[last+1] = A[last++]+1;
- while (A[last] > D) last = last - 1;
- if(last >= m) return(text - textbegin - 1);
- if(*text == '\n') {
- last = D;
- for(c = 0; c<=m1; c++) A[c] = B[c] = c;
- }
- for (k = 1; k <= last; k++)
- {
- cost = A[k-1]+1;
- if (pat[k-1] != *text)
- { if (A[k]+1 < cost) cost = A[k]+1;
- if (B[k-1]+1 < cost) cost = B[k-1]+1; }
- else cost = cost -1;
- B[k] = cost;
- }
- if(pat[last] == *text++) { B[last+1] = A[last]; last++; }
- if(B[last] < D) B[last+1] = B[last++]+1;
- while (B[last] > D) last = last -1;
- if(last >= m) return(text - textbegin - 1);
- if(*text == '\n') {
- last = D;
- for(c = 0; c<=m1; c++) A[c] = B[c] = c;
- }
- }
- return(0);
- }
-
- /* preprocessing for monkey() */
-
- m_preprocess(Pattern)
- CHARTYPE *Pattern;
- {
- int i, j, m;
- unsigned hash;
- m = strlen(Pattern);
- for (i = 0; i < MAX_SHIFT_2; i++) SHIFT_2[i] = m;
- for (i = m-1; i>=1; i--) {
- hash = Pattern[i];
- hash = hash << 3;
- for (j = 0; j< MAXSYM; j++) {
- if(SHIFT_2[hash+j] == m) SHIFT_2[hash+j] = m-1;
- }
- hash = hash + Pattern[i-1];
- if(SHIFT_2[hash] >= m - 1) SHIFT_2[hash] = m-1-i;
- }
- shift_1 = m-1;
- for (i= m-2; i>=0; i--) {
- if(Pattern[i] == Pattern[m-1] )
- { shift_1 = m-1 - i; i = -1; }
- }
- if(shift_1 == 0) shift_1 = 1;
- SHIFT_2[0] = 0;
- }
-
- /* monkey4() the approximate monkey move */
-
- char *MEMBER_D;
-
- monkey4( pat, m, text, textend, D )
- register int m, D ; register unsigned char *text, *pat, *textend;
- {
- register unsigned char *oldtext;
- register unsigned hash, i, hashmask, suffix_error;
- register int m1=m-1-D, j, pos;
-
- hashmask = Hashmask;
- oldtext = text ;
- while (text < textend) {
- text = text + m1;
- suffix_error = 0;
- while(suffix_error <= D) {
- hash = char_map[*text--];
- hash = ((hash << LOG_DNA) + char_map[*(text--)]) & hashmask;
- while(MEMBER_D[hash]) {
- hash = ((hash << LOG_DNA) + char_map[*(text--)]) & hashmask;
- }
- suffix_error++;
- }
- if(text <= oldtext) {
- if((pos = verify(m, 2*m+D, D, pat, oldtext)) > 0) {
- text = oldtext+pos;
- if(text > textend) return;
- num_of_matched++;
- if(FILENAMEONLY) return;
- if(!(COUNT)) {
- if(FNAME) printf("%s:", CurrentFileName);
- while(*(--text) != '\n');
- while(*(++text) != '\n') putchar(*text);
- printf("\n");
- text++;
- }
- else {
- while(*text != '\n') text++;
- text++;
- }
- }
- else text = oldtext + m;
- }
- oldtext = text;
- }
- }
-
- prep4(Pattern, m)
- char *Pattern; int m;
- {
- int i, j, k;
- unsigned hash;
-
- for(i=0; i< MAXSYM; i++) char_map[i] = 0;
- char_map['a'] = char_map['A'] = 4;
- char_map['g'] = char_map['g'] = 1;
- char_map['t'] = char_map['t'] = 2;
- char_map['c'] = char_map['c'] = 3;
- char_map['n'] = char_map['n'] = 5;
-
- BSize = blog(4, m);
- for (i = 1, Hashmask = 1 ; i<BSize*LOG_DNA; i++) Hashmask = (Hashmask << 1) + 1 ;
- MEMBER_D = (char *) malloc((Hashmask+1) * sizeof(char));
- #ifdef DEBUG
- printf("BSize = %d", BSize);
- #endif
- for (i=0; i <= Hashmask; i++) MEMBER_D[i] = 0;
- for (j=0; j < BSize; j++) {
- for(i=m-1; i >= j; i--) {
- hash = 0;
- for(k=0; k <= j; k++)
- hash = (hash << LOG_DNA) +char_map[Pattern[i-k]];
- #ifdef DEBUG
- printf("< %d >, ", hash);
- #endif
- MEMBER_D[hash] = 1;
- }
- }
- }
-
- blog(base, m )
- int base, m;
- {
- int i, exp;
- exp = base;
- m = m + m/2;
- for (i = 1; exp < m; i++) exp = exp * base;
- return(i);
- }
-
-