home *** CD-ROM | disk | FTP | other *** search
- /* vi: set tabstop=4 : */
-
- /*
- * mksp - make soundex dictionary
- * Version 1.3, December 1986
- *
- * If <soundexfile.{dir, pag}> do not exist, try to create them
- * If they do exist and are not empty, then words will be added
- * from the standard input
- * Only when words are being added to an existing database are duplicate words
- * ignored
- * Valid words (words beginning with an alphabetic) are stored as given but
- * comparisons for duplicates is case independent.
- * Non-alphabetic characters are ignored in computing the soundex
- *
- * Permission is given to copy or distribute this program provided you
- * do not remove this header or make money off of the program.
- *
- * Please send comments and suggestions to:
- * Barry Brachman
- * Dept. of Computer Science
- * Univ. of British Columbia
- * Vancouver, B.C. V6T 1W5
- *
- * .. {ihnp4!alberta, uw-beaver}!ubc-vision!ubc-cs!brachman
- * brachman@cs.ubc.cdn
- * brachman%ubc.csnet@csnet-relay.arpa
- * brachman@ubc.csnet
- */
-
- #include <ctype.h>
- #include <errno.h>
- #include <sys/types.h>
- #include <sys/file.h>
- #include <sys/stat.h>
- #include <stdio.h>
-
- #ifdef NEWDBM
- #include <ndbm.h>
- #include <sys/file.h>
- #else !NEWDBM
- #include <dbm.h>
- #endif NEWDBM
-
- #include "sp.h"
-
- #define NEW_DICT 0
- #define OLD_DICT 1
-
- #define VFREQ 1000 /* frequency for verbose messages */
-
- #define streq(X, Y) (!strcmp(X, Y))
- #define strneq(X, Y, N) (!strncmp(X, Y, N))
-
- #define USAGE "Usage: mksp -tad [-v[#]] <soundexfile>"
-
- int map[26][667];
-
- datum FETCH(), FIRSTKEY(), NEXTKEY();
-
- key_t keyvec[KEYSIZE];
- key_t *key = keyvec;
-
- /*
- * Soundex codes
- * The program depends upon the numbers zero through six being used
- * but this can easily be changed
- */
- char soundex_code_map[26] = {
- /*** A B C D E F G H I J K L M N O P ***/
- 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, 1,
-
- /*** Q R S T U V W X Y Z ***/
- 2, 6, 2, 3, 0, 1, 0, 2, 0, 2
- };
-
- int digit_part;
-
- int aflag, dflag, tflag, vflag;
-
- char *mk_word();
-
- main(argc, argv)
- int argc;
- char **argv;
- {
- register int i;
- char *file;
-
- if (argc != 3 && argc != 4) {
- fprintf(stderr, "%s\n", USAGE);
- exit(1);
- }
- aflag = dflag = tflag = vflag = 0;
- file = (char *) NULL;
-
- for (i = 1; i < argc; i++) {
- if (streq(argv[i], "-a"))
- aflag = 1;
- else if (streq(argv[i], "-d"))
- dflag = 1;
- else if (streq(argv[i], "-t"))
- tflag = 1;
- else if (strneq(argv[i], "-v", 2)) {
- if (isdigit(argv[i][2]))
- vflag = atoi(&argv[i][2]);
- else
- vflag = 1;
- }
- else if (file == (char *) NULL)
- file = argv[i];
- else {
- fprintf(stderr, "%s\n", USAGE);
- exit(1);
- }
- }
-
- if (file == (char *) NULL || (tflag + aflag + dflag) != 1) {
- fprintf(stderr, "%s\n", USAGE);
- exit(1);
- }
-
- if (aflag) {
- addwords(file);
- if (vflag > 1) {
- register int j, total;
- int m, max;
-
- fprintf(stderr, "Counters:\n");
- for (i = 0; i < 26; i++) {
- total = max = map[i][0];
- for (j = 1; j < 667; j++) {
- total += (m = map[i][j]);
- if (m > max)
- max = m;
- }
- if (max > 0)
- fprintf(stderr, "%c: max %d total %d\n", 'a'+i, max, total);
- }
- }
- }
- else if (dflag)
- deletewords(file);
- else if (tflag)
- prcontents(file);
- exit(0);
- }
-
- /*
- * Add words read from stdin to the database
- * The key is the 3 digit soundex code for the word plus a disambiguating
- * counter. Different counter values are used for words with the same soundex
- * code. The maximum counter value is MAXCOUNT. If the counter overflows then
- * we lose, but given at least 8 bits this seems unlikely.
- */
- addwords(name)
- char *name;
- {
- register int c, count, delete, duplicate, i, len;
- register int *p;
- int ch, s, status;
- char wcopy[MAXWORDLEN + 2], word[MAXWORDLEN + 2];
- datum dbm_key, dbm_content;
-
- status = setup(name);
- if (DBMINIT(name, O_RDWR) == -1) {
- fprintf(stderr, "mksp: Can't initialize\n");
- exit(1);
- }
-
- for (i = 0; i < 26; i++)
- for (c = 0; c < 667; c++)
- map[i][c] = 0;
-
- dbm_key.dptr = (char *) key;
- dbm_key.dsize = KEYSIZE;
-
- count = 0;
-
- while (fgets(word, sizeof(word), stdin) != (char *) NULL) {
- len = strlen(word);
- if (word[len - 1] != '\n') {
- fprintf(stderr, "mksp: Word too long: %s", word);
- while ((ch = getchar()) != '\n') /* flush rest of line */
- putc(ch, stderr);
- putc('\n', stderr);
- continue;
- }
- word[--len] = '\0';
- if (len > MAXWORDLEN) {
- fprintf(stderr, "mksp: Word too long: %s\n", word);
- continue;
- }
-
- if ((s = soundex(word, 3)) == BAD_WORD) {
- if (vflag)
- fprintf(stderr, "Ignoring bad word: %s\n", word);
- continue;
- }
- ch = (isupper(word[0]) ? tolower(word[0]) : word[0]) - 'a';
- p = &(map[ch][digit_part]);
-
- /*
- * If words are being added to an old dictionary,
- * check for duplication and watch for a deleted entry
- * The reason for only checking for duplicates in old dictionaries is
- * that usually when you're creating a new dictionary the words are
- * already sorted and unique and the creation of a large dictionary is
- * slow enough already.
- */
- duplicate = 0;
- delete = -1; /* an 'impossible' counter */
- if (status == OLD_DICT) {
- c = 0;
- while (1) {
- mk_key(key, s, c);
- dbm_content = FETCH(dbm_key);
- if (dbm_content.dptr == 0)
- break;
-
- if (!IS_DELETED(dbm_content)) {
- char *str;
-
- str = mk_word(dbm_content.dptr, dbm_content.dsize, s);
- if (strmatch(word, str) == 0) {
- duplicate = 1;
- if (vflag)
- fprintf(stderr, "duplicate: %s\n", word);
- break;
- }
- }
- else if (delete < 0) /* choose delete nearest front */
- delete = c;
-
- if (++c > MAXCOUNT) {
- fprintf(stderr, "mksp: Counter overflow\n");
- fprintf(stderr, "soundex: %c%d\n", ch+'a', b10(digit_part));
- exit(1);
- }
- }
- if (duplicate)
- continue;
- *p = c;
- }
- if (*p > MAXCOUNT) {
- fprintf(stderr, "mksp: Counter overflow\n");
- fprintf(stderr, "soundex: %c%d\n", ch+'a', b10(digit_part));
- exit(1);
- }
- mk_key(key, s, *p);
- *p = *p + 1;
- strcpy(wcopy, word);
- if (len == 1) {
- if (isupper(wcopy[0]))
- wcopy[0] |= UPPER_CHAR;
- wcopy[0] |= SINGLE_CHAR;
- dbm_content.dptr = wcopy;
- }
- else {
- if (isupper(wcopy[1]))
- wcopy[1] = wcopy[1] - 'A' + 26;
- else if (islower(wcopy[1]))
- wcopy[1] = wcopy[1] - 'a';
- else if ((wcopy[1] = tospchar(wcopy[1])) == '\0') {
- fprintf(stderr, "Bogus second char: can't happen!\n");
- exit(1);
- }
- if (isupper(wcopy[0]))
- wcopy[1] = wcopy[1] | UPPER_CHAR;
- dbm_content.dptr = wcopy + 1;
- len--;
- }
- dbm_content.dsize = len; /* null not stored */
- if (delete < 0) {
- if (STORE(dbm_key, dbm_content) == -1) {
- fprintf(stderr, "mksp: Can't store\n");
- exit(1);
- }
- }
- else {
- if (vflag)
- fprintf(stderr, "reusing: %s\n", word);
- mk_key(key, s, delete);
- if (REPLACE(dbm_key, dbm_content) == -1) {
- fprintf(stderr, "mksp: Can't replace\n");
- exit(1);
- }
- }
- count++;
- if (vflag > 1)
- fprintf(stderr, "%5d: %s(%d)\n", count, word, ex_count(key));
- if (vflag && (count % VFREQ) == 0)
- fprintf(stderr, "%5d: %s\n", count, word);
- }
- if (vflag)
- fprintf(stderr, "%d words\n", count);
- DBMCLOSE();
- }
-
- /*
- * Print out everything
- */
- prcontents(name)
- char *name;
- {
- register int s;
- datum dbm_key, dbm_content;
-
- if (DBMINIT(name, O_RDONLY) == -1)
- exit(1);
-
- dbm_key = FIRSTKEY();
- while (dbm_key.dptr != NULL) {
- dbm_content = FETCH(dbm_key);
- if (dbm_content.dptr == 0)
- break; /* ??? */
-
- if (vflag)
- printf("%3d. ", ex_count((key_t *) dbm_key.dptr));
- if (IS_DELETED(dbm_content)) {
- if (vflag)
- printf("(deleted)\n");
- }
- else {
- s = ex_soundex((key_t *) dbm_key.dptr);
- printf("%s\n", mk_word(dbm_content.dptr, dbm_content.dsize, s));
- }
- dbm_key = NEXTKEY(dbm_key);
- }
- DBMCLOSE();
- }
-
- /*
- * When words are deleted they must be marked as such rather than deleted
- * using DELETE(). This is because the sequence of counters must remain
- * continuous. If DELETE() is used then any entries with the same soundex
- * but with a larger counter value would not be accessible. This approach
- * does cost some extra space but if an addition is made to the chain then
- * a deleted counter slot will be reused. Also, the storage used by the word
- * should be made available to dbm. This could be improved somewhat
- * by actually using DELETE() on the last entry of the chain.
- */
- deletewords(name)
- char *name;
- {
- register int c, ch, len, s;
- register char *p;
- char word[MAXWORDLEN + 2];
- datum dbm_key, dbm_content;
-
- if (DBMINIT(name, O_RDWR) == -1)
- exit(1);
-
- while (fgets(word, sizeof(word), stdin) != (char *) NULL) {
- len = strlen(word);
- if (word[len - 1] != '\n') {
- fprintf(stderr, "mksp: Word too long: %s", word);
- while ((ch = getchar()) != '\n') /* flush rest of line */
- putc(ch, stderr);
- putc('\n', stderr);
- continue;
- }
- word[--len] = '\0';
- if (len > MAXWORDLEN) {
- fprintf(stderr, "mksp: Word too long: %s\n", word);
- continue;
- }
-
- if ((s = soundex(word, 3)) == BAD_WORD) {
- if (vflag)
- fprintf(stderr, "Bad word: %s\n", word);
- continue;
- }
-
- c = 0;
- while (1) {
- dbm_key.dptr = (char *) key;
- dbm_key.dsize = KEYSIZE;
- mk_key(key, s, c);
- dbm_content = FETCH(dbm_key);
- if (dbm_content.dptr == NULL) {
- if (vflag)
- fprintf(stderr, "Not found: %s\n", word);
- break;
- }
-
- if (!IS_DELETED(dbm_content)) {
- p = mk_word(dbm_content.dptr, dbm_content.dsize, s);
- if (strmatch(word, p) == 0) {
- /*
- * Aside:
- * Since dptr points to static storage it must be reset
- * if we want to retain the old content (content.dptr=word)
- * This took a while to determine...
- * Anyhow, since there is no need to store the old word
- * we free up the space
- */
- dbm_content.dptr = "";
- dbm_content.dsize = 0;
- if (REPLACE(dbm_key, dbm_content) == -1)
- fprintf(stderr, "mksp: delete of '%s' failed\n", word);
- else if (vflag) {
- if (vflag > 1)
- fprintf(stderr, "%d. %s ", c, p);
- fprintf(stderr, "deleted\n");
- }
- break;
- }
- else if (vflag > 1)
- fprintf(stderr, "%d. %s\n", c, p);
- }
- else if (vflag > 1)
- fprintf(stderr, "%d. (deleted)\n", c);
-
- if (++c > MAXCOUNT) {
- ch = isupper(word[0]) ? tolower(word[0]) : word[0];
- fprintf(stderr, "mksp: Counter overflow\n");
- fprintf(stderr, "soundex: %c%d\n", ch, b10(digit_part));
- exit(1);
- }
- }
- }
- DBMCLOSE();
- }
-
- /*
- * Setup the dictionary files if necessary
- */
- setup(name)
- char *name;
- {
- register int s1, s2;
-
- s1 = check_dict(name, ".dir");
- s2 = check_dict(name, ".pag");
- if (s1 == NEW_DICT && s2 == NEW_DICT)
- return(NEW_DICT);
- return(OLD_DICT);
- }
-
- /*
- * Check if a dictionary file exists:
- * - if not, try to create it
- * - if so, see if it is empty
- * Return NEW_DICT if an empty file exists,
- * OLD_DICT if a non-empty file exists
- * Default mode for new files is 0666
- */
- check_dict(name, ext)
- char *name, *ext;
- {
- register int len, s;
- char *filename;
- struct stat statbuf;
- char *malloc();
- extern int errno;
-
- len = strlen(name) + strlen(ext) + 1;
- filename = (char *) malloc((unsigned) len);
- if (filename == (char *) NULL) {
- fprintf(stderr, "mksp: Can't malloc '%s.%s'\n", name, ext);
- exit(1);
- }
- sprintf(filename, "%s%s", name, ext);
- if (stat(filename, &statbuf) == -1) {
- if (errno != ENOENT) {
- perror("mksp");
- exit(1);
- }
- if (creat(filename, 0666) == -1) {
- perror("mksp");
- exit(1);
- }
- s = NEW_DICT;
- }
- else {
- if (statbuf.st_size == 0)
- s = NEW_DICT;
- else
- s = OLD_DICT;
- }
- return(s);
- }
-
- /*
- * Compute an 'n' digit Soundex code for 'word'
- * As a side effect, leave the digit part of the soundex in digit_part
- *
- * Since the soundex can be considered a base 7 number, if 'n' is:
- * 3 require 9 (10 if base 10) bits for digits
- * 4 require 12 (13) bits
- * 5 require 15 (17) bits
- * 6 require 17 (20) bits
- *
- * The three slightly different versions of this routine should be coalesced.
- */
- soundex(word, n)
- register char *word;
- int n;
- {
- register int c, soundex_length, previous_code;
- register char *p, *w;
- char wcopy[MAXWORDLEN + 2];
-
- if (!IS_VALID(word))
- return(-1);
-
- strcpy(wcopy, word);
- p = w = wcopy;
-
- while (*p != '\0') {
- if (isupper(*p))
- *p = tolower(*p);
- p++;
- }
-
- digit_part = 0;
- soundex_length = 0;
- previous_code = soundex_code_map[*w - 'a'];
- for (p = w + 1; *p != '\0' && soundex_length < n; p++) {
- if (!isalpha(*p))
- continue;
- c = soundex_code_map[*p - 'a'];
- if (c == 0 || previous_code == c) {
- previous_code = c;
- continue;
- }
- digit_part = digit_part * 7 + c;
- previous_code = c;
- soundex_length++;
- }
- while (soundex_length++ < n)
- digit_part *= 7;
- return((digit_part << 5) + *w - 'a');
- }
-
- b10(n)
- int n;
- {
- register int b10, s;
-
- for (b10 = 0, s = 1; n != 0; n /= 7) {
- b10 += (n % 7) * s;
- s *= 10;
- }
- return(b10);
- }
-
-