home *** CD-ROM | disk | FTP | other *** search
- /*
- * TEA database builder
- *
- * Version 0.90
- *
- * Usage: TEABUILD <ascii_input_file>
- */
-
- #include <stdio.h>
- #include <string.h>
- #include <ctype.h>
- #include <stdlib.h>
-
- typedef int Boolean;
- #define False (0)
- #define True (!False)
-
- #define LENMIN 1
- #define LENMAX 40
-
- #define BUFMAX 1024
-
- #define OFFSETMAX 0x7
-
- #define CODE_APOSTROPHE '{'
- #define CODE_CAP '|'
- #define CODE_HYPHEN '}'
- #define CODE_SPACE '~'
-
- #define OUTPUT_PREFIX "words"
-
- /*
- * Cache of the encode state for all the different entry lengths required.
- * The program can work with fewer file handles than the number of output
- * files it is writing, if necessary.
- */
- typedef struct _encodeState {
- struct _encodeState *next;
- struct _encodeState *prev;
- FILE *fp;
- Boolean created;
- char frame[LENMAX+1];
- int offset;
- } EncodeState;
-
- EncodeState encodeStateCache[LENMAX];
- struct _encodeState *encodeStateCacheHead;
-
- static void output(fp, code, offset)
- FILE *fp;
- int code;
- int offset;
- {
- /*
- * Output a single encoded character or modifier
- */
-
- if (fputc(offset | ((code - 'a') << 3), fp) == EOF)
- {
- fprintf(stderr, "Error writing output file\n");
- exit(1);
- }
- }
-
- static void openall()
- {
- /*
- * Initialise all the encode state structures
- */
- EncodeState *esp;
-
- encodeStateCacheHead = NULL;
-
- for (esp = encodeStateCache; esp < (encodeStateCache+LENMAX); esp++)
- {
- esp->next = NULL;
- esp->prev = NULL;
- esp->fp = NULL;
- esp->created = False;
- memset(esp->frame, '\0', sizeof(esp->frame));
- esp->offset = 0;
- }
- }
-
- static EncodeState *encache(len)
- int len;
- {
- /*
- * Ensure the output file associated with entries of length 'len'
- * is open and return a pointer to its encode state.
- */
- EncodeState *esp = &encodeStateCache[len-1], *espOld;
- char *type;
- char outputname[BUFMAX];
-
- if (esp->fp == NULL)
- {
- /* file must be (re-)opened */
-
- /* get output file name */
- sprintf(outputname, "%s.%d", OUTPUT_PREFIX, len);
-
- /* get output file open mode */
- type = esp->created ? "ab" : "wb";
- esp->created = True;
-
- /* try opening the file; on error, close the file handle
- for the least recently used entry length and try again */
- while ((esp->fp = fopen(outputname, type)) == NULL)
- {
- if (encodeStateCacheHead == NULL)
- {
- fprintf(stderr, "Couldn't open \"%s\"\n", outputname);
- exit(1);
- }
-
- espOld = encodeStateCacheHead->prev;
- fclose(espOld->fp);
-
- if (espOld == encodeStateCacheHead)
- {
- /* only entry in cache */
- encodeStateCacheHead = NULL;
- espOld->next = NULL;
- espOld->prev = NULL;
- esp->fp = NULL;
- }
- else
- {
- /* other entries in cache */
- espOld->prev->next = espOld->next;
- espOld->next->prev = espOld->prev;
- espOld->next = NULL;
- espOld->prev = NULL;
- espOld->fp = NULL;
- }
- }
-
- /* add the entry to the front of the cache */
- if (encodeStateCacheHead == NULL)
- {
- esp->next = esp;
- esp->prev = esp;
- }
- else
- {
- esp->next = encodeStateCacheHead;
- esp->prev = encodeStateCacheHead->prev;
- }
- encodeStateCacheHead = esp;
-
- esp->next->prev = esp;
- esp->prev->next = esp;
- }
- else if (esp != encodeStateCacheHead)
- {
- /* remove entry from current position */
- esp->prev->next = esp->next;
- esp->next->prev = esp->prev;
-
- /* add entry at front of cache */
- esp->next = encodeStateCacheHead;
- esp->prev = encodeStateCacheHead->prev;
- encodeStateCacheHead = esp;
-
- esp->next->prev = esp;
- esp->prev->next = esp;
- }
-
- return(esp);
- }
-
- static void closeall()
- {
- /*
- * Flush the encode states through. A special case must be dealt
- * with where the alphabetic characters of the last word are identical
- * to those of the penultimate word and the last word has no
- * "modifiers" (space, hyphen, apostrophe, shift). In this case,
- * a "redundant" character must be output to ensure the last word
- * is unambigously represented in the output file.
- */
- EncodeState *esp;
- int i;
-
- for (i = 0; i < LENMAX; i++)
- {
- if ( encodeStateCache[i].created
- && encodeStateCache[i].offset > i)
- {
- /* last entry must be flushed through */
- esp = encache(i+1);
- output(esp->fp, esp->frame[0], 0);
- }
- }
-
- /* close any open files */
- if ((esp = encodeStateCacheHead) != NULL)
- {
- do
- {
- fclose(esp->fp);
- esp = esp->next;
- } while (esp != encodeStateCacheHead);
- }
- }
-
- static void encode(entry)
- char *entry;
- {
- /*
- * Add a single entry to the appropriate output file
- */
- char word[BUFMAX], *wp;
- int cursor, wordlength;
- EncodeState *esp;
-
- /* convert the word into lower case + modifiers */
- wp = word;
- wordlength = 0;
- for ( ; *entry != '\0'; entry++)
- {
- if (*entry == ' ')
- {
- *wp++ = CODE_SPACE;
- }
- else if (*entry == '\'')
- {
- *wp++ = CODE_APOSTROPHE;
- }
- else if (*entry == '-')
- {
- *wp++ = CODE_HYPHEN;
- }
- else if (isalpha(*entry))
- {
- if (isupper(*entry))
- {
- *wp++ = CODE_CAP;
- *wp++ = tolower(*entry);
- }
- else
- {
- *wp++ = *entry;
- }
- wordlength++;
- }
- }
- *wp = '\0';
-
- if (wordlength <= LENMAX)
- {
- /* ensure the handle to the output file is available */
- esp = encache(wordlength);
-
- cursor = 0;
- for (wp = word; *wp != '\0'; wp++)
- {
- /* check if a modifier needs to be output */
- if (*wp >= CODE_APOSTROPHE)
- {
- if (cursor < wordlength)
- {
- output(esp->fp, *wp, esp->offset);
- esp->offset = 0;
- }
- }
- /* check if a letter need to be output */
- else if ( (*wp != esp->frame[cursor])
- || (esp->offset == OFFSETMAX) )
- {
- output(esp->fp, *wp, esp->offset);
- esp->offset = 0;
-
- esp->frame[cursor++] = *wp;
- }
- else
- {
- esp->offset++;
- cursor++;
- }
- }
- }
- else
- {
- fprintf(stderr, "Skipping long entry\n");
- }
- }
-
- main(argc, argv)
- int argc;
- char *argv[];
- {
- char linebuf[BUFMAX], *lbp;
- FILE *ifp;
-
- /* check usage */
- if (argc != 2)
- {
- fprintf(stderr, "Usage: TEABUILD <ascii_input_file>\n");
- exit(1);
- }
-
- /* open ASCII input file */
- if ((ifp = fopen(argv[1], "r")) == NULL)
- {
- fprintf(stderr, "Couldn't open \"%s\" for input\n", argv[1]);
- exit(1);
- }
-
- /* initialise the encode state cache */
- openall();
-
- /* do a single pass of the input file, presenting each line
- as an entry to be encoded */
- while (fgets(linebuf, sizeof(linebuf), ifp) != NULL)
- {
- if ((lbp = strchr(linebuf, '\n')) == NULL)
- {
- fprintf(stderr, "Skipping long entry\n");
- }
- else
- {
- /* delete the newline */
- *lbp = '\0';
- encode(linebuf);
- }
- }
-
- /* flush the encode state cache */
- closeall();
-
- return(0);
- }
-