home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- Brewster@think.com
- */
-
- /* This file defines the files of an inverted file index.
- *
- * This structure is designed to be flexible rather than particularly
- * optimized for speed or space.
- * Thus this organization can support:
- * boolean, proximity, weights, and relevance feedback.
- *
- * Ported directly from the Lisp version 1.2 of the search engine.
- *
- * -brewster 6/90
- */
-
-
- /* ==================== */
- /* === Change Log === */
- /*Created 12/4/89 Brewster full lisp version
- *split from ir-engine 1/11/90 brewster
- *
- *added memory indexing for efficiency
- *added variable index block sizes
- *5/90 ported to C
- *5/90 split from irbuild.c
- *7/90 declared truename() a static function - HWM
- *7/90 changed filename table and headline table to be null
- * terminated in the file rather than \newline.
- * compatibility problems between systems (sigh).
- * -brewster
- *7/90 added field to document table for WAIStation
- * -brewster
- *3/91 took out utilities and created futil.c -brewster
- *3/91 took out the inverted file and created irinv.c -brewster
- */
- /* ==================== */
-
- /* ==================== */
- /* To Do list
- *
- * Implement a filename hashtable so that we can test quickly when
- * a file has been indexed.
- * Free up all memory when we can.
- * Implement logrithmic merging
- *
- */
-
- /* change log
- * 7/90 fixed: BUG: when adding words to the word disk hashtable, watch out
- * for the end of the file and wrap. If it is full, error out.
- */
-
- /* A specification for this is called ir-engine.text in microsoft word. */
-
- #include <string.h> /* for memset() */
-
- #include "cutil.h"
- #include "irfiles.h"
- #include "panic.h"
- #include "ustubs.h" /* for strstr */
- #include "futil.h"
- #include "sockets.h"
- #include "version.h"
-
- #define PRINT_AS_INDEXING false /* also defined in irtfiles.c and irhash.c */
-
- /* ------------------------------- */
- #define DOC_TAB_HEADER_SIZE 2
- #define DOC_TAB_MAXIMUM_ENTRIES 8192
- #define DOC_TAB_ENTRY_FILENAME_ID_SIZE 3
- #define DOC_TAB_ENTRY_START_CHAR_SIZE 4
- #define DOC_TAB_ENTRY_END_CHAR_SIZE 4
- #define DOC_TAB_ENTRY_HEADLINE_ID_SIZE 3
- #define DOC_TAB_ENTRY_DOC_LENGTH_SIZE 4
- #define DOC_TAB_ENTRY_NUM_LINES_SIZE 3
- #define DOC_TAB_ENTRY_DATE_SIZE 4
- #define DOC_TAB_ELEMENT_SIZE 25 /* sum of above sizes */
-
- #define DICTIONARY_ENTRY_SIZE 29 /* sum of MAX_WORD_LENGTH, 1 ('\0'),
- NEXT_INDEX_BLOCK_SIZE and
- NUMBER_OF_OCCURANCES_SIZE */
-
-
- #define FILENAME_TABLE_HEADER_SIZE 4
- #define HEADLINE_TABLE_HEADER_SIZE 4
-
- #define FILE_WRITE_DATE_SIZE 4
- #define NUMBER_OF_OCCURANCES_SIZE 4
- #define DOCUMENT_SCORE_LIMIT_SIZE 1
- #define DOCUMENT_SCORE_LIMIT 255 /* this is computed from DOCUMENT_SCORE_LIMIT_SIZE */
-
- /*============================
- === Database support ===
- ============================*/
-
- database*
- openDatabase(name,initialize,for_search)
- char* name;
- boolean initialize;
- boolean for_search;
- {
- /* open a database (open all its files), and return an opaque object.
- return NULL if there is an error
- */
- char file[MAX_FILE_NAME_LEN];
- char open_mode[4];
- database* db = (database*)s_malloc((size_t)sizeof(database));
-
- if (for_search == true)
- strncpy(open_mode,"rb",3); /* read only for searching */
- else
- strncpy(open_mode,"r+b",4); /* read/write for building */
-
- if (db == NULL)
- { waislog(WLOG_HIGH, WLOG_ERROR,
- "can't make a database, out of memory\n");
- return(NULL);
- }
-
- db->database_file = s_strdup(name);
- if (db->database_file == NULL)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "can't make a database, out of memory\n");
- disposeDatabase(db);
- return(NULL);
- }
-
- if(initialize == true){
- initialize_index_files(db);
- }
- else {
- db->dictionary_stream =
- s_fopen(dictionary_filename(file, db),open_mode);
- if (db->dictionary_stream == NULL)
- { waislog(WLOG_HIGH, WLOG_ERROR,"can't open the word hash file %s\n", file);
- disposeDatabase(db);
- return(NULL);
- }
-
- db->filename_table_stream =
- s_fopen(filename_table_filename(file, db),open_mode);
- if (db->filename_table_stream == NULL)
- { waislog(WLOG_HIGH, WLOG_ERROR,"can't open the filename file %s\n", file);
- disposeDatabase(db);
- return(NULL);
- }
-
- db->headline_table_stream =
- s_fopen(headline_table_filename(file, db),open_mode);
- if (db->headline_table_stream == NULL)
- { waislog(WLOG_HIGH, WLOG_ERROR,"can't open the headline file %s\n", file);
- disposeDatabase(db);
- return(NULL);
- }
-
- db->document_table_stream =
- s_fopen(document_table_filename(file, db),open_mode);
- if (db->document_table_stream == NULL)
- { waislog(WLOG_HIGH, WLOG_ERROR,"can't open the document id file %s\n", file);
- disposeDatabase(db);
- return(NULL);
- }
-
- /* initialize the allocated entries variable */
- s_fseek(db->document_table_stream, 0L, SEEK_END);
- db->doc_table_allocated_entries =
- (ftell(db->document_table_stream) - DOC_TAB_HEADER_SIZE)
- / DOC_TAB_ELEMENT_SIZE;
- if(for_search){
- db->index_stream = s_fopen(index_filename(file, db),open_mode);
- if (db->index_stream == NULL)
- { waislog(WLOG_HIGH, WLOG_ERROR,"can't open the inverted index file %s\n", file);
- disposeDatabase(db);
- return(NULL);
- }
- }
- else{
- /* this is now done separately to be able to control the size: */
- /* init_word_memory_hashtable(HASHTABLE_INITIAL_SIZE, db); */
- }
- }
-
- if(db->dictionary_stream != NULL){
- db->dictionary_size =
- file_length(db->dictionary_stream) / DICTIONARY_ELEMENT_SIZE;
- }
- db->index_file_number = 0;
- return(db);
- }
-
-
- void
- closeDatabase(db)
- database* db;
- /* close a database and all its files. Do not dispose of the structure. */
- {
- if (db == NULL)
- return;
- close_dictionary_file(db);
- if (db->dictionary_stream != NULL)
- s_fclose(db->dictionary_stream);
- if (db->filename_table_stream != NULL)
- s_fclose(db->filename_table_stream);
- if (db->headline_table_stream != NULL)
- s_fclose(db->headline_table_stream);
- if (db->document_table_stream != NULL)
- s_fclose(db->document_table_stream);
- if (db->index_stream != NULL)
- s_fclose(db->index_stream);
- }
-
- void
- disposeDatabase(db)
- database* db;
- {
- closeDatabase(db);
- s_free(db->database_file);
- s_free(db);
- }
-
- /* ==================================== */
- /* === Initialization of the files === */
- /* ==================================== */
-
- #define BLOCK_SIZE 16384 /* size of blocks of zeros to write to a file */
-
- static FILE* initialize_file _AP((long size,char* filename,boolean zero_it));
-
- static FILE* initialize_file(size,filename,zero_it)
- long size;
- char* filename;
- boolean zero_it;
- /* initializes a file by opening a new stream, making it the right
- * size and returning the stream.
- */
- {
- FILE* file = NULL;
- long i;
-
- #ifdef ANSI_LIKE
- remove(filename);
- #endif
-
- file = s_fopen(filename, "wb");
- if(NULL == file){
- panic("The file %s could not be opened\n", filename);
- }
-
- if(zero_it){
- if(size >= BLOCK_SIZE){ /* then write big blocks of zeros */
- char* zeros = NULL;
- zeros = (char*)s_malloc((size_t)BLOCK_SIZE);
- if(NULL == zeros){
- panic("Could not allocate a large block of Zeros\n");
- }
- memset(zeros, 0, BLOCK_SIZE);
- while(size >= BLOCK_SIZE){
- /* then write big blocks of zeros */
- if(BLOCK_SIZE != fwrite(zeros, 1, BLOCK_SIZE, file))
- panic("Write failed");
- size = size - BLOCK_SIZE;
- }
- s_free(zeros);
- }
- for(i = 0; i < size; i++){ /* clean up the rest */
- putc('\0', file);
- }
- }
- else{ /* dont zero it */
- grow_file(file, size);
- }
-
- #ifdef THINK_C
- /* set the mac file type to INDX */
- setFileType(filename, WAIS_INDEX_FILE_TYPE, CREATOR);
- #endif /* THINK_C */
-
- s_fclose(file);
- file = s_fopen(filename, "r+b"); /* open it in read/write */
- if(NULL == file){
- panic("Error in initialization, can not reopen %s.\n", filename);
- }
- return(file);
- }
-
- void initialize_index_files (db)
- database* db;
- /* This creates new index files, deleting any old ones. */
- {
- char file[MAX_FILENAME_LEN];
-
- /* cprintf(PRINT_AS_INDEXING, "initializing index files: %s\n", db->database_file); */
-
- remove(dictionary_filename(file, db)); /* remove the old one */
-
- remove(index_filename(file, db)); /* remove the old one */
- db->index_stream = NULL;
-
- db->doc_table_allocated_entries = 1; /* the 0th is the null pointer */
- db->document_table_stream =
- initialize_file((DOC_TAB_HEADER_SIZE + DOC_TAB_ELEMENT_SIZE),
- document_table_filename(file, db), TRUE);
- db->filename_table_stream =
- initialize_file(FILENAME_TABLE_HEADER_SIZE,
- filename_table_filename(file, db), TRUE);
- db->headline_table_stream =
- initialize_file(HEADLINE_TABLE_HEADER_SIZE,
- headline_table_filename(file, db), TRUE);
- }
-
- /* ========================= */
- /* === Dictionary File === */
- /* ========================= */
-
- /* The dictionary file is a 1 deep tree of blocks.
- The header of the file says how long the header block is.
- The "header block" is a set of pointers to the heads of
- the blocks in the dictionary.
-
- A dictionary block is a list of word and pointer pairs. The words
- are padded to a fixed length so that it is a fixed length record.
- The pointers are pointers into the inverted file (except in the header
- block where they are pointers into the dictionary file).
- */
-
- /* SEARCHING DICTIONARY FILES */
-
- /* top level function:
- long look_up_word_in_dictionary(char *word,database* db)
- */
-
- unsigned char *dictionary_header_block = NULL; /* the dictionary header.
- loaded once */
-
- long number_of_dictionary_blocks = 0; /* also the length of the dictionary
- header block */
-
- unsigned char *dictionary_block = NULL; /* this is one of the dict blocks */
-
- void close_dictionary_file(db)
- database *db;
- {
- if(dictionary_header_block) s_free(dictionary_header_block);
- dictionary_header_block = NULL;
- }
-
-
- static long fread_from_stream _AP((FILE* stream,unsigned char* buf,
- long nbytes));
-
- static long fread_from_stream(stream,buf,nbytes)
- FILE *stream;
- unsigned char *buf;
- long nbytes;
- /* this is a safe version of unix 'fread' it does all the checking
- * and looping necessary
- */
- {
- long didRead;
- long toRead = nbytes;
- long totalRead = 0; /* paranoia */
- /*printf("in Fread_from_stream buffer %ld, nbytes %ld\n", (long)buf, nbytes); */
-
- while (toRead > 0){
- didRead = fread(buf, sizeof(char), toRead, stream);
- if(didRead == -1) /* error*/
- return(-1);
- if(didRead == 0) /* eof */
- return(-2); /* maybe this should return 0? */
- toRead -= didRead;
- buf += didRead;
- totalRead += didRead;
- }
- if(totalRead != nbytes) /* we overread for some reason */
- return(- totalRead); /* bad news */
- return(totalRead);
- }
-
- char *dictionary_block_word(i,block)
- long i;
- unsigned char *block;
- /* returns the word field in the ith dictionary block entry */
- {
- return((char *)(block + (i * DICTIONARY_ENTRY_SIZE)));
- }
-
- long dictionary_block_position(i,block)
- long i;
- unsigned char *block;
- /* returns the position field in the ith dictionary block entry */
- {
- /* printf("dictionary_block_position %ld\n",
- read_bytes_from_memory
- (NEXT_INDEX_BLOCK_SIZE,
- block + (i * DICTIONARY_ENTRY_SIZE) +
- MAX_WORD_LENGTH + 1)); */
- return(read_bytes_from_memory
- (NEXT_INDEX_BLOCK_SIZE,
- block + (i * DICTIONARY_ENTRY_SIZE) +
- MAX_WORD_LENGTH + 1));
- }
-
- long dictionary_block_word_occurances(i,block)
- long i;
- unsigned char *block;
- /* returns the occurances field in the ith dictionary block entry */
- {
- return(read_bytes_from_memory
- (NEXT_INDEX_BLOCK_SIZE,
- block + (i * DICTIONARY_ENTRY_SIZE) +
- MAX_WORD_LENGTH + 1 + NEXT_INDEX_BLOCK_SIZE));
- }
-
- static long find_pointer_in_block _AP((char* word,unsigned char* block,
- long block_length));
-
- static long find_pointer_in_block(word,block,block_length)
- char *word;
- unsigned char *block;
- long block_length; /* in entries */
- /* returns 0 if an error or if the word is below the lowest block,
- (this confusion between error and NULL is bad, but found late in the
- design process)
- it returns the positive position if the word is there exactly,
- and the negative of the position of the word before it if the
- word is not there exactly.
- */
- {
- /* find the entry in the dictionary header for this word.
- returns 0 if not found. */
- /* this could be binary search XXX */
- long i;
- for(i = 0; i < block_length; i++){
- long compare;
- char *dictionary_word = dictionary_block_word(i, block);
- if(dictionary_word[0] == '\0')
- break;
- compare = strcmp(dictionary_word, word);
- if(0 == compare)
- return(dictionary_block_position(i, block));
- if(compare > 0){
- if(i == 0)
- return(0);
- else return(- dictionary_block_position(i - 1, block));
- }
- }
- if(i == 0)
- return(0);
- else return(- dictionary_block_position(i - 1, block));
- }
-
- unsigned char *read_dictionary_block(block,position,length,stream)
- unsigned char *block;
- long position;
- long length;
- FILE *stream;
- /* reads the dictionary block from the disk and returns it.
- block is the place to put it, if it is NULL, then it is malloc'ed.
- position is the position in the dictionary file to start reading.
- length is th enumber of entries (not bytes) in the block.
- stream is the dictionary stream.
-
- it returns NULL if it loses.
- */
-
- {
- if(NULL == block)
- block = (unsigned char *)s_malloc((size_t)(length * DICTIONARY_ENTRY_SIZE));
- s_fseek(stream, position, SEEK_SET);
- if(0 > fread_from_stream(stream, block, (length * DICTIONARY_ENTRY_SIZE))){
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Could not read the dictionary block %ld, length %ld",
- block, length);
- return(NULL);
- }
- return(block);
- }
-
- long
- look_up_word_in_dictionary(word,db)
- char *word;
- database* db;
- /* looks up the word in the dictionary file. Returns the pointer
- into the inverted file or negative number if not found,
- or 0 if error.
- */
- {
- FILE *stream = db->dictionary_stream;
- long dictionary_block_pos;
-
- if(NULL == dictionary_header_block)
- {
- s_fseek(stream, 0L, SEEK_SET);
- number_of_dictionary_blocks = read_bytes(DICTIONARY_HEADER_SIZE,stream);
- dictionary_header_block =
- read_dictionary_block(dictionary_header_block,DICTIONARY_HEADER_SIZE,
- number_of_dictionary_blocks,stream);
- if(NULL == dictionary_header_block)
- { waislog(WLOG_HIGH, WLOG_ERROR,
- "Could not read dictionary header block in db %s.",
- db->database_file);
- return(0);
- }
- }
-
- dictionary_block_pos =
- find_pointer_in_block(word, dictionary_header_block,
- number_of_dictionary_blocks);
- if(0 == dictionary_block_pos)
- { /* waislog(WLOG_HIGH, WLOG_ERROR, "Could not find pointer for word '%s' (location %ld) in block in db %s!",
- word, word, db->database_file); */
- return(-1); /* not an error, necessarily if the word is before the first entry */
- }
-
- dictionary_block =
- read_dictionary_block(dictionary_block,ABS(dictionary_block_pos),
- DICTIONARY_BLOCK_SIZE,stream);
- if(NULL == dictionary_block)
- { waislog(WLOG_HIGH, WLOG_ERROR,
- "Could not read dictionary block %ld in db %s",
- ABS(dictionary_block_pos),
- db->database_file);
- return(0);
- }
-
- return(find_pointer_in_block(word, dictionary_block,DICTIONARY_BLOCK_SIZE));
- }
-
-
- /* BUILDING DICTIONARY FILES */
-
-
- long number_of_dictionary_entries; /* number allocated */
-
- char *block_of_zeros = NULL;
-
- static void write_zeros_to_stream _AP((long n_bytes,FILE* stream));
-
- static void write_zeros_to_stream(n_bytes,stream)
- long n_bytes;
- FILE *stream;
- /* writes zeros to a file quickly */
- {
- long i;
- if(n_bytes >= BLOCK_SIZE){ /* then write big blocks of zeros */
- if(NULL == block_of_zeros){
- block_of_zeros = (char*)s_malloc((size_t)BLOCK_SIZE);
- memset(block_of_zeros, 0, BLOCK_SIZE);
- }
- while(n_bytes >= BLOCK_SIZE){
- /* then write big blocks of zeros */
- if(BLOCK_SIZE !=
- fwrite(block_of_zeros, sizeof(char), BLOCK_SIZE, stream))
- panic("Write failed");
- n_bytes -= BLOCK_SIZE;
- }
- }
- for(i = 0; i < n_bytes; i++){ /* clean up the rest */
- putc('\0', stream);
- }
- }
-
- /* returns 0 if successful */
- long init_dict_file_for_writing(db)
- database *db;
- {
- char filename[MAX_FILENAME_LEN];
- long number_of_blocks;
-
- /*printf("init_dict_file_for_writing\n");*/
- if (db->dictionary_stream == NULL)
- db->dictionary_stream = s_fopen(dictionary_filename(filename, db), "w+b");
-
- number_of_blocks = (db->number_of_words / DICTIONARY_BLOCK_SIZE);
- number_of_blocks += /* must be done on separate lines for XENIX */
- ((0 == (db->number_of_words % DICTIONARY_BLOCK_SIZE)) ? 0 : 1);
- /*printf("number of words = %ld, block size = %ld, number-of blocks = %ld\n",db->number_of_words,DICTIONARY_BLOCK_SIZE,number_of_blocks);*/
-
- init_dict_file_detailed(db->dictionary_stream,number_of_blocks);
- return(0);
- }
-
- void
- init_dict_file_detailed(dictionary_stream,number_of_blocks)
- FILE* dictionary_stream;
- long number_of_blocks;
- {
- /* create space for the table in the front of the file */
- write_zeros_to_stream(DICTIONARY_HEADER_SIZE +
- DICTIONARY_ENTRY_SIZE * number_of_blocks,
- dictionary_stream);
- /* write the number of blocks */
- s_fseek(dictionary_stream, 0L, SEEK_SET);
- write_bytes(number_of_blocks, DICTIONARY_HEADER_SIZE, dictionary_stream);
-
- fseek(dictionary_stream, 0L, SEEK_END);
- number_of_dictionary_entries = 0;
- }
-
- long add_word_to_dictionary(word,position,number_of_occurances,db)
- char *word;
- long position;
- long number_of_occurances;
- database *db;
- /* Puts a word into the dictionary file. */
- {
- /* assumes the streamg has been initialized, and it is positioned
- at the end */
- FILE *stream = db->dictionary_stream;
- char padded_word[MAX_WORD_LENGTH + 1];
-
- memset(padded_word, 0, MAX_WORD_LENGTH + 1); /* clear the word */
- strcpy(padded_word, word);
-
- if(0 == (number_of_dictionary_entries % DICTIONARY_BLOCK_SIZE)){
- /* then add an entry in the header */
- long original_position = s_ftell(stream);
- long header_entry = number_of_dictionary_entries / DICTIONARY_BLOCK_SIZE;
- /* printf("Adding header entry %ld %s original pos %ld\n",
- header_entry, padded_word, original_position); */
- fseek(stream, DICTIONARY_HEADER_SIZE +
- (header_entry * DICTIONARY_ENTRY_SIZE), SEEK_SET);
- if((MAX_WORD_LENGTH + 1) !=
- fwrite(padded_word, sizeof(char), MAX_WORD_LENGTH + 1, stream))
- panic("Write failed");
- write_bytes(original_position, NEXT_INDEX_BLOCK_SIZE, stream);
- write_bytes(0L, NUMBER_OF_OCCURANCES_SIZE, stream);
- fseek(stream, original_position, SEEK_SET); /* go back to the end */
- /* zero the next block */
- write_zeros_to_stream(DICTIONARY_ENTRY_SIZE * DICTIONARY_BLOCK_SIZE,
- stream);
- fseek(stream, original_position, SEEK_SET);
- }
- /* write the word */
- if((MAX_WORD_LENGTH + 1) !=
- fwrite(padded_word, sizeof(char), MAX_WORD_LENGTH + 1, stream))
- panic("Write failed");
- write_bytes(position, NEXT_INDEX_BLOCK_SIZE, stream);
- write_bytes(number_of_occurances, NUMBER_OF_OCCURANCES_SIZE, stream);
- number_of_dictionary_entries++;
- return(0);
- }
-
- void print_dictionary_block(block,size)
- unsigned char *block;
- long size;
- /* this prints the contents of a dictionary block */
- {
- long i;
- for(i = 0; i < size; i++){
- char *word = dictionary_block_word(i, block);
- if(word[0] == '\0')
- break;
- /* I assume this is only for debugging - JG */
- printf("Entry %3ld: %21s %7ld %7ld\n", i, word,
- dictionary_block_position(i, block),
- dictionary_block_word_occurances(i, block));
- }
- }
-
- void print_dictionary _AP((database* db));
-
- void print_dictionary(db)
- database *db;
- {
- /* prints the contents of a dictionary */
- FILE *stream = db->dictionary_stream;
- long i;
- long new_number_of_dictionary_blocks;
-
- if(NULL == stream)
- panic("dictionary stream is not open");
- s_fseek(stream, 0L, SEEK_SET);
- new_number_of_dictionary_blocks = read_bytes(DICTIONARY_HEADER_SIZE, stream);
- if(new_number_of_dictionary_blocks > number_of_dictionary_blocks)
- dictionary_header_block = NULL;
- number_of_dictionary_blocks = new_number_of_dictionary_blocks;
- printf("Number of dictionary blocks %ld\n", number_of_dictionary_blocks);
- if(NULL == (dictionary_header_block =
- read_dictionary_block(dictionary_header_block,
- DICTIONARY_HEADER_SIZE,
- number_of_dictionary_blocks,
- stream)))
- panic("Could not read dictionary header block");
- printf("The Dictionary Header Block:\n");
- print_dictionary_block(dictionary_header_block, number_of_dictionary_blocks);
- for(i = 0; i < number_of_dictionary_blocks; i++){
- long pos = dictionary_block_position(i, dictionary_header_block);
- if(NULL == (dictionary_block =
- read_dictionary_block(dictionary_block,
- pos, DICTIONARY_BLOCK_SIZE, stream)))
- panic("Could not read dictionary block %ld", pos);
- printf("\n\nDictionary block %ld (position %ld):\n", i, pos);
- print_dictionary_block(dictionary_block, DICTIONARY_BLOCK_SIZE);
- }
- fseek(stream, 0L, SEEK_END);
- }
-
- #ifdef testing
- /* dictionary testing code */
-
- static void check_dictionary_entry _AP((char* word,long expected_position,
- database* db));
-
- static void check_dictionary_entry(word,expected_position,db)
- char *word;
- long expected_position;
- database *db;
- {
- if(expected_position != look_up_word_in_dictionary(word, db)) {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "%s should be %ld is %ld in db %s\n",
- word, expected_position,
- look_up_word_in_dictionary(word, db),
- db->database_file);
- }
- }
-
- static void test_dictionary _AP((database* db));
-
- static void test_dictionary(db)
- database *db;
- /* this is just an trivial test */
- {
-
- db->number_of_words = 3;
- init_dict_file_for_writing(db);
- add_word_to_dictionary("aardvark", 123L, 0l, db);
- add_word_to_dictionary("house", 234L, 0L, db);
- add_word_to_dictionary("mary", 345L, 0L, db);
- fflush(db->dictionary_stream);
- print_dictionary(db);
- check_dictionary_entry("aardvark", 123L, db);
- check_dictionary_entry("house", 234L, db);
- check_dictionary_entry("mary", 345L, db);
- check_dictionary_entry("food", -123L, db);
- check_dictionary_entry("zebra", -345L, db);
- check_dictionary_entry("aaarf", 0L, db);
- }
- #endif /* def testing */
-
-
- /*========================*
- *=== Document Table ===*
- *========================*/
-
- boolean
- read_document_table_entry(doc_entry,number,db)
- document_table_entry* doc_entry;
- long number;
- database* db;
- /* returns a document_table_entry on the stack */
- {
- long position;
- FILE *stream = db->document_table_stream;
-
- position = (DOC_TAB_HEADER_SIZE +
- ((long)number * (long)DOC_TAB_ELEMENT_SIZE));
-
- if (0 != fseek(stream, position, SEEK_SET))
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "fseek failed into the document table to position %ld in db %s",
- position,
- db->database_file);
- return(false);
- }
-
- doc_entry->filename_id = read_bytes(DOC_TAB_ENTRY_FILENAME_ID_SIZE,
- stream);
- doc_entry->headline_id = read_bytes(DOC_TAB_ENTRY_HEADLINE_ID_SIZE,
- stream);
- doc_entry->start_character =
- read_bytes(DOC_TAB_ENTRY_START_CHAR_SIZE, stream);
- doc_entry->end_character =
- read_bytes(DOC_TAB_ENTRY_END_CHAR_SIZE, stream);
- doc_entry->document_length =
- read_bytes(DOC_TAB_ENTRY_DOC_LENGTH_SIZE, stream);
- doc_entry->number_of_lines =
- read_bytes(DOC_TAB_ENTRY_NUM_LINES_SIZE, stream);
- doc_entry->date =
- read_bytes(DOC_TAB_ENTRY_DATE_SIZE, stream);
- if (doc_entry->date == EOF) {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Reading from the Document Table got an error in db %s",
- db->database_file);
- return(false);
- }
-
- return(true);
- }
-
- #ifdef testing
-
- static boolean check_document_id _AP((long doc_id,database* db));
-
- static boolean
- check_document_id(doc_id,db)
- long doc_id;
- database* db;
- /* returns true if that is a valid doc_id (corresponds to a file
- that has not been deleted */
- {
- long position;
- FILE *stream = db->document_table_stream;
- long filename_id;
- char filename[MAX_FILE_NAME_LEN];
-
- position = (DOC_TAB_HEADER_SIZE +
- ((long)doc_id * (long)DOC_TAB_ELEMENT_SIZE));
-
- if (0 != fseek(stream, position, SEEK_SET)) {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "fseek failed into the document table to position %ld in db %s",
- position,
- db->database_file);
- return(false);
- }
-
- filename_id = read_bytes(DOC_TAB_ENTRY_FILENAME_ID_SIZE, stream);
- /* probe the file. Is there a faster way? */
- return(probe_file(read_filename_table_entry(filename_id, filename,
- NULL,
- db)));
- }
- #endif
-
- long write_document_table_entry(doc_table_entry, db)
- document_table_entry* doc_table_entry;
- database* db;
- {
- /* returns the document_id */
- s_fseek(db->document_table_stream,
- (DOC_TAB_HEADER_SIZE +
- (db->doc_table_allocated_entries *
- DOC_TAB_ELEMENT_SIZE)),
- SEEK_SET);
- /* write the pieces */
- write_bytes(doc_table_entry->filename_id,
- DOC_TAB_ENTRY_FILENAME_ID_SIZE,
- db->document_table_stream);
- write_bytes(doc_table_entry->headline_id,
- DOC_TAB_ENTRY_HEADLINE_ID_SIZE,
- db->document_table_stream);
- write_bytes(doc_table_entry->start_character,
- DOC_TAB_ENTRY_START_CHAR_SIZE,
- db->document_table_stream);
- write_bytes(doc_table_entry->end_character,
- DOC_TAB_ENTRY_END_CHAR_SIZE,
- db->document_table_stream);
- write_bytes(doc_table_entry->document_length,
- DOC_TAB_ENTRY_DOC_LENGTH_SIZE,
- db->document_table_stream);
- /* printf("Writing %ld lines\n", document_table_entry->number_of_lines); */
- write_bytes(doc_table_entry->number_of_lines,
- DOC_TAB_ENTRY_NUM_LINES_SIZE,
- db->document_table_stream);
- write_bytes(doc_table_entry->date,
- DOC_TAB_ENTRY_DATE_SIZE,
- db->document_table_stream);
- db->doc_table_allocated_entries++;
- return(db->doc_table_allocated_entries);
- }
-
- long next_document_id(db)
- database* db;
- {
- return(db->doc_table_allocated_entries);
- }
-
-
- /*========================*
- *=== Filename table ===*
- *========================*/
-
- #ifndef MAXPATHLEN /* think_c does not define it for instance */
- #define MAXPATHLEN 2000
- #endif /* MAXPATHLEN */
-
- static char *read_filename_table_stream _AP((long position,
- char* filename,
- char* type,
- time_t* file_write_date,
- FILE *stream));
-
- static char *read_filename_table_stream(position,filename,type,
- file_write_date, stream)
- long position;
- char* filename;
- char* type;
- time_t* file_write_date;
- FILE *stream;
- {
- /* Returns the filename array after side effecting it,
- * or NULL if an error.
- * The type of the file is put in the argument "type". This will
- * not be longer than MAX_FILE_NAME_LEN.
- *
- * if type is NULL then ignore it,
- * if file_write_date is NULL then ignore it,
- * If position is -1, then it does not seek.
- *
- * Leave the file positioned at the start of the next entry.
- */
- long file_write_date_internal;
- char type_internal[MAX_TYPE_LEN];
-
- if(NULL == stream)
- return(NULL);
-
- if(NULL == type) /* this means we do not care, so set up a dummy */
- type = type_internal;
-
- filename[0] = '\0'; /* init to the empty string */
- if(NULL != type)
- type[0] = '\0'; /* init to the empty string */
-
- if(position != -1){
- if (0 != fseek(stream, position, SEEK_SET)){
- waislog(WLOG_HIGH, WLOG_ERROR, "fseek failed into the filename index to position %ld",
- position);
- return(NULL);
- }
- }
- if(false == read_string_from_file(stream, filename, MAX_FILE_NAME_LEN)){
- return(NULL);
- }
- else{
- file_write_date_internal = read_bytes(FILE_WRITE_DATE_SIZE, stream);
- if(file_write_date){
- *file_write_date = (time_t)file_write_date_internal;
- }
- if(false == read_string_from_file(stream, type, MAX_TYPE_LEN)){
- return(NULL);
- }
- }
- return(filename);
- }
-
- char *read_filename_table_entry(position,filename,type,file_write_date,db)
- long position;
- char* filename;
- char* type;
- time_t* file_write_date;
- database* db;
- {
- /* Returns the filename array after side effecting it,
- * or NULL if an error.
- * The type of the file is put in the argument "type". This will
- * not be longer than MAX_FILE_NAME_LEN.
- *
- * if type is NULL then ignore it,
- * if file_write_date is NULL then ignore it,
- * If position is -1, then it does not seek.
- *
- * Leave the file positioned at the start of the next entry.
- */
- FILE *stream = db->filename_table_stream;
- return(read_filename_table_stream(position,filename,type,
- file_write_date,stream));
- }
-
- long write_filename_table_entry(filename,type,db)
- char* filename;
- char *type;
- database* db;
- {
- /* writes the filename (NULL terminated),
- followed by 4 bytes of creation date,
- followed by the file type (NULL terminated),
- Returns the postion of the filename
- */
- long free_position;
- char full_path[MAXPATHLEN];
- s_fseek(db->filename_table_stream, 0L, SEEK_END);
- free_position = ftell(db->filename_table_stream);
- /* add the filename to the hashtable not done yet XXX
- (setf (gethash filename *filename_table_hashtable*)
- (file_write_date filename))
- */
- fprintf(db->filename_table_stream, "%s", truename(filename, full_path));
- fputc(0, db->filename_table_stream);
- if(FILE_WRITE_DATE_SIZE != sizeof(time_t)){ /* check if these are the same */
- panic("We have a problem with the file_write_date_size\n");
- }
- write_bytes((long)file_write_date(filename),
- FILE_WRITE_DATE_SIZE, db->filename_table_stream);
- /* fwrite(type, sizeof(char), strlen(type) + 1, db->filename_table_stream);*/
- fprintf(db->filename_table_stream, "%s",type);
- fputc(0,db->filename_table_stream);
- return(free_position);
- }
-
- /* functions to figure out if the file is in the index already */
-
- static boolean filename_in_filename_stream _AP((char *filename, char *type,
- time_t *file_write_date,
- FILE *stream));
-
- static boolean filename_in_filename_stream(filename, type,
- file_write_date, stream)
- char *filename;
- char *type;
- time_t *file_write_date;
- FILE *stream;
- /* returns true if it is there (and side effects type and
- file_write_date).
- leaves the stream at the end of the file.
- If type or file_write_date is NULL, then it is a dont care.
- type, if it is an array, should be MAX_FILENAME_LEN long at least.
- */
- {
- /* this is slow because it loops through the whole file every time.
- this might want to be optimized by making a hashtable. */
- char next_filename[MAX_FILENAME_LEN];
-
- s_fseek(stream, FILENAME_TABLE_HEADER_SIZE, SEEK_SET);
- while(!feof(stream)){
- char new_type[MAX_FILENAME_LEN];
- if(NULL ==
- read_filename_table_stream(-1, next_filename, new_type,
- file_write_date, stream))
- return(false);
- if(0 == strcmp(next_filename, filename))
- return(true);
- }
- }
-
- boolean filename_in_database(filename,type,file_write_date,db)
- char *filename;
- char *type;
- time_t *file_write_date;
- database *db;
- {
- return(filename_in_filename_stream(filename, type, file_write_date,
- db->filename_table_stream));
- }
-
- /* this caches the last filename that was found to be in the filename file,
- this way repeated attempts to figure out if a file is there will be fast.
- This is the case when retrieving successive blocks of a file. */
- char last_filename_found_in_file[MAX_FILE_NAME_LEN];
- char last_filename_file[MAX_FILE_NAME_LEN];
-
- boolean filename_in_filename_file(filename,type,file_write_date, filename_file)
- char *filename;
- char *type;
- time_t *file_write_date;
- char *filename_file;
- {
- if(NULL == filename)
- return(false);
-
- if(0 == strcmp(last_filename_found_in_file, filename) &&
- 0 == strcmp(last_filename_file, filename_file))
- return(true);
- else
- { FILE *stream = s_fopen(filename_file, "r");
- boolean answer;
-
- if(NULL == stream)
- { s_fclose(stream);
- return(false);
- }
- answer =
- filename_in_filename_stream(filename,type,file_write_date, stream);
- if(answer == true)
- { /* record it in the cache */
- strncpy(last_filename_file, filename_file, MAX_FILE_NAME_LEN);
- strncpy(last_filename_found_in_file, filename, MAX_FILE_NAME_LEN);
- }
- s_fclose(stream);
- return(answer);
- }
- }
-
-
- /*========================*
- *=== Headline Table ===*
- *========================*/
-
- char *read_headline_table_entry(position,db)
- long position;
- database* db;
- /* returns the headline array after side effecting it. Beware that
- * the next call to this function will overwrite the the headline_array
- */
- {
- /* this is the headline that gets returned */
- static char headline_array[MAX_HEADLINE_LEN];
- FILE *stream = db->headline_table_stream;
- headline_array[0] = '\0'; /* init to the empty string */
-
- if (0 != fseek(stream, position, SEEK_SET)) {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "fseek failed into the headline index to position %ld in db %s",
- position, db->database_file);
- return(headline_array);
- }
- if(false == read_string_from_file(db->headline_table_stream,
- headline_array, MAX_FILE_NAME_LEN)){
- waislog(WLOG_HIGH, WLOG_ERROR,
- "headline table is corrupt at %ld in db %s",
- position, db->database_file);
- }
- return(headline_array);
- }
-
- /* writes the string to the file followed by a NULL.
- * The returned number is the position in the file to start reading.
- */
- long write_headline_table_entry(headline,db)
- char* headline;
- database* db;
- {
- /* writes the headline followed by a newline.
- Returns the postion of the headline.
- */
- long free_position;
- s_fseek(db->headline_table_stream, 0L, SEEK_END);
- free_position = ftell(db->headline_table_stream);
-
- fprintf(db->headline_table_stream, "%s", headline);
- fputc(0, db->headline_table_stream);
- return(free_position);
- }
-
-
- /* =================== */
- /* === Source file === */
- /* =================== */
-
- /* the source file is an ascii file for describing a source.
- it is defined in ../doc/source.txt */
-
- /* Registers the src structure with the directory of servers.
- Return true if successful */
- boolean register_src_structure(filename)
- char *filename;
- {
- char string[200];
- long answer;
- /* register the server with the directory of servers */
- printf("Sending source struture to the directory of servers...");
- fflush(stdout);
- sprintf(string,
- "cat %s | mail wais-directory-of-servers@quake.think.com\n",
- filename);
- answer = system(string);
- printf("Done.\n");
- return((answer == 0)?true:false);
- }
-
-
- /* Writes a source structure to a file.
- If the export_database arg is set, then the tcp_port is used in the
- tcp-port slot.
- Returns true if successful. */
- boolean write_src_structure(filename, database_name, typename,
- filenames, count, export_database, tcp_port)
- char *filename;
- char *database_name;
- char *typename;
- char **filenames;
- long count;
- boolean export_database;
- long tcp_port;
- {
- long i;
- char hostname[120];
- struct hostent *h;
-
- #ifndef THINK_C
- #ifndef M_XENIX
-
- FILE *source_stream = s_fopen(filename, "w");
-
- fprintf(source_stream, "(:source \n");
- fprintf(source_stream, " :version 3 \n");
- if(export_database){
- gethostname(hostname, 120);
- h = gethostbyname(hostname);
- if (h != NULL &&
- h->h_addr_list != NULL &&
- h->h_addr_list[0] != NULL) {
- fprintf(source_stream,
- " :ip-address \"%d.%d.%d.%d\"\n",
- (unsigned char)h->h_addr_list[0][0],
- (unsigned char)h->h_addr_list[0][1],
- (unsigned char)h->h_addr_list[0][2],
- (unsigned char)h->h_addr_list[0][3] );
- }
- fprintf(source_stream, " :ip-name \"%s\"\n", hostname );
- fprintf(source_stream, " :tcp-port %ld\n", tcp_port);
- }
- fprintf(source_stream, " :database-name \"%s\"\n", database_name);
- fprintf(source_stream, " :cost 0.00 \n");
- fprintf(source_stream, " :cost-unit :free \n");
- fprintf(source_stream, " :maintainer \"%s\"\n",
- current_user_name());
- fprintf(source_stream, " :description \"Server created with %s on %s by %s\n",
- VERSION, printable_time(), current_user_name());
- if(count > 0){
- fprintf(source_stream, "The files of type %s used in the index were:\n",
- typename);
- for(i = 0; i < count; i++){
- char full_path[MAX_FILENAME_LEN + 1];
- fprintf(source_stream, " %s\n", truename(filenames[i], full_path));
- }
- }
- fprintf(source_stream, "\"\n");
- fprintf(source_stream, ")\n");
- s_fclose(source_stream);
-
- #endif /* ndef M_XENIX */
- #endif /* ndef THINK_C */
-
- return(true);
- }
-
-
-
-
- /*****************************/
- /*** Database support ***/
- /*****************************/
-
- char* dictionary_filename(destination,db)
- char* destination;
- database* db;
- {
- strncpy(destination, db->database_file,MAX_FILE_NAME_LEN);
- s_strncat(destination,dictionary_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN);
- return(destination);
- }
-
- char* document_table_filename(destination,db)
- char* destination;
- database* db;
- {
- strncpy(destination, db->database_file,MAX_FILE_NAME_LEN);
- s_strncat(destination,document_table_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN);
- return(destination);
- }
-
- char* filename_table_filename(destination,db)
- char* destination;
- database* db;
- {
- strncpy(destination, db->database_file,MAX_FILE_NAME_LEN);
- s_strncat(destination,filename_table_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN);
- return(destination);
- }
-
- char* headline_table_filename(destination,db)
- char* destination;
- database* db;
- {
- strncpy(destination, db->database_file,MAX_FILE_NAME_LEN);
- s_strncat(destination,headline_table_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN);
- return(destination);
- }
- char* index_filename(destination,db)
- char* destination;
- database* db;
- {
- strncpy(destination, db->database_file,MAX_FILE_NAME_LEN);
- s_strncat(destination,index_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN);
- return(destination);
- }
-
- char* index_filename_with_version(version,destination,db)
- long version;
- char* destination;
- database* db;
- {
- sprintf(destination, "%s%s%ld", db->database_file,
- index_ext, version);
- return(destination);
- }
-
-
- char* source_filename(destination,db)
- char* destination;
- database* db;
- {
- strncpy(destination, db->database_file,MAX_FILE_NAME_LEN);
- s_strncat(destination,source_ext,MAX_FILE_NAME_LEN,MAX_FILE_NAME_LEN);
- return(destination);
- }
-
- char*
- get_doc(destination, document_id, db, headline)
- char* destination;
- long document_id;
- database* db;
- boolean headline;
- {
- document_table_entry doc_entry;
- char filename[MAX_FILE_NAME_LEN], type[100];
-
- if (read_document_table_entry(&doc_entry, document_id, db)
- == true){
- read_filename_table_entry(doc_entry.filename_id,
- filename,
- type,
- NULL,
- db);
- if (headline == TRUE)
- sprintf(destination, "%d %d %s, \"%s\"",
- doc_entry.start_character, doc_entry.end_character,
- filename,
- read_headline_table_entry(doc_entry.headline_id,db));
- else
- sprintf(destination, "%d %d %s",
- doc_entry.start_character, doc_entry.end_character,
- filename);
- return(s_strdup(type));
- }
- else return NULL;
- }
-
- long next_doc(destination, docID, db)
- char* destination;
- char* docID;
- database* db;
- {
- long i, start, end;
- char doc[MAX_FILE_NAME_LEN+50], fn[MAX_FILE_NAME_LEN];
- char *type, *loc;
-
- for(i = 0; i < db->doc_table_allocated_entries; i++) {
- if (get_doc(doc, i, db, FALSE) != NULL) {
- if (strcmp(doc, docID) == 0) {
- type = get_doc(doc, i+1, db, TRUE);
- sscanf(doc, "%d %d %s", &start, &end, fn);
- if((loc = strstr(doc, ",")) == NULL) return -1;
- fn[loc-doc] = 0;
- sprintf(destination, "%s, %s", doc, type);
- if( end != 0)
- return(end-start);
- else {
- /* whole file, find file length from the file */
- long size;
- FILE* file = NULL;
- if (((file = s_fopen(fn, "r")) != NULL) &&
- (fseek(file, 0L, SEEK_END) == 0) &&
- ((size = ftell(file)) != -1)) {
- s_fclose(file);
- return(size); /* we are done, bytes is set */
- }
- else {
- s_fclose(file);
- return(-1); /* something went wrong with the file */
- }
- }
- }
- }
- }
- return -1;
- }
-
- long previous_doc(destination, docID, db)
- char* destination;
- char* docID;
- database* db;
- {
- long i, start, end;
- char doc[MAX_FILE_NAME_LEN+50], fn[MAX_FILE_NAME_LEN];
- char *type, *loc;
-
- for(i = 0; i < db->doc_table_allocated_entries; i++) {
- if (get_doc(doc, i, db, FALSE) != NULL) {
- if (strcmp(doc, docID) == 0) {
- if (i != 0) {
- type = get_doc(doc, i-1, db, TRUE);
- sscanf(doc, "%d %d %s", &start, &end, fn);
- if((loc = strstr(doc, ",")) == NULL) return -1;
- fn[loc-doc] = 0;
- sprintf(destination, "%s, %s", doc, type);
- if( end != 0)
- return(end-start);
- else {
- /* whole file, find file length from the file */
- long size;
- FILE* file = NULL;
- if (((file = s_fopen(fn, "r")) != NULL) &&
- (fseek(file, 0L, SEEK_END) == 0) &&
- ((size = ftell(file)) != -1)) {
- s_fclose(file);
- return(size); /* we are done, bytes is set */
- }
- else {
- s_fclose(file);
- return(-1); /* something went wrong with the file */
- }
- }
- }
- }
- }
- }
- return(-1);
- }
-
- long next_docid(docID, db)
- char* docID;
- database* db;
- {
- long i;
- char doc[MAX_FILE_NAME_LEN+50];
-
- for(i = 0; i < db->doc_table_allocated_entries; i++) {
- if (get_doc(doc, i, db, FALSE) != NULL) {
- if (strcmp(doc, docID) == 0) {
- return (i+1);
- }
- }
- }
- return -1;
- }
-
- long previous_docid(docID, db)
- char* docID;
- database* db;
- {
- long i;
- char doc[MAX_FILE_NAME_LEN+50];
-
- for(i = 0; i < db->doc_table_allocated_entries; i++) {
- if (get_doc(doc, i, db, FALSE) != NULL) {
- if (strcmp(doc, docID) == 0) {
- return (i-1);
- }
- }
- }
- return -1;
- }
-