Education Sampler 1992 [NeXTSTEP]

home *** CD-ROM | disk | FTP | other *** search

/ Education Sampler 1992 [NeXTSTEP] / Education_1992_Sampler.iso / Programming / Source / WAIS / ir / hutil.h < prev next >

Wrap

C/C++ Source or Header | 1992-02-02 | 2.7 KB | 73 lines

/* WIDE AREA INFORMATION SERVER SOFTWARE: No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ /* Hash table utilities */ #ifndef HUTIL_H #define HUTIL_H #include "irlex.h" /* for MAX_WORD_LENGTH */ /* this is the size of the memory word hashtable. * It will be grown if needed. */ #define HASHTABLE_INITIAL_SIZE 65536L /* the amount of memory for word occurances (bytes) */ #define WORD_MEMORY_INIT_BLOCK_SIZE 10 /* this is the maximum number of occurances that will be stored in the * disk table. The number of occurances will reflect the total number in * all files. The theory is that if a word is very common, then it * is not very useful in descriminating between files. Also, if it * is very common, then it takes up alot of space. * Maybe this should be dependent on the number of documents indexed. * Therefore if a word is in every document, then it probably does not mean * much. * In increasing this, it may not keep all the references in the * inverted file because the max length of an index block is governed * by a size that can be represented in INDEX_BLOCK_SIZE_SIZE bytes. */ #define MAX_OCCURANCES 20000L #define STOP_WORD_FLAG 0x40000000 /* this is a flag to be put in the number_of_occurances field of a word_entry so that it is always greater than the limit and no words will be collected. */ typedef struct word_entry{ char word[MAX_WORD_LENGTH + 1]; /* NULL when empty. Must be the first slot */ /* should this be the total weight? */ long hash_code; long number_of_occurances; /* total for the whole db */ char* memory_ptr; /* what will go into the next block */ char* current_memory_ptr; /* the fill ptr into memory_ptr */ long memory_size; /* the size of memory_ptr */ long current_doc_id; /* the last document-id in memory_ptr * this will change a page pointer eventually */ } word_entry; typedef struct word_memory_hashtable{ long size; /* number of elements that can be in the contents */ long word_entry_block_size; /* the maximum number of entries before flushing */ long number_of_entries; /* number of elements that are in the * contents. */ word_entry** contents; /* pointer to the word hashtable memory */ word_entry *word_entry_block; /* pointer block of entries */ long number_of_words_indexed; /* total number of words indexed */ long flush_after_n_words; /* number of words that should be accumulated * before flushing. This should be dynamically * handled rather than this way. */ double growth_factor; /* amount to grow when growing */ double grow_when_this_full; /* fraction of full that triggers growth */ } word_memory_hashtable; #endif /* ndef HUTIL_H */