No guarantees or restrictions. See the readme file for the full standard
disclaimer.
Brewster@think.com
*/
/* Hash table utilities */
#ifndef HUTIL_H
#define HUTIL_H
#include "irlex.h" /* for MAX_WORD_LENGTH */
/* this is the size of the memory word hashtable.
* It will be grown if needed.
*/
#define HASHTABLE_INITIAL_SIZE 65536L
/* the amount of memory for word occurances (bytes) */
#define WORD_MEMORY_INIT_BLOCK_SIZE 10
/* this is the maximum number of occurances that will be stored in the
* disk table. The number of occurances will reflect the total number in
* all files. The theory is that if a word is very common, then it
* is not very useful in descriminating between files. Also, if it
* is very common, then it takes up alot of space.
* Maybe this should be dependent on the number of documents indexed.
* Therefore if a word is in every document, then it probably does not mean
* much.
* In increasing this, it may not keep all the references in the
* inverted file because the max length of an index block is governed
* by a size that can be represented in INDEX_BLOCK_SIZE_SIZE bytes.
*/
#define MAX_OCCURANCES 20000L
#define STOP_WORD_FLAG 0x40000000 /* this is a flag to be put in the number_of_occurances field of a word_entry so that it is always greater than the limit
and no words will be collected. */
typedef struct word_entry{
char word[MAX_WORD_LENGTH + 1]; /* NULL when empty. Must be
the first slot */
/* should this be the total weight? */
long hash_code;
long number_of_occurances; /* total for the whole db */
char* memory_ptr; /* what will go into the next block */
char* current_memory_ptr; /* the fill ptr into memory_ptr */
long memory_size; /* the size of memory_ptr */
long current_doc_id; /* the last document-id in memory_ptr
* this will change a page pointer eventually
*/
} word_entry;
typedef struct word_memory_hashtable{
long size; /* number of elements that can be in the contents */
long word_entry_block_size; /* the maximum number of entries before flushing */
long number_of_entries; /* number of elements that are in the
* contents.
*/
word_entry** contents; /* pointer to the word hashtable memory */
word_entry *word_entry_block; /* pointer block of entries */
long number_of_words_indexed; /* total number of words indexed */
long flush_after_n_words; /* number of words that should be accumulated
* before flushing. This should be dynamically
* handled rather than this way.
*/
double growth_factor; /* amount to grow when growing */
double grow_when_this_full; /* fraction of full that triggers growth */