home *** CD-ROM | disk | FTP | other *** search
- Newsgroups: comp.lang.c
- Path: sparky!uunet!haven.umd.edu!darwin.sura.net!europa.asd.contel.com!howland.reston.ans.net!usc!cs.utexas.edu!torn!newshub.ccs.yorku.ca!newshub.ccs.yorku.ca!oz
- From: oz@ursa.sis.yorku.ca (Ozan Yigit)
- Subject: Re: Need good hashing functions
- In-Reply-To: torek@horse.ee.lbl.gov's message of 10 Dec 1992 18: 31:03 GMT
- Message-ID: <OZ.92Dec14115806@ursa.sis.yorku.ca>
- Sender: news@newshub.ccs.yorku.ca (USENET News System)
- Organization: York U. Student Information Systems Project
- References: <1992Dec9.213041.16259@netcom.com> <1992Dec10.113732.4889@si.hhs.nl>
- <27911@dog.ee.lbl.gov>
- Date: Mon, 14 Dec 1992 16:58:06 GMT
- Lines: 319
-
- Chris Torek writes:
- [most of the interesting hashing article elided]
-
- Incidentally, the same expansion technique can be used for any hash
- method: it simply corresponds to moving all the entries from the old
- hash table to the new, larger one. This particular version has the
- advantage of being compact.
-
- Alternatively, one can use an algorithm that expands without re-hashing
- the entire table. Here is one based on the Larson article in CACM, that
- can be used instead of the inferior sV hsearch. [please share any fixes
- and improvements to this code]
-
- enjoy... oz
- ---
- With diligence, it is possible to | electric: oz@sis.yorku.ca
- make anything run slowly. -T.Duff | ph:[416] 736 2100 x 33976
- ----- snap here ----
- /*
- * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
- * Coded into C, with minor code improvements, and with hsearch(3) interface,
- * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
- * also, hcreate/hdestroy routines added to simulate hsearch(3).
- *
- * These routines simulate hsearch(3) and family, with the important
- * difference that the hash table is dynamic - can grow indefinitely
- * beyond its original size (as supplied to hcreate()).
- *
- * Performance appears to be comparable to that of hsearch(3).
- * The 'source-code' options referred to in hsearch(3)'s 'man' page
- * are not implemented; otherwise functionality is identical.
- *
- * Compilation controls:
- * DEBUG controls some informative traces, mainly for debugging.
- * HASH_STATISTICS causes hashaccesses and hashcollisions to be maintained;
- * when combined with DEBUG, these are displayed by hdestroy().
- */
-
- #include <stdio.h>
- #include <assert.h>
- #include <string.h>
- #include <search.h>
-
- /* Constants */
- # define SEGSIZE 256 /* should be power of 2 for speed */
- # define DIRSIZE 256 /* should be power of 2 for speed */
- # define PRIME1 37
- # define PRIME2 1048583
- # define DEFMAXLD 5 /* default maximum load factor */
-
- /* local data templates */
-
- /*
- * The user only sees the first two fields, as we pretend to pass
- * back only a pointer to ENTRY. {S}he doesn't know what else is in here.
- */
- typedef struct element {
- char *ekey;
- char *edata;
- struct element *next; /* secret from user */
- } element_t, *segment_t;
-
- typedef struct {
- unsigned short p; /* next bucket to be split */
- unsigned short maxp; /* upper bound on p during expansion */
- unsigned long keycount; /* current # keys */
- unsigned short segcount; /* current # segments */
- unsigned short minload; /* minimum load factor */
- unsigned short maxload; /* maximum load factor */
- segment_t * dir[DIRSIZE];
- } hashtbl;
-
- typedef unsigned long addr_t;
-
- /* external routines */
- extern char *calloc();
-
- /* Entry points */
- extern int hcreate();
- extern void hdestroy();
- extern ENTRY *hsearch();
-
- /* Internal routines */
- static addr_t hash();
- static void expandtbl();
-
- /* Local data */
- static hashtbl *tbl = NULL;
- #if HASH_STATISTICS
- static long hashaccesses, hashcollisions, hashexpansions;
- #endif
-
- int
- hcreate(count)
- unsigned count;
- {
- register unsigned i;
-
- /*
- * Adjust count to be nearest higher power of 2, minimum SEGSIZE,
- * then convert into segments.
- */
- for (i = SEGSIZE; i < count; i <<= 1)
- ;
-
- count = i / SEGSIZE;
-
- tbl = (hashtbl *)calloc(1, sizeof(hashtbl));
- if (tbl == NULL)
- return 0;
-
- tbl->segcount = 0;
- tbl->p = 0;
- tbl->keycount = 0;
-
- /*
- * Allocate initial 'i' segments of buckets
- */
- for (i = 0; i < count; i++) {
- tbl->dir[i] = (segment_t *)calloc(SEGSIZE, sizeof(segment_t));
- if (tbl->dir[i] == NULL) {
- hdestroy();
- return 0;
- }
- tbl->segcount++;
- }
- tbl->maxp = count * SEGSIZE;
- tbl->minload = 1;
- tbl->maxload = DEFMAXLD;
-
- # if DEBUG
- fprintf(stderr, "hcreate: table %x count %d maxp %d segcount %d\n",
- tbl, count, tbl->maxp, tbl->segcount);
- # endif
- # if HASH_STATISTICS
- hashaccesses = hashcollisions = hashexpansions = 0;
- # endif
- return 1;
- }
-
- void
- hdestroy()
- {
- int i, j;
- segment_t * s;
- element_t * p, *q;
-
- if (tbl != NULL) {
- for (i = 0; i < tbl->segcount; i++) {
- /* test probably unnecessary */
- s = tbl->dir[i];
- if (s != NULL)
- for (j = 0; j < SEGSIZE; j++) {
- p = s[j];
- while (p != NULL) {
- q = p->next;
- free((char *)p);
- p = q;
- }
- free(tbl->dir[i]);
- }
- }
- # if HASH_STATISTICS && DEBUG
- fprintf(stderr, "hdestroy: accesses %ld collisions %ld\n",
- hashaccesses, hashcollisions);
- fprintf(stderr, "hdestroy: expansions %ld\n", hashexpansions);
- fprintf(stderr, "keys %ld maxp %d segcount %d\n",
- tbl->keycount, tbl->maxp, tbl->segcount);
- # endif
- free(tbl);
- tbl = NULL;
- }
- }
-
- ENTRY *
- hsearch(item, action)
- ENTRY item;
- ACTION action; /* FIND/ENTER */
- {
- register addr_t h;
- addr_t segidx, segdir;
- segment_t * currseg;
- segment_t * p, q;
-
- assert(tbl != NULL); /* Kinder really than return(NULL); */
-
- # if HASH_STATISTICS
- hashaccesses++;
- # endif
-
- h = hash(item.key);
- segdir = h / SEGSIZE;
- segidx = h % SEGSIZE;
- /*
- * valid segment_t ensured by hash()
- */
- currseg = tbl->dir[segdir];
- assert(currseg != NULL); /* bad failure if tripped */
- p = &currseg[segidx];
- q = *p;
- /*
- * Follow collision chain
- */
- while (q != NULL && strcmp(q->ekey, item.key) != 0) {
- p = &q->next;
- q = *p;
- # if HASH_STATISTICS
- hashcollisions++;
- # endif
- }
-
- if (q != NULL || /* found */
- action == FIND || /* not found, search only */
- (q = (element_t *)calloc(1, sizeof(element_t))) == NULL) /* not found, no room */
- return (ENTRY *)q;
- *p = q; /* link into chain */
- /*
- * Initialize new element
- */
- q->ekey = item.key;
- q->edata = item.data;
- /* cleared by calloc(3) q->next = NULL; */
-
- /* tbl over-full? */
- if (++tbl->keycount / (tbl->segcount * SEGSIZE) > tbl->maxload)
- expandtbl(); /* doesn't affect q */
- return (ENTRY *)q;
- }
-
- /*
- * Internal routines
- */
-
- static addr_t
- hash(ekey)
- char *ekey;
- {
- register unsigned char *k = (unsigned char *) ekey;
- register addr_t h, address;
-
- h = 0;
- /*
- * Convert string to integer
- */
- while (*k != '\0')
- h = h * PRIME1 ^ (*k++ - ' ');
- h %= PRIME2;
- address = h % tbl->maxp;
- if (address < tbl->p)
- address = h % (2*tbl->maxp);
- return address;
- }
-
- static void
- expandtbl()
- {
- register addr_t newaddr;
- addr_t oldsegidx, newsegidx;
- addr_t oldsegdir, newsegdir;
- segment_t * oldseg, *newseg;
- element_t * curr, **prev, **lastnew;
-
- #ifdef HASH_STATISTICS
- hashexpansions++;
- #endif
- if (tbl->maxp + tbl->p < DIRSIZE * SEGSIZE) {
- /*
- * Locate the bucket to be split
- */
- oldsegdir = tbl->p / SEGSIZE;
- oldseg = tbl->dir[oldsegdir];
- oldsegidx = tbl->p % SEGSIZE;
- /*
- * Expand address space; if necessary create a new segment_t
- */
- newaddr = tbl->maxp + tbl->p;
- newsegdir = newaddr / SEGSIZE;
- newsegidx = newaddr % SEGSIZE;
- if (newsegidx == 0)
- tbl->dir[newsegdir] =
- (segment_t *)calloc(SEGSIZE, sizeof(segment_t));
- newseg = tbl->dir[newsegdir];
- /*
- * Adjust state variables
- */
- tbl->p++;
- if (tbl->p == tbl->maxp) {
- tbl->maxp <<= 1; /* tbl->maxp *= 2 */
- tbl->p = 0;
- }
- tbl->segcount++;
- /*
- * Relocate records to the new bucket
- */
- prev = &oldseg[oldsegidx];
- curr = *prev;
- lastnew = &newseg[newsegidx];
- *lastnew = NULL;
- while (curr != NULL)
- if (hash(curr->ekey) == newaddr) {
- /* Attach it to the end of the new chain */
- *lastnew = curr;
- /*
- * Remove it from old chain
- */
- *prev = curr->next;
- lastnew = &curr->next;
- curr = curr->next;
- *lastnew = NULL;
- } else {
- /*
- * leave it on the old chain
- */
- prev = &curr->next;
- curr = curr->next;
- }
- }
- }
-
-