NetNews Usenet Archive 1992 #30

home *** CD-ROM | disk | FTP | other *** search

/ NetNews Usenet Archive 1992 #30 / NN_1992_30.iso / spool / comp / lang / c / 18323 < prev next >

Wrap

Text File | 1992-12-14 | 8.0 KB | 333 lines

Newsgroups: comp.lang.c Path: sparky!uunet!haven.umd.edu!darwin.sura.net!europa.asd.contel.com!howland.reston.ans.net!usc!cs.utexas.edu!torn!newshub.ccs.yorku.ca!newshub.ccs.yorku.ca!oz From: oz@ursa.sis.yorku.ca (Ozan Yigit) Subject: Re: Need good hashing functions In-Reply-To: torek@horse.ee.lbl.gov's message of 10 Dec 1992 18: 31:03 GMT Message-ID: <OZ.92Dec14115806@ursa.sis.yorku.ca> Sender: news@newshub.ccs.yorku.ca (USENET News System) Organization: York U. Student Information Systems Project References: <1992Dec9.213041.16259@netcom.com> <1992Dec10.113732.4889@si.hhs.nl> <27911@dog.ee.lbl.gov> Date: Mon, 14 Dec 1992 16:58:06 GMT Lines: 319 Chris Torek writes: [most of the interesting hashing article elided] Incidentally, the same expansion technique can be used for any hash method: it simply corresponds to moving all the entries from the old hash table to the new, larger one. This particular version has the advantage of being compact. Alternatively, one can use an algorithm that expands without re-hashing the entire table. Here is one based on the Larson article in CACM, that can be used instead of the inferior sV hsearch. [please share any fixes and improvements to this code] enjoy... oz --- With diligence, it is possible to | electric: oz@sis.yorku.ca make anything run slowly. -T.Duff | ph:[416] 736 2100 x 33976 ----- snap here ---- /* * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson. * Coded into C, with minor code improvements, and with hsearch(3) interface, * by ejp@ausmelb.oz, Jul 26, 1988: 13:16; * also, hcreate/hdestroy routines added to simulate hsearch(3). * * These routines simulate hsearch(3) and family, with the important * difference that the hash table is dynamic - can grow indefinitely * beyond its original size (as supplied to hcreate()). * * Performance appears to be comparable to that of hsearch(3). * The 'source-code' options referred to in hsearch(3)'s 'man' page * are not implemented; otherwise functionality is identical. * * Compilation controls: * DEBUG controls some informative traces, mainly for debugging. * HASH_STATISTICS causes hashaccesses and hashcollisions to be maintained; * when combined with DEBUG, these are displayed by hdestroy(). */ #include <stdio.h> #include <assert.h> #include <string.h> #include <search.h> /* Constants */ # define SEGSIZE 256 /* should be power of 2 for speed */ # define DIRSIZE 256 /* should be power of 2 for speed */ # define PRIME1 37 # define PRIME2 1048583 # define DEFMAXLD 5 /* default maximum load factor */ /* local data templates */ /* * The user only sees the first two fields, as we pretend to pass * back only a pointer to ENTRY. {S}he doesn't know what else is in here. */ typedef struct element { char *ekey; char *edata; struct element *next; /* secret from user */ } element_t, *segment_t; typedef struct { unsigned short p; /* next bucket to be split */ unsigned short maxp; /* upper bound on p during expansion */ unsigned long keycount; /* current # keys */ unsigned short segcount; /* current # segments */ unsigned short minload; /* minimum load factor */ unsigned short maxload; /* maximum load factor */ segment_t * dir[DIRSIZE]; } hashtbl; typedef unsigned long addr_t; /* external routines */ extern char *calloc(); /* Entry points */ extern int hcreate(); extern void hdestroy(); extern ENTRY *hsearch(); /* Internal routines */ static addr_t hash(); static void expandtbl(); /* Local data */ static hashtbl *tbl = NULL; #if HASH_STATISTICS static long hashaccesses, hashcollisions, hashexpansions; #endif int hcreate(count) unsigned count; { register unsigned i; /* * Adjust count to be nearest higher power of 2, minimum SEGSIZE, * then convert into segments. */ for (i = SEGSIZE; i < count; i <<= 1) ; count = i / SEGSIZE; tbl = (hashtbl *)calloc(1, sizeof(hashtbl)); if (tbl == NULL) return 0; tbl->segcount = 0; tbl->p = 0; tbl->keycount = 0; /* * Allocate initial 'i' segments of buckets */ for (i = 0; i < count; i++) { tbl->dir[i] = (segment_t *)calloc(SEGSIZE, sizeof(segment_t)); if (tbl->dir[i] == NULL) { hdestroy(); return 0; } tbl->segcount++; } tbl->maxp = count * SEGSIZE; tbl->minload = 1; tbl->maxload = DEFMAXLD; # if DEBUG fprintf(stderr, "hcreate: table %x count %d maxp %d segcount %d\n", tbl, count, tbl->maxp, tbl->segcount); # endif # if HASH_STATISTICS hashaccesses = hashcollisions = hashexpansions = 0; # endif return 1; } void hdestroy() { int i, j; segment_t * s; element_t * p, *q; if (tbl != NULL) { for (i = 0; i < tbl->segcount; i++) { /* test probably unnecessary */ s = tbl->dir[i]; if (s != NULL) for (j = 0; j < SEGSIZE; j++) { p = s[j]; while (p != NULL) { q = p->next; free((char *)p); p = q; } free(tbl->dir[i]); } } # if HASH_STATISTICS && DEBUG fprintf(stderr, "hdestroy: accesses %ld collisions %ld\n", hashaccesses, hashcollisions); fprintf(stderr, "hdestroy: expansions %ld\n", hashexpansions); fprintf(stderr, "keys %ld maxp %d segcount %d\n", tbl->keycount, tbl->maxp, tbl->segcount); # endif free(tbl); tbl = NULL; } } ENTRY * hsearch(item, action) ENTRY item; ACTION action; /* FIND/ENTER */ { register addr_t h; addr_t segidx, segdir; segment_t * currseg; segment_t * p, q; assert(tbl != NULL); /* Kinder really than return(NULL); */ # if HASH_STATISTICS hashaccesses++; # endif h = hash(item.key); segdir = h / SEGSIZE; segidx = h % SEGSIZE; /* * valid segment_t ensured by hash() */ currseg = tbl->dir[segdir]; assert(currseg != NULL); /* bad failure if tripped */ p = &currseg[segidx]; q = *p; /* * Follow collision chain */ while (q != NULL && strcmp(q->ekey, item.key) != 0) { p = &q->next; q = *p; # if HASH_STATISTICS hashcollisions++; # endif } if (q != NULL || /* found */ action == FIND || /* not found, search only */ (q = (element_t *)calloc(1, sizeof(element_t))) == NULL) /* not found, no room */ return (ENTRY *)q; *p = q; /* link into chain */ /* * Initialize new element */ q->ekey = item.key; q->edata = item.data; /* cleared by calloc(3) q->next = NULL; */ /* tbl over-full? */ if (++tbl->keycount / (tbl->segcount * SEGSIZE) > tbl->maxload) expandtbl(); /* doesn't affect q */ return (ENTRY *)q; } /* * Internal routines */ static addr_t hash(ekey) char *ekey; { register unsigned char *k = (unsigned char *) ekey; register addr_t h, address; h = 0; /* * Convert string to integer */ while (*k != '\0') h = h * PRIME1 ^ (*k++ - ' '); h %= PRIME2; address = h % tbl->maxp; if (address < tbl->p) address = h % (2*tbl->maxp); return address; } static void expandtbl() { register addr_t newaddr; addr_t oldsegidx, newsegidx; addr_t oldsegdir, newsegdir; segment_t * oldseg, *newseg; element_t * curr, **prev, **lastnew; #ifdef HASH_STATISTICS hashexpansions++; #endif if (tbl->maxp + tbl->p < DIRSIZE * SEGSIZE) { /* * Locate the bucket to be split */ oldsegdir = tbl->p / SEGSIZE; oldseg = tbl->dir[oldsegdir]; oldsegidx = tbl->p % SEGSIZE; /* * Expand address space; if necessary create a new segment_t */ newaddr = tbl->maxp + tbl->p; newsegdir = newaddr / SEGSIZE; newsegidx = newaddr % SEGSIZE; if (newsegidx == 0) tbl->dir[newsegdir] = (segment_t *)calloc(SEGSIZE, sizeof(segment_t)); newseg = tbl->dir[newsegdir]; /* * Adjust state variables */ tbl->p++; if (tbl->p == tbl->maxp) { tbl->maxp <<= 1; /* tbl->maxp *= 2 */ tbl->p = 0; } tbl->segcount++; /* * Relocate records to the new bucket */ prev = &oldseg[oldsegidx]; curr = *prev; lastnew = &newseg[newsegidx]; *lastnew = NULL; while (curr != NULL) if (hash(curr->ekey) == newaddr) { /* Attach it to the end of the new chain */ *lastnew = curr; /* * Remove it from old chain */ *prev = curr->next; lastnew = &curr->next; curr = curr->next; *lastnew = NULL; } else { /* * leave it on the old chain */ prev = &curr->next; curr = curr->next; } } }