source: branches/ithildin-1.1/source/hash.c @ 726

Revision 726, 16.0 KB checked in by wd, 6 years ago (diff)

Update/fix hash bugs.

  • Property svn:keywords set to Id Rev
Line 
1/*
2 * hash.c: hash table support functions
3 *
4 * Copyright 2002-2006 the Ithildin Project.
5 * See the COPYING file for more information on licensing and use.
6 *
7 * This system implements a generic hash-table systeem.  The workings of
8 * hash-tables are discussed in almost any simple algorithm book, so I won't
9 * go into deteail here.
10 */
11
12#include <ithildin/stand.h>
13
14IDSTRING(rcsid, "$Id$");
15
16static void resize_hash_table(hashtable_t *table, uint32_t elems);
17static unsigned int hash_get_key_hash(hashtable_t *, void *, size_t);
18
19/*
20 * This function creates a hashtable with 'elems' buckets (well, not really,
21 * it returns the closest thing that is smaller and a power-of-two).
22 * 'offset' is the offset of the key in structures which get passed to
23 * hash_insert().  len is the length of the key (0 for \0 terminated
24 * strings of variable length).  cmpfunc is the function which should be
25 * used for comparsion when calling hash_find()
26 *
27 * Note that hash values are 32 bit.  As such we will not grow a table past
28 * 2^31 entries.  I do not anticipate anyone using this arguably not very
29 * good code for something of that magnitude at any rate.
30 *
31 * We do not create tables smaller than 128 elements.  If your table is that
32 * small it is probable that hash tables are not actually going to help you
33 * out much in terms of speed.
34 */
35hashtable_t *create_hash_table(uint32_t elems, size_t offset, size_t len,
36        int flags, const char *cmpname) {
37    hashtable_t *htp = malloc(sizeof(hashtable_t));
38    uint32_t real_elems = 0x80;
39   
40    /* Take elems and begin shifting real_elems to the left until it is
41     * larger than, or equal in size to, elems.  If it is equal just stop,
42     * if it is larger shift it back down to the right one place and go with
43     * that. */
44
45    if (elems > 1<<31) {
46        log_error("Request for hashtable of more than 2^31 elements cannot "
47                "be fulfilled.");
48        return NULL;
49    }
50
51    while (real_elems < elems)
52        real_elems <<= 1;
53    if (real_elems > elems)
54        real_elems >>= 1;
55
56    if (real_elems != elems)
57        log_debug("Request for %d element hash table adjusted to %d "
58                "elements", elems, real_elems);
59
60    htp->size = real_elems;
61    htp->entries = 0;
62    htp->keyoffset = offset;
63    htp->keylen = len;
64#ifdef DEBUG_CODE
65    htp->max_per_bucket = 1;
66    htp->empty_buckets = real_elems;
67#endif
68    htp->flags = flags;
69    if (cmpname != NULL)
70            htp->cmpsym = import_symbol((char *) cmpname);
71    else
72        htp->cmpsym = import_symbol("memcmp");
73
74    htp->table = malloc(sizeof(struct hashbucket) * htp->size);
75    memset(htp->table, 0, sizeof(struct hashbucket) * htp->size);
76
77    return htp;
78}
79
80/* hash_table destroyer.  sweep through the given table and kill off every
81 * hashent */
82void destroy_hash_table(hashtable_t *table) {
83    struct hashent *hep;
84    int i;
85
86    for (i = 0;i < table->size;i++) {
87        while (!SLIST_EMPTY(&table->table[i])) {
88            hep = SLIST_FIRST(&table->table[i]);
89            SLIST_REMOVE_HEAD(&table->table[i], lp);
90            free(hep);
91        }
92    }
93    free(table->table);
94    free(table);
95}
96
97/* this function is used to resize a hash table.  of course, to do this, it has
98 * to create a new table and re-hash everything in it, so it's not very
99 * efficient, however if used judiciously it should enhance performance, not
100 * hinder it.  'elems' is the new size of the table. */
101static void resize_hash_table(hashtable_t *table, uint32_t elems) {
102    struct hashbucket *oldtable;
103    int oldsize, i;
104    struct hashent *hep;
105
106    /* preserve the old table, then create a new one.  */
107    oldtable = table->table;
108    oldsize = table->size;
109    table->size = elems;
110    table->entries = 0;
111#ifdef DEBUG_CODE
112    table->max_per_bucket = 1;
113    table->empty_buckets = elems;
114#endif
115    table->table = malloc(sizeof(struct hashbucket) * table->size);
116    memset(table->table, 0, sizeof(struct hashbucket) * table->size);
117
118    /* now walk each bucket in the old table, pulling off individual entries
119     * and re-adding them to the table as we go */
120    for (i = 0;i < oldsize;i++) {
121        while (!SLIST_EMPTY(&oldtable[i])) {
122            hep = SLIST_FIRST(&oldtable[i]);
123            hash_insert(table, hep->ent);
124            SLIST_REMOVE_HEAD(&oldtable[i], lp);
125            free(hep);
126        }
127    }
128    free(oldtable);
129}
130
131/*
132 * NOTE
133 * This hash algorithm was not written by me.  I am math-stupid, so I spent
134 * some time investigating various algorithms and liked the cut of this
135 * one's jib.  I tested it on some datasets I had lying around and find it
136 * very adequate, as well as performant.  The original author's information:
137 * 
138 * By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use this
139 * code any way you wish, private, educational, or commercial.  It's free.
140 * See http://burtleburtle.net/bob/hash/evahash.html
141 *
142 * I have reworked some of this code for updated formatting and to make it
143 * more readable (to me) but I have not tweaked the algorithm at all.  I
144 * also took out a lot of the comments about how the algorithm works, the
145 * website above should be used as a reference.
146 */
147#define mix(a,b,c) {                                                          \
148  a -= b; a -= c; a ^= (c>>13);                                               \
149  b -= c; b -= a; b ^= (a<<8);                                                \
150  c -= a; c -= b; c ^= (b>>13);                                               \
151  a -= b; a -= c; a ^= (c>>12);                                               \
152  b -= c; b -= a; b ^= (a<<16);                                               \
153  c -= a; c -= b; c ^= (b>>5);                                                \
154  a -= b; a -= c; a ^= (c>>3);                                                \
155  b -= c; b -= a; b ^= (a<<10);                                               \
156  c -= a; c -= b; c ^= (b>>15);                                               \
157}
158
159/* this function allows you to get the hash of a given key.  it must be used in
160 * the context of the table, of course.  it is mostly useful for insert/delete
161 * below, and also for searching, but the included function should do that for
162 * you adequately. */
163uint32_t hash_get_key_hash(hashtable_t *table, void *key,
164        size_t offset) {
165    register uint32_t a, b, c;
166    register uint32_t len;
167    uint32_t length;
168    unsigned char *ckey = (unsigned char *)key + offset;
169
170    /* Set up the internal state */
171    if (table->flags & HASH_FL_STRING) {
172        length = strlen(ckey);
173        if (length > table->keylen)
174            /* they may ask to only hash on the first n bytes ...
175             * XXX: What if they actually want to hash the LAST n bytes (say
176             * for a filename or URL)?  There ought to be a flag... */
177            length = table->keylen;
178    } else
179        length = table->keylen;
180   
181    len = length;
182    a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */
183    c = 0xb33ff00d;      /* the previous hash value (UNUSED initval
184                            functionality) */
185
186    /* XXX: So in order to support case insensitive hashing I simply took
187     * the below code and duped it further down to key generation on those
188     * strings.  I think this will end up with faster conpiled code, though
189     * it is not pleasant to look at..  */
190    if (table->flags & HASH_FL_NOCASE) {
191        while (len >= 12) {
192            a += ( (uint32_t)tolower(ckey[0]) +
193                  ((uint32_t)tolower(ckey[1]) << 8) +
194                  ((uint32_t)tolower(ckey[2]) << 16) +
195                  ((uint32_t)tolower(ckey[3]) << 24));
196            b += ( (uint32_t)tolower(ckey[5]) +
197                  ((uint32_t)tolower(ckey[6]) << 8) +
198                  ((uint32_t)tolower(ckey[7]) << 16) +
199                  ((uint32_t)tolower(ckey[8]) << 24));
200            c += ( (uint32_t)tolower(ckey[9]) +
201                  ((uint32_t)tolower(ckey[10]) << 8) +
202                  ((uint32_t)tolower(ckey[11]) << 16) +
203                  ((uint32_t)tolower(ckey[12]) << 24));
204           mix(a, b, c);
205           ckey += 12; len -= 12;
206        }
207
208        c += length;
209
210        /* deal with the last 11 (or less) bytes */
211        switch (len) { /* all the case statements fall through */
212        case 11: c += ((uint32_t)tolower(ckey[10]) << 24);
213        case 10: c += ((uint32_t)tolower(ckey[9]) << 16);
214        case 9:  c += ((uint32_t)tolower(ckey[8]) << 8);
215           /* the first byte of c is reserved for the length */
216        case 8:  b += ((uint32_t)tolower(ckey[7]) << 24);
217        case 7:  b += ((uint32_t)tolower(ckey[6]) << 16);
218        case 6:  b += ((uint32_t)tolower(ckey[5]) << 8);
219        case 5:  b += ((uint32_t)tolower(ckey[4]));
220        case 4:  b += ((uint32_t)tolower(ckey[3]) << 24);
221        case 3:  b += ((uint32_t)tolower(ckey[2]) << 16);
222        case 2:  b += ((uint32_t)tolower(ckey[1]) << 8);
223        case 1:  b += ((uint32_t)tolower(ckey[0]));
224        /* case 0: nothing left to add */
225        }
226        mix(a, b, c);
227    } else {
228        while (len >= 12) {
229            a += ( (uint32_t)ckey[0] +
230                  ((uint32_t)ckey[1] << 8) +
231                  ((uint32_t)ckey[2] << 16) +
232                  ((uint32_t)ckey[3] << 24));
233            b += ( (uint32_t)ckey[5] +
234                  ((uint32_t)ckey[6] << 8) +
235                  ((uint32_t)ckey[7] << 16) +
236                  ((uint32_t)ckey[8] << 24));
237            c += ( (uint32_t)ckey[9] +
238                  ((uint32_t)ckey[10] << 8) +
239                  ((uint32_t)ckey[11] << 16) +
240                  ((uint32_t)ckey[12] << 24));
241           mix(a, b, c);
242           ckey += 12; len -= 12;
243        }
244
245        c += length;
246
247        /* deal with the last 11 (or less) bytes */
248        switch (len) { /* all the case statements fall through */
249        case 11: c += ((uint32_t)ckey[10] << 24);
250        case 10: c += ((uint32_t)ckey[9] << 16);
251        case 9:  c += ((uint32_t)ckey[8] << 8);
252           /* the first byte of c is reserved for the length */
253        case 8:  b += ((uint32_t)ckey[7] << 24);
254        case 7:  b += ((uint32_t)ckey[6] << 16);
255        case 6:  b += ((uint32_t)ckey[5] << 8);
256        case 5:  b += ((uint32_t)ckey[4]);
257        case 4:  b += ((uint32_t)ckey[3] << 24);
258        case 3:  b += ((uint32_t)ckey[2] << 16);
259        case 2:  b += ((uint32_t)ckey[1] << 8);
260        case 1:  b += ((uint32_t)ckey[0]);
261        /* case 0: nothing left to add */
262        }
263        mix(a, b, c);
264    }
265
266    /* just return the hash sized right for the table instead of asking them
267     * to do it.. */
268    return c & (table->size - 1);
269}
270
271/* add the entry into the table */
272int hash_insert(hashtable_t *table, void *ent) {
273    unsigned int hash = hash_get_key_hash(table, ent, table->keyoffset);
274    struct hashent *hep = malloc(sizeof(struct hashent));
275
276    hep->hv = hash;
277#ifdef DEBUG_CODE
278    if (SLIST_EMPTY(&table->table[hash]))
279        table->empty_buckets--; /* this bucket isn't empty now */
280    else {
281        /* count the items in the bucket.  of course this is wasteful, that's
282         * why you don't debug code in production. :) */
283        struct hashent *hep2;
284        int cnt = 0;
285        SLIST_FOREACH(hep2, &table->table[hash], lp) {
286            cnt++;
287        }
288        if (cnt > table->max_per_bucket)
289            table->max_per_bucket = cnt;
290    }
291#endif
292
293    table->entries++;
294    hep->ent = ent;
295    if (table->flags & HASH_FL_INSERTTAIL) {
296        /* find the end of the list and insert data there.  this is costly,
297         * and probably not something you want to do unless you're really sure
298         * that it's what you're after... i.e. if you think new entries will
299         * be the least sought for, and you expect a lot of collisions, you
300         * would use this.. otherwise I don't know why.. */
301        struct hashent *hep2;
302
303        hep2 = SLIST_FIRST(&table->table[hash]);
304        while (hep2 != NULL)
305            if (SLIST_NEXT(hep2, lp) == NULL)
306                break;
307            else
308                hep2 = SLIST_NEXT(hep2, lp);
309
310        if (hep2 == NULL)
311            SLIST_INSERT_HEAD(&table->table[hash], hep, lp);
312        else
313            SLIST_INSERT_AFTER(hep2, hep, lp);
314    } else
315        SLIST_INSERT_HEAD(&table->table[hash], hep, lp);
316
317    /*
318     * if the table has 1.2x as many entries as there are buckets, resize it so
319     * that there ar twice as many buckets. :)
320     * Perf considerations:
321     * I have heard a lot of conjecture about the wisdom of resizes in
322     * relation to hash tables, some advise resizing at about 70-80%
323     * capacity, others claim that a good hash algorithm (of which FNC is
324     * ostensibly one) should be O(1) even when the table has as many
325     * entries as it does buckets.  Some basic testing from my standpoint
326     * shows that FNV does a good enough job of distribution that we really
327     * don't need to worry about taking the hit until we have more entries
328     * than buckets.  Furthermore, in what would be mediocre circumstances
329     * (50% buckets unused, most buckets doubled or tripled) the runtime is
330     * still not very high, whereas the costs associated with a resize
331     * (in terms of memory and resizing operation considerations) are
332     * relatively high.  especially when data growth is rapid, the less we
333     * resize the better off we are.
334     *
335     * TODO: Allow consumers of hash tables to specify a growth ratio
336     * instead of making this static.
337     */
338    if (((table->size * 6) / 5) <= table->entries)
339        resize_hash_table(table, table->size * 2);
340
341    return 1;
342}
343
344/* remove the entry from the table. */
345int hash_delete(hashtable_t *table, void *ent) {
346    unsigned int hash = hash_get_key_hash(table, ent, table->keyoffset);
347    struct hashent *hep;
348
349    SLIST_FOREACH(hep, &table->table[hash], lp) {
350        if (hep->ent == ent)
351            break;
352    }
353    if (hep == NULL)
354        return 0;
355
356    table->entries--;
357    SLIST_REMOVE(&table->table[hash], hep, hashent, lp);
358    free(hep);
359
360#ifdef DEBUG_CODE
361    if (SLIST_EMPTY(&table->table[hash]))
362        table->empty_buckets++; /* this bucket is empty again. */
363#endif
364
365    return 1;
366}
367
368/* last, but not least, the find functions.  given the table and the key to
369 * look for, it hashes the key, and then calls the compare function in the
370 * given table slice until it finds the item, or reaches the end of the
371 * list. */
372void *hash_find(hashtable_t *table, void *key) {
373    unsigned int hash = hash_get_key_hash(table, key, 0);
374    struct hashbucket *bucket = &table->table[hash];
375    struct hashent *hep;
376    int (*cmpfunc)(void *, void *, size_t) =
377        (int (*)(void *, void *, size_t))getsym(table->cmpsym);
378
379    SLIST_FOREACH(hep, bucket, lp) {
380        if (hep->hv == hash && !cmpfunc(&((char *)hep->ent)[table->keyoffset],
381                    key, table->keylen))
382            return hep->ent;
383    }
384
385    return NULL; /* not found */
386}
387
388/* I haven't found a use for this code yet... */
389#if 0
390static struct hashent *hash_find_ent(hashtable_t *, void *);
391static struct hashent *hash_find_ent_next(hashtable_t *, void *, struct hashent *);
392
393/* this function finds the first 'hashent' of the given key, it returns a
394 * hashent so that calls to the _next variant can be a bit faster. */
395static struct hashent *hash_find_ent(hashtable_t *table, void *key) {
396    unsigned int hash = hash_get_key_hash(table, key, 0);
397    struct hashbucket *bucket = &table->table[hash];
398    struct hashent *hep;
399    int (*cmpfunc)(void *, void *, size_t) =
400        (int (*)(void *, void *, size_t))getsym(table->cmpsym);
401
402    SLIST_FOREACH(hep, bucket, lp) {
403        if (hep->hv == hash && !cmpfunc(&((char *)hep->ent)[table->keyoffset],
404                    key, table->keylen))
405            return hep;
406    }
407
408    return NULL; /* not found */
409}
410
411/* this function finds the next entry matching the key after 'ent' */
412static struct hashent *hash_find_ent_next(hashtable_t *table, void *key,
413        struct hashent *ent) {
414    struct hashent *hep = ent;
415    int (*cmpfunc)(void *, void *, size_t) =
416        (int (*)(void *, void *, size_t))getsym(table->cmpsym);
417
418    /* we know where we are .. */
419    while ((hep = SLIST_NEXT(hep, lp)) != NULL) {
420        if (!cmpfunc(&((char *)hep->ent)[table->keyoffset], key,
421                    table->keylen))
422            return hep;
423    }
424
425    return NULL;
426}
427#endif
428   
429/* vi:set ts=8 sts=4 sw=4 tw=76 et: */
Note: See TracBrowser for help on using the repository browser.