Changeset 724 for trunk


Ignore:
Timestamp:
04/29/06 10:30:52 (6 years ago)
Author:
wd
Message:
  • Add mising Makefile.am to the base of 1.2 (whoops!)
  • Update the hashing code to remove use of perl's hasher and use a public domain variant which should provide better distribution.
Location:
trunk/ithildin
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/ithildin/include/ithildin/hash.h

    r655 r724  
    1616struct hashent { 
    1717    void    *ent; 
    18     unsigned int hv;            /* full hash value (i.e. no modulus).  makes for 
     18    unsigned int hv;        /* full hash value (i.e. no modulus).  makes for 
    1919                               quick compares. */ 
    2020    SLIST_ENTRY(hashent) lp; 
     
    2222 
    2323struct hashtable { 
    24     int            size;            /* thte size of the hash table */ 
     24    int     size;           /* thte size of the hash table */ 
    2525#define hashtable_size(x) ((x)->size) 
    26     int            entries;            /* number of entries in the hash table */ 
     26    int     entries;        /* number of entries in the hash table */ 
    2727#define hashtable_count(x) ((x)->entries) 
    28     struct hashbucket *table;        /* our table */ 
    29     size_t  keyoffset;            /* this stores the offset of the key from the 
     28    struct hashbucket *table;   /* our table */ 
     29    size_t  keyoffset;      /* this stores the offset of the key from the 
    3030                               given structure */ 
    31     size_t  keylen;            /* the length of the key.  this CANNOT be 0.  If 
     31    size_t  keylen;         /* the length of the key.  this CANNOT be 0.  If 
    3232                               you want to use variable length strings use the 
    3333                               flag below. */ 
    3434 
    3535    /* these are useful for debugging the state of the hash system */ 
    36 #ifdef DEBUG 
    37     int            max_per_bucket; /* most entries in a single bucket */ 
    38     int            empty_buckets;  /* number of empty buckets */ 
     36#ifdef DEBUG_CODE 
     37    int            max_per_bucket;  /* most entries in a single bucket */ 
     38    int            empty_buckets;   /* number of empty buckets */ 
    3939#endif 
    40 #define HASH_FL_NOCASE 0x1        /* ignore case (tolower before hash) */ 
    41 #define HASH_FL_STRING 0x2        /* key is a nul-terminated string, treat len 
     40#define HASH_FL_NOCASE 0x1      /* ignore case (tolower before hash) */ 
     41#define HASH_FL_STRING 0x2      /* key is a nul-terminated string, treat len 
    4242                                   as a maximum length to hash */ 
    43 #define HASH_FL_INSERTHEAD 0x4        /* insert values at the head of their 
     43#define HASH_FL_INSERTHEAD 0x4  /* insert values at the head of their 
    4444                                   respective buckets (default) */ 
    45 #define HASH_FL_INSERTTAIL 0x8        /* insert values at the tail of their bucket */ 
     45#define HASH_FL_INSERTTAIL 0x8  /* insert values at the tail of their bucket */ 
    4646    int            flags; 
    4747    /* the symbol for our comparison function, used in hash_find_ent().  This 
     
    5454}; 
    5555 
    56 extern struct heap *hashent_heap; /* allocated for us elsewhere */ 
    57  
    5856/* table management functions */ 
    5957hashtable_t *create_hash_table(int, size_t, size_t, int, const char *); 
    6058void destroy_hash_table(hashtable_t *); 
    6159void resize_hash_table(hashtable_t *table, int elems); 
    62 unsigned int hash_get_key_hash(hashtable_t *, void *, size_t); 
    6360int hash_insert(hashtable_t *, void *); 
    6461int hash_delete(hashtable_t *, void *); 
    65 #define hash_find_bucket(tbl, key) \ 
    66 (&tbl->table[hash_get_key_hash(tbl, key, 0) % table->size]) 
    6762void *hash_find(hashtable_t *, void *); 
    68 struct hashent *hash_find_ent(hashtable_t *, void *); 
    69 struct hashent *hash_find_ent_next(hashtable_t *, void *, struct hashent *); 
    7063#endif 
    7164/* vi:set ts=8 sts=4 sw=4 tw=76 et: */ 
  • trunk/ithildin/source/hash.c

    r655 r724  
    22 * hash.c: hash table support functions 
    33 *  
    4  * Copyright 2002 the Ithildin Project. 
     4 * Copyright 2002-2006 the Ithildin Project. 
    55 * See the COPYING file for more information on licensing and use. 
    66 *  
    77 * This system implements a generic hash-table systeem.  The workings of 
    8  * hash-tables are discussed in almost any simple C algorithm book, so I won't 
     8 * hash-tables are discussed in almost any simple algorithm book, so I won't 
    99 * go into deteail here. 
    1010 */ 
     
    1414IDSTRING(rcsid, "$Id$"); 
    1515 
    16 /* this function creates a hashtable with 'elems' buckets (elems can be any 
    17  * size and remain efficient).  'offset' is the offset of the key from 
    18  * structures being added (this should be obtained with the 'offsetof()' 
    19  * function).  len is the length of the key, and flags are any flags for the 
    20  * table (see hash.h).  cmpfunc is the function which should be used for 
    21  * comparison when calling 'hash_find' */ 
     16/* 
     17 * This function creates a hashtable with 'elems' buckets (well, not really, 
     18 * it returns the closest thing that is smaller and a power-of-two). 
     19 * 'offset' is the offset of the key in structures which get passed to 
     20 * hash_insert().  len is the length of the key (0 for \0 terminated 
     21 * strings of variable length).  cmpfunc is the function which should be 
     22 * used for comparsion when calling hash_find() 
     23 * 
     24 * Note that hash values are 32 bit.  As such we will not grow a table past 
     25 * 2^31 entries.  I do not anticipate anyone using this arguably not very 
     26 * good code for something of that magnitude at any rate. 
     27 * 
     28 * We do not create tables smaller than 128 elements.  If your table is that 
     29 * small it is probable that hash tables are not actually going to help you 
     30 * out much in terms of speed. 
     31 */ 
    2232hashtable_t *create_hash_table(int elems, size_t offset, size_t len, 
    2333        int flags, const char *cmpname) { 
    2434    hashtable_t *htp = malloc(sizeof(hashtable_t)); 
    25  
    26     htp->size = elems; 
     35    u_int32_t real_elems = 0x80; 
     36     
     37    /* Take elems and begin shifting real_elems to the left until it is 
     38     * larger than, or equal in size to, elems.  If it is equal just stop, 
     39     * if it is larger shift it back down to the right one place and go with 
     40     * that. */ 
     41 
     42    if (elems > 1<<31) { 
     43        log_error("Request for hashtable of more than 2^31 elements cannot " 
     44                "be fulfilled."); 
     45        return NULL; 
     46    } 
     47 
     48    while (real_elems < elems) 
     49        real_elems <<= 1; 
     50    if (real_elems > elems) 
     51        real_elems >>= 1; 
     52 
     53    htp->size = real_elems; 
    2754    htp->entries = 0; 
    2855    htp->keyoffset = offset; 
    2956    htp->keylen = len; 
    30 #ifdef DEBUG 
     57#ifdef DEBUG_CODE 
    3158    htp->max_per_bucket = 1; 
    3259    htp->empty_buckets = elems; 
     
    75102    table->size = elems; 
    76103    table->entries = 0; 
    77 #ifdef DEBUG 
     104#ifdef DEBUG_CODE 
    78105    table->max_per_bucket = 1; 
    79106    table->empty_buckets = elems; 
     
    95122} 
    96123 
     124static unsigned int hash_get_key_hash(hashtable_t *, void *, size_t); 
     125 
     126/* 
     127 * NOTE 
     128 * This hash algorithm was not written by me.  I am math-stupid, so I spent 
     129 * some time investigating various algorithms and liked the cut of this 
     130 * one's jib.  I tested it on some datasets I had lying around and find it 
     131 * very adequate, as well as performant.  The original author's information: 
     132 *   
     133 * By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use this 
     134 * code any way you wish, private, educational, or commercial.  It's free. 
     135 * See http://burtleburtle.net/bob/hash/evahash.html 
     136 * 
     137 * I have reworked some of this code for updated formatting and to make it 
     138 * more readable (to me) but I have not tweaked the algorithm at all.  I 
     139 * also took out a lot of the comments about how the algorithm works, the 
     140 * website above should be used as a reference. 
     141 */ 
     142#define mix(a,b,c) {                                                          \ 
     143  a -= b; a -= c; a ^= (c>>13);                                               \ 
     144  b -= c; b -= a; b ^= (a<<8);                                                \ 
     145  c -= a; c -= b; c ^= (b>>13);                                               \ 
     146  a -= b; a -= c; a ^= (c>>12);                                               \ 
     147  b -= c; b -= a; b ^= (a<<16);                                               \ 
     148  c -= a; c -= b; c ^= (b>>5);                                                \ 
     149  a -= b; a -= c; a ^= (c>>3);                                                \ 
     150  b -= c; b -= a; b ^= (a<<10);                                               \ 
     151  c -= a; c -= b; c ^= (b>>15);                                               \ 
     152} 
     153 
    97154/* this function allows you to get the hash of a given key.  it must be used in 
    98155 * the context of the table, of course.  it is mostly useful for insert/delete 
    99156 * below, and also for searching, but the included function should do that for 
    100157 * you adequately. */ 
    101 unsigned int hash_get_key_hash(hashtable_t *table, void *key, 
     158uint32_t hash_get_key_hash(hashtable_t *table, void *key, 
    102159        size_t offset) { 
    103     char *rkey = (char *)key + offset; 
    104     unsigned int len = table->keylen; 
    105     unsigned int hash = 0; 
    106  
     160    register uint32_t a, b, c; 
     161    register uint32_t len; 
     162    uint32_t length; 
     163    unsigned char *ckey = (unsigned char *)key + offset; 
     164 
     165    /* Set up the internal state */ 
    107166    if (table->flags & HASH_FL_STRING) { 
    108         len = strlen(rkey); 
    109         if (len > table->keylen) 
    110             len = table->keylen; 
    111     } 
    112     /* I borrowed this algorithm from perl5.  Kudos to Larry Wall & co. */ 
    113     if (table->flags & HASH_FL_NOCASE) 
    114         while (len--) 
    115             hash = hash * 33 + tolower(*rkey++); 
    116     else 
    117         while (len--) 
    118             hash = hash * 33 + *rkey++; 
    119  
    120     return hash + (hash >> 5); 
     167        length = strlen(ckey); 
     168        if (length > table->keylen) 
     169            /* they may ask to only hash on the first n bytes ...  
     170             * XXX: What if they actually want to hash the LAST n bytes (say 
     171             * for a filename or URL)?  There ought to be a flag... */ 
     172            length = table->keylen; 
     173    } else 
     174        length = table->keylen; 
     175    
     176    len = length; 
     177    a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */ 
     178    c = 0xb33ff00d;      /* the previous hash value (UNUSED initval 
     179                            functionality) */ 
     180 
     181    /* XXX: So in order to support case insensitive hashing I simply took 
     182     * the below code and duped it further down to key generation on those 
     183     * strings.  I think this will end up with faster conpiled code, though 
     184     * it is not pleasant to look at..  */ 
     185    if (table->flags & HASH_FL_NOCASE) { 
     186        while (len >= 12) { 
     187            a += ( (uint32_t)tolower(ckey[0]) + 
     188                  ((uint32_t)tolower(ckey[1]) << 8) + 
     189                  ((uint32_t)tolower(ckey[2]) << 16) + 
     190                  ((uint32_t)tolower(ckey[3]) << 24)); 
     191            b += ( (uint32_t)tolower(ckey[5]) + 
     192                  ((uint32_t)tolower(ckey[6]) << 8) + 
     193                  ((uint32_t)tolower(ckey[7]) << 16) + 
     194                  ((uint32_t)tolower(ckey[8]) << 24)); 
     195            c += ( (uint32_t)tolower(ckey[9]) + 
     196                  ((uint32_t)tolower(ckey[10]) << 8) + 
     197                  ((uint32_t)tolower(ckey[11]) << 16) + 
     198                  ((uint32_t)tolower(ckey[12]) << 24)); 
     199           mix(a, b, c); 
     200           ckey += 12; len -= 12; 
     201        } 
     202 
     203        c += length; 
     204 
     205        /* deal with the last 11 (or less) bytes */ 
     206        switch (len) { /* all the case statements fall through */ 
     207        case 11: c += ((uint32_t)tolower(ckey[10]) << 24); 
     208        case 10: c += ((uint32_t)tolower(ckey[9]) << 16); 
     209        case 9:  c += ((uint32_t)tolower(ckey[8]) << 8); 
     210           /* the first byte of c is reserved for the length */ 
     211        case 8:  b += ((uint32_t)tolower(ckey[7]) << 24); 
     212        case 7:  b += ((uint32_t)tolower(ckey[6]) << 16); 
     213        case 6:  b += ((uint32_t)tolower(ckey[5]) << 8); 
     214        case 5:  b += ((uint32_t)tolower(ckey[4])); 
     215        case 4:  b += ((uint32_t)tolower(ckey[3]) << 24); 
     216        case 3:  b += ((uint32_t)tolower(ckey[2]) << 16); 
     217        case 2:  b += ((uint32_t)tolower(ckey[1]) << 8); 
     218        case 1:  b += ((uint32_t)tolower(ckey[0])); 
     219        /* case 0: nothing left to add */ 
     220        } 
     221        mix(a, b, c); 
     222    } else { 
     223        while (len >= 12) { 
     224            a += ( (uint32_t)ckey[0] + 
     225                  ((uint32_t)ckey[1] << 8) + 
     226                  ((uint32_t)ckey[2] << 16) + 
     227                  ((uint32_t)ckey[3] << 24)); 
     228            b += ( (uint32_t)ckey[5] + 
     229                  ((uint32_t)ckey[6] << 8) + 
     230                  ((uint32_t)ckey[7] << 16) + 
     231                  ((uint32_t)ckey[8] << 24)); 
     232            c += ( (uint32_t)ckey[9] + 
     233                  ((uint32_t)ckey[10] << 8) + 
     234                  ((uint32_t)ckey[11] << 16) + 
     235                  ((uint32_t)ckey[12] << 24)); 
     236           mix(a, b, c); 
     237           ckey += 12; len -= 12; 
     238        } 
     239 
     240        c += length; 
     241 
     242        /* deal with the last 11 (or less) bytes */ 
     243        switch (len) { /* all the case statements fall through */ 
     244        case 11: c += ((uint32_t)ckey[10] << 24); 
     245        case 10: c += ((uint32_t)ckey[9] << 16); 
     246        case 9:  c += ((uint32_t)ckey[8] << 8); 
     247           /* the first byte of c is reserved for the length */ 
     248        case 8:  b += ((uint32_t)ckey[7] << 24); 
     249        case 7:  b += ((uint32_t)ckey[6] << 16); 
     250        case 6:  b += ((uint32_t)ckey[5] << 8); 
     251        case 5:  b += ((uint32_t)ckey[4]); 
     252        case 4:  b += ((uint32_t)ckey[3] << 24); 
     253        case 3:  b += ((uint32_t)ckey[2] << 16); 
     254        case 2:  b += ((uint32_t)ckey[1] << 8); 
     255        case 1:  b += ((uint32_t)ckey[0]); 
     256        /* case 0: nothing left to add */ 
     257        } 
     258        mix(a, b, c); 
     259    } 
     260 
     261    /* just return the hash sized right for the table instead of asking them 
     262     * to do it.. */ 
     263    return c & (table->size - 1); 
    121264} 
    122265 
     
    127270 
    128271    hep->hv = hash; 
    129     hash %= table->size; 
    130 #ifdef DEBUG 
     272#ifdef DEBUG_CODE 
    131273    if (SLIST_EMPTY(&table->table[hash])) 
    132274        table->empty_buckets--; /* this bucket isn't empty now */ 
    133275    else { 
    134         /* count the items in the bucket.  of course this is expensive, that's 
     276        /* count the items in the bucket.  of course this is wasteful, that's 
    135277         * why you don't debug code in production. :) */ 
    136278        struct hashent *hep2; 
     
    149291        /* find the end of the list and insert data there.  this is costly, 
    150292         * and probably not something you want to do unless you're really sure 
    151          * that it's what you're after. */ 
     293         * that it's what you're after... i.e. if you think new entries will 
     294         * be the least sought for, and you expect a lot of collisions, you 
     295         * would use this.. otherwise I don't know why.. */ 
    152296        struct hashent *hep2; 
    153297 
     
    166310        SLIST_INSERT_HEAD(&table->table[hash], hep, lp); 
    167311 
    168     /* if the table has 1.2x as many entries as there are buckets, resize it so 
    169      * that there ar twice as many buckets. :) */ 
    170     if (((table->size * 6) / 5)  <= table->entries) 
     312    /* 
     313     * if the table has 1.2x as many entries as there are buckets, resize it so 
     314     * that there ar twice as many buckets. :) 
     315     * Perf considerations: 
     316     * I have heard a lot of conjecture about the wisdom of resizes in 
     317     * relation to hash tables, some advise resizing at about 70-80% 
     318     * capacity, others claim that a good hash algorithm (of which FNC is 
     319     * ostensibly one) should be O(1) even when the table has as many 
     320     * entries as it does buckets.  Some basic testing from my standpoint 
     321     * shows that FNV does a good enough job of distribution that we really 
     322     * don't need to worry about taking the hit until we have more entries 
     323     * than buckets.  Furthermore, in what would be mediocre circumstances 
     324     * (50% buckets unused, most buckets doubled or tripled) the runtime is 
     325     * still not very high, whereas the costs associated with a resize 
     326     * (in terms of memory and resizing operation considerations) are 
     327     * relatively high.  especially when data growth is rapid, the less we 
     328     * resize the better off we are. 
     329     * 
     330     * TODO: Allow consumers of hash tables to specify a growth ratio 
     331     * instead of making this static. 
     332     */ 
     333    if (((table->size * 6) / 5) <= table->entries) 
    171334        resize_hash_table(table, table->size * 2); 
    172335 
     
    176339/* remove the entry from the table. */ 
    177340int hash_delete(hashtable_t *table, void *ent) { 
    178     unsigned int hash = hash_get_key_hash(table, ent, table->keyoffset) % 
    179         table->size; 
     341    unsigned int hash = hash_get_key_hash(table, ent, table->keyoffset); 
    180342    struct hashent *hep; 
    181343 
     
    191353    free(hep); 
    192354 
    193 #ifdef DEBUG 
     355#ifdef DEBUG_CODE 
    194356    if (SLIST_EMPTY(&table->table[hash])) 
    195357        table->empty_buckets++; /* this bucket is empty again. */ 
     
    205367void *hash_find(hashtable_t *table, void *key) { 
    206368    unsigned int hash = hash_get_key_hash(table, key, 0); 
    207     struct hashbucket *bucket = &table->table[hash % table->size]; 
     369    struct hashbucket *bucket = &table->table[hash]; 
    208370    struct hashent *hep; 
    209371    int (*cmpfunc)(void *, void *, size_t) = 
     
    219381} 
    220382 
     383/* I haven't found a use for this code yet... */ 
     384#if 0 
     385static struct hashent *hash_find_ent(hashtable_t *, void *); 
     386static struct hashent *hash_find_ent_next(hashtable_t *, void *, struct hashent *); 
     387 
    221388/* this function finds the first 'hashent' of the given key, it returns a 
    222389 * hashent so that calls to the _next variant can be a bit faster. */ 
    223 struct hashent *hash_find_ent(hashtable_t *table, void *key) { 
     390static struct hashent *hash_find_ent(hashtable_t *table, void *key) { 
    224391    unsigned int hash = hash_get_key_hash(table, key, 0); 
    225     struct hashbucket *bucket = &table->table[hash % table->size]; 
     392    struct hashbucket *bucket = &table->table[hash]; 
    226393    struct hashent *hep; 
    227394    int (*cmpfunc)(void *, void *, size_t) = 
     
    238405 
    239406/* this function finds the next entry matching the key after 'ent' */ 
    240 struct hashent *hash_find_ent_next(hashtable_t *table, void *key, 
     407static struct hashent *hash_find_ent_next(hashtable_t *table, void *key, 
    241408        struct hashent *ent) { 
    242409    struct hashent *hep = ent; 
     
    253420    return NULL; 
    254421} 
     422#endif 
    255423     
    256424/* vi:set ts=8 sts=4 sw=4 tw=76 et: */ 
Note: See TracChangeset for help on using the changeset viewer.