- Timestamp:
- 04/29/06 10:30:52 (6 years ago)
- Location:
- trunk/ithildin
- Files:
-
- 1 added
- 2 edited
-
Makefile.am (added)
-
include/ithildin/hash.h (modified) (3 diffs)
-
source/hash.c (modified) (13 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/ithildin/include/ithildin/hash.h
r655 r724 16 16 struct hashent { 17 17 void *ent; 18 unsigned int hv; /* full hash value (i.e. no modulus). makes for18 unsigned int hv; /* full hash value (i.e. no modulus). makes for 19 19 quick compares. */ 20 20 SLIST_ENTRY(hashent) lp; … … 22 22 23 23 struct hashtable { 24 int size;/* thte size of the hash table */24 int size; /* thte size of the hash table */ 25 25 #define hashtable_size(x) ((x)->size) 26 int entries;/* number of entries in the hash table */26 int entries; /* number of entries in the hash table */ 27 27 #define hashtable_count(x) ((x)->entries) 28 struct hashbucket *table; /* our table */29 size_t keyoffset; /* this stores the offset of the key from the28 struct hashbucket *table; /* our table */ 29 size_t keyoffset; /* this stores the offset of the key from the 30 30 given structure */ 31 size_t keylen; /* the length of the key. this CANNOT be 0. If31 size_t keylen; /* the length of the key. this CANNOT be 0. If 32 32 you want to use variable length strings use the 33 33 flag below. */ 34 34 35 35 /* these are useful for debugging the state of the hash system */ 36 #ifdef DEBUG 37 int max_per_bucket; /* most entries in a single bucket */38 int empty_buckets; /* number of empty buckets */36 #ifdef DEBUG_CODE 37 int max_per_bucket; /* most entries in a single bucket */ 38 int empty_buckets; /* number of empty buckets */ 39 39 #endif 40 #define HASH_FL_NOCASE 0x1 /* ignore case (tolower before hash) */41 #define HASH_FL_STRING 0x2 /* key is a nul-terminated string, treat len40 #define HASH_FL_NOCASE 0x1 /* ignore case (tolower before hash) */ 41 #define HASH_FL_STRING 0x2 /* key is a nul-terminated string, treat len 42 42 as a maximum length to hash */ 43 #define HASH_FL_INSERTHEAD 0x4 /* insert values at the head of their43 #define HASH_FL_INSERTHEAD 0x4 /* insert values at the head of their 44 44 respective buckets (default) */ 45 #define HASH_FL_INSERTTAIL 0x8 /* insert values at the tail of their bucket */45 #define HASH_FL_INSERTTAIL 0x8 /* insert values at the tail of their bucket */ 46 46 int flags; 47 47 /* the symbol for our comparison function, used in hash_find_ent(). This … … 54 54 }; 55 55 56 extern struct heap *hashent_heap; /* allocated for us elsewhere */57 58 56 /* table management functions */ 59 57 hashtable_t *create_hash_table(int, size_t, size_t, int, const char *); 60 58 void destroy_hash_table(hashtable_t *); 61 59 void resize_hash_table(hashtable_t *table, int elems); 62 unsigned int hash_get_key_hash(hashtable_t *, void *, size_t);63 60 int hash_insert(hashtable_t *, void *); 64 61 int hash_delete(hashtable_t *, void *); 65 #define hash_find_bucket(tbl, key) \66 (&tbl->table[hash_get_key_hash(tbl, key, 0) % table->size])67 62 void *hash_find(hashtable_t *, void *); 68 struct hashent *hash_find_ent(hashtable_t *, void *);69 struct hashent *hash_find_ent_next(hashtable_t *, void *, struct hashent *);70 63 #endif 71 64 /* vi:set ts=8 sts=4 sw=4 tw=76 et: */ -
trunk/ithildin/source/hash.c
r655 r724 2 2 * hash.c: hash table support functions 3 3 * 4 * Copyright 2002 the Ithildin Project.4 * Copyright 2002-2006 the Ithildin Project. 5 5 * See the COPYING file for more information on licensing and use. 6 6 * 7 7 * This system implements a generic hash-table systeem. The workings of 8 * hash-tables are discussed in almost any simple Calgorithm book, so I won't8 * hash-tables are discussed in almost any simple algorithm book, so I won't 9 9 * go into deteail here. 10 10 */ … … 14 14 IDSTRING(rcsid, "$Id$"); 15 15 16 /* this function creates a hashtable with 'elems' buckets (elems can be any 17 * size and remain efficient). 'offset' is the offset of the key from 18 * structures being added (this should be obtained with the 'offsetof()' 19 * function). len is the length of the key, and flags are any flags for the 20 * table (see hash.h). cmpfunc is the function which should be used for 21 * comparison when calling 'hash_find' */ 16 /* 17 * This function creates a hashtable with 'elems' buckets (well, not really, 18 * it returns the closest thing that is smaller and a power-of-two). 19 * 'offset' is the offset of the key in structures which get passed to 20 * hash_insert(). len is the length of the key (0 for \0 terminated 21 * strings of variable length). cmpfunc is the function which should be 22 * used for comparsion when calling hash_find() 23 * 24 * Note that hash values are 32 bit. As such we will not grow a table past 25 * 2^31 entries. I do not anticipate anyone using this arguably not very 26 * good code for something of that magnitude at any rate. 27 * 28 * We do not create tables smaller than 128 elements. If your table is that 29 * small it is probable that hash tables are not actually going to help you 30 * out much in terms of speed. 31 */ 22 32 hashtable_t *create_hash_table(int elems, size_t offset, size_t len, 23 33 int flags, const char *cmpname) { 24 34 hashtable_t *htp = malloc(sizeof(hashtable_t)); 25 26 htp->size = elems; 35 u_int32_t real_elems = 0x80; 36 37 /* Take elems and begin shifting real_elems to the left until it is 38 * larger than, or equal in size to, elems. If it is equal just stop, 39 * if it is larger shift it back down to the right one place and go with 40 * that. */ 41 42 if (elems > 1<<31) { 43 log_error("Request for hashtable of more than 2^31 elements cannot " 44 "be fulfilled."); 45 return NULL; 46 } 47 48 while (real_elems < elems) 49 real_elems <<= 1; 50 if (real_elems > elems) 51 real_elems >>= 1; 52 53 htp->size = real_elems; 27 54 htp->entries = 0; 28 55 htp->keyoffset = offset; 29 56 htp->keylen = len; 30 #ifdef DEBUG 57 #ifdef DEBUG_CODE 31 58 htp->max_per_bucket = 1; 32 59 htp->empty_buckets = elems; … … 75 102 table->size = elems; 76 103 table->entries = 0; 77 #ifdef DEBUG 104 #ifdef DEBUG_CODE 78 105 table->max_per_bucket = 1; 79 106 table->empty_buckets = elems; … … 95 122 } 96 123 124 static unsigned int hash_get_key_hash(hashtable_t *, void *, size_t); 125 126 /* 127 * NOTE 128 * This hash algorithm was not written by me. I am math-stupid, so I spent 129 * some time investigating various algorithms and liked the cut of this 130 * one's jib. I tested it on some datasets I had lying around and find it 131 * very adequate, as well as performant. The original author's information: 132 * 133 * By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use this 134 * code any way you wish, private, educational, or commercial. It's free. 135 * See http://burtleburtle.net/bob/hash/evahash.html 136 * 137 * I have reworked some of this code for updated formatting and to make it 138 * more readable (to me) but I have not tweaked the algorithm at all. I 139 * also took out a lot of the comments about how the algorithm works, the 140 * website above should be used as a reference. 141 */ 142 #define mix(a,b,c) { \ 143 a -= b; a -= c; a ^= (c>>13); \ 144 b -= c; b -= a; b ^= (a<<8); \ 145 c -= a; c -= b; c ^= (b>>13); \ 146 a -= b; a -= c; a ^= (c>>12); \ 147 b -= c; b -= a; b ^= (a<<16); \ 148 c -= a; c -= b; c ^= (b>>5); \ 149 a -= b; a -= c; a ^= (c>>3); \ 150 b -= c; b -= a; b ^= (a<<10); \ 151 c -= a; c -= b; c ^= (b>>15); \ 152 } 153 97 154 /* this function allows you to get the hash of a given key. it must be used in 98 155 * the context of the table, of course. it is mostly useful for insert/delete 99 156 * below, and also for searching, but the included function should do that for 100 157 * you adequately. */ 101 u nsigned int hash_get_key_hash(hashtable_t *table, void *key,158 uint32_t hash_get_key_hash(hashtable_t *table, void *key, 102 159 size_t offset) { 103 char *rkey = (char *)key + offset; 104 unsigned int len = table->keylen; 105 unsigned int hash = 0; 106 160 register uint32_t a, b, c; 161 register uint32_t len; 162 uint32_t length; 163 unsigned char *ckey = (unsigned char *)key + offset; 164 165 /* Set up the internal state */ 107 166 if (table->flags & HASH_FL_STRING) { 108 len = strlen(rkey); 109 if (len > table->keylen) 110 len = table->keylen; 111 } 112 /* I borrowed this algorithm from perl5. Kudos to Larry Wall & co. */ 113 if (table->flags & HASH_FL_NOCASE) 114 while (len--) 115 hash = hash * 33 + tolower(*rkey++); 116 else 117 while (len--) 118 hash = hash * 33 + *rkey++; 119 120 return hash + (hash >> 5); 167 length = strlen(ckey); 168 if (length > table->keylen) 169 /* they may ask to only hash on the first n bytes ... 170 * XXX: What if they actually want to hash the LAST n bytes (say 171 * for a filename or URL)? There ought to be a flag... */ 172 length = table->keylen; 173 } else 174 length = table->keylen; 175 176 len = length; 177 a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */ 178 c = 0xb33ff00d; /* the previous hash value (UNUSED initval 179 functionality) */ 180 181 /* XXX: So in order to support case insensitive hashing I simply took 182 * the below code and duped it further down to key generation on those 183 * strings. I think this will end up with faster conpiled code, though 184 * it is not pleasant to look at.. */ 185 if (table->flags & HASH_FL_NOCASE) { 186 while (len >= 12) { 187 a += ( (uint32_t)tolower(ckey[0]) + 188 ((uint32_t)tolower(ckey[1]) << 8) + 189 ((uint32_t)tolower(ckey[2]) << 16) + 190 ((uint32_t)tolower(ckey[3]) << 24)); 191 b += ( (uint32_t)tolower(ckey[5]) + 192 ((uint32_t)tolower(ckey[6]) << 8) + 193 ((uint32_t)tolower(ckey[7]) << 16) + 194 ((uint32_t)tolower(ckey[8]) << 24)); 195 c += ( (uint32_t)tolower(ckey[9]) + 196 ((uint32_t)tolower(ckey[10]) << 8) + 197 ((uint32_t)tolower(ckey[11]) << 16) + 198 ((uint32_t)tolower(ckey[12]) << 24)); 199 mix(a, b, c); 200 ckey += 12; len -= 12; 201 } 202 203 c += length; 204 205 /* deal with the last 11 (or less) bytes */ 206 switch (len) { /* all the case statements fall through */ 207 case 11: c += ((uint32_t)tolower(ckey[10]) << 24); 208 case 10: c += ((uint32_t)tolower(ckey[9]) << 16); 209 case 9: c += ((uint32_t)tolower(ckey[8]) << 8); 210 /* the first byte of c is reserved for the length */ 211 case 8: b += ((uint32_t)tolower(ckey[7]) << 24); 212 case 7: b += ((uint32_t)tolower(ckey[6]) << 16); 213 case 6: b += ((uint32_t)tolower(ckey[5]) << 8); 214 case 5: b += ((uint32_t)tolower(ckey[4])); 215 case 4: b += ((uint32_t)tolower(ckey[3]) << 24); 216 case 3: b += ((uint32_t)tolower(ckey[2]) << 16); 217 case 2: b += ((uint32_t)tolower(ckey[1]) << 8); 218 case 1: b += ((uint32_t)tolower(ckey[0])); 219 /* case 0: nothing left to add */ 220 } 221 mix(a, b, c); 222 } else { 223 while (len >= 12) { 224 a += ( (uint32_t)ckey[0] + 225 ((uint32_t)ckey[1] << 8) + 226 ((uint32_t)ckey[2] << 16) + 227 ((uint32_t)ckey[3] << 24)); 228 b += ( (uint32_t)ckey[5] + 229 ((uint32_t)ckey[6] << 8) + 230 ((uint32_t)ckey[7] << 16) + 231 ((uint32_t)ckey[8] << 24)); 232 c += ( (uint32_t)ckey[9] + 233 ((uint32_t)ckey[10] << 8) + 234 ((uint32_t)ckey[11] << 16) + 235 ((uint32_t)ckey[12] << 24)); 236 mix(a, b, c); 237 ckey += 12; len -= 12; 238 } 239 240 c += length; 241 242 /* deal with the last 11 (or less) bytes */ 243 switch (len) { /* all the case statements fall through */ 244 case 11: c += ((uint32_t)ckey[10] << 24); 245 case 10: c += ((uint32_t)ckey[9] << 16); 246 case 9: c += ((uint32_t)ckey[8] << 8); 247 /* the first byte of c is reserved for the length */ 248 case 8: b += ((uint32_t)ckey[7] << 24); 249 case 7: b += ((uint32_t)ckey[6] << 16); 250 case 6: b += ((uint32_t)ckey[5] << 8); 251 case 5: b += ((uint32_t)ckey[4]); 252 case 4: b += ((uint32_t)ckey[3] << 24); 253 case 3: b += ((uint32_t)ckey[2] << 16); 254 case 2: b += ((uint32_t)ckey[1] << 8); 255 case 1: b += ((uint32_t)ckey[0]); 256 /* case 0: nothing left to add */ 257 } 258 mix(a, b, c); 259 } 260 261 /* just return the hash sized right for the table instead of asking them 262 * to do it.. */ 263 return c & (table->size - 1); 121 264 } 122 265 … … 127 270 128 271 hep->hv = hash; 129 hash %= table->size; 130 #ifdef DEBUG 272 #ifdef DEBUG_CODE 131 273 if (SLIST_EMPTY(&table->table[hash])) 132 274 table->empty_buckets--; /* this bucket isn't empty now */ 133 275 else { 134 /* count the items in the bucket. of course this is expensive, that's276 /* count the items in the bucket. of course this is wasteful, that's 135 277 * why you don't debug code in production. :) */ 136 278 struct hashent *hep2; … … 149 291 /* find the end of the list and insert data there. this is costly, 150 292 * and probably not something you want to do unless you're really sure 151 * that it's what you're after. */ 293 * that it's what you're after... i.e. if you think new entries will 294 * be the least sought for, and you expect a lot of collisions, you 295 * would use this.. otherwise I don't know why.. */ 152 296 struct hashent *hep2; 153 297 … … 166 310 SLIST_INSERT_HEAD(&table->table[hash], hep, lp); 167 311 168 /* if the table has 1.2x as many entries as there are buckets, resize it so 169 * that there ar twice as many buckets. :) */ 170 if (((table->size * 6) / 5) <= table->entries) 312 /* 313 * if the table has 1.2x as many entries as there are buckets, resize it so 314 * that there ar twice as many buckets. :) 315 * Perf considerations: 316 * I have heard a lot of conjecture about the wisdom of resizes in 317 * relation to hash tables, some advise resizing at about 70-80% 318 * capacity, others claim that a good hash algorithm (of which FNC is 319 * ostensibly one) should be O(1) even when the table has as many 320 * entries as it does buckets. Some basic testing from my standpoint 321 * shows that FNV does a good enough job of distribution that we really 322 * don't need to worry about taking the hit until we have more entries 323 * than buckets. Furthermore, in what would be mediocre circumstances 324 * (50% buckets unused, most buckets doubled or tripled) the runtime is 325 * still not very high, whereas the costs associated with a resize 326 * (in terms of memory and resizing operation considerations) are 327 * relatively high. especially when data growth is rapid, the less we 328 * resize the better off we are. 329 * 330 * TODO: Allow consumers of hash tables to specify a growth ratio 331 * instead of making this static. 332 */ 333 if (((table->size * 6) / 5) <= table->entries) 171 334 resize_hash_table(table, table->size * 2); 172 335 … … 176 339 /* remove the entry from the table. */ 177 340 int hash_delete(hashtable_t *table, void *ent) { 178 unsigned int hash = hash_get_key_hash(table, ent, table->keyoffset) % 179 table->size; 341 unsigned int hash = hash_get_key_hash(table, ent, table->keyoffset); 180 342 struct hashent *hep; 181 343 … … 191 353 free(hep); 192 354 193 #ifdef DEBUG 355 #ifdef DEBUG_CODE 194 356 if (SLIST_EMPTY(&table->table[hash])) 195 357 table->empty_buckets++; /* this bucket is empty again. */ … … 205 367 void *hash_find(hashtable_t *table, void *key) { 206 368 unsigned int hash = hash_get_key_hash(table, key, 0); 207 struct hashbucket *bucket = &table->table[hash % table->size];369 struct hashbucket *bucket = &table->table[hash]; 208 370 struct hashent *hep; 209 371 int (*cmpfunc)(void *, void *, size_t) = … … 219 381 } 220 382 383 /* I haven't found a use for this code yet... */ 384 #if 0 385 static struct hashent *hash_find_ent(hashtable_t *, void *); 386 static struct hashent *hash_find_ent_next(hashtable_t *, void *, struct hashent *); 387 221 388 /* this function finds the first 'hashent' of the given key, it returns a 222 389 * hashent so that calls to the _next variant can be a bit faster. */ 223 st ruct hashent *hash_find_ent(hashtable_t *table, void *key) {390 static struct hashent *hash_find_ent(hashtable_t *table, void *key) { 224 391 unsigned int hash = hash_get_key_hash(table, key, 0); 225 struct hashbucket *bucket = &table->table[hash % table->size];392 struct hashbucket *bucket = &table->table[hash]; 226 393 struct hashent *hep; 227 394 int (*cmpfunc)(void *, void *, size_t) = … … 238 405 239 406 /* this function finds the next entry matching the key after 'ent' */ 240 st ruct hashent *hash_find_ent_next(hashtable_t *table, void *key,407 static struct hashent *hash_find_ent_next(hashtable_t *table, void *key, 241 408 struct hashent *ent) { 242 409 struct hashent *hep = ent; … … 253 420 return NULL; 254 421 } 422 #endif 255 423 256 424 /* vi:set ts=8 sts=4 sw=4 tw=76 et: */
Note: See TracChangeset
for help on using the changeset viewer.
