Mercurial > ~dholland > hg > ag > index.cgi
view help2html/stringdict.c @ 0:13d2b8934445
Import AnaGram (near-)release tree into Mercurial.
author | David A. Holland |
---|---|
date | Sat, 22 Dec 2007 17:52:45 -0500 |
parents | |
children |
line wrap: on
line source
#include <string.h> #include <stdlib.h> #include <stdint.h> #include "must.h" #include "array.h" #include "stringdict.h" struct stringdict_string { size_t sds_len; uint32_t sds_hash; unsigned sds_index; /* indexes sd_strings */ }; struct stringdict_bucket { struct array sdb_strings; /* array of stringdict_string* */ }; struct stringdict { struct array sd_hashbuckets; /* array of stringdict_bucket* */ struct array sd_strings; /* array of char* */ unsigned sd_totnum; }; //////////////////////////////////////////////////////////// /* * Fletcher's check-sum routines for AnaGram * * Freely adapted from routines published in Dr. Dobbs Journal * May 1992, p. 64 * * Revised as portable C in June 2006. Rewritten again in 2007 for no * good reason. * * The various copies of this in AG should probably be unified. */ static uint32_t stringdict_hash(const char *s, size_t len) { uint16_t x1, x2, a; size_t i; x1 = (uint16_t) (len >> 16); x2 = (uint16_t) (len); if (x1==0) { x1++; } if (x2==0) { x2++; } for (i=0; i<len; i+=2) { if (i==len-1) { a = (unsigned char)s[i]; /* don't run off the end of the array */ } else { a = (unsigned char)s[i] + ((uint16_t)(unsigned char)s[i+1] << 8); } x1 += a; if (x1 < a) { x1++; } x2 += x1; if (x2 < x1) { x2++; } } x1 ^= 0xffff; x2 ^= 0xffff; return ((uint32_t)x2)*65535U + x1; } static void stringdict_setbuckets(struct stringdict *sd, unsigned newbucks) { /* * Because the number of buckets is always a power of 2, and we use the * low bits of the hash code to choose the bucket, and we always grow by * doubling (not, say, quadrupling) when we grow we know that any string * either stays put or moves to bucket i+n0, where n0 is the old number of * buckets. Further, we can check hash&n0 to see if the string must move. * * (n0 == oldbucks) */ unsigned oldbucks, i, j; unsigned numseen; struct stringdict_bucket *sdb, *sdb2; struct stringdict_string *sds; oldbucks = array_num(&sd->sd_hashbuckets); assert(newbucks == oldbucks*2 || (oldbucks == 0 && sd->sd_totnum==0)); assert((newbucks & (newbucks-1)) == 0); // power of 2 // grow the array array_setsize(&sd->sd_hashbuckets, newbucks); // allocate empty buckets in the new half of the array for (i=oldbucks; i<newbucks; i++) { sdb = must_malloc(sizeof(*sdb)); array_init(&sdb->sdb_strings); array_set(&sd->sd_hashbuckets, i, sdb); } // transfer any strings that need to be transferred numseen = 0; for (i=0; i<oldbucks; i++) { sdb = array_get(&sd->sd_hashbuckets, i); sdb2 = array_get(&sd->sd_hashbuckets, i+oldbucks); for (j=0; j<array_num(&sdb->sdb_strings); j++) { sds = array_get(&sdb->sdb_strings, j); if (sds->sds_hash & oldbucks) { array_set(&sdb->sdb_strings, j, NULL); array_add(&sdb2->sdb_strings, sds); } numseen++; } array_nonulls(&sdb->sdb_strings); } assert(numseen == sd->sd_totnum); } struct stringdict *stringdict_create(void) { struct stringdict *sd; sd = must_malloc(sizeof(*sd)); array_init(&sd->sd_hashbuckets); array_init(&sd->sd_strings); sd->sd_totnum = 0; // number of buckets is always a power of 2 stringdict_setbuckets(sd, 16); return sd; } void stringdict_destroy(struct stringdict *sd) { struct stringdict_bucket *sdb; struct stringdict_string *sds; char *str; unsigned i, j; for (i=0; i<array_num(&sd->sd_hashbuckets); i++) { sdb = array_get(&sd->sd_hashbuckets, i); for (j=0; j<array_num(&sdb->sdb_strings); j++) { sds = array_get(&sdb->sdb_strings, j); free(sds); } array_cleanup(&sdb->sdb_strings); } array_cleanup(&sd->sd_hashbuckets); for (i=0; i<array_num(&sd->sd_strings); i++) { str = array_get(&sd->sd_strings, i); free(str); } array_cleanup(&sd->sd_strings); free(sd); } void stringdict_reset(struct stringdict *sd) { struct stringdict_bucket *sdb; struct stringdict_string *sds; char *str; unsigned i, j; for (i=0; i<array_num(&sd->sd_hashbuckets); i++) { sdb = array_get(&sd->sd_hashbuckets, i); for (j=0; j<array_num(&sdb->sdb_strings); j++) { sds = array_get(&sdb->sdb_strings, j); free(sds); } array_setsize(&sdb->sdb_strings, 0); } /* could reset the bucket count here, but it seems better not to */ for (i=0; i<array_num(&sd->sd_strings); i++) { str = array_get(&sd->sd_strings, i); free(str); } array_setsize(&sd->sd_strings, 0); } unsigned stringdict_internlen(struct stringdict *sd, const char *s, size_t len) { uint32_t hash; unsigned j; struct stringdict_bucket *sdb; struct stringdict_string *sds; char *str; hash = stringdict_hash(s, len); // number of buckets is always a power of 2 sdb = array_get(&sd->sd_hashbuckets, hash & (array_num(&sd->sd_hashbuckets) - 1)); for (j=0; j<array_num(&sdb->sdb_strings); j++) { sds = array_get(&sdb->sdb_strings, j); if (len == sds->sds_len && hash == sds->sds_hash) { str = array_get(&sd->sd_strings, sds->sds_index); if (!memcmp(s, str, len)) { /* found it */ return sds->sds_index; } } } str = must_malloc(len+1); memcpy(str, s, len); str[len] = 0; sds = must_malloc(sizeof(*sds)); sds->sds_hash = hash; sds->sds_len = len; sds->sds_index = array_add(&sd->sd_strings, str); array_add(&sdb->sdb_strings, sds); sd->sd_totnum++; if (sd->sd_totnum / array_num(&sd->sd_hashbuckets) > 12) { stringdict_setbuckets(sd, array_num(&sd->sd_hashbuckets) * 2); } return sds->sds_index; } unsigned stringdict_intern(struct stringdict *sd, const char *s) { return stringdict_internlen(sd, s, strlen(s)); } unsigned stringdict_count(const struct stringdict *sd) { return sd->sd_totnum; } const char *stringdict_getbynum(const struct stringdict *sd, unsigned index) { return array_get(&sd->sd_strings, index); } static struct stringdict_string * stringdict_find(const struct stringdict *sd, const char *key, size_t keylen) { uint32_t hash; unsigned j; struct stringdict_bucket *sdb; struct stringdict_string *sds; char *str; hash = stringdict_hash(key, keylen); // number of buckets is always a power of 2 sdb = array_get(&sd->sd_hashbuckets, hash & (array_num(&sd->sd_hashbuckets) - 1)); for (j=0; j<array_num(&sdb->sdb_strings); j++) { sds = array_get(&sdb->sdb_strings, j); if (keylen == sds->sds_len && hash == sds->sds_hash) { str = array_get(&sd->sd_strings, sds->sds_index); if (!memcmp(key, str, keylen)) { /* found it */ return sds; } } } return NULL; } unsigned stringdict_findbyname(const struct stringdict *sd, const char *str) { struct stringdict_string *sds; sds = stringdict_find(sd, str, strlen(str)); return sds ? sds->sds_index : (unsigned)-1; } int stringdict_exists(const struct stringdict *sd, const char *str) { struct stringdict_string *sds; sds = stringdict_find(sd, str, strlen(str)); return sds != NULL; }