Mercurial > ~dholland > hg > ag > index.cgi
diff help2html/stringdict.c @ 0:13d2b8934445
Import AnaGram (near-)release tree into Mercurial.
author | David A. Holland |
---|---|
date | Sat, 22 Dec 2007 17:52:45 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/help2html/stringdict.c Sat Dec 22 17:52:45 2007 -0500 @@ -0,0 +1,272 @@ +#include <string.h> +#include <stdlib.h> +#include <stdint.h> + +#include "must.h" +#include "array.h" +#include "stringdict.h" + +struct stringdict_string { + size_t sds_len; + uint32_t sds_hash; + unsigned sds_index; /* indexes sd_strings */ +}; + +struct stringdict_bucket { + struct array sdb_strings; /* array of stringdict_string* */ +}; + +struct stringdict { + struct array sd_hashbuckets; /* array of stringdict_bucket* */ + struct array sd_strings; /* array of char* */ + unsigned sd_totnum; +}; + +//////////////////////////////////////////////////////////// + +/* + * Fletcher's check-sum routines for AnaGram + * + * Freely adapted from routines published in Dr. Dobbs Journal + * May 1992, p. 64 + * + * Revised as portable C in June 2006. Rewritten again in 2007 for no + * good reason. + * + * The various copies of this in AG should probably be unified. + */ +static uint32_t stringdict_hash(const char *s, size_t len) { + uint16_t x1, x2, a; + size_t i; + + x1 = (uint16_t) (len >> 16); + x2 = (uint16_t) (len); + if (x1==0) { + x1++; + } + if (x2==0) { + x2++; + } + + for (i=0; i<len; i+=2) { + if (i==len-1) { + a = (unsigned char)s[i]; + /* don't run off the end of the array */ + } + else { + a = (unsigned char)s[i] + ((uint16_t)(unsigned char)s[i+1] << 8); + } + x1 += a; + if (x1 < a) { + x1++; + } + x2 += x1; + if (x2 < x1) { + x2++; + } + } + + x1 ^= 0xffff; + x2 ^= 0xffff; + return ((uint32_t)x2)*65535U + x1; +} + +static void stringdict_setbuckets(struct stringdict *sd, unsigned newbucks) { + /* + * Because the number of buckets is always a power of 2, and we use the + * low bits of the hash code to choose the bucket, and we always grow by + * doubling (not, say, quadrupling) when we grow we know that any string + * either stays put or moves to bucket i+n0, where n0 is the old number of + * buckets. Further, we can check hash&n0 to see if the string must move. + * + * (n0 == oldbucks) + */ + unsigned oldbucks, i, j; + unsigned numseen; + struct stringdict_bucket *sdb, *sdb2; + struct stringdict_string *sds; + + oldbucks = array_num(&sd->sd_hashbuckets); + assert(newbucks == oldbucks*2 || (oldbucks == 0 && sd->sd_totnum==0)); + assert((newbucks & (newbucks-1)) == 0); // power of 2 + + // grow the array + array_setsize(&sd->sd_hashbuckets, newbucks); + + // allocate empty buckets in the new half of the array + for (i=oldbucks; i<newbucks; i++) { + sdb = must_malloc(sizeof(*sdb)); + array_init(&sdb->sdb_strings); + array_set(&sd->sd_hashbuckets, i, sdb); + } + + // transfer any strings that need to be transferred + numseen = 0; + for (i=0; i<oldbucks; i++) { + sdb = array_get(&sd->sd_hashbuckets, i); + sdb2 = array_get(&sd->sd_hashbuckets, i+oldbucks); + for (j=0; j<array_num(&sdb->sdb_strings); j++) { + sds = array_get(&sdb->sdb_strings, j); + if (sds->sds_hash & oldbucks) { + array_set(&sdb->sdb_strings, j, NULL); + array_add(&sdb2->sdb_strings, sds); + } + numseen++; + } + array_nonulls(&sdb->sdb_strings); + } + assert(numseen == sd->sd_totnum); +} + +struct stringdict *stringdict_create(void) { + struct stringdict *sd; + sd = must_malloc(sizeof(*sd)); + array_init(&sd->sd_hashbuckets); + array_init(&sd->sd_strings); + sd->sd_totnum = 0; + // number of buckets is always a power of 2 + stringdict_setbuckets(sd, 16); + return sd; +} + +void stringdict_destroy(struct stringdict *sd) { + struct stringdict_bucket *sdb; + struct stringdict_string *sds; + char *str; + unsigned i, j; + + for (i=0; i<array_num(&sd->sd_hashbuckets); i++) { + sdb = array_get(&sd->sd_hashbuckets, i); + for (j=0; j<array_num(&sdb->sdb_strings); j++) { + sds = array_get(&sdb->sdb_strings, j); + free(sds); + } + array_cleanup(&sdb->sdb_strings); + } + array_cleanup(&sd->sd_hashbuckets); + + for (i=0; i<array_num(&sd->sd_strings); i++) { + str = array_get(&sd->sd_strings, i); + free(str); + } + array_cleanup(&sd->sd_strings); + + free(sd); +} + +void stringdict_reset(struct stringdict *sd) { + struct stringdict_bucket *sdb; + struct stringdict_string *sds; + char *str; + unsigned i, j; + + for (i=0; i<array_num(&sd->sd_hashbuckets); i++) { + sdb = array_get(&sd->sd_hashbuckets, i); + for (j=0; j<array_num(&sdb->sdb_strings); j++) { + sds = array_get(&sdb->sdb_strings, j); + free(sds); + } + array_setsize(&sdb->sdb_strings, 0); + } + /* could reset the bucket count here, but it seems better not to */ + + for (i=0; i<array_num(&sd->sd_strings); i++) { + str = array_get(&sd->sd_strings, i); + free(str); + } + array_setsize(&sd->sd_strings, 0); +} + +unsigned stringdict_internlen(struct stringdict *sd, + const char *s, size_t len) { + uint32_t hash; + unsigned j; + struct stringdict_bucket *sdb; + struct stringdict_string *sds; + char *str; + + hash = stringdict_hash(s, len); + // number of buckets is always a power of 2 + sdb = array_get(&sd->sd_hashbuckets, + hash & (array_num(&sd->sd_hashbuckets) - 1)); + for (j=0; j<array_num(&sdb->sdb_strings); j++) { + sds = array_get(&sdb->sdb_strings, j); + if (len == sds->sds_len && hash == sds->sds_hash) { + str = array_get(&sd->sd_strings, sds->sds_index); + if (!memcmp(s, str, len)) { + /* found it */ + return sds->sds_index; + } + } + } + + str = must_malloc(len+1); + memcpy(str, s, len); + str[len] = 0; + + sds = must_malloc(sizeof(*sds)); + sds->sds_hash = hash; + sds->sds_len = len; + sds->sds_index = array_add(&sd->sd_strings, str); + + array_add(&sdb->sdb_strings, sds); + + sd->sd_totnum++; + if (sd->sd_totnum / array_num(&sd->sd_hashbuckets) > 12) { + stringdict_setbuckets(sd, array_num(&sd->sd_hashbuckets) * 2); + } + + return sds->sds_index; +} + +unsigned stringdict_intern(struct stringdict *sd, const char *s) { + return stringdict_internlen(sd, s, strlen(s)); +} + +unsigned stringdict_count(const struct stringdict *sd) { + return sd->sd_totnum; +} + +const char *stringdict_getbynum(const struct stringdict *sd, unsigned index) { + return array_get(&sd->sd_strings, index); +} + +static struct stringdict_string * +stringdict_find(const struct stringdict *sd, const char *key, size_t keylen) { + uint32_t hash; + unsigned j; + struct stringdict_bucket *sdb; + struct stringdict_string *sds; + char *str; + + hash = stringdict_hash(key, keylen); + // number of buckets is always a power of 2 + sdb = array_get(&sd->sd_hashbuckets, + hash & (array_num(&sd->sd_hashbuckets) - 1)); + for (j=0; j<array_num(&sdb->sdb_strings); j++) { + sds = array_get(&sdb->sdb_strings, j); + if (keylen == sds->sds_len && hash == sds->sds_hash) { + str = array_get(&sd->sd_strings, sds->sds_index); + if (!memcmp(key, str, keylen)) { + /* found it */ + return sds; + } + } + } + + return NULL; +} + + +unsigned stringdict_findbyname(const struct stringdict *sd, const char *str) { + struct stringdict_string *sds; + sds = stringdict_find(sd, str, strlen(str)); + return sds ? sds->sds_index : (unsigned)-1; +} + +int stringdict_exists(const struct stringdict *sd, const char *str) { + struct stringdict_string *sds; + sds = stringdict_find(sd, str, strlen(str)); + return sds != NULL; +} +