AnaGram interim repo (temporary): help2html/stringdict.c comparison

comparison help2html/stringdict.c @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.

author	David A. Holland
date	Sat, 22 Dec 2007 17:52:45 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:13d2b8934445
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include "must.h"
+#include "array.h"
+#include "stringdict.h"
+struct stringdict_string {
+size_t sds_len;
+uint32_t sds_hash;
+unsigned sds_index; /* indexes sd_strings */
+};
+struct stringdict_bucket {
+struct array sdb_strings;	/* array of stringdict_string* */
+};
+struct stringdict {
+struct array sd_hashbuckets;	/* array of stringdict_bucket* */
+struct array sd_strings;	/* array of char* */
+unsigned sd_totnum;
+};
+////////////////////////////////////////////////////////////
+/*
+* Fletcher's check-sum routines for AnaGram
+*
+* Freely adapted from routines published in Dr. Dobbs Journal
+* May 1992, p. 64
+*
+* Revised as portable C in June 2006. Rewritten again in 2007 for no
+* good reason.
+*
+* The various copies of this in AG should probably be unified.
+*/
+static uint32_t stringdict_hash(const char *s, size_t len) {
+uint16_t x1, x2, a;
+size_t i;
+x1 = (uint16_t) (len >> 16);
+x2 = (uint16_t) (len);
+if (x1==0) {
+x1++;
+}
+if (x2==0) {
+x2++;
+}
+for (i=0; i<len; i+=2) {
+if (i==len-1) {
+	 a = (unsigned char)s[i];
+	 /* don't run off the end of the array */
+}
+else {
+	 a = (unsigned char)s[i] + ((uint16_t)(unsigned char)s[i+1] << 8);
+}
+x1 += a;
+if (x1 < a) {
+	 x1++;
+}
+x2 += x1;
+if (x2 < x1) {
+	 x2++;
+}
+}
+x1 ^= 0xffff;
+x2 ^= 0xffff;
+return ((uint32_t)x2)*65535U + x1;
+}
+static void stringdict_setbuckets(struct stringdict *sd, unsigned newbucks) {
+/*
+* Because the number of buckets is always a power of 2, and we use the
+* low bits of the hash code to choose the bucket, and we always grow by
+* doubling (not, say, quadrupling) when we grow we know that any string
+* either stays put or moves to bucket i+n0, where n0 is the old number of
+* buckets. Further, we can check hash&n0 to see if the string must move.
+*
+* (n0 == oldbucks)
+*/
+unsigned oldbucks, i, j;
+unsigned numseen;
+struct stringdict_bucket *sdb, *sdb2;
+struct stringdict_string *sds;
+oldbucks = array_num(&sd->sd_hashbuckets);
+assert(newbucks == oldbucks*2 || (oldbucks == 0 && sd->sd_totnum==0));
+assert((newbucks & (newbucks-1)) == 0); // power of 2
+// grow the array
+array_setsize(&sd->sd_hashbuckets, newbucks);
+// allocate empty buckets in the new half of the array
+for (i=oldbucks; i<newbucks; i++) {
+sdb = must_malloc(sizeof(*sdb));
+array_init(&sdb->sdb_strings);
+array_set(&sd->sd_hashbuckets, i, sdb);
+}
+// transfer any strings that need to be transferred
+numseen = 0;
+for (i=0; i<oldbucks; i++) {
+sdb = array_get(&sd->sd_hashbuckets, i);
+sdb2 = array_get(&sd->sd_hashbuckets, i+oldbucks);
+for (j=0; j<array_num(&sdb->sdb_strings); j++) {
+	 sds = array_get(&sdb->sdb_strings, j);
+	 if (sds->sds_hash & oldbucks) {
+	    array_set(&sdb->sdb_strings, j, NULL);
+	    array_add(&sdb2->sdb_strings, sds);
+	 }
+	 numseen++;
+}
+array_nonulls(&sdb->sdb_strings);
+}
+assert(numseen == sd->sd_totnum);
+}
+struct stringdict *stringdict_create(void) {
+struct stringdict *sd;
+sd = must_malloc(sizeof(*sd));
+array_init(&sd->sd_hashbuckets);
+array_init(&sd->sd_strings);
+sd->sd_totnum = 0;
+// number of buckets is always a power of 2
+stringdict_setbuckets(sd, 16);
+return sd;
+}
+void stringdict_destroy(struct stringdict *sd) {
+struct stringdict_bucket *sdb;
+struct stringdict_string *sds;
+char *str;
+unsigned i, j;
+for (i=0; i<array_num(&sd->sd_hashbuckets); i++) {
+sdb = array_get(&sd->sd_hashbuckets, i);
+for (j=0; j<array_num(&sdb->sdb_strings); j++) {
+sds = array_get(&sdb->sdb_strings, j);
+free(sds);
+}
+array_cleanup(&sdb->sdb_strings);
+}
+array_cleanup(&sd->sd_hashbuckets);
+for (i=0; i<array_num(&sd->sd_strings); i++) {
+str = array_get(&sd->sd_strings, i);
+free(str);
+}
+array_cleanup(&sd->sd_strings);
+free(sd);
+}
+void stringdict_reset(struct stringdict *sd) {
+struct stringdict_bucket *sdb;
+struct stringdict_string *sds;
+char *str;
+unsigned i, j;
+for (i=0; i<array_num(&sd->sd_hashbuckets); i++) {
+sdb = array_get(&sd->sd_hashbuckets, i);
+for (j=0; j<array_num(&sdb->sdb_strings); j++) {
+sds = array_get(&sdb->sdb_strings, j);
+free(sds);
+}
+array_setsize(&sdb->sdb_strings, 0);
+}
+/* could reset the bucket count here, but it seems better not to */
+for (i=0; i<array_num(&sd->sd_strings); i++) {
+str = array_get(&sd->sd_strings, i);
+free(str);
+}
+array_setsize(&sd->sd_strings, 0);
+}
+unsigned stringdict_internlen(struct stringdict *sd,
+			      const char *s, size_t len) {
+uint32_t hash;
+unsigned j;
+struct stringdict_bucket *sdb;
+struct stringdict_string *sds;
+char *str;
+hash = stringdict_hash(s, len);
+// number of buckets is always a power of 2
+sdb = array_get(&sd->sd_hashbuckets,
+		   hash & (array_num(&sd->sd_hashbuckets) - 1));
+for (j=0; j<array_num(&sdb->sdb_strings); j++) {
+sds = array_get(&sdb->sdb_strings, j);
+if (len == sds->sds_len && hash == sds->sds_hash) {
+	str = array_get(&sd->sd_strings, sds->sds_index);
+	if (!memcmp(s, str, len)) {
+	  /* found it */
+	  return sds->sds_index;
+	}
+}
+}
+str = must_malloc(len+1);
+memcpy(str, s, len);
+str[len] = 0;
+sds = must_malloc(sizeof(*sds));
+sds->sds_hash = hash;
+sds->sds_len = len;
+sds->sds_index = array_add(&sd->sd_strings, str);
+array_add(&sdb->sdb_strings, sds);
+sd->sd_totnum++;
+if (sd->sd_totnum / array_num(&sd->sd_hashbuckets) > 12) {
+stringdict_setbuckets(sd, array_num(&sd->sd_hashbuckets) * 2);
+}
+return sds->sds_index;
+}
+unsigned stringdict_intern(struct stringdict *sd, const char *s) {
+return stringdict_internlen(sd, s, strlen(s));
+}
+unsigned stringdict_count(const struct stringdict *sd) {
+return sd->sd_totnum;
+}
+const char *stringdict_getbynum(const struct stringdict *sd, unsigned index) {
+return array_get(&sd->sd_strings, index);
+}
+static struct stringdict_string *
+stringdict_find(const struct stringdict *sd, const char *key, size_t keylen) {
+uint32_t hash;
+unsigned j;
+struct stringdict_bucket *sdb;
+struct stringdict_string *sds;
+char *str;
+hash = stringdict_hash(key, keylen);
+// number of buckets is always a power of 2
+sdb = array_get(&sd->sd_hashbuckets,
+		   hash & (array_num(&sd->sd_hashbuckets) - 1));
+for (j=0; j<array_num(&sdb->sdb_strings); j++) {
+sds = array_get(&sdb->sdb_strings, j);
+if (keylen == sds->sds_len && hash == sds->sds_hash) {
+	str = array_get(&sd->sd_strings, sds->sds_index);
+	if (!memcmp(key, str, keylen)) {
+	  /* found it */
+	  return sds;
+	}
+}
+}
+return NULL;
+}
+unsigned stringdict_findbyname(const struct stringdict *sd, const char *str) {
+struct stringdict_string *sds;
+sds = stringdict_find(sd, str, strlen(str));
+return sds ? sds->sds_index : (unsigned)-1;
+}
+int stringdict_exists(const struct stringdict *sd, const char *str) {
+struct stringdict_string *sds;
+sds = stringdict_find(sd, str, strlen(str));
+return sds != NULL;
+}

Mercurial > ~dholland > hg > ag > index.cgi

comparison help2html/stringdict.c @ 0:13d2b8934445