diff help2html/stringdict.c @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/help2html/stringdict.c	Sat Dec 22 17:52:45 2007 -0500
@@ -0,0 +1,272 @@
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "must.h"
+#include "array.h"
+#include "stringdict.h"
+
+struct stringdict_string {
+  size_t sds_len;
+  uint32_t sds_hash;
+  unsigned sds_index; /* indexes sd_strings */
+};
+
+struct stringdict_bucket {
+  struct array sdb_strings;	/* array of stringdict_string* */
+};
+
+struct stringdict {
+  struct array sd_hashbuckets;	/* array of stringdict_bucket* */
+  struct array sd_strings;	/* array of char* */
+  unsigned sd_totnum;
+};
+
+////////////////////////////////////////////////////////////
+
+/*
+ * Fletcher's check-sum routines for AnaGram
+ * 
+ * Freely adapted from routines published in Dr. Dobbs Journal
+ * May 1992, p. 64
+ *
+ * Revised as portable C in June 2006. Rewritten again in 2007 for no
+ * good reason.
+ *
+ * The various copies of this in AG should probably be unified.
+ */
+static uint32_t stringdict_hash(const char *s, size_t len) {
+   uint16_t x1, x2, a;
+   size_t i;
+
+   x1 = (uint16_t) (len >> 16);
+   x2 = (uint16_t) (len);
+   if (x1==0) {
+      x1++;
+   }
+   if (x2==0) {
+      x2++;
+   }
+
+   for (i=0; i<len; i+=2) {
+      if (i==len-1) {
+	 a = (unsigned char)s[i];
+	 /* don't run off the end of the array */
+      }
+      else {
+	 a = (unsigned char)s[i] + ((uint16_t)(unsigned char)s[i+1] << 8);
+      }
+      x1 += a;
+      if (x1 < a) {
+	 x1++;
+      }
+      x2 += x1;
+      if (x2 < x1) {
+	 x2++;
+      }
+   }
+
+   x1 ^= 0xffff;
+   x2 ^= 0xffff;
+   return ((uint32_t)x2)*65535U + x1;
+}
+
+static void stringdict_setbuckets(struct stringdict *sd, unsigned newbucks) {
+   /*
+    * Because the number of buckets is always a power of 2, and we use the
+    * low bits of the hash code to choose the bucket, and we always grow by
+    * doubling (not, say, quadrupling) when we grow we know that any string
+    * either stays put or moves to bucket i+n0, where n0 is the old number of
+    * buckets. Further, we can check hash&n0 to see if the string must move.
+    *
+    * (n0 == oldbucks)
+    */
+   unsigned oldbucks, i, j;
+   unsigned numseen;
+   struct stringdict_bucket *sdb, *sdb2;
+   struct stringdict_string *sds;
+
+   oldbucks = array_num(&sd->sd_hashbuckets);
+   assert(newbucks == oldbucks*2 || (oldbucks == 0 && sd->sd_totnum==0));
+   assert((newbucks & (newbucks-1)) == 0); // power of 2
+
+   // grow the array
+   array_setsize(&sd->sd_hashbuckets, newbucks);
+
+   // allocate empty buckets in the new half of the array
+   for (i=oldbucks; i<newbucks; i++) {
+      sdb = must_malloc(sizeof(*sdb));
+      array_init(&sdb->sdb_strings);
+      array_set(&sd->sd_hashbuckets, i, sdb);
+   }
+
+   // transfer any strings that need to be transferred
+   numseen = 0;
+   for (i=0; i<oldbucks; i++) {
+      sdb = array_get(&sd->sd_hashbuckets, i);
+      sdb2 = array_get(&sd->sd_hashbuckets, i+oldbucks);
+      for (j=0; j<array_num(&sdb->sdb_strings); j++) {
+	 sds = array_get(&sdb->sdb_strings, j);
+	 if (sds->sds_hash & oldbucks) {
+	    array_set(&sdb->sdb_strings, j, NULL);
+	    array_add(&sdb2->sdb_strings, sds);
+	 }
+	 numseen++;
+      }
+      array_nonulls(&sdb->sdb_strings);
+   }
+   assert(numseen == sd->sd_totnum);
+}
+
+struct stringdict *stringdict_create(void) {
+   struct stringdict *sd;
+   sd = must_malloc(sizeof(*sd));
+   array_init(&sd->sd_hashbuckets);
+   array_init(&sd->sd_strings);
+   sd->sd_totnum = 0;
+   // number of buckets is always a power of 2
+   stringdict_setbuckets(sd, 16);
+   return sd;
+}
+
+void stringdict_destroy(struct stringdict *sd) {
+   struct stringdict_bucket *sdb;
+   struct stringdict_string *sds;
+   char *str;
+   unsigned i, j;
+
+   for (i=0; i<array_num(&sd->sd_hashbuckets); i++) {
+      sdb = array_get(&sd->sd_hashbuckets, i);
+      for (j=0; j<array_num(&sdb->sdb_strings); j++) {
+         sds = array_get(&sdb->sdb_strings, j);
+         free(sds);
+      }
+      array_cleanup(&sdb->sdb_strings);
+   }
+   array_cleanup(&sd->sd_hashbuckets);
+
+   for (i=0; i<array_num(&sd->sd_strings); i++) {
+     str = array_get(&sd->sd_strings, i);
+     free(str);
+   }
+   array_cleanup(&sd->sd_strings);
+
+   free(sd);
+}
+
+void stringdict_reset(struct stringdict *sd) {
+   struct stringdict_bucket *sdb;
+   struct stringdict_string *sds;
+   char *str;
+   unsigned i, j;
+
+   for (i=0; i<array_num(&sd->sd_hashbuckets); i++) {
+      sdb = array_get(&sd->sd_hashbuckets, i);
+      for (j=0; j<array_num(&sdb->sdb_strings); j++) {
+         sds = array_get(&sdb->sdb_strings, j);
+         free(sds);
+      }
+      array_setsize(&sdb->sdb_strings, 0);
+   }
+   /* could reset the bucket count here, but it seems better not to */
+
+   for (i=0; i<array_num(&sd->sd_strings); i++) {
+     str = array_get(&sd->sd_strings, i);
+     free(str);
+   }
+   array_setsize(&sd->sd_strings, 0);
+}
+
+unsigned stringdict_internlen(struct stringdict *sd, 
+			      const char *s, size_t len) {
+   uint32_t hash;
+   unsigned j;
+   struct stringdict_bucket *sdb;
+   struct stringdict_string *sds;
+   char *str;
+
+   hash = stringdict_hash(s, len);
+   // number of buckets is always a power of 2
+   sdb = array_get(&sd->sd_hashbuckets, 
+		   hash & (array_num(&sd->sd_hashbuckets) - 1));
+   for (j=0; j<array_num(&sdb->sdb_strings); j++) {
+      sds = array_get(&sdb->sdb_strings, j);
+      if (len == sds->sds_len && hash == sds->sds_hash) {
+	str = array_get(&sd->sd_strings, sds->sds_index);
+	if (!memcmp(s, str, len)) {
+	  /* found it */
+	  return sds->sds_index;
+	}
+      }
+   }
+
+   str = must_malloc(len+1);
+   memcpy(str, s, len);
+   str[len] = 0;
+   
+   sds = must_malloc(sizeof(*sds));
+   sds->sds_hash = hash;
+   sds->sds_len = len;
+   sds->sds_index = array_add(&sd->sd_strings, str);
+
+   array_add(&sdb->sdb_strings, sds);
+
+   sd->sd_totnum++;
+   if (sd->sd_totnum / array_num(&sd->sd_hashbuckets) > 12) {
+      stringdict_setbuckets(sd, array_num(&sd->sd_hashbuckets) * 2);
+   }
+
+   return sds->sds_index;
+}
+
+unsigned stringdict_intern(struct stringdict *sd, const char *s) {
+   return stringdict_internlen(sd, s, strlen(s));
+}
+
+unsigned stringdict_count(const struct stringdict *sd) {
+  return sd->sd_totnum;
+}
+
+const char *stringdict_getbynum(const struct stringdict *sd, unsigned index) {
+  return array_get(&sd->sd_strings, index);
+}
+
+static struct stringdict_string *
+stringdict_find(const struct stringdict *sd, const char *key, size_t keylen) {
+   uint32_t hash;
+   unsigned j;
+   struct stringdict_bucket *sdb;
+   struct stringdict_string *sds;
+   char *str;
+
+   hash = stringdict_hash(key, keylen);
+   // number of buckets is always a power of 2
+   sdb = array_get(&sd->sd_hashbuckets, 
+		   hash & (array_num(&sd->sd_hashbuckets) - 1));
+   for (j=0; j<array_num(&sdb->sdb_strings); j++) {
+      sds = array_get(&sdb->sdb_strings, j);
+      if (keylen == sds->sds_len && hash == sds->sds_hash) {
+	str = array_get(&sd->sd_strings, sds->sds_index);
+	if (!memcmp(key, str, keylen)) {
+	  /* found it */
+	  return sds;
+	}
+      }
+   }
+
+   return NULL;
+}
+
+
+unsigned stringdict_findbyname(const struct stringdict *sd, const char *str) {
+  struct stringdict_string *sds;
+  sds = stringdict_find(sd, str, strlen(str));
+  return sds ? sds->sds_index : (unsigned)-1;
+}
+
+int stringdict_exists(const struct stringdict *sd, const char *str) {
+  struct stringdict_string *sds;
+  sds = stringdict_find(sd, str, strlen(str));
+  return sds != NULL;
+}
+