Mercurial > ~dholland > hg > ag > index.cgi

#include <string.h>
#include <stdlib.h>
#include <stdint.h>

#include "must.h"
#include "array.h"
#include "stringdict.h"

struct stringdict_string {
  size_t sds_len;
  uint32_t sds_hash;
  unsigned sds_index; /* indexes sd_strings */
};

struct stringdict_bucket {
  struct array sdb_strings;	/* array of stringdict_string* */
};

struct stringdict {
  struct array sd_hashbuckets;	/* array of stringdict_bucket* */
  struct array sd_strings;	/* array of char* */
  unsigned sd_totnum;
};

////////////////////////////////////////////////////////////

/*
 * Fletcher's check-sum routines for AnaGram
 *
 * Freely adapted from routines published in Dr. Dobbs Journal
 * May 1992, p. 64
 *
 * Revised as portable C in June 2006. Rewritten again in 2007 for no
 * good reason.
 *
 * The various copies of this in AG should probably be unified.
 */
static uint32_t stringdict_hash(const char *s, size_t len) {
   uint16_t x1, x2, a;
   size_t i;

   x1 = (uint16_t) (len >> 16);
   x2 = (uint16_t) (len);
   if (x1==0) {
      x1++;
   }
   if (x2==0) {
      x2++;
   }

   for (i=0; i<len; i+=2) {
      if (i==len-1) {
	 a = (unsigned char)s[i];
	 /* don't run off the end of the array */
      }
      else {
	 a = (unsigned char)s[i] + ((uint16_t)(unsigned char)s[i+1] << 8);
      }
      x1 += a;
      if (x1 < a) {
	 x1++;
      }
      x2 += x1;
      if (x2 < x1) {
	 x2++;
      }
   }

   x1 ^= 0xffff;
   x2 ^= 0xffff;
   return ((uint32_t)x2)*65535U + x1;
}

static void stringdict_setbuckets(struct stringdict *sd, unsigned newbucks) {
   /*
    * Because the number of buckets is always a power of 2, and we use the
    * low bits of the hash code to choose the bucket, and we always grow by
    * doubling (not, say, quadrupling) when we grow we know that any string
    * either stays put or moves to bucket i+n0, where n0 is the old number of
    * buckets. Further, we can check hash&n0 to see if the string must move.
    *
    * (n0 == oldbucks)
    */
   unsigned oldbucks, i, j;
   unsigned numseen;
   struct stringdict_bucket *sdb, *sdb2;
   struct stringdict_string *sds;

   oldbucks = array_num(&sd->sd_hashbuckets);
   assert(newbucks == oldbucks*2 || (oldbucks == 0 && sd->sd_totnum==0));
   assert((newbucks & (newbucks-1)) == 0); // power of 2

   // grow the array
   array_setsize(&sd->sd_hashbuckets, newbucks);

   // allocate empty buckets in the new half of the array
   for (i=oldbucks; i<newbucks; i++) {
      sdb = must_malloc(sizeof(*sdb));
      array_init(&sdb->sdb_strings);
      array_set(&sd->sd_hashbuckets, i, sdb);
   }

   // transfer any strings that need to be transferred
   numseen = 0;
   for (i=0; i<oldbucks; i++) {
      sdb = array_get(&sd->sd_hashbuckets, i);
      sdb2 = array_get(&sd->sd_hashbuckets, i+oldbucks);
      for (j=0; j<array_num(&sdb->sdb_strings); j++) {
	 sds = array_get(&sdb->sdb_strings, j);
	 if (sds->sds_hash & oldbucks) {
	    array_set(&sdb->sdb_strings, j, NULL);
	    array_add(&sdb2->sdb_strings, sds);
	 }
	 numseen++;
      }
      array_nonulls(&sdb->sdb_strings);
   }
   assert(numseen == sd->sd_totnum);
}

struct stringdict *stringdict_create(void) {
   struct stringdict *sd;
   sd = must_malloc(sizeof(*sd));
   array_init(&sd->sd_hashbuckets);
   array_init(&sd->sd_strings);
   sd->sd_totnum = 0;
   // number of buckets is always a power of 2
   stringdict_setbuckets(sd, 16);
   return sd;
}

void stringdict_destroy(struct stringdict *sd) {
   struct stringdict_bucket *sdb;
   struct stringdict_string *sds;
   char *str;
   unsigned i, j;

   for (i=0; i<array_num(&sd->sd_hashbuckets); i++) {
      sdb = array_get(&sd->sd_hashbuckets, i);
      for (j=0; j<array_num(&sdb->sdb_strings); j++) {
         sds = array_get(&sdb->sdb_strings, j);
         free(sds);
      }
      array_cleanup(&sdb->sdb_strings);
   }
   array_cleanup(&sd->sd_hashbuckets);

   for (i=0; i<array_num(&sd->sd_strings); i++) {
     str = array_get(&sd->sd_strings, i);
     free(str);
   }
   array_cleanup(&sd->sd_strings);

   free(sd);
}

void stringdict_reset(struct stringdict *sd) {
   struct stringdict_bucket *sdb;
   struct stringdict_string *sds;
   char *str;
   unsigned i, j;

   for (i=0; i<array_num(&sd->sd_hashbuckets); i++) {
      sdb = array_get(&sd->sd_hashbuckets, i);
      for (j=0; j<array_num(&sdb->sdb_strings); j++) {
         sds = array_get(&sdb->sdb_strings, j);
         free(sds);
      }
      array_setsize(&sdb->sdb_strings, 0);
   }
   /* could reset the bucket count here, but it seems better not to */

   for (i=0; i<array_num(&sd->sd_strings); i++) {
     str = array_get(&sd->sd_strings, i);
     free(str);
   }
   array_setsize(&sd->sd_strings, 0);
}

unsigned stringdict_internlen(struct stringdict *sd,
			      const char *s, size_t len) {
   uint32_t hash;
   unsigned j;
   struct stringdict_bucket *sdb;
   struct stringdict_string *sds;
   char *str;

   hash = stringdict_hash(s, len);
   // number of buckets is always a power of 2
   sdb = array_get(&sd->sd_hashbuckets,
		   hash & (array_num(&sd->sd_hashbuckets) - 1));
   for (j=0; j<array_num(&sdb->sdb_strings); j++) {
      sds = array_get(&sdb->sdb_strings, j);
      if (len == sds->sds_len && hash == sds->sds_hash) {
	str = array_get(&sd->sd_strings, sds->sds_index);
	if (!memcmp(s, str, len)) {
	  /* found it */
	  return sds->sds_index;
	}
      }
   }

   str = must_malloc(len+1);
   memcpy(str, s, len);
   str[len] = 0;

   sds = must_malloc(sizeof(*sds));
   sds->sds_hash = hash;
   sds->sds_len = len;
   sds->sds_index = array_add(&sd->sd_strings, str);

   array_add(&sdb->sdb_strings, sds);

   sd->sd_totnum++;
   if (sd->sd_totnum / array_num(&sd->sd_hashbuckets) > 12) {
      stringdict_setbuckets(sd, array_num(&sd->sd_hashbuckets) * 2);
   }

   return sds->sds_index;
}

unsigned stringdict_intern(struct stringdict *sd, const char *s) {
   return stringdict_internlen(sd, s, strlen(s));
}

unsigned stringdict_count(const struct stringdict *sd) {
  return sd->sd_totnum;
}

const char *stringdict_getbynum(const struct stringdict *sd, unsigned index) {
  return array_get(&sd->sd_strings, index);
}

static struct stringdict_string *
stringdict_find(const struct stringdict *sd, const char *key, size_t keylen) {
   uint32_t hash;
   unsigned j;
   struct stringdict_bucket *sdb;
   struct stringdict_string *sds;
   char *str;

   hash = stringdict_hash(key, keylen);
   // number of buckets is always a power of 2
   sdb = array_get(&sd->sd_hashbuckets,
		   hash & (array_num(&sd->sd_hashbuckets) - 1));
   for (j=0; j<array_num(&sdb->sdb_strings); j++) {
      sds = array_get(&sdb->sdb_strings, j);
      if (keylen == sds->sds_len && hash == sds->sds_hash) {
	str = array_get(&sd->sd_strings, sds->sds_index);
	if (!memcmp(key, str, keylen)) {
	  /* found it */
	  return sds;
	}
      }
   }

   return NULL;
}


unsigned stringdict_findbyname(const struct stringdict *sd, const char *str) {
  struct stringdict_string *sds;
  sds = stringdict_find(sd, str, strlen(str));
  return sds ? sds->sds_index : (unsigned)-1;
}

int stringdict_exists(const struct stringdict *sd, const char *str) {
  struct stringdict_string *sds;
  sds = stringdict_find(sd, str, strlen(str));
  return sds != NULL;
}
author	David A. Holland
date	Mon, 30 May 2022 23:51:43 -0400
parents	13d2b8934445
children