view help2html/stringdict.c @ 21:1c9dac05d040

Add lint-style FALLTHROUGH annotations to fallthrough cases. (in the parse engine and thus the output code) Document this, because the old output causes warnings with gcc10.
author David A. Holland
date Mon, 13 Jun 2022 00:04:38 -0400
parents 13d2b8934445
children
line wrap: on
line source

#include <string.h>
#include <stdlib.h>
#include <stdint.h>

#include "must.h"
#include "array.h"
#include "stringdict.h"

struct stringdict_string {
  size_t sds_len;
  uint32_t sds_hash;
  unsigned sds_index; /* indexes sd_strings */
};

struct stringdict_bucket {
  struct array sdb_strings;	/* array of stringdict_string* */
};

struct stringdict {
  struct array sd_hashbuckets;	/* array of stringdict_bucket* */
  struct array sd_strings;	/* array of char* */
  unsigned sd_totnum;
};

////////////////////////////////////////////////////////////

/*
 * Fletcher's check-sum routines for AnaGram
 * 
 * Freely adapted from routines published in Dr. Dobbs Journal
 * May 1992, p. 64
 *
 * Revised as portable C in June 2006. Rewritten again in 2007 for no
 * good reason.
 *
 * The various copies of this in AG should probably be unified.
 */
static uint32_t stringdict_hash(const char *s, size_t len) {
   uint16_t x1, x2, a;
   size_t i;

   x1 = (uint16_t) (len >> 16);
   x2 = (uint16_t) (len);
   if (x1==0) {
      x1++;
   }
   if (x2==0) {
      x2++;
   }

   for (i=0; i<len; i+=2) {
      if (i==len-1) {
	 a = (unsigned char)s[i];
	 /* don't run off the end of the array */
      }
      else {
	 a = (unsigned char)s[i] + ((uint16_t)(unsigned char)s[i+1] << 8);
      }
      x1 += a;
      if (x1 < a) {
	 x1++;
      }
      x2 += x1;
      if (x2 < x1) {
	 x2++;
      }
   }

   x1 ^= 0xffff;
   x2 ^= 0xffff;
   return ((uint32_t)x2)*65535U + x1;
}

static void stringdict_setbuckets(struct stringdict *sd, unsigned newbucks) {
   /*
    * Because the number of buckets is always a power of 2, and we use the
    * low bits of the hash code to choose the bucket, and we always grow by
    * doubling (not, say, quadrupling) when we grow we know that any string
    * either stays put or moves to bucket i+n0, where n0 is the old number of
    * buckets. Further, we can check hash&n0 to see if the string must move.
    *
    * (n0 == oldbucks)
    */
   unsigned oldbucks, i, j;
   unsigned numseen;
   struct stringdict_bucket *sdb, *sdb2;
   struct stringdict_string *sds;

   oldbucks = array_num(&sd->sd_hashbuckets);
   assert(newbucks == oldbucks*2 || (oldbucks == 0 && sd->sd_totnum==0));
   assert((newbucks & (newbucks-1)) == 0); // power of 2

   // grow the array
   array_setsize(&sd->sd_hashbuckets, newbucks);

   // allocate empty buckets in the new half of the array
   for (i=oldbucks; i<newbucks; i++) {
      sdb = must_malloc(sizeof(*sdb));
      array_init(&sdb->sdb_strings);
      array_set(&sd->sd_hashbuckets, i, sdb);
   }

   // transfer any strings that need to be transferred
   numseen = 0;
   for (i=0; i<oldbucks; i++) {
      sdb = array_get(&sd->sd_hashbuckets, i);
      sdb2 = array_get(&sd->sd_hashbuckets, i+oldbucks);
      for (j=0; j<array_num(&sdb->sdb_strings); j++) {
	 sds = array_get(&sdb->sdb_strings, j);
	 if (sds->sds_hash & oldbucks) {
	    array_set(&sdb->sdb_strings, j, NULL);
	    array_add(&sdb2->sdb_strings, sds);
	 }
	 numseen++;
      }
      array_nonulls(&sdb->sdb_strings);
   }
   assert(numseen == sd->sd_totnum);
}

struct stringdict *stringdict_create(void) {
   struct stringdict *sd;
   sd = must_malloc(sizeof(*sd));
   array_init(&sd->sd_hashbuckets);
   array_init(&sd->sd_strings);
   sd->sd_totnum = 0;
   // number of buckets is always a power of 2
   stringdict_setbuckets(sd, 16);
   return sd;
}

void stringdict_destroy(struct stringdict *sd) {
   struct stringdict_bucket *sdb;
   struct stringdict_string *sds;
   char *str;
   unsigned i, j;

   for (i=0; i<array_num(&sd->sd_hashbuckets); i++) {
      sdb = array_get(&sd->sd_hashbuckets, i);
      for (j=0; j<array_num(&sdb->sdb_strings); j++) {
         sds = array_get(&sdb->sdb_strings, j);
         free(sds);
      }
      array_cleanup(&sdb->sdb_strings);
   }
   array_cleanup(&sd->sd_hashbuckets);

   for (i=0; i<array_num(&sd->sd_strings); i++) {
     str = array_get(&sd->sd_strings, i);
     free(str);
   }
   array_cleanup(&sd->sd_strings);

   free(sd);
}

void stringdict_reset(struct stringdict *sd) {
   struct stringdict_bucket *sdb;
   struct stringdict_string *sds;
   char *str;
   unsigned i, j;

   for (i=0; i<array_num(&sd->sd_hashbuckets); i++) {
      sdb = array_get(&sd->sd_hashbuckets, i);
      for (j=0; j<array_num(&sdb->sdb_strings); j++) {
         sds = array_get(&sdb->sdb_strings, j);
         free(sds);
      }
      array_setsize(&sdb->sdb_strings, 0);
   }
   /* could reset the bucket count here, but it seems better not to */

   for (i=0; i<array_num(&sd->sd_strings); i++) {
     str = array_get(&sd->sd_strings, i);
     free(str);
   }
   array_setsize(&sd->sd_strings, 0);
}

unsigned stringdict_internlen(struct stringdict *sd, 
			      const char *s, size_t len) {
   uint32_t hash;
   unsigned j;
   struct stringdict_bucket *sdb;
   struct stringdict_string *sds;
   char *str;

   hash = stringdict_hash(s, len);
   // number of buckets is always a power of 2
   sdb = array_get(&sd->sd_hashbuckets, 
		   hash & (array_num(&sd->sd_hashbuckets) - 1));
   for (j=0; j<array_num(&sdb->sdb_strings); j++) {
      sds = array_get(&sdb->sdb_strings, j);
      if (len == sds->sds_len && hash == sds->sds_hash) {
	str = array_get(&sd->sd_strings, sds->sds_index);
	if (!memcmp(s, str, len)) {
	  /* found it */
	  return sds->sds_index;
	}
      }
   }

   str = must_malloc(len+1);
   memcpy(str, s, len);
   str[len] = 0;
   
   sds = must_malloc(sizeof(*sds));
   sds->sds_hash = hash;
   sds->sds_len = len;
   sds->sds_index = array_add(&sd->sd_strings, str);

   array_add(&sdb->sdb_strings, sds);

   sd->sd_totnum++;
   if (sd->sd_totnum / array_num(&sd->sd_hashbuckets) > 12) {
      stringdict_setbuckets(sd, array_num(&sd->sd_hashbuckets) * 2);
   }

   return sds->sds_index;
}

unsigned stringdict_intern(struct stringdict *sd, const char *s) {
   return stringdict_internlen(sd, s, strlen(s));
}

unsigned stringdict_count(const struct stringdict *sd) {
  return sd->sd_totnum;
}

const char *stringdict_getbynum(const struct stringdict *sd, unsigned index) {
  return array_get(&sd->sd_strings, index);
}

static struct stringdict_string *
stringdict_find(const struct stringdict *sd, const char *key, size_t keylen) {
   uint32_t hash;
   unsigned j;
   struct stringdict_bucket *sdb;
   struct stringdict_string *sds;
   char *str;

   hash = stringdict_hash(key, keylen);
   // number of buckets is always a power of 2
   sdb = array_get(&sd->sd_hashbuckets, 
		   hash & (array_num(&sd->sd_hashbuckets) - 1));
   for (j=0; j<array_num(&sdb->sdb_strings); j++) {
      sds = array_get(&sdb->sdb_strings, j);
      if (keylen == sds->sds_len && hash == sds->sds_hash) {
	str = array_get(&sd->sd_strings, sds->sds_index);
	if (!memcmp(key, str, keylen)) {
	  /* found it */
	  return sds;
	}
      }
   }

   return NULL;
}


unsigned stringdict_findbyname(const struct stringdict *sd, const char *str) {
  struct stringdict_string *sds;
  sds = stringdict_find(sd, str, strlen(str));
  return sds ? sds->sds_index : (unsigned)-1;
}

int stringdict_exists(const struct stringdict *sd, const char *str) {
  struct stringdict_string *sds;
  sds = stringdict_find(sd, str, strlen(str));
  return sds != NULL;
}