comparison help2html/stringdict.c @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:13d2b8934445
1 #include <string.h>
2 #include <stdlib.h>
3 #include <stdint.h>
4
5 #include "must.h"
6 #include "array.h"
7 #include "stringdict.h"
8
9 struct stringdict_string {
10 size_t sds_len;
11 uint32_t sds_hash;
12 unsigned sds_index; /* indexes sd_strings */
13 };
14
15 struct stringdict_bucket {
16 struct array sdb_strings; /* array of stringdict_string* */
17 };
18
19 struct stringdict {
20 struct array sd_hashbuckets; /* array of stringdict_bucket* */
21 struct array sd_strings; /* array of char* */
22 unsigned sd_totnum;
23 };
24
25 ////////////////////////////////////////////////////////////
26
27 /*
28 * Fletcher's check-sum routines for AnaGram
29 *
30 * Freely adapted from routines published in Dr. Dobbs Journal
31 * May 1992, p. 64
32 *
33 * Revised as portable C in June 2006. Rewritten again in 2007 for no
34 * good reason.
35 *
36 * The various copies of this in AG should probably be unified.
37 */
38 static uint32_t stringdict_hash(const char *s, size_t len) {
39 uint16_t x1, x2, a;
40 size_t i;
41
42 x1 = (uint16_t) (len >> 16);
43 x2 = (uint16_t) (len);
44 if (x1==0) {
45 x1++;
46 }
47 if (x2==0) {
48 x2++;
49 }
50
51 for (i=0; i<len; i+=2) {
52 if (i==len-1) {
53 a = (unsigned char)s[i];
54 /* don't run off the end of the array */
55 }
56 else {
57 a = (unsigned char)s[i] + ((uint16_t)(unsigned char)s[i+1] << 8);
58 }
59 x1 += a;
60 if (x1 < a) {
61 x1++;
62 }
63 x2 += x1;
64 if (x2 < x1) {
65 x2++;
66 }
67 }
68
69 x1 ^= 0xffff;
70 x2 ^= 0xffff;
71 return ((uint32_t)x2)*65535U + x1;
72 }
73
74 static void stringdict_setbuckets(struct stringdict *sd, unsigned newbucks) {
75 /*
76 * Because the number of buckets is always a power of 2, and we use the
77 * low bits of the hash code to choose the bucket, and we always grow by
78 * doubling (not, say, quadrupling) when we grow we know that any string
79 * either stays put or moves to bucket i+n0, where n0 is the old number of
80 * buckets. Further, we can check hash&n0 to see if the string must move.
81 *
82 * (n0 == oldbucks)
83 */
84 unsigned oldbucks, i, j;
85 unsigned numseen;
86 struct stringdict_bucket *sdb, *sdb2;
87 struct stringdict_string *sds;
88
89 oldbucks = array_num(&sd->sd_hashbuckets);
90 assert(newbucks == oldbucks*2 || (oldbucks == 0 && sd->sd_totnum==0));
91 assert((newbucks & (newbucks-1)) == 0); // power of 2
92
93 // grow the array
94 array_setsize(&sd->sd_hashbuckets, newbucks);
95
96 // allocate empty buckets in the new half of the array
97 for (i=oldbucks; i<newbucks; i++) {
98 sdb = must_malloc(sizeof(*sdb));
99 array_init(&sdb->sdb_strings);
100 array_set(&sd->sd_hashbuckets, i, sdb);
101 }
102
103 // transfer any strings that need to be transferred
104 numseen = 0;
105 for (i=0; i<oldbucks; i++) {
106 sdb = array_get(&sd->sd_hashbuckets, i);
107 sdb2 = array_get(&sd->sd_hashbuckets, i+oldbucks);
108 for (j=0; j<array_num(&sdb->sdb_strings); j++) {
109 sds = array_get(&sdb->sdb_strings, j);
110 if (sds->sds_hash & oldbucks) {
111 array_set(&sdb->sdb_strings, j, NULL);
112 array_add(&sdb2->sdb_strings, sds);
113 }
114 numseen++;
115 }
116 array_nonulls(&sdb->sdb_strings);
117 }
118 assert(numseen == sd->sd_totnum);
119 }
120
121 struct stringdict *stringdict_create(void) {
122 struct stringdict *sd;
123 sd = must_malloc(sizeof(*sd));
124 array_init(&sd->sd_hashbuckets);
125 array_init(&sd->sd_strings);
126 sd->sd_totnum = 0;
127 // number of buckets is always a power of 2
128 stringdict_setbuckets(sd, 16);
129 return sd;
130 }
131
132 void stringdict_destroy(struct stringdict *sd) {
133 struct stringdict_bucket *sdb;
134 struct stringdict_string *sds;
135 char *str;
136 unsigned i, j;
137
138 for (i=0; i<array_num(&sd->sd_hashbuckets); i++) {
139 sdb = array_get(&sd->sd_hashbuckets, i);
140 for (j=0; j<array_num(&sdb->sdb_strings); j++) {
141 sds = array_get(&sdb->sdb_strings, j);
142 free(sds);
143 }
144 array_cleanup(&sdb->sdb_strings);
145 }
146 array_cleanup(&sd->sd_hashbuckets);
147
148 for (i=0; i<array_num(&sd->sd_strings); i++) {
149 str = array_get(&sd->sd_strings, i);
150 free(str);
151 }
152 array_cleanup(&sd->sd_strings);
153
154 free(sd);
155 }
156
157 void stringdict_reset(struct stringdict *sd) {
158 struct stringdict_bucket *sdb;
159 struct stringdict_string *sds;
160 char *str;
161 unsigned i, j;
162
163 for (i=0; i<array_num(&sd->sd_hashbuckets); i++) {
164 sdb = array_get(&sd->sd_hashbuckets, i);
165 for (j=0; j<array_num(&sdb->sdb_strings); j++) {
166 sds = array_get(&sdb->sdb_strings, j);
167 free(sds);
168 }
169 array_setsize(&sdb->sdb_strings, 0);
170 }
171 /* could reset the bucket count here, but it seems better not to */
172
173 for (i=0; i<array_num(&sd->sd_strings); i++) {
174 str = array_get(&sd->sd_strings, i);
175 free(str);
176 }
177 array_setsize(&sd->sd_strings, 0);
178 }
179
180 unsigned stringdict_internlen(struct stringdict *sd,
181 const char *s, size_t len) {
182 uint32_t hash;
183 unsigned j;
184 struct stringdict_bucket *sdb;
185 struct stringdict_string *sds;
186 char *str;
187
188 hash = stringdict_hash(s, len);
189 // number of buckets is always a power of 2
190 sdb = array_get(&sd->sd_hashbuckets,
191 hash & (array_num(&sd->sd_hashbuckets) - 1));
192 for (j=0; j<array_num(&sdb->sdb_strings); j++) {
193 sds = array_get(&sdb->sdb_strings, j);
194 if (len == sds->sds_len && hash == sds->sds_hash) {
195 str = array_get(&sd->sd_strings, sds->sds_index);
196 if (!memcmp(s, str, len)) {
197 /* found it */
198 return sds->sds_index;
199 }
200 }
201 }
202
203 str = must_malloc(len+1);
204 memcpy(str, s, len);
205 str[len] = 0;
206
207 sds = must_malloc(sizeof(*sds));
208 sds->sds_hash = hash;
209 sds->sds_len = len;
210 sds->sds_index = array_add(&sd->sd_strings, str);
211
212 array_add(&sdb->sdb_strings, sds);
213
214 sd->sd_totnum++;
215 if (sd->sd_totnum / array_num(&sd->sd_hashbuckets) > 12) {
216 stringdict_setbuckets(sd, array_num(&sd->sd_hashbuckets) * 2);
217 }
218
219 return sds->sds_index;
220 }
221
222 unsigned stringdict_intern(struct stringdict *sd, const char *s) {
223 return stringdict_internlen(sd, s, strlen(s));
224 }
225
226 unsigned stringdict_count(const struct stringdict *sd) {
227 return sd->sd_totnum;
228 }
229
230 const char *stringdict_getbynum(const struct stringdict *sd, unsigned index) {
231 return array_get(&sd->sd_strings, index);
232 }
233
234 static struct stringdict_string *
235 stringdict_find(const struct stringdict *sd, const char *key, size_t keylen) {
236 uint32_t hash;
237 unsigned j;
238 struct stringdict_bucket *sdb;
239 struct stringdict_string *sds;
240 char *str;
241
242 hash = stringdict_hash(key, keylen);
243 // number of buckets is always a power of 2
244 sdb = array_get(&sd->sd_hashbuckets,
245 hash & (array_num(&sd->sd_hashbuckets) - 1));
246 for (j=0; j<array_num(&sdb->sdb_strings); j++) {
247 sds = array_get(&sdb->sdb_strings, j);
248 if (keylen == sds->sds_len && hash == sds->sds_hash) {
249 str = array_get(&sd->sd_strings, sds->sds_index);
250 if (!memcmp(key, str, keylen)) {
251 /* found it */
252 return sds;
253 }
254 }
255 }
256
257 return NULL;
258 }
259
260
261 unsigned stringdict_findbyname(const struct stringdict *sd, const char *str) {
262 struct stringdict_string *sds;
263 sds = stringdict_find(sd, str, strlen(str));
264 return sds ? sds->sds_index : (unsigned)-1;
265 }
266
267 int stringdict_exists(const struct stringdict *sd, const char *str) {
268 struct stringdict_string *sds;
269 sds = stringdict_find(sd, str, strlen(str));
270 return sds != NULL;
271 }
272