Mercurial > ~dholland > hg > ag > index.cgi
comparison help2html/stringdict.c @ 0:13d2b8934445
Import AnaGram (near-)release tree into Mercurial.
author | David A. Holland |
---|---|
date | Sat, 22 Dec 2007 17:52:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:13d2b8934445 |
---|---|
1 #include <string.h> | |
2 #include <stdlib.h> | |
3 #include <stdint.h> | |
4 | |
5 #include "must.h" | |
6 #include "array.h" | |
7 #include "stringdict.h" | |
8 | |
9 struct stringdict_string { | |
10 size_t sds_len; | |
11 uint32_t sds_hash; | |
12 unsigned sds_index; /* indexes sd_strings */ | |
13 }; | |
14 | |
15 struct stringdict_bucket { | |
16 struct array sdb_strings; /* array of stringdict_string* */ | |
17 }; | |
18 | |
19 struct stringdict { | |
20 struct array sd_hashbuckets; /* array of stringdict_bucket* */ | |
21 struct array sd_strings; /* array of char* */ | |
22 unsigned sd_totnum; | |
23 }; | |
24 | |
25 //////////////////////////////////////////////////////////// | |
26 | |
27 /* | |
28 * Fletcher's check-sum routines for AnaGram | |
29 * | |
30 * Freely adapted from routines published in Dr. Dobbs Journal | |
31 * May 1992, p. 64 | |
32 * | |
33 * Revised as portable C in June 2006. Rewritten again in 2007 for no | |
34 * good reason. | |
35 * | |
36 * The various copies of this in AG should probably be unified. | |
37 */ | |
38 static uint32_t stringdict_hash(const char *s, size_t len) { | |
39 uint16_t x1, x2, a; | |
40 size_t i; | |
41 | |
42 x1 = (uint16_t) (len >> 16); | |
43 x2 = (uint16_t) (len); | |
44 if (x1==0) { | |
45 x1++; | |
46 } | |
47 if (x2==0) { | |
48 x2++; | |
49 } | |
50 | |
51 for (i=0; i<len; i+=2) { | |
52 if (i==len-1) { | |
53 a = (unsigned char)s[i]; | |
54 /* don't run off the end of the array */ | |
55 } | |
56 else { | |
57 a = (unsigned char)s[i] + ((uint16_t)(unsigned char)s[i+1] << 8); | |
58 } | |
59 x1 += a; | |
60 if (x1 < a) { | |
61 x1++; | |
62 } | |
63 x2 += x1; | |
64 if (x2 < x1) { | |
65 x2++; | |
66 } | |
67 } | |
68 | |
69 x1 ^= 0xffff; | |
70 x2 ^= 0xffff; | |
71 return ((uint32_t)x2)*65535U + x1; | |
72 } | |
73 | |
74 static void stringdict_setbuckets(struct stringdict *sd, unsigned newbucks) { | |
75 /* | |
76 * Because the number of buckets is always a power of 2, and we use the | |
77 * low bits of the hash code to choose the bucket, and we always grow by | |
78 * doubling (not, say, quadrupling) when we grow we know that any string | |
79 * either stays put or moves to bucket i+n0, where n0 is the old number of | |
80 * buckets. Further, we can check hash&n0 to see if the string must move. | |
81 * | |
82 * (n0 == oldbucks) | |
83 */ | |
84 unsigned oldbucks, i, j; | |
85 unsigned numseen; | |
86 struct stringdict_bucket *sdb, *sdb2; | |
87 struct stringdict_string *sds; | |
88 | |
89 oldbucks = array_num(&sd->sd_hashbuckets); | |
90 assert(newbucks == oldbucks*2 || (oldbucks == 0 && sd->sd_totnum==0)); | |
91 assert((newbucks & (newbucks-1)) == 0); // power of 2 | |
92 | |
93 // grow the array | |
94 array_setsize(&sd->sd_hashbuckets, newbucks); | |
95 | |
96 // allocate empty buckets in the new half of the array | |
97 for (i=oldbucks; i<newbucks; i++) { | |
98 sdb = must_malloc(sizeof(*sdb)); | |
99 array_init(&sdb->sdb_strings); | |
100 array_set(&sd->sd_hashbuckets, i, sdb); | |
101 } | |
102 | |
103 // transfer any strings that need to be transferred | |
104 numseen = 0; | |
105 for (i=0; i<oldbucks; i++) { | |
106 sdb = array_get(&sd->sd_hashbuckets, i); | |
107 sdb2 = array_get(&sd->sd_hashbuckets, i+oldbucks); | |
108 for (j=0; j<array_num(&sdb->sdb_strings); j++) { | |
109 sds = array_get(&sdb->sdb_strings, j); | |
110 if (sds->sds_hash & oldbucks) { | |
111 array_set(&sdb->sdb_strings, j, NULL); | |
112 array_add(&sdb2->sdb_strings, sds); | |
113 } | |
114 numseen++; | |
115 } | |
116 array_nonulls(&sdb->sdb_strings); | |
117 } | |
118 assert(numseen == sd->sd_totnum); | |
119 } | |
120 | |
121 struct stringdict *stringdict_create(void) { | |
122 struct stringdict *sd; | |
123 sd = must_malloc(sizeof(*sd)); | |
124 array_init(&sd->sd_hashbuckets); | |
125 array_init(&sd->sd_strings); | |
126 sd->sd_totnum = 0; | |
127 // number of buckets is always a power of 2 | |
128 stringdict_setbuckets(sd, 16); | |
129 return sd; | |
130 } | |
131 | |
132 void stringdict_destroy(struct stringdict *sd) { | |
133 struct stringdict_bucket *sdb; | |
134 struct stringdict_string *sds; | |
135 char *str; | |
136 unsigned i, j; | |
137 | |
138 for (i=0; i<array_num(&sd->sd_hashbuckets); i++) { | |
139 sdb = array_get(&sd->sd_hashbuckets, i); | |
140 for (j=0; j<array_num(&sdb->sdb_strings); j++) { | |
141 sds = array_get(&sdb->sdb_strings, j); | |
142 free(sds); | |
143 } | |
144 array_cleanup(&sdb->sdb_strings); | |
145 } | |
146 array_cleanup(&sd->sd_hashbuckets); | |
147 | |
148 for (i=0; i<array_num(&sd->sd_strings); i++) { | |
149 str = array_get(&sd->sd_strings, i); | |
150 free(str); | |
151 } | |
152 array_cleanup(&sd->sd_strings); | |
153 | |
154 free(sd); | |
155 } | |
156 | |
157 void stringdict_reset(struct stringdict *sd) { | |
158 struct stringdict_bucket *sdb; | |
159 struct stringdict_string *sds; | |
160 char *str; | |
161 unsigned i, j; | |
162 | |
163 for (i=0; i<array_num(&sd->sd_hashbuckets); i++) { | |
164 sdb = array_get(&sd->sd_hashbuckets, i); | |
165 for (j=0; j<array_num(&sdb->sdb_strings); j++) { | |
166 sds = array_get(&sdb->sdb_strings, j); | |
167 free(sds); | |
168 } | |
169 array_setsize(&sdb->sdb_strings, 0); | |
170 } | |
171 /* could reset the bucket count here, but it seems better not to */ | |
172 | |
173 for (i=0; i<array_num(&sd->sd_strings); i++) { | |
174 str = array_get(&sd->sd_strings, i); | |
175 free(str); | |
176 } | |
177 array_setsize(&sd->sd_strings, 0); | |
178 } | |
179 | |
180 unsigned stringdict_internlen(struct stringdict *sd, | |
181 const char *s, size_t len) { | |
182 uint32_t hash; | |
183 unsigned j; | |
184 struct stringdict_bucket *sdb; | |
185 struct stringdict_string *sds; | |
186 char *str; | |
187 | |
188 hash = stringdict_hash(s, len); | |
189 // number of buckets is always a power of 2 | |
190 sdb = array_get(&sd->sd_hashbuckets, | |
191 hash & (array_num(&sd->sd_hashbuckets) - 1)); | |
192 for (j=0; j<array_num(&sdb->sdb_strings); j++) { | |
193 sds = array_get(&sdb->sdb_strings, j); | |
194 if (len == sds->sds_len && hash == sds->sds_hash) { | |
195 str = array_get(&sd->sd_strings, sds->sds_index); | |
196 if (!memcmp(s, str, len)) { | |
197 /* found it */ | |
198 return sds->sds_index; | |
199 } | |
200 } | |
201 } | |
202 | |
203 str = must_malloc(len+1); | |
204 memcpy(str, s, len); | |
205 str[len] = 0; | |
206 | |
207 sds = must_malloc(sizeof(*sds)); | |
208 sds->sds_hash = hash; | |
209 sds->sds_len = len; | |
210 sds->sds_index = array_add(&sd->sd_strings, str); | |
211 | |
212 array_add(&sdb->sdb_strings, sds); | |
213 | |
214 sd->sd_totnum++; | |
215 if (sd->sd_totnum / array_num(&sd->sd_hashbuckets) > 12) { | |
216 stringdict_setbuckets(sd, array_num(&sd->sd_hashbuckets) * 2); | |
217 } | |
218 | |
219 return sds->sds_index; | |
220 } | |
221 | |
222 unsigned stringdict_intern(struct stringdict *sd, const char *s) { | |
223 return stringdict_internlen(sd, s, strlen(s)); | |
224 } | |
225 | |
226 unsigned stringdict_count(const struct stringdict *sd) { | |
227 return sd->sd_totnum; | |
228 } | |
229 | |
230 const char *stringdict_getbynum(const struct stringdict *sd, unsigned index) { | |
231 return array_get(&sd->sd_strings, index); | |
232 } | |
233 | |
234 static struct stringdict_string * | |
235 stringdict_find(const struct stringdict *sd, const char *key, size_t keylen) { | |
236 uint32_t hash; | |
237 unsigned j; | |
238 struct stringdict_bucket *sdb; | |
239 struct stringdict_string *sds; | |
240 char *str; | |
241 | |
242 hash = stringdict_hash(key, keylen); | |
243 // number of buckets is always a power of 2 | |
244 sdb = array_get(&sd->sd_hashbuckets, | |
245 hash & (array_num(&sd->sd_hashbuckets) - 1)); | |
246 for (j=0; j<array_num(&sdb->sdb_strings); j++) { | |
247 sds = array_get(&sdb->sdb_strings, j); | |
248 if (keylen == sds->sds_len && hash == sds->sds_hash) { | |
249 str = array_get(&sd->sd_strings, sds->sds_index); | |
250 if (!memcmp(key, str, keylen)) { | |
251 /* found it */ | |
252 return sds; | |
253 } | |
254 } | |
255 } | |
256 | |
257 return NULL; | |
258 } | |
259 | |
260 | |
261 unsigned stringdict_findbyname(const struct stringdict *sd, const char *str) { | |
262 struct stringdict_string *sds; | |
263 sds = stringdict_find(sd, str, strlen(str)); | |
264 return sds ? sds->sds_index : (unsigned)-1; | |
265 } | |
266 | |
267 int stringdict_exists(const struct stringdict *sd, const char *str) { | |
268 struct stringdict_string *sds; | |
269 sds = stringdict_find(sd, str, strlen(str)); | |
270 return sds != NULL; | |
271 } | |
272 |