Mercurial > ~dholland > hg > ag > index.cgi
comparison anagram/agcore/keyword.cpp @ 0:13d2b8934445
Import AnaGram (near-)release tree into Mercurial.
author | David A. Holland |
---|---|
date | Sat, 22 Dec 2007 17:52:45 -0500 |
parents | |
children | 607e3be6bad8 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:13d2b8934445 |
---|---|
1 /* | |
2 * AnaGram, A System for Syntax Directed Programming | |
3 * Copyright 1993-2002 Parsifal Software. All Rights Reserved. | |
4 * See the file COPYING for license and usage terms. | |
5 * | |
6 * keyword.cpp | |
7 */ | |
8 | |
9 #include "arrays.h" | |
10 #include "bitmap.h" | |
11 #include "bpe3.h" | |
12 #include "csexp.h" | |
13 #include "dict.h" | |
14 #include "ftpar.h" | |
15 #include "keyword.h" | |
16 #include "myalloc.h" | |
17 #include "q1glbl.h" | |
18 #include "stacks.h" | |
19 #include "tsd.h" | |
20 #include "ut.h" | |
21 | |
22 //#define INCLUDE_LOGGING | |
23 #include "log.h" | |
24 | |
25 | |
26 static tsd *key_work_table; | |
27 static int n_ends = 0; | |
28 | |
29 | |
30 static void tokensTo(unsigned sn, AgStack<int> &tokenStack) { | |
31 LOGSECTION("tokensTo"); | |
32 AgBalancedTree<int> tree; | |
33 while (sn) { | |
34 tree.insert(sn); | |
35 state_number_map *sp = &map_state_number[sn]; | |
36 unsigned *list = lstptr(*sp, previous_states); | |
37 int n = sp->n_previous_states; | |
38 LOGV(sn) LCV(sp->n_previous_states); | |
39 while (n--) { | |
40 LOGV(n) LCV(*list) LCV(map_state_number[*list].char_token); | |
41 while (tree.includes(*list)) { | |
42 list++; | |
43 } | |
44 int k = map_state_number[*list].n_actions; | |
45 while (k--) { | |
46 if (lstptr(map_state_number[*list], p_actions)[k] != sn) { | |
47 continue; | |
48 } | |
49 if (lstptr(map_state_number[*list], a_actions)[k] != pe_go_to) { | |
50 continue; | |
51 } | |
52 break; | |
53 } | |
54 assert(k >= 0); | |
55 tokenStack.push(lstptr(map_state_number[*list], t_actions)[k]); | |
56 sn = *list; | |
57 LOGV(n) LCV(sn); | |
58 break; | |
59 } | |
60 assert(n >= 0); | |
61 } | |
62 } | |
63 | |
64 /* | |
65 * Each time it is necessary for the parser to get a new token, it | |
66 * begins by checking the key_word_index table indexed by the current | |
67 * state. If the entry is zero, there are no keywords to be identified. | |
68 * If the entry is non_zero, the entry identifies a location in the | |
69 * key_word identification table. The id_key_word function is then | |
70 * invoked to determine whether there is a keyword present. | |
71 * | |
72 * The id_key_word function is a table driven, finite state automaton. | |
73 * The actions are as follows: | |
74 * jump to next state | |
75 * The string in hand is not itself a key, but matches the | |
76 * beginning of one or more keys. | |
77 * set key and jump to next state | |
78 * The string in hand might be a key, but it also matches the | |
79 * beginning of at least one other key. Save the token number and read | |
80 * pointer for this key in case we do not get a complete match on | |
81 * the longer keys. | |
82 * accept key | |
83 * The string in hand is a key, and there are no others which match | |
84 * partially. | |
85 * end key | |
86 * The string in hand is the beginning of a key, there are no other | |
87 * matches, and it is only necessary to see if we get a match for the | |
88 * remainder of the key. | |
89 * no match key | |
90 * The string in hand does not match any key but there is a | |
91 * substring which matched. Go return it. | |
92 */ | |
93 | |
94 typedef enum { | |
95 accept_key, | |
96 set_key, | |
97 jmp_key, | |
98 end_key, | |
99 no_match_key, | |
100 cf_accept_key, | |
101 cf_set_key, | |
102 cf_end_key | |
103 } key_words; | |
104 | |
105 | |
106 static int build_key_table_state(char *sp[], int *tkn, unsigned ns) { | |
107 int n; | |
108 int nw = key_work_table->nt; | |
109 unsigned i = 0, j, k; | |
110 int ch, act, parm = 0, jmp = 0; | |
111 | |
112 check_tsd(key_work_table); | |
113 while (i < ns) { | |
114 ch = *sp[i]++; | |
115 parm = jmp = 0; | |
116 if (ch == 0) { | |
117 continue; | |
118 } | |
119 for (j = i+1, k = 0; j < ns && *sp[j] == ch;) { | |
120 sp[j++]++; | |
121 k++; | |
122 } | |
123 if (*sp[i] == 0) { | |
124 parm = tkn[i]; | |
125 if (k == 0) { | |
126 act = accept_key; | |
127 } | |
128 else { | |
129 act = set_key; | |
130 jmp = build_key_table_state(&sp[i+1], &tkn[i+1], k); | |
131 } | |
132 } | |
133 else if (k == 0) { | |
134 act = end_key; | |
135 parm = tkn[i]; | |
136 jmp = n_ends; | |
137 ass(sp[i]); | |
138 acs(0); | |
139 n_ends += strlen(sp[i]) + 1; | |
140 } | |
141 else { | |
142 act = jmp_key; | |
143 jmp = build_key_table_state(&sp[i], &tkn[i], k+1); | |
144 } | |
145 at(key_work_table, ch, act, parm, jmp); | |
146 i = j; | |
147 } | |
148 n = key_table->nt; | |
149 k = nw; | |
150 while (k < key_work_table->nt) { | |
151 xtxf(key_work_table, k, &ch, &act, &parm, &jmp); | |
152 at(key_table, ch, act, parm, jmp); | |
153 k++; | |
154 } | |
155 at(key_table, 255, no_match_key, 0, 0); | |
156 key_work_table->nt = nw; | |
157 return n; | |
158 } | |
159 | |
160 static int build_key_table(char *sp[], int tkn[], int ns) { | |
161 int i, j; | |
162 char *p; | |
163 int t; | |
164 LOGSECTION("build_key_table"); | |
165 | |
166 for (i = 0; i < ns; i++) { | |
167 for (j = i+1; j < ns; j++) { | |
168 if (strcmp(sp[j], sp[i]) < 0) { | |
169 p = sp[i]; | |
170 sp[i] = sp[j]; | |
171 sp[j] = p; | |
172 t = tkn[i]; | |
173 tkn[i] = tkn[j]; | |
174 tkn[j] = t; | |
175 } | |
176 } | |
177 } | |
178 return build_key_table_state(sp, tkn, ns); | |
179 } | |
180 | |
181 void build_key_tables(void) { | |
182 unsigned i; | |
183 unsigned nki = key_list_dict->nsx; | |
184 //int *ki; | |
185 extern char *key_ends; | |
186 extern unsigned n_key_ends; | |
187 extern unsigned nstates; | |
188 | |
189 LOGSECTION("build_key_tables"); | |
190 | |
191 if (Keyword::count() <= 1) { | |
192 return; | |
193 } | |
194 //ki = local_array(nki, int); | |
195 LocalArray<int> ki(nki); | |
196 memset(ki, 0, nki*sizeof(*ki)); | |
197 | |
198 key_table = init_tsd(4); | |
199 key_work_table = init_tsd(4); | |
200 at(key_table, 0, 0, 0, 0); | |
201 ics(); | |
202 n_ends = 0; | |
203 ki[0] = 0; | |
204 | |
205 for (i = 1; i < key_list_dict->nsx; i++) { | |
206 int *lb = dict_str(key_list_dict, i); | |
207 int ns = *lb++ - 1; | |
208 char **strings = ALLOCATE(ns, char *); | |
209 int *tokens = ALLOCATE(ns, int); | |
210 int j; | |
211 | |
212 for (j = 0; j < ns; j++) { | |
213 Keyword key = map_token_number[lb[j]].key; | |
214 strings[j] = key->string.pointer(); | |
215 tokens[j] = lb[j]; | |
216 } | |
217 ki[i] = build_key_table(strings, tokens, ns); | |
218 DEALLOCATE(strings); | |
219 DEALLOCATE(tokens); | |
220 } | |
221 delete_tsd(key_work_table); | |
222 n_key_ends = tis(); | |
223 key_ends = build_string(); | |
224 for (i = 0; i < nstates; i++) { | |
225 state_number_map *sp = &map_state_number[i]; | |
226 int k = sp->key_list; | |
227 | |
228 if (k == 0) { | |
229 continue; | |
230 } | |
231 sp->key_index = (unsigned short) ki[k]; | |
232 } | |
233 } | |
234 | |
235 void check_key_reserve(void) { | |
236 int n_keys = Keyword::count(); | |
237 int k = distinguishSets.size(); | |
238 | |
239 if (n_keys <= 1 || k == 0) { | |
240 return; | |
241 } | |
242 while (k--) { | |
243 int pt = distinguishSets[k]; | |
244 CharBitmap bitmap = map_parse_tree[pt].expression->bitmap(); | |
245 for (Each<Keyword> keyword; keyword.loopNotFinished(); keyword.getNext()) { | |
246 KeywordDescriptor &keywordDescriptor(keyword); | |
247 //unsigned char *str = (unsigned char *) keyword->string.pointer(); | |
248 unsigned char *str = (unsigned char *) keywordDescriptor.string.pointer(); | |
249 //if (keyword->reserve) continue; | |
250 if (keywordDescriptor.reserve) { | |
251 continue; | |
252 } | |
253 while (*str && bitmap.getBit(*str)) { | |
254 str++; | |
255 } | |
256 if (*str) { | |
257 continue; | |
258 } | |
259 //keyword->reserve = pt; | |
260 keywordDescriptor.reserve = pt; | |
261 } | |
262 } | |
263 } | |
264 | |
265 | |
266 void append_key(int kn) { | |
267 unsigned char *s = (unsigned char *) Keyword(kn)->string.pointer(); | |
268 while (*s) { | |
269 append_ascii_char(*s++); | |
270 } | |
271 } | |
272 | |
273 //int keyword_problem(unsigned sn,const unsigned char *p, int key){ | |
274 //int keyword_problem(unsigned sn,const unsigned char *p, Keyword key){ | |
275 //int keyword_problem(unsigned sn,const AgArray<int> input, Keyword key){ | |
276 int keyword_problem(unsigned sn, const AgArray<int> input, Keyword) { | |
277 LOGSECTION("keyword_problem"); | |
278 | |
279 AgStack<int> tokens; | |
280 | |
281 // Get a path to this state | |
282 tokensTo(sn, tokens); | |
283 | |
284 #ifdef INCLUDE_LOGGING | |
285 { | |
286 for (int i = 0; i < tokens.size(); i++) { | |
287 LOGV(tokens[i]); | |
288 } | |
289 } | |
290 #endif | |
291 | |
292 FtParser parser; | |
293 | |
294 // Advance to current state | |
295 while (tokens.size()) { | |
296 parser.parseToken(tokens.pop()); | |
297 } | |
298 | |
299 /* | |
300 LOGV(key); | |
301 // Disable keyword under test | |
302 parser.ignoreKeyword(key); | |
303 parser.parse((const char *) p); | |
304 traceCounts.discardData(); | |
305 if (parser.processState < FtParser::syntaxError) { | |
306 return 1; | |
307 } | |
308 return -1; | |
309 */ | |
310 | |
311 int n = input.size(); | |
312 int flag = 1; | |
313 for (int i = 0; flag == 1 && i < n; i++) { | |
314 parser.parseToken(input[i]); | |
315 if (parser.processState > FtParser::running) { | |
316 flag = -1; | |
317 } | |
318 } | |
319 traceCounts.discardData(); | |
320 return flag; | |
321 } | |
322 | |
323 Keyword::Keyword() | |
324 : KeyedObjectRegister<KeywordDescriptor>() | |
325 {} | |
326 | |
327 Keyword::Keyword(unsigned x) | |
328 : KeyedObjectRegister<KeywordDescriptor>(x) | |
329 {} | |
330 | |
331 Keyword::Keyword(const Keyword &t) | |
332 : KeyedObjectRegister<KeywordDescriptor>(t) | |
333 {} | |
334 | |
335 Keyword::Keyword(const char *x) | |
336 : KeyedObjectRegister<KeywordDescriptor>(KeywordDescriptor(x)) | |
337 {} | |
338 | |
339 void Keyword::reset() { | |
340 LOGSECTION("Keyword::reset"); | |
341 KeyedObjectRegister<KeywordDescriptor>::reset(); | |
342 Keyword(""); | |
343 AgString string = Keyword((unsigned) 0)->string; | |
344 LOGV(string.pointer()); | |
345 LOGV(descriptorList.size()) LCV(Keyword((unsigned) 0)->string); | |
346 LOGS("reset complete"); | |
347 } | |
348 | |
349 Keyword Keyword::sorted(int x) { | |
350 LOGSECTION("Keyword::sorted"); | |
351 LOGV(x); | |
352 Keyword keyword = tree[x]; | |
353 LOGV(keyword) LCV(keyword->string); | |
354 return keyword; | |
355 } | |
356 | |
357 Keyword Keyword::create() { | |
358 LOGSECTION("Keyword::create"); | |
359 tss(); | |
360 LOGV(string_base); | |
361 Keyword keyword(string_base); | |
362 rcs(); | |
363 LOGV(keyword); | |
364 return keyword; | |
365 } | |
366 | |
367 Keyword add_string_dict(const char *s, Keyword::Dict &d) { | |
368 return d.add_string(s); | |
369 } | |
370 | |
371 //Keyword::Map map_token_name; | |
372 | |
373 KeywordDescriptor &Keyword::Map::operator [] (unsigned x) { | |
374 LOGSECTION("Keyword::Map::operator []"); | |
375 LOGV(x) LCV(descriptorList.size()); | |
376 assert(x < Keyword::descriptorList.size()); | |
377 return Keyword::descriptorList[x]; | |
378 } | |
379 | |
380 Keyword Keyword::Dict::add_string(const char *s) { | |
381 return Keyword(s); | |
382 } | |
383 | |
384 KeywordDescriptor::KeywordDescriptor() | |
385 : reserve(0) | |
386 { | |
387 // Nothing to do | |
388 } | |
389 | |
390 KeywordDescriptor::KeywordDescriptor(const char *s) | |
391 : string(s), reserve(0) | |
392 { | |
393 // Nothing to do | |
394 } | |
395 | |
396 int KeywordDescriptor::operator < (const KeywordDescriptor &d) const { | |
397 return string < d.string; | |
398 } | |
399 | |
400 const int Keyword::first = 1; |