comparison anagram/agcore/keyword.cpp @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children 607e3be6bad8
comparison
equal deleted inserted replaced
-1:000000000000 0:13d2b8934445
1 /*
2 * AnaGram, A System for Syntax Directed Programming
3 * Copyright 1993-2002 Parsifal Software. All Rights Reserved.
4 * See the file COPYING for license and usage terms.
5 *
6 * keyword.cpp
7 */
8
9 #include "arrays.h"
10 #include "bitmap.h"
11 #include "bpe3.h"
12 #include "csexp.h"
13 #include "dict.h"
14 #include "ftpar.h"
15 #include "keyword.h"
16 #include "myalloc.h"
17 #include "q1glbl.h"
18 #include "stacks.h"
19 #include "tsd.h"
20 #include "ut.h"
21
22 //#define INCLUDE_LOGGING
23 #include "log.h"
24
25
26 static tsd *key_work_table;
27 static int n_ends = 0;
28
29
30 static void tokensTo(unsigned sn, AgStack<int> &tokenStack) {
31 LOGSECTION("tokensTo");
32 AgBalancedTree<int> tree;
33 while (sn) {
34 tree.insert(sn);
35 state_number_map *sp = &map_state_number[sn];
36 unsigned *list = lstptr(*sp, previous_states);
37 int n = sp->n_previous_states;
38 LOGV(sn) LCV(sp->n_previous_states);
39 while (n--) {
40 LOGV(n) LCV(*list) LCV(map_state_number[*list].char_token);
41 while (tree.includes(*list)) {
42 list++;
43 }
44 int k = map_state_number[*list].n_actions;
45 while (k--) {
46 if (lstptr(map_state_number[*list], p_actions)[k] != sn) {
47 continue;
48 }
49 if (lstptr(map_state_number[*list], a_actions)[k] != pe_go_to) {
50 continue;
51 }
52 break;
53 }
54 assert(k >= 0);
55 tokenStack.push(lstptr(map_state_number[*list], t_actions)[k]);
56 sn = *list;
57 LOGV(n) LCV(sn);
58 break;
59 }
60 assert(n >= 0);
61 }
62 }
63
64 /*
65 * Each time it is necessary for the parser to get a new token, it
66 * begins by checking the key_word_index table indexed by the current
67 * state. If the entry is zero, there are no keywords to be identified.
68 * If the entry is non_zero, the entry identifies a location in the
69 * key_word identification table. The id_key_word function is then
70 * invoked to determine whether there is a keyword present.
71 *
72 * The id_key_word function is a table driven, finite state automaton.
73 * The actions are as follows:
74 * jump to next state
75 * The string in hand is not itself a key, but matches the
76 * beginning of one or more keys.
77 * set key and jump to next state
78 * The string in hand might be a key, but it also matches the
79 * beginning of at least one other key. Save the token number and read
80 * pointer for this key in case we do not get a complete match on
81 * the longer keys.
82 * accept key
83 * The string in hand is a key, and there are no others which match
84 * partially.
85 * end key
86 * The string in hand is the beginning of a key, there are no other
87 * matches, and it is only necessary to see if we get a match for the
88 * remainder of the key.
89 * no match key
90 * The string in hand does not match any key but there is a
91 * substring which matched. Go return it.
92 */
93
94 typedef enum {
95 accept_key,
96 set_key,
97 jmp_key,
98 end_key,
99 no_match_key,
100 cf_accept_key,
101 cf_set_key,
102 cf_end_key
103 } key_words;
104
105
106 static int build_key_table_state(char *sp[], int *tkn, unsigned ns) {
107 int n;
108 int nw = key_work_table->nt;
109 unsigned i = 0, j, k;
110 int ch, act, parm = 0, jmp = 0;
111
112 check_tsd(key_work_table);
113 while (i < ns) {
114 ch = *sp[i]++;
115 parm = jmp = 0;
116 if (ch == 0) {
117 continue;
118 }
119 for (j = i+1, k = 0; j < ns && *sp[j] == ch;) {
120 sp[j++]++;
121 k++;
122 }
123 if (*sp[i] == 0) {
124 parm = tkn[i];
125 if (k == 0) {
126 act = accept_key;
127 }
128 else {
129 act = set_key;
130 jmp = build_key_table_state(&sp[i+1], &tkn[i+1], k);
131 }
132 }
133 else if (k == 0) {
134 act = end_key;
135 parm = tkn[i];
136 jmp = n_ends;
137 ass(sp[i]);
138 acs(0);
139 n_ends += strlen(sp[i]) + 1;
140 }
141 else {
142 act = jmp_key;
143 jmp = build_key_table_state(&sp[i], &tkn[i], k+1);
144 }
145 at(key_work_table, ch, act, parm, jmp);
146 i = j;
147 }
148 n = key_table->nt;
149 k = nw;
150 while (k < key_work_table->nt) {
151 xtxf(key_work_table, k, &ch, &act, &parm, &jmp);
152 at(key_table, ch, act, parm, jmp);
153 k++;
154 }
155 at(key_table, 255, no_match_key, 0, 0);
156 key_work_table->nt = nw;
157 return n;
158 }
159
160 static int build_key_table(char *sp[], int tkn[], int ns) {
161 int i, j;
162 char *p;
163 int t;
164 LOGSECTION("build_key_table");
165
166 for (i = 0; i < ns; i++) {
167 for (j = i+1; j < ns; j++) {
168 if (strcmp(sp[j], sp[i]) < 0) {
169 p = sp[i];
170 sp[i] = sp[j];
171 sp[j] = p;
172 t = tkn[i];
173 tkn[i] = tkn[j];
174 tkn[j] = t;
175 }
176 }
177 }
178 return build_key_table_state(sp, tkn, ns);
179 }
180
181 void build_key_tables(void) {
182 unsigned i;
183 unsigned nki = key_list_dict->nsx;
184 //int *ki;
185 extern char *key_ends;
186 extern unsigned n_key_ends;
187 extern unsigned nstates;
188
189 LOGSECTION("build_key_tables");
190
191 if (Keyword::count() <= 1) {
192 return;
193 }
194 //ki = local_array(nki, int);
195 LocalArray<int> ki(nki);
196 memset(ki, 0, nki*sizeof(*ki));
197
198 key_table = init_tsd(4);
199 key_work_table = init_tsd(4);
200 at(key_table, 0, 0, 0, 0);
201 ics();
202 n_ends = 0;
203 ki[0] = 0;
204
205 for (i = 1; i < key_list_dict->nsx; i++) {
206 int *lb = dict_str(key_list_dict, i);
207 int ns = *lb++ - 1;
208 char **strings = ALLOCATE(ns, char *);
209 int *tokens = ALLOCATE(ns, int);
210 int j;
211
212 for (j = 0; j < ns; j++) {
213 Keyword key = map_token_number[lb[j]].key;
214 strings[j] = key->string.pointer();
215 tokens[j] = lb[j];
216 }
217 ki[i] = build_key_table(strings, tokens, ns);
218 DEALLOCATE(strings);
219 DEALLOCATE(tokens);
220 }
221 delete_tsd(key_work_table);
222 n_key_ends = tis();
223 key_ends = build_string();
224 for (i = 0; i < nstates; i++) {
225 state_number_map *sp = &map_state_number[i];
226 int k = sp->key_list;
227
228 if (k == 0) {
229 continue;
230 }
231 sp->key_index = (unsigned short) ki[k];
232 }
233 }
234
235 void check_key_reserve(void) {
236 int n_keys = Keyword::count();
237 int k = distinguishSets.size();
238
239 if (n_keys <= 1 || k == 0) {
240 return;
241 }
242 while (k--) {
243 int pt = distinguishSets[k];
244 CharBitmap bitmap = map_parse_tree[pt].expression->bitmap();
245 for (Each<Keyword> keyword; keyword.loopNotFinished(); keyword.getNext()) {
246 KeywordDescriptor &keywordDescriptor(keyword);
247 //unsigned char *str = (unsigned char *) keyword->string.pointer();
248 unsigned char *str = (unsigned char *) keywordDescriptor.string.pointer();
249 //if (keyword->reserve) continue;
250 if (keywordDescriptor.reserve) {
251 continue;
252 }
253 while (*str && bitmap.getBit(*str)) {
254 str++;
255 }
256 if (*str) {
257 continue;
258 }
259 //keyword->reserve = pt;
260 keywordDescriptor.reserve = pt;
261 }
262 }
263 }
264
265
266 void append_key(int kn) {
267 unsigned char *s = (unsigned char *) Keyword(kn)->string.pointer();
268 while (*s) {
269 append_ascii_char(*s++);
270 }
271 }
272
273 //int keyword_problem(unsigned sn,const unsigned char *p, int key){
274 //int keyword_problem(unsigned sn,const unsigned char *p, Keyword key){
275 //int keyword_problem(unsigned sn,const AgArray<int> input, Keyword key){
276 int keyword_problem(unsigned sn, const AgArray<int> input, Keyword) {
277 LOGSECTION("keyword_problem");
278
279 AgStack<int> tokens;
280
281 // Get a path to this state
282 tokensTo(sn, tokens);
283
284 #ifdef INCLUDE_LOGGING
285 {
286 for (int i = 0; i < tokens.size(); i++) {
287 LOGV(tokens[i]);
288 }
289 }
290 #endif
291
292 FtParser parser;
293
294 // Advance to current state
295 while (tokens.size()) {
296 parser.parseToken(tokens.pop());
297 }
298
299 /*
300 LOGV(key);
301 // Disable keyword under test
302 parser.ignoreKeyword(key);
303 parser.parse((const char *) p);
304 traceCounts.discardData();
305 if (parser.processState < FtParser::syntaxError) {
306 return 1;
307 }
308 return -1;
309 */
310
311 int n = input.size();
312 int flag = 1;
313 for (int i = 0; flag == 1 && i < n; i++) {
314 parser.parseToken(input[i]);
315 if (parser.processState > FtParser::running) {
316 flag = -1;
317 }
318 }
319 traceCounts.discardData();
320 return flag;
321 }
322
323 Keyword::Keyword()
324 : KeyedObjectRegister<KeywordDescriptor>()
325 {}
326
327 Keyword::Keyword(unsigned x)
328 : KeyedObjectRegister<KeywordDescriptor>(x)
329 {}
330
331 Keyword::Keyword(const Keyword &t)
332 : KeyedObjectRegister<KeywordDescriptor>(t)
333 {}
334
335 Keyword::Keyword(const char *x)
336 : KeyedObjectRegister<KeywordDescriptor>(KeywordDescriptor(x))
337 {}
338
339 void Keyword::reset() {
340 LOGSECTION("Keyword::reset");
341 KeyedObjectRegister<KeywordDescriptor>::reset();
342 Keyword("");
343 AgString string = Keyword((unsigned) 0)->string;
344 LOGV(string.pointer());
345 LOGV(descriptorList.size()) LCV(Keyword((unsigned) 0)->string);
346 LOGS("reset complete");
347 }
348
349 Keyword Keyword::sorted(int x) {
350 LOGSECTION("Keyword::sorted");
351 LOGV(x);
352 Keyword keyword = tree[x];
353 LOGV(keyword) LCV(keyword->string);
354 return keyword;
355 }
356
357 Keyword Keyword::create() {
358 LOGSECTION("Keyword::create");
359 tss();
360 LOGV(string_base);
361 Keyword keyword(string_base);
362 rcs();
363 LOGV(keyword);
364 return keyword;
365 }
366
367 Keyword add_string_dict(const char *s, Keyword::Dict &d) {
368 return d.add_string(s);
369 }
370
371 //Keyword::Map map_token_name;
372
373 KeywordDescriptor &Keyword::Map::operator [] (unsigned x) {
374 LOGSECTION("Keyword::Map::operator []");
375 LOGV(x) LCV(descriptorList.size());
376 assert(x < Keyword::descriptorList.size());
377 return Keyword::descriptorList[x];
378 }
379
380 Keyword Keyword::Dict::add_string(const char *s) {
381 return Keyword(s);
382 }
383
384 KeywordDescriptor::KeywordDescriptor()
385 : reserve(0)
386 {
387 // Nothing to do
388 }
389
390 KeywordDescriptor::KeywordDescriptor(const char *s)
391 : string(s), reserve(0)
392 {
393 // Nothing to do
394 }
395
396 int KeywordDescriptor::operator < (const KeywordDescriptor &d) const {
397 return string < d.string;
398 }
399
400 const int Keyword::first = 1;