Mercurial > ~dholland > hg > ag > index.cgi
comparison anagram/agcore/lexeme.cpp @ 0:13d2b8934445
Import AnaGram (near-)release tree into Mercurial.
author | David A. Holland |
---|---|
date | Sat, 22 Dec 2007 17:52:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:13d2b8934445 |
---|---|
1 /* | |
2 * AnaGram, A System for Syntax Directed Programming | |
3 * Copyright 1993-1999 Parsifal Software. All Rights Reserved. | |
4 * See the file COPYING for license and usage terms. | |
5 * | |
6 * lexeme.cpp - lexeme analysis | |
7 */ | |
8 | |
9 #include "arrays.h" | |
10 #include "config.h" | |
11 #include "data.h" | |
12 #include "dict.h" | |
13 #include "keyword.h" | |
14 #include "lexeme.h" | |
15 #include "q1glbl.h" | |
16 #include "q5.h" | |
17 #include "rpk.h" | |
18 #include "rule.h" | |
19 #include "stacks.h" | |
20 #include "token.h" | |
21 #include "tree.h" | |
22 #include "tsd.h" | |
23 | |
24 //#define INCLUDE_LOGGING | |
25 #include "log.h" | |
26 | |
27 | |
28 #define FIX3 | |
29 | |
30 AgStack<int> disregardList; | |
31 unsigned disregard_token; | |
32 | |
33 | |
34 // Find and mark the lexical rules. | |
35 static void find_lexical_rules(void) { | |
36 LOGSECTION("find_lexical_rules"); | |
37 unsigned ku; | |
38 int k; | |
39 iws(); | |
40 LOGS("disregard list tokens"); | |
41 // First, all the disregard tokens | |
42 for (ku = disregardList.size(); ku--;) { | |
43 //xws(disregard_list[ku]); | |
44 xws(disregardList[ku]); | |
45 //Token(disregardList[ku])->disregard = 1; | |
46 LOGV(disregardList[ku]); | |
47 } | |
48 // Then all the lexemes | |
49 LOGS("lexemes"); | |
50 for (ku = 0; ku++ < ntkns;) if (map_token_number[ku].lexeme) { | |
51 xws(ku); | |
52 LOGV(ku); | |
53 } | |
54 // rules produced by disregard tokens and lexemes are | |
55 // lexical rules. Any rule produced by a token found | |
56 // in a lexical rule is also a lexical rule. | |
57 // This loop, in other words, implements a closure | |
58 LOGS("lexical rules"); | |
59 for (k = 0; k < tis(); k++) { | |
60 int tn = list_base[k]; | |
61 int *bnf = bnf_table->sb; | |
62 int nbnf = bnf_table->nt; | |
63 while (nbnf--) { | |
64 int t = *bnf++, f = *bnf++, n; | |
65 | |
66 if (t != tn) { | |
67 continue; | |
68 } | |
69 Rule rule(f); | |
70 n = rule->length(); | |
71 rule->lexical = 1; | |
72 LOGV(rule) LCV(rule->lexical); | |
73 while (n--) { | |
74 Token token = rule.token(n); | |
75 if (token->non_terminal_flag) { | |
76 xws(token); | |
77 LOGV(token); | |
78 } | |
79 } | |
80 } | |
81 } | |
82 rws(); | |
83 } | |
84 | |
85 static void build_noise_token(void) { | |
86 LOGSECTION("build_noise_token"); | |
87 LOGV(disregardList.size()); | |
88 if (disregardList.size() == 1) { | |
89 disregard_token = vp_6(disregardList[0]); | |
90 Token token = disregardList[0]; | |
91 token->disregard = 1; | |
92 } | |
93 else { | |
94 int n = disregardList.size();; | |
95 //int *lb = disregard_list; | |
96 iws(); | |
97 int i; | |
98 for (i = 0; i < n; i++) { | |
99 ruleElementStack | |
100 .push(AgStack<RuleElement>()) | |
101 .top() | |
102 .push(RuleElement(disregardList[i],0)); | |
103 aws(vp_form3(0)); | |
104 } | |
105 disregard_token = vp_4(); | |
106 } | |
107 extern Token vpRepeatToken; | |
108 Token disregard = Token(disregard_token); | |
109 disregard->disregard = 1; | |
110 vpRepeatToken->disregard = 1; | |
111 LOGV(disregard); | |
112 LOGV((int) vpRepeatToken); | |
113 } | |
114 | |
115 static int in_disregard_list(int tn) { | |
116 LOGSECTION("in_disregard_list"); | |
117 int n = disregardList.size(); | |
118 while (n--) { | |
119 if (tn == disregardList[n]) { | |
120 return 1; | |
121 } | |
122 } | |
123 return 0; | |
124 } | |
125 | |
126 static void subs_bnf(int tn, int nt) { | |
127 int *p = bnf_table->sb; | |
128 int n = bnf_table->nt; | |
129 for (; n--; p += 2) { | |
130 if (*p != tn) { | |
131 continue; | |
132 } | |
133 *p = nt; | |
134 Rule rule(p[1]); | |
135 if ((int)rule->prim_tkn == tn) { | |
136 rule->prim_tkn = nt; | |
137 } | |
138 } | |
139 } | |
140 | |
141 static int alias(Token token) { | |
142 LOGSECTION("alias"); | |
143 | |
144 Token pureToken = Token::create(); | |
145 LOGV(token) LCV(pureToken); | |
146 map_token_number[pureToken] = map_token_number[token]; | |
147 LOGV(token) LCV(token->value_type) LCV(token->immediate_action); | |
148 LOGV(pureToken) LCV(pureToken->value_type) LCV(token->immediate_action); | |
149 if (token->key) { | |
150 Keyword keyword =token->key; | |
151 keyword->token_number = pureToken; | |
152 } | |
153 pureToken->pure = 1; | |
154 LOGV(token->non_terminal_flag) LCV(token->token_set_id); | |
155 if (token->non_terminal_flag) { | |
156 LOGS("Substituting") LCV(token) LCV(pureToken); | |
157 subs_bnf(token,pureToken); | |
158 } | |
159 token->junky = 1; | |
160 if (token->token_set_id) { | |
161 LOGV(token->token_set_id); | |
162 pureToken->token_set_id = token->token_set_id; | |
163 token->token_set_id = 0; | |
164 int n = part_dict->nsx; | |
165 while (n--) if (map_part_number[n].token_number == (unsigned) token) { | |
166 map_part_number[n].token_number = pureToken; | |
167 LOGV(n); | |
168 break; | |
169 } | |
170 for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) { | |
171 if ((int) rule->prim_tkn == token) { | |
172 rule->prim_tkn = pureToken; | |
173 LOGV(rule); | |
174 } | |
175 } | |
176 for (unsigned i = 0; i < n_chars; i++) { | |
177 if (map_char_number[i].token_number == (unsigned) token) { | |
178 map_char_number[i].token_number = pureToken; | |
179 } | |
180 } | |
181 } | |
182 else if (token->part_number) { | |
183 LOGV(token->part_number); | |
184 pureToken->part_number = token->part_number; | |
185 map_part_number[token->part_number].token_number = pureToken; | |
186 token->part_number = 0; | |
187 for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) { | |
188 if ((int) rule->prim_tkn == token) { | |
189 rule->prim_tkn = pureToken; | |
190 LOGV(rule); | |
191 } | |
192 } | |
193 for (unsigned i = 0; i < n_chars; i++) { | |
194 if (map_char_number[i].token_number == (unsigned) token) { | |
195 map_char_number[i].token_number = pureToken; | |
196 } | |
197 } | |
198 } | |
199 Rule rule = makeRule(pureToken, disregard_token); | |
200 at(bnf_table, (int)token, (int)rule); | |
201 token->non_terminal_flag = 1; | |
202 rule->prim_tkn = token; | |
203 ParseTree parseTree = token->parse_tree; | |
204 if (parseTree) { | |
205 parseTree->token_number = pureToken; | |
206 } | |
207 LOGV((int) token) LCV(token->value_type); | |
208 LOGV((int) pureToken) LCV(pureToken->value_type); | |
209 return pureToken; | |
210 } | |
211 | |
212 #ifdef NOT_FIX3 | |
213 | |
214 static AgStack<int> findRules(Token token) { | |
215 AgStack<int> rules; | |
216 int *p = bnf_table->sb; | |
217 int n = bnf_table->nt; | |
218 for (; n--; p += 2) { | |
219 if (*p != (int) token) continue; | |
220 rules.push(p[1]); | |
221 } | |
222 return rules; | |
223 } | |
224 #endif | |
225 | |
226 /* | |
227 scan rules, and and mark token usage as lexical, non-lexical or both | |
228 then, for each token that has both lexical and non lexical usage, | |
229 make a clone | |
230 */ | |
231 | |
232 void set_lexemes(void) { | |
233 int nf = nforms; | |
234 nInputRules = nforms; | |
235 LOGSECTION("set_lexemes"); | |
236 LOGV(nforms); | |
237 | |
238 disregard_token = 0; | |
239 if (disregardList.size() == 0) return; | |
240 LocalArray<int> newTokenNumber(ntkns+1); | |
241 #ifdef NOT_FIX3 | |
242 int maxTokenNumber = ntkns; | |
243 #endif | |
244 memset(newTokenNumber, 0, (ntkns+1)*sizeof(*newTokenNumber)); | |
245 Each<Rule> rule; | |
246 #ifdef INCLUDE_LOGGING | |
247 for (rule.restart(); (int) rule <= nf; rule.getNext()) { | |
248 LOGV(rule) LCV(rule->lexical); | |
249 } | |
250 #endif | |
251 find_lexical_rules(); | |
252 #ifdef INCLUDE_LOGGING | |
253 for (rule.restart(); (int) rule <= nf; rule.getNext()) { | |
254 LOGV(rule) LCV(rule->lexical); | |
255 } | |
256 #endif | |
257 build_noise_token(); | |
258 //#ifdef FIX3 | |
259 // mark lexical tokens | |
260 for (rule.restart(); (int) rule <= nf; rule.getNext()) { | |
261 int n = rule->length(); | |
262 LOGV(rule) LCV(rule->lexical); | |
263 if (n == 0 || !rule->lexical) { | |
264 continue; | |
265 } | |
266 while (n--) { | |
267 Token token = rule.token(n); | |
268 token->lexical = 1; | |
269 } | |
270 } | |
271 //#endif | |
272 | |
273 // Scan rules which are _not_ lexical | |
274 for (rule.restart(); (int) rule <= nf; rule.getNext()) { | |
275 int n = rule->length(); | |
276 LOGV(rule) LCV(rule->lexical); | |
277 if (n == 0 || rule->lexical) { | |
278 continue; | |
279 } | |
280 LOGSECTION("Scanning non-lexical rule"); | |
281 LOGV(rule); | |
282 while (n--) { | |
283 Token token = rule.token(n); | |
284 LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag); | |
285 LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token)); | |
286 LOGV(token->disregard); | |
287 if (newTokenNumber[token] || | |
288 in_disregard_list(token) || | |
289 (token->non_terminal_flag && token->token_set_id) || | |
290 token->disregard || | |
291 (int) token == eof_token || | |
292 (int) token == error_token) { | |
293 continue; | |
294 } | |
295 if (token->non_terminal_flag && !token->lexeme) { | |
296 continue; | |
297 } | |
298 // newTokenNumber is the pure token | |
299 newTokenNumber[token] = alias(token); | |
300 LOGV(token) LCV(newTokenNumber[token]); | |
301 } | |
302 } | |
303 #ifdef FIX3 | |
304 for (rule.restart(); (int) rule <= nf; rule.getNext()) { | |
305 int n = rule->length(); | |
306 LOGV(rule) LCV(rule->lexical); | |
307 if (n == 0 || rule->lexical) { | |
308 continue; | |
309 } | |
310 LOGSECTION("Scanning non-lexical rule"); | |
311 LOGV(rule); | |
312 while (n--) { | |
313 Token token = rule.token(n); | |
314 LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag); | |
315 LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token)); | |
316 LOGV(token->disregard); | |
317 if (newTokenNumber[token] || | |
318 in_disregard_list(token) || | |
319 (token->non_terminal_flag && token->token_set_id) || | |
320 token->disregard || | |
321 (int) token == eof_token || | |
322 (int) token == error_token) { | |
323 continue; | |
324 } | |
325 if (token->non_terminal_flag && !token->lexical) { | |
326 continue; | |
327 } | |
328 // newTokenNumber is the pure token | |
329 newTokenNumber[token] = alias(token); | |
330 LOGV(token) LCV(newTokenNumber[token]); | |
331 } | |
332 } | |
333 #endif | |
334 #ifdef NOT_FIX3 | |
335 for (rule.restart(); (int) rule <= nf; rule.getNext()) { | |
336 int n = rule->length(); | |
337 if (n == 0 || rule->lexical) { | |
338 continue; | |
339 } | |
340 while (n-- > 0) { | |
341 Token token = rule.token(n); | |
342 if ((int) token >= maxTokenNumber || newTokenNumber[token]) continue; | |
343 if (newTokenNumber[token] || | |
344 in_disregard_list(token) || | |
345 (int) token == eof_token || | |
346 (token->non_terminal_flag && token->token_set_id) || | |
347 (int) token == error_token) { | |
348 continue; | |
349 } | |
350 if (token->non_terminal_flag && !token->lexical) { | |
351 continue; | |
352 } | |
353 AgStack<int> ruleList = findRules(token); | |
354 Token newToken = Token::create(); | |
355 map_token_number[newToken] = map_token_number[token]; | |
356 subs_bnf(token, newToken); | |
357 newTokenNumber[token] = newToken; | |
358 newToken->pure = 1; | |
359 int i; | |
360 for (i = 0; i < ruleList.size(); i++) { | |
361 Rule oldRule = ruleList[i]; | |
362 Rule newRule = Rule::create(); | |
363 map_form_number[newRule] = map_form_number[oldRule]; | |
364 int k = oldRule->elementList.size(); | |
365 newRule->elementList = AgArray<RuleElement>(k); | |
366 while (k--) { | |
367 newRule->elementList[k] = oldRule->elementList[k]; | |
368 } | |
369 newRule->lexical = 0; | |
370 at(bnf_table,(int)token,(int)newRule); | |
371 token->non_terminal_flag = 1; | |
372 newRule->prim_tkn = token; | |
373 } | |
374 } | |
375 } | |
376 #endif | |
377 LOGS("alias loop complete"); | |
378 for (rule.restart(); rule.loopNotFinished(); rule.getNext()) { | |
379 int n = rule->length(); | |
380 LOGV(rule) LCV(rule->lexical); | |
381 if (n == 0) continue; | |
382 if (!rule->lexical) continue; | |
383 LOGSECTION("Substitution loop"); | |
384 while (n-- > 0) { | |
385 Token token = rule.token(n); | |
386 if (newTokenNumber[token] == 0) { | |
387 continue; | |
388 } | |
389 rule.token(n) = newTokenNumber[token]; | |
390 LOGV(token) LCV(newTokenNumber[token]); | |
391 } | |
392 } | |
393 LOGS("Rule loop complete"); | |
394 nforms_base = nforms; | |
395 } | |
396 | |
397 |