comparison anagram/agcore/lexeme.cpp @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:13d2b8934445
1 /*
2 * AnaGram, A System for Syntax Directed Programming
3 * Copyright 1993-1999 Parsifal Software. All Rights Reserved.
4 * See the file COPYING for license and usage terms.
5 *
6 * lexeme.cpp - lexeme analysis
7 */
8
9 #include "arrays.h"
10 #include "config.h"
11 #include "data.h"
12 #include "dict.h"
13 #include "keyword.h"
14 #include "lexeme.h"
15 #include "q1glbl.h"
16 #include "q5.h"
17 #include "rpk.h"
18 #include "rule.h"
19 #include "stacks.h"
20 #include "token.h"
21 #include "tree.h"
22 #include "tsd.h"
23
24 //#define INCLUDE_LOGGING
25 #include "log.h"
26
27
28 #define FIX3
29
30 AgStack<int> disregardList;
31 unsigned disregard_token;
32
33
34 // Find and mark the lexical rules.
35 static void find_lexical_rules(void) {
36 LOGSECTION("find_lexical_rules");
37 unsigned ku;
38 int k;
39 iws();
40 LOGS("disregard list tokens");
41 // First, all the disregard tokens
42 for (ku = disregardList.size(); ku--;) {
43 //xws(disregard_list[ku]);
44 xws(disregardList[ku]);
45 //Token(disregardList[ku])->disregard = 1;
46 LOGV(disregardList[ku]);
47 }
48 // Then all the lexemes
49 LOGS("lexemes");
50 for (ku = 0; ku++ < ntkns;) if (map_token_number[ku].lexeme) {
51 xws(ku);
52 LOGV(ku);
53 }
54 // rules produced by disregard tokens and lexemes are
55 // lexical rules. Any rule produced by a token found
56 // in a lexical rule is also a lexical rule.
57 // This loop, in other words, implements a closure
58 LOGS("lexical rules");
59 for (k = 0; k < tis(); k++) {
60 int tn = list_base[k];
61 int *bnf = bnf_table->sb;
62 int nbnf = bnf_table->nt;
63 while (nbnf--) {
64 int t = *bnf++, f = *bnf++, n;
65
66 if (t != tn) {
67 continue;
68 }
69 Rule rule(f);
70 n = rule->length();
71 rule->lexical = 1;
72 LOGV(rule) LCV(rule->lexical);
73 while (n--) {
74 Token token = rule.token(n);
75 if (token->non_terminal_flag) {
76 xws(token);
77 LOGV(token);
78 }
79 }
80 }
81 }
82 rws();
83 }
84
85 static void build_noise_token(void) {
86 LOGSECTION("build_noise_token");
87 LOGV(disregardList.size());
88 if (disregardList.size() == 1) {
89 disregard_token = vp_6(disregardList[0]);
90 Token token = disregardList[0];
91 token->disregard = 1;
92 }
93 else {
94 int n = disregardList.size();;
95 //int *lb = disregard_list;
96 iws();
97 int i;
98 for (i = 0; i < n; i++) {
99 ruleElementStack
100 .push(AgStack<RuleElement>())
101 .top()
102 .push(RuleElement(disregardList[i],0));
103 aws(vp_form3(0));
104 }
105 disregard_token = vp_4();
106 }
107 extern Token vpRepeatToken;
108 Token disregard = Token(disregard_token);
109 disregard->disregard = 1;
110 vpRepeatToken->disregard = 1;
111 LOGV(disregard);
112 LOGV((int) vpRepeatToken);
113 }
114
115 static int in_disregard_list(int tn) {
116 LOGSECTION("in_disregard_list");
117 int n = disregardList.size();
118 while (n--) {
119 if (tn == disregardList[n]) {
120 return 1;
121 }
122 }
123 return 0;
124 }
125
126 static void subs_bnf(int tn, int nt) {
127 int *p = bnf_table->sb;
128 int n = bnf_table->nt;
129 for (; n--; p += 2) {
130 if (*p != tn) {
131 continue;
132 }
133 *p = nt;
134 Rule rule(p[1]);
135 if ((int)rule->prim_tkn == tn) {
136 rule->prim_tkn = nt;
137 }
138 }
139 }
140
141 static int alias(Token token) {
142 LOGSECTION("alias");
143
144 Token pureToken = Token::create();
145 LOGV(token) LCV(pureToken);
146 map_token_number[pureToken] = map_token_number[token];
147 LOGV(token) LCV(token->value_type) LCV(token->immediate_action);
148 LOGV(pureToken) LCV(pureToken->value_type) LCV(token->immediate_action);
149 if (token->key) {
150 Keyword keyword =token->key;
151 keyword->token_number = pureToken;
152 }
153 pureToken->pure = 1;
154 LOGV(token->non_terminal_flag) LCV(token->token_set_id);
155 if (token->non_terminal_flag) {
156 LOGS("Substituting") LCV(token) LCV(pureToken);
157 subs_bnf(token,pureToken);
158 }
159 token->junky = 1;
160 if (token->token_set_id) {
161 LOGV(token->token_set_id);
162 pureToken->token_set_id = token->token_set_id;
163 token->token_set_id = 0;
164 int n = part_dict->nsx;
165 while (n--) if (map_part_number[n].token_number == (unsigned) token) {
166 map_part_number[n].token_number = pureToken;
167 LOGV(n);
168 break;
169 }
170 for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) {
171 if ((int) rule->prim_tkn == token) {
172 rule->prim_tkn = pureToken;
173 LOGV(rule);
174 }
175 }
176 for (unsigned i = 0; i < n_chars; i++) {
177 if (map_char_number[i].token_number == (unsigned) token) {
178 map_char_number[i].token_number = pureToken;
179 }
180 }
181 }
182 else if (token->part_number) {
183 LOGV(token->part_number);
184 pureToken->part_number = token->part_number;
185 map_part_number[token->part_number].token_number = pureToken;
186 token->part_number = 0;
187 for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) {
188 if ((int) rule->prim_tkn == token) {
189 rule->prim_tkn = pureToken;
190 LOGV(rule);
191 }
192 }
193 for (unsigned i = 0; i < n_chars; i++) {
194 if (map_char_number[i].token_number == (unsigned) token) {
195 map_char_number[i].token_number = pureToken;
196 }
197 }
198 }
199 Rule rule = makeRule(pureToken, disregard_token);
200 at(bnf_table, (int)token, (int)rule);
201 token->non_terminal_flag = 1;
202 rule->prim_tkn = token;
203 ParseTree parseTree = token->parse_tree;
204 if (parseTree) {
205 parseTree->token_number = pureToken;
206 }
207 LOGV((int) token) LCV(token->value_type);
208 LOGV((int) pureToken) LCV(pureToken->value_type);
209 return pureToken;
210 }
211
212 #ifdef NOT_FIX3
213
214 static AgStack<int> findRules(Token token) {
215 AgStack<int> rules;
216 int *p = bnf_table->sb;
217 int n = bnf_table->nt;
218 for (; n--; p += 2) {
219 if (*p != (int) token) continue;
220 rules.push(p[1]);
221 }
222 return rules;
223 }
224 #endif
225
226 /*
227 scan rules, and and mark token usage as lexical, non-lexical or both
228 then, for each token that has both lexical and non lexical usage,
229 make a clone
230 */
231
232 void set_lexemes(void) {
233 int nf = nforms;
234 nInputRules = nforms;
235 LOGSECTION("set_lexemes");
236 LOGV(nforms);
237
238 disregard_token = 0;
239 if (disregardList.size() == 0) return;
240 LocalArray<int> newTokenNumber(ntkns+1);
241 #ifdef NOT_FIX3
242 int maxTokenNumber = ntkns;
243 #endif
244 memset(newTokenNumber, 0, (ntkns+1)*sizeof(*newTokenNumber));
245 Each<Rule> rule;
246 #ifdef INCLUDE_LOGGING
247 for (rule.restart(); (int) rule <= nf; rule.getNext()) {
248 LOGV(rule) LCV(rule->lexical);
249 }
250 #endif
251 find_lexical_rules();
252 #ifdef INCLUDE_LOGGING
253 for (rule.restart(); (int) rule <= nf; rule.getNext()) {
254 LOGV(rule) LCV(rule->lexical);
255 }
256 #endif
257 build_noise_token();
258 //#ifdef FIX3
259 // mark lexical tokens
260 for (rule.restart(); (int) rule <= nf; rule.getNext()) {
261 int n = rule->length();
262 LOGV(rule) LCV(rule->lexical);
263 if (n == 0 || !rule->lexical) {
264 continue;
265 }
266 while (n--) {
267 Token token = rule.token(n);
268 token->lexical = 1;
269 }
270 }
271 //#endif
272
273 // Scan rules which are _not_ lexical
274 for (rule.restart(); (int) rule <= nf; rule.getNext()) {
275 int n = rule->length();
276 LOGV(rule) LCV(rule->lexical);
277 if (n == 0 || rule->lexical) {
278 continue;
279 }
280 LOGSECTION("Scanning non-lexical rule");
281 LOGV(rule);
282 while (n--) {
283 Token token = rule.token(n);
284 LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag);
285 LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token));
286 LOGV(token->disregard);
287 if (newTokenNumber[token] ||
288 in_disregard_list(token) ||
289 (token->non_terminal_flag && token->token_set_id) ||
290 token->disregard ||
291 (int) token == eof_token ||
292 (int) token == error_token) {
293 continue;
294 }
295 if (token->non_terminal_flag && !token->lexeme) {
296 continue;
297 }
298 // newTokenNumber is the pure token
299 newTokenNumber[token] = alias(token);
300 LOGV(token) LCV(newTokenNumber[token]);
301 }
302 }
303 #ifdef FIX3
304 for (rule.restart(); (int) rule <= nf; rule.getNext()) {
305 int n = rule->length();
306 LOGV(rule) LCV(rule->lexical);
307 if (n == 0 || rule->lexical) {
308 continue;
309 }
310 LOGSECTION("Scanning non-lexical rule");
311 LOGV(rule);
312 while (n--) {
313 Token token = rule.token(n);
314 LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag);
315 LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token));
316 LOGV(token->disregard);
317 if (newTokenNumber[token] ||
318 in_disregard_list(token) ||
319 (token->non_terminal_flag && token->token_set_id) ||
320 token->disregard ||
321 (int) token == eof_token ||
322 (int) token == error_token) {
323 continue;
324 }
325 if (token->non_terminal_flag && !token->lexical) {
326 continue;
327 }
328 // newTokenNumber is the pure token
329 newTokenNumber[token] = alias(token);
330 LOGV(token) LCV(newTokenNumber[token]);
331 }
332 }
333 #endif
334 #ifdef NOT_FIX3
335 for (rule.restart(); (int) rule <= nf; rule.getNext()) {
336 int n = rule->length();
337 if (n == 0 || rule->lexical) {
338 continue;
339 }
340 while (n-- > 0) {
341 Token token = rule.token(n);
342 if ((int) token >= maxTokenNumber || newTokenNumber[token]) continue;
343 if (newTokenNumber[token] ||
344 in_disregard_list(token) ||
345 (int) token == eof_token ||
346 (token->non_terminal_flag && token->token_set_id) ||
347 (int) token == error_token) {
348 continue;
349 }
350 if (token->non_terminal_flag && !token->lexical) {
351 continue;
352 }
353 AgStack<int> ruleList = findRules(token);
354 Token newToken = Token::create();
355 map_token_number[newToken] = map_token_number[token];
356 subs_bnf(token, newToken);
357 newTokenNumber[token] = newToken;
358 newToken->pure = 1;
359 int i;
360 for (i = 0; i < ruleList.size(); i++) {
361 Rule oldRule = ruleList[i];
362 Rule newRule = Rule::create();
363 map_form_number[newRule] = map_form_number[oldRule];
364 int k = oldRule->elementList.size();
365 newRule->elementList = AgArray<RuleElement>(k);
366 while (k--) {
367 newRule->elementList[k] = oldRule->elementList[k];
368 }
369 newRule->lexical = 0;
370 at(bnf_table,(int)token,(int)newRule);
371 token->non_terminal_flag = 1;
372 newRule->prim_tkn = token;
373 }
374 }
375 }
376 #endif
377 LOGS("alias loop complete");
378 for (rule.restart(); rule.loopNotFinished(); rule.getNext()) {
379 int n = rule->length();
380 LOGV(rule) LCV(rule->lexical);
381 if (n == 0) continue;
382 if (!rule->lexical) continue;
383 LOGSECTION("Substitution loop");
384 while (n-- > 0) {
385 Token token = rule.token(n);
386 if (newTokenNumber[token] == 0) {
387 continue;
388 }
389 rule.token(n) = newTokenNumber[token];
390 LOGV(token) LCV(newTokenNumber[token]);
391 }
392 }
393 LOGS("Rule loop complete");
394 nforms_base = nforms;
395 }
396
397