Mercurial > ~dholland > hg > ag > index.cgi
diff anagram/agcore/lexeme.cpp @ 0:13d2b8934445
Import AnaGram (near-)release tree into Mercurial.
author | David A. Holland |
---|---|
date | Sat, 22 Dec 2007 17:52:45 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/anagram/agcore/lexeme.cpp Sat Dec 22 17:52:45 2007 -0500 @@ -0,0 +1,397 @@ +/* + * AnaGram, A System for Syntax Directed Programming + * Copyright 1993-1999 Parsifal Software. All Rights Reserved. + * See the file COPYING for license and usage terms. + * + * lexeme.cpp - lexeme analysis + */ + +#include "arrays.h" +#include "config.h" +#include "data.h" +#include "dict.h" +#include "keyword.h" +#include "lexeme.h" +#include "q1glbl.h" +#include "q5.h" +#include "rpk.h" +#include "rule.h" +#include "stacks.h" +#include "token.h" +#include "tree.h" +#include "tsd.h" + +//#define INCLUDE_LOGGING +#include "log.h" + + +#define FIX3 + +AgStack<int> disregardList; +unsigned disregard_token; + + +// Find and mark the lexical rules. +static void find_lexical_rules(void) { + LOGSECTION("find_lexical_rules"); + unsigned ku; + int k; + iws(); + LOGS("disregard list tokens"); + // First, all the disregard tokens + for (ku = disregardList.size(); ku--;) { + //xws(disregard_list[ku]); + xws(disregardList[ku]); + //Token(disregardList[ku])->disregard = 1; + LOGV(disregardList[ku]); + } + // Then all the lexemes + LOGS("lexemes"); + for (ku = 0; ku++ < ntkns;) if (map_token_number[ku].lexeme) { + xws(ku); + LOGV(ku); + } + // rules produced by disregard tokens and lexemes are + // lexical rules. Any rule produced by a token found + // in a lexical rule is also a lexical rule. + // This loop, in other words, implements a closure + LOGS("lexical rules"); + for (k = 0; k < tis(); k++) { + int tn = list_base[k]; + int *bnf = bnf_table->sb; + int nbnf = bnf_table->nt; + while (nbnf--) { + int t = *bnf++, f = *bnf++, n; + + if (t != tn) { + continue; + } + Rule rule(f); + n = rule->length(); + rule->lexical = 1; + LOGV(rule) LCV(rule->lexical); + while (n--) { + Token token = rule.token(n); + if (token->non_terminal_flag) { + xws(token); + LOGV(token); + } + } + } + } + rws(); +} + +static void build_noise_token(void) { + LOGSECTION("build_noise_token"); + LOGV(disregardList.size()); + if (disregardList.size() == 1) { + disregard_token = vp_6(disregardList[0]); + Token token = disregardList[0]; + token->disregard = 1; + } + else { + int n = disregardList.size();; + //int *lb = disregard_list; + iws(); + int i; + for (i = 0; i < n; i++) { + ruleElementStack + .push(AgStack<RuleElement>()) + .top() + .push(RuleElement(disregardList[i],0)); + aws(vp_form3(0)); + } + disregard_token = vp_4(); + } + extern Token vpRepeatToken; + Token disregard = Token(disregard_token); + disregard->disregard = 1; + vpRepeatToken->disregard = 1; + LOGV(disregard); + LOGV((int) vpRepeatToken); +} + +static int in_disregard_list(int tn) { + LOGSECTION("in_disregard_list"); + int n = disregardList.size(); + while (n--) { + if (tn == disregardList[n]) { + return 1; + } + } + return 0; +} + +static void subs_bnf(int tn, int nt) { + int *p = bnf_table->sb; + int n = bnf_table->nt; + for (; n--; p += 2) { + if (*p != tn) { + continue; + } + *p = nt; + Rule rule(p[1]); + if ((int)rule->prim_tkn == tn) { + rule->prim_tkn = nt; + } + } +} + +static int alias(Token token) { + LOGSECTION("alias"); + + Token pureToken = Token::create(); + LOGV(token) LCV(pureToken); + map_token_number[pureToken] = map_token_number[token]; + LOGV(token) LCV(token->value_type) LCV(token->immediate_action); + LOGV(pureToken) LCV(pureToken->value_type) LCV(token->immediate_action); + if (token->key) { + Keyword keyword =token->key; + keyword->token_number = pureToken; + } + pureToken->pure = 1; + LOGV(token->non_terminal_flag) LCV(token->token_set_id); + if (token->non_terminal_flag) { + LOGS("Substituting") LCV(token) LCV(pureToken); + subs_bnf(token,pureToken); + } + token->junky = 1; + if (token->token_set_id) { + LOGV(token->token_set_id); + pureToken->token_set_id = token->token_set_id; + token->token_set_id = 0; + int n = part_dict->nsx; + while (n--) if (map_part_number[n].token_number == (unsigned) token) { + map_part_number[n].token_number = pureToken; + LOGV(n); + break; + } + for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) { + if ((int) rule->prim_tkn == token) { + rule->prim_tkn = pureToken; + LOGV(rule); + } + } + for (unsigned i = 0; i < n_chars; i++) { + if (map_char_number[i].token_number == (unsigned) token) { + map_char_number[i].token_number = pureToken; + } + } + } + else if (token->part_number) { + LOGV(token->part_number); + pureToken->part_number = token->part_number; + map_part_number[token->part_number].token_number = pureToken; + token->part_number = 0; + for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) { + if ((int) rule->prim_tkn == token) { + rule->prim_tkn = pureToken; + LOGV(rule); + } + } + for (unsigned i = 0; i < n_chars; i++) { + if (map_char_number[i].token_number == (unsigned) token) { + map_char_number[i].token_number = pureToken; + } + } + } + Rule rule = makeRule(pureToken, disregard_token); + at(bnf_table, (int)token, (int)rule); + token->non_terminal_flag = 1; + rule->prim_tkn = token; + ParseTree parseTree = token->parse_tree; + if (parseTree) { + parseTree->token_number = pureToken; + } + LOGV((int) token) LCV(token->value_type); + LOGV((int) pureToken) LCV(pureToken->value_type); + return pureToken; +} + +#ifdef NOT_FIX3 + +static AgStack<int> findRules(Token token) { + AgStack<int> rules; + int *p = bnf_table->sb; + int n = bnf_table->nt; + for (; n--; p += 2) { + if (*p != (int) token) continue; + rules.push(p[1]); + } + return rules; +} +#endif + +/* +scan rules, and and mark token usage as lexical, non-lexical or both + then, for each token that has both lexical and non lexical usage, + make a clone +*/ + +void set_lexemes(void) { + int nf = nforms; + nInputRules = nforms; + LOGSECTION("set_lexemes"); + LOGV(nforms); + + disregard_token = 0; + if (disregardList.size() == 0) return; + LocalArray<int> newTokenNumber(ntkns+1); +#ifdef NOT_FIX3 + int maxTokenNumber = ntkns; +#endif + memset(newTokenNumber, 0, (ntkns+1)*sizeof(*newTokenNumber)); + Each<Rule> rule; +#ifdef INCLUDE_LOGGING + for (rule.restart(); (int) rule <= nf; rule.getNext()) { + LOGV(rule) LCV(rule->lexical); + } +#endif + find_lexical_rules(); +#ifdef INCLUDE_LOGGING + for (rule.restart(); (int) rule <= nf; rule.getNext()) { + LOGV(rule) LCV(rule->lexical); + } +#endif + build_noise_token(); +//#ifdef FIX3 + // mark lexical tokens + for (rule.restart(); (int) rule <= nf; rule.getNext()) { + int n = rule->length(); + LOGV(rule) LCV(rule->lexical); + if (n == 0 || !rule->lexical) { + continue; + } + while (n--) { + Token token = rule.token(n); + token->lexical = 1; + } + } +//#endif + + // Scan rules which are _not_ lexical + for (rule.restart(); (int) rule <= nf; rule.getNext()) { + int n = rule->length(); + LOGV(rule) LCV(rule->lexical); + if (n == 0 || rule->lexical) { + continue; + } + LOGSECTION("Scanning non-lexical rule"); + LOGV(rule); + while (n--) { + Token token = rule.token(n); + LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag); + LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token)); + LOGV(token->disregard); + if (newTokenNumber[token] || + in_disregard_list(token) || + (token->non_terminal_flag && token->token_set_id) || + token->disregard || + (int) token == eof_token || + (int) token == error_token) { + continue; + } + if (token->non_terminal_flag && !token->lexeme) { + continue; + } + // newTokenNumber is the pure token + newTokenNumber[token] = alias(token); + LOGV(token) LCV(newTokenNumber[token]); + } + } +#ifdef FIX3 + for (rule.restart(); (int) rule <= nf; rule.getNext()) { + int n = rule->length(); + LOGV(rule) LCV(rule->lexical); + if (n == 0 || rule->lexical) { + continue; + } + LOGSECTION("Scanning non-lexical rule"); + LOGV(rule); + while (n--) { + Token token = rule.token(n); + LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag); + LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token)); + LOGV(token->disregard); + if (newTokenNumber[token] || + in_disregard_list(token) || + (token->non_terminal_flag && token->token_set_id) || + token->disregard || + (int) token == eof_token || + (int) token == error_token) { + continue; + } + if (token->non_terminal_flag && !token->lexical) { + continue; + } + // newTokenNumber is the pure token + newTokenNumber[token] = alias(token); + LOGV(token) LCV(newTokenNumber[token]); + } + } +#endif +#ifdef NOT_FIX3 + for (rule.restart(); (int) rule <= nf; rule.getNext()) { + int n = rule->length(); + if (n == 0 || rule->lexical) { + continue; + } + while (n-- > 0) { + Token token = rule.token(n); + if ((int) token >= maxTokenNumber || newTokenNumber[token]) continue; + if (newTokenNumber[token] || + in_disregard_list(token) || + (int) token == eof_token || + (token->non_terminal_flag && token->token_set_id) || + (int) token == error_token) { + continue; + } + if (token->non_terminal_flag && !token->lexical) { + continue; + } + AgStack<int> ruleList = findRules(token); + Token newToken = Token::create(); + map_token_number[newToken] = map_token_number[token]; + subs_bnf(token, newToken); + newTokenNumber[token] = newToken; + newToken->pure = 1; + int i; + for (i = 0; i < ruleList.size(); i++) { + Rule oldRule = ruleList[i]; + Rule newRule = Rule::create(); + map_form_number[newRule] = map_form_number[oldRule]; + int k = oldRule->elementList.size(); + newRule->elementList = AgArray<RuleElement>(k); + while (k--) { + newRule->elementList[k] = oldRule->elementList[k]; + } + newRule->lexical = 0; + at(bnf_table,(int)token,(int)newRule); + token->non_terminal_flag = 1; + newRule->prim_tkn = token; + } + } + } +#endif + LOGS("alias loop complete"); + for (rule.restart(); rule.loopNotFinished(); rule.getNext()) { + int n = rule->length(); + LOGV(rule) LCV(rule->lexical); + if (n == 0) continue; + if (!rule->lexical) continue; + LOGSECTION("Substitution loop"); + while (n-- > 0) { + Token token = rule.token(n); + if (newTokenNumber[token] == 0) { + continue; + } + rule.token(n) = newTokenNumber[token]; + LOGV(token) LCV(newTokenNumber[token]); + } + } + LOGS("Rule loop complete"); + nforms_base = nforms; +} + +