AnaGram interim repo (temporary): anagram/agcore/lexeme.cpp comparison

comparison anagram/agcore/lexeme.cpp @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.

author	David A. Holland
date	Sat, 22 Dec 2007 17:52:45 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:13d2b8934445
+/*
+* AnaGram, A System for Syntax Directed Programming
+* Copyright 1993-1999 Parsifal Software. All Rights Reserved.
+* See the file COPYING for license and usage terms.
+*
+* lexeme.cpp - lexeme analysis
+*/
+#include "arrays.h"
+#include "config.h"
+#include "data.h"
+#include "dict.h"
+#include "keyword.h"
+#include "lexeme.h"
+#include "q1glbl.h"
+#include "q5.h"
+#include "rpk.h"
+#include "rule.h"
+#include "stacks.h"
+#include "token.h"
+#include "tree.h"
+#include "tsd.h"
+//#define INCLUDE_LOGGING
+#include "log.h"
+#define FIX3
+AgStack<int> disregardList;
+unsigned disregard_token;
+// Find and mark the lexical rules.
+static void find_lexical_rules(void) {
+LOGSECTION("find_lexical_rules");
+unsigned ku;
+int k;
+iws();
+LOGS("disregard list tokens");
+// First, all the disregard tokens
+for (ku = disregardList.size(); ku--;) {
+//xws(disregard_list[ku]);
+xws(disregardList[ku]);
+//Token(disregardList[ku])->disregard = 1;
+LOGV(disregardList[ku]);
+}
+// Then all the lexemes
+LOGS("lexemes");
+for (ku = 0; ku++ < ntkns;) if (map_token_number[ku].lexeme) {
+xws(ku);
+LOGV(ku);
+}
+// rules produced by disregard tokens and lexemes are
+// lexical rules. Any rule produced by a token found
+// in a lexical rule is also a lexical rule.
+// This loop, in other words, implements a closure
+LOGS("lexical rules");
+for (k = 0; k < tis(); k++) {
+int tn = list_base[k];
+int *bnf = bnf_table->sb;
+int nbnf = bnf_table->nt;
+while (nbnf--) {
+int t = *bnf++, f = *bnf++, n;
+if (t != tn) {
+	continue;
+}
+Rule rule(f);
+n = rule->length();
+rule->lexical = 1;
+LOGV(rule) LCV(rule->lexical);
+while (n--) {
+Token token = rule.token(n);
+if (token->non_terminal_flag) {
+xws(token);
+LOGV(token);
+}
+}
+}
+}
+rws();
+}
+static void build_noise_token(void) {
+LOGSECTION("build_noise_token");
+LOGV(disregardList.size());
+if (disregardList.size() == 1) {
+disregard_token = vp_6(disregardList[0]);
+Token token = disregardList[0];
+token->disregard = 1;
+}
+else {
+int n = disregardList.size();;
+//int *lb = disregard_list;
+iws();
+int i;
+for (i = 0; i < n; i++) {
+ruleElementStack
+.push(AgStack<RuleElement>())
+.top()
+.push(RuleElement(disregardList[i],0));
+aws(vp_form3(0));
+}
+disregard_token = vp_4();
+}
+extern Token vpRepeatToken;
+Token disregard = Token(disregard_token);
+disregard->disregard = 1;
+vpRepeatToken->disregard = 1;
+LOGV(disregard);
+LOGV((int) vpRepeatToken);
+}
+static int in_disregard_list(int tn) {
+LOGSECTION("in_disregard_list");
+int n = disregardList.size();
+while (n--) {
+if (tn == disregardList[n]) {
+return 1;
+}
+}
+return 0;
+}
+static void subs_bnf(int tn, int nt) {
+int *p = bnf_table->sb;
+int n = bnf_table->nt;
+for (; n--; p += 2) {
+if (*p != tn) {
+continue;
+}
+*p = nt;
+Rule rule(p[1]);
+if ((int)rule->prim_tkn == tn) {
+rule->prim_tkn = nt;
+}
+}
+}
+static int alias(Token token) {
+LOGSECTION("alias");
+Token pureToken = Token::create();
+LOGV(token) LCV(pureToken);
+map_token_number[pureToken] = map_token_number[token];
+LOGV(token) LCV(token->value_type) LCV(token->immediate_action);
+LOGV(pureToken) LCV(pureToken->value_type) LCV(token->immediate_action);
+if (token->key) {
+Keyword keyword =token->key;
+keyword->token_number = pureToken;
+}
+pureToken->pure = 1;
+LOGV(token->non_terminal_flag) LCV(token->token_set_id);
+if (token->non_terminal_flag) {
+LOGS("Substituting") LCV(token) LCV(pureToken);
+subs_bnf(token,pureToken);
+}
+token->junky = 1;
+if (token->token_set_id) {
+LOGV(token->token_set_id);
+pureToken->token_set_id = token->token_set_id;
+token->token_set_id = 0;
+int n = part_dict->nsx;
+while (n--) if (map_part_number[n].token_number == (unsigned) token) {
+map_part_number[n].token_number = pureToken;
+LOGV(n);
+break;
+}
+for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) {
+if ((int) rule->prim_tkn == token) {
+rule->prim_tkn = pureToken;
+LOGV(rule);
+}
+}
+for (unsigned i = 0; i < n_chars; i++) {
+if (map_char_number[i].token_number == (unsigned) token) {
+map_char_number[i].token_number = pureToken;
+}
+}
+}
+else if (token->part_number) {
+LOGV(token->part_number);
+pureToken->part_number = token->part_number;
+map_part_number[token->part_number].token_number = pureToken;
+token->part_number = 0;
+for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) {
+if ((int) rule->prim_tkn == token) {
+rule->prim_tkn = pureToken;
+LOGV(rule);
+}
+}
+for (unsigned i = 0; i < n_chars; i++) {
+if (map_char_number[i].token_number == (unsigned) token) {
+map_char_number[i].token_number = pureToken;
+}
+}
+}
+Rule rule = makeRule(pureToken, disregard_token);
+at(bnf_table, (int)token, (int)rule);
+token->non_terminal_flag = 1;
+rule->prim_tkn = token;
+ParseTree parseTree = token->parse_tree;
+if (parseTree) {
+parseTree->token_number = pureToken;
+}
+LOGV((int) token) LCV(token->value_type);
+LOGV((int) pureToken) LCV(pureToken->value_type);
+return pureToken;
+}
+#ifdef NOT_FIX3
+static AgStack<int> findRules(Token token) {
+AgStack<int> rules;
+int *p = bnf_table->sb;
+int n = bnf_table->nt;
+for (; n--; p += 2) {
+if (*p != (int) token) continue;
+rules.push(p[1]);
+}
+return rules;
+}
+#endif
+/*
+scan rules, and and mark token usage as lexical, non-lexical or both
+then, for each token that has both lexical and non lexical usage,
+make a clone
+*/
+void set_lexemes(void) {
+int nf = nforms;
+nInputRules = nforms;
+LOGSECTION("set_lexemes");
+LOGV(nforms);
+disregard_token = 0;
+if (disregardList.size() == 0) return;
+LocalArray<int> newTokenNumber(ntkns+1);
+#ifdef NOT_FIX3
+int maxTokenNumber = ntkns;
+#endif
+memset(newTokenNumber, 0, (ntkns+1)*sizeof(*newTokenNumber));
+Each<Rule> rule;
+#ifdef INCLUDE_LOGGING
+for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+LOGV(rule) LCV(rule->lexical);
+}
+#endif
+find_lexical_rules();
+#ifdef INCLUDE_LOGGING
+for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+LOGV(rule) LCV(rule->lexical);
+}
+#endif
+build_noise_token();
+//#ifdef FIX3
+// mark lexical tokens
+for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+int n = rule->length();
+LOGV(rule) LCV(rule->lexical);
+if (n == 0 || !rule->lexical) {
+continue;
+}
+while (n--) {
+Token token = rule.token(n);
+token->lexical = 1;
+}
+}
+//#endif
+// Scan rules which are _not_ lexical
+for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+int n = rule->length();
+LOGV(rule) LCV(rule->lexical);
+if (n == 0 || rule->lexical) {
+continue;
+}
+LOGSECTION("Scanning non-lexical rule");
+LOGV(rule);
+while (n--) {
+Token token = rule.token(n);
+LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag);
+LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token));
+LOGV(token->disregard);
+if (newTokenNumber[token] ||
+	  in_disregard_list(token) ||
+	  (token->non_terminal_flag && token->token_set_id) ||
+	  token->disregard ||
+	  (int) token == eof_token ||
+	  (int) token == error_token) {
+	continue;
+}
+if (token->non_terminal_flag && !token->lexeme) {
+	continue;
+}
+// newTokenNumber is the pure token
+newTokenNumber[token] = alias(token);
+LOGV(token) LCV(newTokenNumber[token]);
+}
+}
+#ifdef FIX3
+for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+int n = rule->length();
+LOGV(rule) LCV(rule->lexical);
+if (n == 0 || rule->lexical) {
+continue;
+}
+LOGSECTION("Scanning non-lexical rule");
+LOGV(rule);
+while (n--) {
+Token token = rule.token(n);
+LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag);
+LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token));
+LOGV(token->disregard);
+if (newTokenNumber[token] ||
+	  in_disregard_list(token) ||
+	  (token->non_terminal_flag && token->token_set_id) ||
+	  token->disregard ||
+	  (int) token == eof_token ||
+	  (int) token == error_token) {
+	continue;
+}
+if (token->non_terminal_flag && !token->lexical) {
+	continue;
+}
+// newTokenNumber is the pure token
+newTokenNumber[token] = alias(token);
+LOGV(token) LCV(newTokenNumber[token]);
+}
+}
+#endif
+#ifdef NOT_FIX3
+for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+int n = rule->length();
+if (n == 0 || rule->lexical) {
+continue;
+}
+while (n-- > 0) {
+Token token = rule.token(n);
+if ((int) token >= maxTokenNumber || newTokenNumber[token]) continue;
+if (newTokenNumber[token] ||
+	  in_disregard_list(token) ||
+	  (int) token == eof_token ||
+	  (token->non_terminal_flag && token->token_set_id) ||
+	  (int) token == error_token) {
+	continue;
+}
+if (token->non_terminal_flag && !token->lexical) {
+	continue;
+}
+AgStack<int> ruleList = findRules(token);
+Token newToken = Token::create();
+map_token_number[newToken] = map_token_number[token];
+subs_bnf(token, newToken);
+newTokenNumber[token] = newToken;
+newToken->pure = 1;
+int i;
+for (i = 0; i < ruleList.size(); i++) {
+Rule oldRule = ruleList[i];
+Rule newRule = Rule::create();
+map_form_number[newRule] = map_form_number[oldRule];
+int k = oldRule->elementList.size();
+newRule->elementList = AgArray<RuleElement>(k);
+while (k--) {
+	  newRule->elementList[k] = oldRule->elementList[k];
+	}
+newRule->lexical = 0;
+at(bnf_table,(int)token,(int)newRule);
+token->non_terminal_flag = 1;
+newRule->prim_tkn = token;
+}
+}
+}
+#endif
+LOGS("alias loop complete");
+for (rule.restart(); rule.loopNotFinished(); rule.getNext()) {
+int n = rule->length();
+LOGV(rule) LCV(rule->lexical);
+if (n == 0) continue;
+if (!rule->lexical)  continue;
+LOGSECTION("Substitution loop");
+while (n-- > 0) {
+Token token = rule.token(n);
+if (newTokenNumber[token] == 0) {
+	continue;
+}
+rule.token(n) = newTokenNumber[token];
+LOGV(token) LCV(newTokenNumber[token]);
+}
+}
+LOGS("Rule loop complete");
+nforms_base = nforms;
+}

Mercurial > ~dholland > hg > ag > index.cgi

comparison anagram/agcore/lexeme.cpp @ 0:13d2b8934445