diff anagram/agcore/lexeme.cpp @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/anagram/agcore/lexeme.cpp	Sat Dec 22 17:52:45 2007 -0500
@@ -0,0 +1,397 @@
+/*
+ * AnaGram, A System for Syntax Directed Programming
+ * Copyright 1993-1999 Parsifal Software. All Rights Reserved.
+ * See the file COPYING for license and usage terms.
+ *
+ * lexeme.cpp - lexeme analysis
+ */
+
+#include "arrays.h"
+#include "config.h"
+#include "data.h"
+#include "dict.h"
+#include "keyword.h"
+#include "lexeme.h"
+#include "q1glbl.h"
+#include "q5.h"
+#include "rpk.h"
+#include "rule.h"
+#include "stacks.h"
+#include "token.h"
+#include "tree.h"
+#include "tsd.h"
+
+//#define INCLUDE_LOGGING
+#include "log.h"
+
+
+#define FIX3
+
+AgStack<int> disregardList;
+unsigned disregard_token;
+
+
+// Find and mark the lexical rules.
+static void find_lexical_rules(void) {
+  LOGSECTION("find_lexical_rules");
+  unsigned ku;
+  int k;
+  iws();
+  LOGS("disregard list tokens");
+  // First, all the disregard tokens
+  for (ku = disregardList.size(); ku--;) {
+    //xws(disregard_list[ku]);
+    xws(disregardList[ku]);
+    //Token(disregardList[ku])->disregard = 1;
+    LOGV(disregardList[ku]);
+  }
+  // Then all the lexemes
+  LOGS("lexemes");
+  for (ku = 0; ku++ < ntkns;) if (map_token_number[ku].lexeme) {
+    xws(ku);
+    LOGV(ku);
+  }
+  // rules produced by disregard tokens and lexemes are
+  // lexical rules. Any rule produced by a token found
+  // in a lexical rule is also a lexical rule.
+  // This loop, in other words, implements a closure
+  LOGS("lexical rules");
+  for (k = 0; k < tis(); k++) {
+    int tn = list_base[k];
+    int *bnf = bnf_table->sb;
+    int nbnf = bnf_table->nt;
+    while (nbnf--) {
+      int t = *bnf++, f = *bnf++, n;
+
+      if (t != tn) {
+	continue;
+      }
+      Rule rule(f);
+      n = rule->length();
+      rule->lexical = 1;
+      LOGV(rule) LCV(rule->lexical);
+      while (n--) {
+        Token token = rule.token(n);
+        if (token->non_terminal_flag) {
+          xws(token);
+          LOGV(token);
+        }
+      }
+    }
+  }
+  rws();
+}
+
+static void build_noise_token(void) {
+  LOGSECTION("build_noise_token");
+  LOGV(disregardList.size());
+  if (disregardList.size() == 1) {
+    disregard_token = vp_6(disregardList[0]);
+    Token token = disregardList[0];
+    token->disregard = 1;
+  }
+  else {
+    int n = disregardList.size();;
+    //int *lb = disregard_list;
+    iws();
+    int i;
+    for (i = 0; i < n; i++) {
+      ruleElementStack
+          .push(AgStack<RuleElement>())
+          .top()
+          .push(RuleElement(disregardList[i],0));
+      aws(vp_form3(0));
+    }
+    disregard_token = vp_4();
+  }
+  extern Token vpRepeatToken;
+  Token disregard = Token(disregard_token);
+  disregard->disregard = 1;
+  vpRepeatToken->disregard = 1;
+  LOGV(disregard);
+  LOGV((int) vpRepeatToken);
+}
+
+static int in_disregard_list(int tn) {
+  LOGSECTION("in_disregard_list");
+  int n = disregardList.size();
+  while (n--) {
+    if (tn == disregardList[n]) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static void subs_bnf(int tn, int nt) {
+  int *p = bnf_table->sb;
+  int n = bnf_table->nt;
+  for (; n--; p += 2) {
+    if (*p != tn) {
+      continue;
+    }
+    *p = nt;
+    Rule rule(p[1]);
+    if ((int)rule->prim_tkn == tn) {
+      rule->prim_tkn = nt;
+    }
+  }
+}
+
+static int alias(Token token) {
+  LOGSECTION("alias");
+
+  Token pureToken = Token::create();
+  LOGV(token) LCV(pureToken);
+  map_token_number[pureToken] = map_token_number[token];
+  LOGV(token) LCV(token->value_type) LCV(token->immediate_action);
+  LOGV(pureToken) LCV(pureToken->value_type) LCV(token->immediate_action);
+  if (token->key) {
+    Keyword keyword =token->key;
+    keyword->token_number = pureToken;
+  }
+  pureToken->pure = 1;
+  LOGV(token->non_terminal_flag) LCV(token->token_set_id);
+  if (token->non_terminal_flag) {
+    LOGS("Substituting") LCV(token) LCV(pureToken);
+    subs_bnf(token,pureToken);
+  }
+  token->junky = 1;
+  if (token->token_set_id) {
+    LOGV(token->token_set_id);
+    pureToken->token_set_id = token->token_set_id;
+    token->token_set_id = 0;
+    int n = part_dict->nsx;
+    while (n--) if (map_part_number[n].token_number == (unsigned) token) {
+      map_part_number[n].token_number = pureToken;
+      LOGV(n);
+      break;
+    }
+    for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) {
+      if ((int) rule->prim_tkn == token) {
+        rule->prim_tkn = pureToken;
+        LOGV(rule);
+      }
+    }
+    for (unsigned i = 0; i < n_chars; i++) {
+      if (map_char_number[i].token_number == (unsigned) token) {
+        map_char_number[i].token_number = pureToken;
+      }
+    }
+  }
+  else if (token->part_number) {
+    LOGV(token->part_number);
+    pureToken->part_number = token->part_number;
+    map_part_number[token->part_number].token_number = pureToken;
+    token->part_number = 0;
+    for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) {
+      if ((int) rule->prim_tkn == token) {
+        rule->prim_tkn = pureToken;
+        LOGV(rule);
+      }
+    }
+    for (unsigned i = 0; i < n_chars; i++) {
+      if (map_char_number[i].token_number == (unsigned) token) {
+        map_char_number[i].token_number = pureToken;
+      }
+    }
+  }
+  Rule rule = makeRule(pureToken, disregard_token);
+  at(bnf_table, (int)token, (int)rule);
+  token->non_terminal_flag = 1;
+  rule->prim_tkn = token;
+  ParseTree parseTree = token->parse_tree;
+  if (parseTree) {
+    parseTree->token_number = pureToken;
+  }
+  LOGV((int) token) LCV(token->value_type);
+  LOGV((int) pureToken) LCV(pureToken->value_type);
+  return pureToken;
+}
+
+#ifdef NOT_FIX3
+
+static AgStack<int> findRules(Token token) {
+  AgStack<int> rules;
+  int *p = bnf_table->sb;
+  int n = bnf_table->nt;
+  for (; n--; p += 2) {
+    if (*p != (int) token) continue;
+    rules.push(p[1]);
+  }
+  return rules;
+}
+#endif
+
+/*
+scan rules, and and mark token usage as lexical, non-lexical or both
+ then, for each token that has both lexical and non lexical usage,
+ make a clone
+*/
+
+void set_lexemes(void) {
+  int nf = nforms;
+  nInputRules = nforms;
+  LOGSECTION("set_lexemes");
+  LOGV(nforms);
+
+  disregard_token = 0;
+  if (disregardList.size() == 0) return;
+  LocalArray<int> newTokenNumber(ntkns+1);
+#ifdef NOT_FIX3
+  int maxTokenNumber = ntkns;
+#endif
+  memset(newTokenNumber, 0, (ntkns+1)*sizeof(*newTokenNumber));
+  Each<Rule> rule;
+#ifdef INCLUDE_LOGGING
+  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+    LOGV(rule) LCV(rule->lexical);
+  }
+#endif
+  find_lexical_rules();
+#ifdef INCLUDE_LOGGING
+  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+    LOGV(rule) LCV(rule->lexical);
+  }
+#endif
+  build_noise_token();
+//#ifdef FIX3
+  // mark lexical tokens
+  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+    int n = rule->length();
+    LOGV(rule) LCV(rule->lexical);
+    if (n == 0 || !rule->lexical) {
+      continue;
+    }
+    while (n--) {
+      Token token = rule.token(n);
+      token->lexical = 1;
+    }
+  }
+//#endif
+
+  // Scan rules which are _not_ lexical
+  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+    int n = rule->length();
+    LOGV(rule) LCV(rule->lexical);
+    if (n == 0 || rule->lexical) {
+      continue;
+    }
+    LOGSECTION("Scanning non-lexical rule");
+    LOGV(rule);
+    while (n--) {
+      Token token = rule.token(n);
+      LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag);
+      LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token));
+      LOGV(token->disregard);
+      if (newTokenNumber[token] ||
+	  in_disregard_list(token) ||
+	  (token->non_terminal_flag && token->token_set_id) ||
+	  token->disregard ||
+	  (int) token == eof_token ||
+	  (int) token == error_token) {
+	continue;
+      }
+      if (token->non_terminal_flag && !token->lexeme) {
+	continue;
+      }
+      // newTokenNumber is the pure token
+      newTokenNumber[token] = alias(token);
+      LOGV(token) LCV(newTokenNumber[token]);
+    }
+  }
+#ifdef FIX3
+  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+    int n = rule->length();
+    LOGV(rule) LCV(rule->lexical);
+    if (n == 0 || rule->lexical) {
+      continue;
+    }
+    LOGSECTION("Scanning non-lexical rule");
+    LOGV(rule);
+    while (n--) {
+      Token token = rule.token(n);
+      LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag);
+      LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token));
+      LOGV(token->disregard);
+      if (newTokenNumber[token] ||
+	  in_disregard_list(token) ||
+	  (token->non_terminal_flag && token->token_set_id) ||
+	  token->disregard ||
+	  (int) token == eof_token ||
+	  (int) token == error_token) {
+	continue;
+      }
+      if (token->non_terminal_flag && !token->lexical) {
+	continue;
+      }
+      // newTokenNumber is the pure token
+      newTokenNumber[token] = alias(token);
+      LOGV(token) LCV(newTokenNumber[token]);
+    }
+  }
+#endif
+#ifdef NOT_FIX3
+  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
+    int n = rule->length();
+    if (n == 0 || rule->lexical) {
+      continue;
+    }
+    while (n-- > 0) {
+      Token token = rule.token(n);
+      if ((int) token >= maxTokenNumber || newTokenNumber[token]) continue;
+      if (newTokenNumber[token] ||
+	  in_disregard_list(token) ||
+	  (int) token == eof_token ||
+	  (token->non_terminal_flag && token->token_set_id) ||
+	  (int) token == error_token) {
+	continue;
+      }
+      if (token->non_terminal_flag && !token->lexical) {
+	continue;
+      }
+      AgStack<int> ruleList = findRules(token);
+      Token newToken = Token::create();
+      map_token_number[newToken] = map_token_number[token];
+      subs_bnf(token, newToken);
+      newTokenNumber[token] = newToken;
+      newToken->pure = 1;
+      int i;
+      for (i = 0; i < ruleList.size(); i++) {
+        Rule oldRule = ruleList[i];
+        Rule newRule = Rule::create();
+        map_form_number[newRule] = map_form_number[oldRule];
+        int k = oldRule->elementList.size();
+        newRule->elementList = AgArray<RuleElement>(k);
+        while (k--) {
+	  newRule->elementList[k] = oldRule->elementList[k];
+	}
+        newRule->lexical = 0;
+        at(bnf_table,(int)token,(int)newRule);
+        token->non_terminal_flag = 1;
+        newRule->prim_tkn = token;
+      }
+    }
+  }
+#endif
+  LOGS("alias loop complete");
+  for (rule.restart(); rule.loopNotFinished(); rule.getNext()) {
+    int n = rule->length();
+    LOGV(rule) LCV(rule->lexical);
+    if (n == 0) continue;
+    if (!rule->lexical)  continue;
+    LOGSECTION("Substitution loop");
+    while (n-- > 0) {
+      Token token = rule.token(n);
+      if (newTokenNumber[token] == 0) {
+	continue;
+      }
+      rule.token(n) = newTokenNumber[token];
+      LOGV(token) LCV(newTokenNumber[token]);
+    }
+  }
+  LOGS("Rule loop complete");
+  nforms_base = nforms;
+}
+
+