view anagram/agcore/lexeme.cpp @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
line wrap: on
line source

/*
 * AnaGram, A System for Syntax Directed Programming
 * Copyright 1993-1999 Parsifal Software. All Rights Reserved.
 * See the file COPYING for license and usage terms.
 *
 * lexeme.cpp - lexeme analysis
 */

#include "arrays.h"
#include "config.h"
#include "data.h"
#include "dict.h"
#include "keyword.h"
#include "lexeme.h"
#include "q1glbl.h"
#include "q5.h"
#include "rpk.h"
#include "rule.h"
#include "stacks.h"
#include "token.h"
#include "tree.h"
#include "tsd.h"

//#define INCLUDE_LOGGING
#include "log.h"


#define FIX3

AgStack<int> disregardList;
unsigned disregard_token;


// Find and mark the lexical rules.
static void find_lexical_rules(void) {
  LOGSECTION("find_lexical_rules");
  unsigned ku;
  int k;
  iws();
  LOGS("disregard list tokens");
  // First, all the disregard tokens
  for (ku = disregardList.size(); ku--;) {
    //xws(disregard_list[ku]);
    xws(disregardList[ku]);
    //Token(disregardList[ku])->disregard = 1;
    LOGV(disregardList[ku]);
  }
  // Then all the lexemes
  LOGS("lexemes");
  for (ku = 0; ku++ < ntkns;) if (map_token_number[ku].lexeme) {
    xws(ku);
    LOGV(ku);
  }
  // rules produced by disregard tokens and lexemes are
  // lexical rules. Any rule produced by a token found
  // in a lexical rule is also a lexical rule.
  // This loop, in other words, implements a closure
  LOGS("lexical rules");
  for (k = 0; k < tis(); k++) {
    int tn = list_base[k];
    int *bnf = bnf_table->sb;
    int nbnf = bnf_table->nt;
    while (nbnf--) {
      int t = *bnf++, f = *bnf++, n;

      if (t != tn) {
	continue;
      }
      Rule rule(f);
      n = rule->length();
      rule->lexical = 1;
      LOGV(rule) LCV(rule->lexical);
      while (n--) {
        Token token = rule.token(n);
        if (token->non_terminal_flag) {
          xws(token);
          LOGV(token);
        }
      }
    }
  }
  rws();
}

static void build_noise_token(void) {
  LOGSECTION("build_noise_token");
  LOGV(disregardList.size());
  if (disregardList.size() == 1) {
    disregard_token = vp_6(disregardList[0]);
    Token token = disregardList[0];
    token->disregard = 1;
  }
  else {
    int n = disregardList.size();;
    //int *lb = disregard_list;
    iws();
    int i;
    for (i = 0; i < n; i++) {
      ruleElementStack
          .push(AgStack<RuleElement>())
          .top()
          .push(RuleElement(disregardList[i],0));
      aws(vp_form3(0));
    }
    disregard_token = vp_4();
  }
  extern Token vpRepeatToken;
  Token disregard = Token(disregard_token);
  disregard->disregard = 1;
  vpRepeatToken->disregard = 1;
  LOGV(disregard);
  LOGV((int) vpRepeatToken);
}

static int in_disregard_list(int tn) {
  LOGSECTION("in_disregard_list");
  int n = disregardList.size();
  while (n--) {
    if (tn == disregardList[n]) {
      return 1;
    }
  }
  return 0;
}

static void subs_bnf(int tn, int nt) {
  int *p = bnf_table->sb;
  int n = bnf_table->nt;
  for (; n--; p += 2) {
    if (*p != tn) {
      continue;
    }
    *p = nt;
    Rule rule(p[1]);
    if ((int)rule->prim_tkn == tn) {
      rule->prim_tkn = nt;
    }
  }
}

static int alias(Token token) {
  LOGSECTION("alias");

  Token pureToken = Token::create();
  LOGV(token) LCV(pureToken);
  map_token_number[pureToken] = map_token_number[token];
  LOGV(token) LCV(token->value_type) LCV(token->immediate_action);
  LOGV(pureToken) LCV(pureToken->value_type) LCV(token->immediate_action);
  if (token->key) {
    Keyword keyword =token->key;
    keyword->token_number = pureToken;
  }
  pureToken->pure = 1;
  LOGV(token->non_terminal_flag) LCV(token->token_set_id);
  if (token->non_terminal_flag) {
    LOGS("Substituting") LCV(token) LCV(pureToken);
    subs_bnf(token,pureToken);
  }
  token->junky = 1;
  if (token->token_set_id) {
    LOGV(token->token_set_id);
    pureToken->token_set_id = token->token_set_id;
    token->token_set_id = 0;
    int n = part_dict->nsx;
    while (n--) if (map_part_number[n].token_number == (unsigned) token) {
      map_part_number[n].token_number = pureToken;
      LOGV(n);
      break;
    }
    for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) {
      if ((int) rule->prim_tkn == token) {
        rule->prim_tkn = pureToken;
        LOGV(rule);
      }
    }
    for (unsigned i = 0; i < n_chars; i++) {
      if (map_char_number[i].token_number == (unsigned) token) {
        map_char_number[i].token_number = pureToken;
      }
    }
  }
  else if (token->part_number) {
    LOGV(token->part_number);
    pureToken->part_number = token->part_number;
    map_part_number[token->part_number].token_number = pureToken;
    token->part_number = 0;
    for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) {
      if ((int) rule->prim_tkn == token) {
        rule->prim_tkn = pureToken;
        LOGV(rule);
      }
    }
    for (unsigned i = 0; i < n_chars; i++) {
      if (map_char_number[i].token_number == (unsigned) token) {
        map_char_number[i].token_number = pureToken;
      }
    }
  }
  Rule rule = makeRule(pureToken, disregard_token);
  at(bnf_table, (int)token, (int)rule);
  token->non_terminal_flag = 1;
  rule->prim_tkn = token;
  ParseTree parseTree = token->parse_tree;
  if (parseTree) {
    parseTree->token_number = pureToken;
  }
  LOGV((int) token) LCV(token->value_type);
  LOGV((int) pureToken) LCV(pureToken->value_type);
  return pureToken;
}

#ifdef NOT_FIX3

static AgStack<int> findRules(Token token) {
  AgStack<int> rules;
  int *p = bnf_table->sb;
  int n = bnf_table->nt;
  for (; n--; p += 2) {
    if (*p != (int) token) continue;
    rules.push(p[1]);
  }
  return rules;
}
#endif

/*
scan rules, and and mark token usage as lexical, non-lexical or both
 then, for each token that has both lexical and non lexical usage,
 make a clone
*/

void set_lexemes(void) {
  int nf = nforms;
  nInputRules = nforms;
  LOGSECTION("set_lexemes");
  LOGV(nforms);

  disregard_token = 0;
  if (disregardList.size() == 0) return;
  LocalArray<int> newTokenNumber(ntkns+1);
#ifdef NOT_FIX3
  int maxTokenNumber = ntkns;
#endif
  memset(newTokenNumber, 0, (ntkns+1)*sizeof(*newTokenNumber));
  Each<Rule> rule;
#ifdef INCLUDE_LOGGING
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    LOGV(rule) LCV(rule->lexical);
  }
#endif
  find_lexical_rules();
#ifdef INCLUDE_LOGGING
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    LOGV(rule) LCV(rule->lexical);
  }
#endif
  build_noise_token();
//#ifdef FIX3
  // mark lexical tokens
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    int n = rule->length();
    LOGV(rule) LCV(rule->lexical);
    if (n == 0 || !rule->lexical) {
      continue;
    }
    while (n--) {
      Token token = rule.token(n);
      token->lexical = 1;
    }
  }
//#endif

  // Scan rules which are _not_ lexical
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    int n = rule->length();
    LOGV(rule) LCV(rule->lexical);
    if (n == 0 || rule->lexical) {
      continue;
    }
    LOGSECTION("Scanning non-lexical rule");
    LOGV(rule);
    while (n--) {
      Token token = rule.token(n);
      LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag);
      LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token));
      LOGV(token->disregard);
      if (newTokenNumber[token] ||
	  in_disregard_list(token) ||
	  (token->non_terminal_flag && token->token_set_id) ||
	  token->disregard ||
	  (int) token == eof_token ||
	  (int) token == error_token) {
	continue;
      }
      if (token->non_terminal_flag && !token->lexeme) {
	continue;
      }
      // newTokenNumber is the pure token
      newTokenNumber[token] = alias(token);
      LOGV(token) LCV(newTokenNumber[token]);
    }
  }
#ifdef FIX3
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    int n = rule->length();
    LOGV(rule) LCV(rule->lexical);
    if (n == 0 || rule->lexical) {
      continue;
    }
    LOGSECTION("Scanning non-lexical rule");
    LOGV(rule);
    while (n--) {
      Token token = rule.token(n);
      LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag);
      LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token));
      LOGV(token->disregard);
      if (newTokenNumber[token] ||
	  in_disregard_list(token) ||
	  (token->non_terminal_flag && token->token_set_id) ||
	  token->disregard ||
	  (int) token == eof_token ||
	  (int) token == error_token) {
	continue;
      }
      if (token->non_terminal_flag && !token->lexical) {
	continue;
      }
      // newTokenNumber is the pure token
      newTokenNumber[token] = alias(token);
      LOGV(token) LCV(newTokenNumber[token]);
    }
  }
#endif
#ifdef NOT_FIX3
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    int n = rule->length();
    if (n == 0 || rule->lexical) {
      continue;
    }
    while (n-- > 0) {
      Token token = rule.token(n);
      if ((int) token >= maxTokenNumber || newTokenNumber[token]) continue;
      if (newTokenNumber[token] ||
	  in_disregard_list(token) ||
	  (int) token == eof_token ||
	  (token->non_terminal_flag && token->token_set_id) ||
	  (int) token == error_token) {
	continue;
      }
      if (token->non_terminal_flag && !token->lexical) {
	continue;
      }
      AgStack<int> ruleList = findRules(token);
      Token newToken = Token::create();
      map_token_number[newToken] = map_token_number[token];
      subs_bnf(token, newToken);
      newTokenNumber[token] = newToken;
      newToken->pure = 1;
      int i;
      for (i = 0; i < ruleList.size(); i++) {
        Rule oldRule = ruleList[i];
        Rule newRule = Rule::create();
        map_form_number[newRule] = map_form_number[oldRule];
        int k = oldRule->elementList.size();
        newRule->elementList = AgArray<RuleElement>(k);
        while (k--) {
	  newRule->elementList[k] = oldRule->elementList[k];
	}
        newRule->lexical = 0;
        at(bnf_table,(int)token,(int)newRule);
        token->non_terminal_flag = 1;
        newRule->prim_tkn = token;
      }
    }
  }
#endif
  LOGS("alias loop complete");
  for (rule.restart(); rule.loopNotFinished(); rule.getNext()) {
    int n = rule->length();
    LOGV(rule) LCV(rule->lexical);
    if (n == 0) continue;
    if (!rule->lexical)  continue;
    LOGSECTION("Substitution loop");
    while (n-- > 0) {
      Token token = rule.token(n);
      if (newTokenNumber[token] == 0) {
	continue;
      }
      rule.token(n) = newTokenNumber[token];
      LOGV(token) LCV(newTokenNumber[token]);
    }
  }
  LOGS("Rule loop complete");
  nforms_base = nforms;
}