view anagram/agcore/lexeme.cpp @ 21:1c9dac05d040

Add lint-style FALLTHROUGH annotations to fallthrough cases. (in the parse engine and thus the output code) Document this, because the old output causes warnings with gcc10.
author David A. Holland
date Mon, 13 Jun 2022 00:04:38 -0400
parents 13d2b8934445
children
line wrap: on
line source

/*
 * AnaGram, A System for Syntax Directed Programming
 * Copyright 1993-1999 Parsifal Software. All Rights Reserved.
 * See the file COPYING for license and usage terms.
 *
 * lexeme.cpp - lexeme analysis
 */

#include "arrays.h"
#include "config.h"
#include "data.h"
#include "dict.h"
#include "keyword.h"
#include "lexeme.h"
#include "q1glbl.h"
#include "q5.h"
#include "rpk.h"
#include "rule.h"
#include "stacks.h"
#include "token.h"
#include "tree.h"
#include "tsd.h"

//#define INCLUDE_LOGGING
#include "log.h"


#define FIX3

AgStack<int> disregardList;
unsigned disregard_token;


// Find and mark the lexical rules.
static void find_lexical_rules(void) {
  LOGSECTION("find_lexical_rules");
  unsigned ku;
  int k;
  iws();
  LOGS("disregard list tokens");
  // First, all the disregard tokens
  for (ku = disregardList.size(); ku--;) {
    //xws(disregard_list[ku]);
    xws(disregardList[ku]);
    //Token(disregardList[ku])->disregard = 1;
    LOGV(disregardList[ku]);
  }
  // Then all the lexemes
  LOGS("lexemes");
  for (ku = 0; ku++ < ntkns;) if (map_token_number[ku].lexeme) {
    xws(ku);
    LOGV(ku);
  }
  // rules produced by disregard tokens and lexemes are
  // lexical rules. Any rule produced by a token found
  // in a lexical rule is also a lexical rule.
  // This loop, in other words, implements a closure
  LOGS("lexical rules");
  for (k = 0; k < tis(); k++) {
    int tn = list_base[k];
    int *bnf = bnf_table->sb;
    int nbnf = bnf_table->nt;
    while (nbnf--) {
      int t = *bnf++, f = *bnf++, n;

      if (t != tn) {
	continue;
      }
      Rule rule(f);
      n = rule->length();
      rule->lexical = 1;
      LOGV(rule) LCV(rule->lexical);
      while (n--) {
        Token token = rule.token(n);
        if (token->non_terminal_flag) {
          xws(token);
          LOGV(token);
        }
      }
    }
  }
  rws();
}

static void build_noise_token(void) {
  LOGSECTION("build_noise_token");
  LOGV(disregardList.size());
  if (disregardList.size() == 1) {
    disregard_token = vp_6(disregardList[0]);
    Token token = disregardList[0];
    token->disregard = 1;
  }
  else {
    int n = disregardList.size();;
    //int *lb = disregard_list;
    iws();
    int i;
    for (i = 0; i < n; i++) {
      ruleElementStack
          .push(AgStack<RuleElement>())
          .top()
          .push(RuleElement(disregardList[i],0));
      aws(vp_form3(0));
    }
    disregard_token = vp_4();
  }
  extern Token vpRepeatToken;
  Token disregard = Token(disregard_token);
  disregard->disregard = 1;
  vpRepeatToken->disregard = 1;
  LOGV(disregard);
  LOGV((int) vpRepeatToken);
}

static int in_disregard_list(int tn) {
  LOGSECTION("in_disregard_list");
  int n = disregardList.size();
  while (n--) {
    if (tn == disregardList[n]) {
      return 1;
    }
  }
  return 0;
}

static void subs_bnf(int tn, int nt) {
  int *p = bnf_table->sb;
  int n = bnf_table->nt;
  for (; n--; p += 2) {
    if (*p != tn) {
      continue;
    }
    *p = nt;
    Rule rule(p[1]);
    if ((int)rule->prim_tkn == tn) {
      rule->prim_tkn = nt;
    }
  }
}

static int alias(Token token) {
  LOGSECTION("alias");

  Token pureToken = Token::create();
  LOGV(token) LCV(pureToken);
  map_token_number[pureToken] = map_token_number[token];
  LOGV(token) LCV(token->value_type) LCV(token->immediate_action);
  LOGV(pureToken) LCV(pureToken->value_type) LCV(token->immediate_action);
  if (token->key) {
    Keyword keyword =token->key;
    keyword->token_number = pureToken;
  }
  pureToken->pure = 1;
  LOGV(token->non_terminal_flag) LCV(token->token_set_id);
  if (token->non_terminal_flag) {
    LOGS("Substituting") LCV(token) LCV(pureToken);
    subs_bnf(token,pureToken);
  }
  token->junky = 1;
  if (token->token_set_id) {
    LOGV(token->token_set_id);
    pureToken->token_set_id = token->token_set_id;
    token->token_set_id = 0;
    int n = part_dict->nsx;
    while (n--) if (map_part_number[n].token_number == (unsigned) token) {
      map_part_number[n].token_number = pureToken;
      LOGV(n);
      break;
    }
    for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) {
      if ((int) rule->prim_tkn == token) {
        rule->prim_tkn = pureToken;
        LOGV(rule);
      }
    }
    for (unsigned i = 0; i < n_chars; i++) {
      if (map_char_number[i].token_number == (unsigned) token) {
        map_char_number[i].token_number = pureToken;
      }
    }
  }
  else if (token->part_number) {
    LOGV(token->part_number);
    pureToken->part_number = token->part_number;
    map_part_number[token->part_number].token_number = pureToken;
    token->part_number = 0;
    for (Each<Rule> rule; rule.loopNotFinished(); rule.getNext()) {
      if ((int) rule->prim_tkn == token) {
        rule->prim_tkn = pureToken;
        LOGV(rule);
      }
    }
    for (unsigned i = 0; i < n_chars; i++) {
      if (map_char_number[i].token_number == (unsigned) token) {
        map_char_number[i].token_number = pureToken;
      }
    }
  }
  Rule rule = makeRule(pureToken, disregard_token);
  at(bnf_table, (int)token, (int)rule);
  token->non_terminal_flag = 1;
  rule->prim_tkn = token;
  ParseTree parseTree = token->parse_tree;
  if (parseTree) {
    parseTree->token_number = pureToken;
  }
  LOGV((int) token) LCV(token->value_type);
  LOGV((int) pureToken) LCV(pureToken->value_type);
  return pureToken;
}

#ifdef NOT_FIX3

static AgStack<int> findRules(Token token) {
  AgStack<int> rules;
  int *p = bnf_table->sb;
  int n = bnf_table->nt;
  for (; n--; p += 2) {
    if (*p != (int) token) continue;
    rules.push(p[1]);
  }
  return rules;
}
#endif

/*
scan rules, and and mark token usage as lexical, non-lexical or both
 then, for each token that has both lexical and non lexical usage,
 make a clone
*/

void set_lexemes(void) {
  int nf = nforms;
  nInputRules = nforms;
  LOGSECTION("set_lexemes");
  LOGV(nforms);

  disregard_token = 0;
  if (disregardList.size() == 0) return;
  LocalArray<int> newTokenNumber(ntkns+1);
#ifdef NOT_FIX3
  int maxTokenNumber = ntkns;
#endif
  memset(newTokenNumber, 0, (ntkns+1)*sizeof(*newTokenNumber));
  Each<Rule> rule;
#ifdef INCLUDE_LOGGING
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    LOGV(rule) LCV(rule->lexical);
  }
#endif
  find_lexical_rules();
#ifdef INCLUDE_LOGGING
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    LOGV(rule) LCV(rule->lexical);
  }
#endif
  build_noise_token();
//#ifdef FIX3
  // mark lexical tokens
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    int n = rule->length();
    LOGV(rule) LCV(rule->lexical);
    if (n == 0 || !rule->lexical) {
      continue;
    }
    while (n--) {
      Token token = rule.token(n);
      token->lexical = 1;
    }
  }
//#endif

  // Scan rules which are _not_ lexical
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    int n = rule->length();
    LOGV(rule) LCV(rule->lexical);
    if (n == 0 || rule->lexical) {
      continue;
    }
    LOGSECTION("Scanning non-lexical rule");
    LOGV(rule);
    while (n--) {
      Token token = rule.token(n);
      LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag);
      LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token));
      LOGV(token->disregard);
      if (newTokenNumber[token] ||
	  in_disregard_list(token) ||
	  (token->non_terminal_flag && token->token_set_id) ||
	  token->disregard ||
	  (int) token == eof_token ||
	  (int) token == error_token) {
	continue;
      }
      if (token->non_terminal_flag && !token->lexeme) {
	continue;
      }
      // newTokenNumber is the pure token
      newTokenNumber[token] = alias(token);
      LOGV(token) LCV(newTokenNumber[token]);
    }
  }
#ifdef FIX3
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    int n = rule->length();
    LOGV(rule) LCV(rule->lexical);
    if (n == 0 || rule->lexical) {
      continue;
    }
    LOGSECTION("Scanning non-lexical rule");
    LOGV(rule);
    while (n--) {
      Token token = rule.token(n);
      LOGV(token) LCV(token->token_set_id) LCV(token->non_terminal_flag);
      LOGV(token->lexeme) LCV(token->lexical) LCV(in_disregard_list(token));
      LOGV(token->disregard);
      if (newTokenNumber[token] ||
	  in_disregard_list(token) ||
	  (token->non_terminal_flag && token->token_set_id) ||
	  token->disregard ||
	  (int) token == eof_token ||
	  (int) token == error_token) {
	continue;
      }
      if (token->non_terminal_flag && !token->lexical) {
	continue;
      }
      // newTokenNumber is the pure token
      newTokenNumber[token] = alias(token);
      LOGV(token) LCV(newTokenNumber[token]);
    }
  }
#endif
#ifdef NOT_FIX3
  for (rule.restart(); (int) rule <= nf; rule.getNext()) {
    int n = rule->length();
    if (n == 0 || rule->lexical) {
      continue;
    }
    while (n-- > 0) {
      Token token = rule.token(n);
      if ((int) token >= maxTokenNumber || newTokenNumber[token]) continue;
      if (newTokenNumber[token] ||
	  in_disregard_list(token) ||
	  (int) token == eof_token ||
	  (token->non_terminal_flag && token->token_set_id) ||
	  (int) token == error_token) {
	continue;
      }
      if (token->non_terminal_flag && !token->lexical) {
	continue;
      }
      AgStack<int> ruleList = findRules(token);
      Token newToken = Token::create();
      map_token_number[newToken] = map_token_number[token];
      subs_bnf(token, newToken);
      newTokenNumber[token] = newToken;
      newToken->pure = 1;
      int i;
      for (i = 0; i < ruleList.size(); i++) {
        Rule oldRule = ruleList[i];
        Rule newRule = Rule::create();
        map_form_number[newRule] = map_form_number[oldRule];
        int k = oldRule->elementList.size();
        newRule->elementList = AgArray<RuleElement>(k);
        while (k--) {
	  newRule->elementList[k] = oldRule->elementList[k];
	}
        newRule->lexical = 0;
        at(bnf_table,(int)token,(int)newRule);
        token->non_terminal_flag = 1;
        newRule->prim_tkn = token;
      }
    }
  }
#endif
  LOGS("alias loop complete");
  for (rule.restart(); rule.loopNotFinished(); rule.getNext()) {
    int n = rule->length();
    LOGV(rule) LCV(rule->lexical);
    if (n == 0) continue;
    if (!rule->lexical)  continue;
    LOGSECTION("Substitution loop");
    while (n-- > 0) {
      Token token = rule.token(n);
      if (newTokenNumber[token] == 0) {
	continue;
      }
      rule.token(n) = newTokenNumber[token];
      LOGV(token) LCV(newTokenNumber[token]);
    }
  }
  LOGS("Rule loop complete");
  nforms_base = nforms;
}