view anagram/agcore/cs.cpp @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
line wrap: on
line source

/*
 * AnaGram, A System for Syntax Directed Programming
 * Copyright 1993-2002 Parsifal Software. All Rights Reserved.
 * See the file COPYING for license and usage terms.
 *
 * cs.cpp
 */

#include "arrays.h"
#include "assert.h"
#include "bpu.h"
#include "config.h"
#include "cs.h"
#include "csexp.h"
#include "dict.h"
#include "error.h"
#include "myalloc.h"
#include "q1glbl.h"
#include "rpk.h"
#include "rule.h"
#include "stacks.h"
#include "symbol.h"
#include "token.h"
#include "tree.h"
#include "tsd.h"
#include "ut.h"

//#define INCLUDE_LOGGING
#include "log.h"


static int char_set_error;

static void build_partition_sets(list_dict *d);
static void partition_set(int ts, list_dict *d, int *ps, char *fa);


void set_universe(void) {
  //unsigned tn;

  LOGSECTION("set_universe");
  LOGV(ntkns);
  LOGV(nforms);
  //min_char_number = 0;
  max_char_number = -1;
/*
  for (int ptIndex = 0; ptIndex++ < ntrees;) {
    map_parse_tree[ptIndex].expression->checkRange();
  }
*/
  for (ParseTree pt; pt.next(); ) {
    pt->expression->checkRange();
  }
  CharSetExpression::checkMinimum(min_char_number);
  if (max_char_number > 0x10000) {
    max_char_number = 0x10000;
  }
  LOGV(min_char_number) LCV(max_char_number);

  //for (tn = 0; tn++ < ntkns; )
  for (Each<Token> token; token.loopNotFinished(); token.getNext()) {
    //token_number_map *tp = &map_token_number[tn];
    //Token token(tn);
    //int pt = tp->parse_tree;
    ParseTree tree = token->parse_tree;
    //parse_tree_map *ptp;
    //int alias, name;

    //if (pt) continue;
    if (tree.isNotNull()) {
      continue;
    }
    Symbol alias = token->token_name;
    if (alias.isNull()) {
      continue;
    }
    //pt = map_token_name[alias].parse_tree;
    tree = alias->parse_tree;
    if (tree.isNull()) {
      continue;
    }
    //ptp = &map_parse_tree[pt];

    CharSetExpression *exp = tree->expression;
    if (exp && exp->nameNode()) {
      Symbol name = ((NamedCharSet *)exp)->name;
      Token referencedToken = name->token_number;
      if (referencedToken.isNull()) {
	continue;
      }
      shell_production(token, referencedToken);
      tree->token_number = token;
    }
  }

  // if (error_token == 0) {
  //   error_token = find_token_number("error");
  // }
  if (grammar_token == 0) {
    grammar_token = find_token_number("grammar");
  }
  if (eof_token == 0) {
    eof_token = find_token_number("eof");
    if (eof_token == 0) {
      //int id = identify_string("eof", tkn_dict);
      Symbol id("eof");
      if (id && (auto_resynch || error_token)) {
	eof_token = id_token(id);
      }
    }
  }
  nforms_base = nforms;
  if (min_char_number > max_char_number) {
    return;
  }
  if (max_char_number < 255) {
    max_char_number = 255;
  }
  n_chars = max_char_number - min_char_number + 1;
  check_size(map_char_number, n_chars, n_chars);
  LOGV(ntkns);
  LOGV(nforms);
}

static unsigned ntc_size;

void build_sets(void) {
  int k;
  unsigned i;
  int id;
  char ts[100];
  int *ntc;
  unsigned ntk = ntkns;

  if (charRangeDiagnostic || min_char_number > max_char_number) {
    return;
  }

  LOGSECTION("build_sets");
  LOGV(ntkns) LCV(nforms) LCV(error_token);
  /*
   * count the number of named tokens that point to a given parse tree.
   * If there is only one such named token, it can be assigned to the
   * parse tree. Otherwise, the parse tree needs its own token, and
   * the named tokens have to have shells created.
   */
  //ntc_size = ntrees+1;
  ntc_size = ParseTree::count();
  ZALLOCATE_ST(ntc, ntc_size);
  LOGV(ntc_size);
  ParseTree tree;
  LOGS("Scanning trees");
  for (tree = ParseTree(); tree.next(); ) {
    int name = tree->token_name;
    if (name && tree->expression->char_node()) {
      int value = ((IndividualCode *)tree->expression)->value;
      assert(value >= min_char_number);
      LOGV(value) LCV(min_char_number) LCV(name);
      map_char_number[value - min_char_number].token_name = name;
    }
  }
  LOGS("finished tree loop");
  for (i = 0; i++ < ntkns; ) {
    token_number_map *tp = &map_token_number[i];
    int name = tp->token_name;
    unsigned pt;

    if (tp->non_terminal_flag) {
      continue;
    }
    if (name == 0) {
      continue;
    }
    pt = map_token_name[name].parse_tree;
    LOGV(pt) LCV(ParseTree(pt)->token_number);
    LOGV((int) map_parse_tree[pt].expression);
    if (map_parse_tree[pt].token_number) {
      continue;
    }
    assert(pt < ntc_size);
    ntc[pt]++;
  }
  LOGS("finished token loop");

  /*
   * Now check those trees that have more than one named token. Create
   * a specific token for the tree.
   */

  for (tree = ParseTree(); tree.next();) {
    int tn;

    if (ntc[tree] <= 1) {
      ntc[tree] = 0;
      continue;
    }
    tn = tree->token_number;
    if (tn == 0) {
      if (tree->expression->nameNode()) {
        int name = ((NamedCharSet *)tree->expression)->name;
        //LOGV(netAllocations());
        tn = id_token(name);
        Token(tn)->value_type = default_input_type;
      }
      else {
	tn = form_element_1(tree->expression);
      }
    }
    ntc[tree] = tn;
  }
  LOGS("trees scanned again");

  /*
   * Now check the named terminals again and either make up the
   * tree-token links or build shell productions
   */

  for (i = 0; i++ < ntk;) {
    token_number_map *tp = &map_token_number[i];
    int name = tp->token_name;
    int pt;
    int tn;
    parse_tree_map *ptp;

    if (tp->non_terminal_flag) {
      continue;
    }
    if (name == 0) {
      continue;
    }
    pt = map_token_name[name].parse_tree;
    if (pt == 0) {
      continue;
    }
    ptp = &map_parse_tree[pt];
    tn = ptp->token_number;
    if (tn == 0) {
      tn = ntc[pt];
    }
    if (tn == 0) {
      ptp->token_number = i;
      continue;
    }
    shell_production(i, tn);
  }
  DEALLOCATE(ntc);
  LOGS("tokens scanned again");
  nforms_base = nforms;

  LOGV(ntkns) LCV(nforms);
  LOGV(error_token);
  for (i = 0; i++ < ntkns;) {
    token_number_map *tp = &map_token_number[i];
    int nn;
    if (tp->non_terminal_flag ||
        (int) i == error_token ||
        tp->key ||
        tp->token_set_id) {
      continue;
    }
    nn = tp->parse_tree;
    LOGV(i) LCV(nn);
    if (nn == 0 && (k = tp->token_name) != 0) {
        nn = map_token_name[k].parse_tree;
    }
    LOGV(nn);
    if (nn == 0) {
      char_set_error = 5;
    }
    else {
      tp->token_set_id = build_char_set(nn);
    }
    if (char_set_error == 0) {
      continue;
    }

    id = tp->token_name;
    if (id) {
      sprintf(ts," = %s", Symbol(id)->string.pointer());
    }
    else {
      ts[0] = 0;
    }
    switch (char_set_error) {
    case 1:
      ssprintf("Error defining T%03d: ",i);
      break;
    case 5:
      ssprintf("Undefined token T%03d: ",i);
      break;
    }
    atkn(i);
    log_error();
  }
  build_partition_sets(char_set_dict);
  LOGV(ntkns);
  LOGV(nforms);
}

static void build_partition_sets(list_dict *d) {
  LOGSECTION("build_partition_sets");
  int i;
  unsigned j, n;
  int *db;
  unsigned *ndx;
  int *lb;
  int m;
  //int *ps;
  int ts;
  unsigned ntk;
  //char *fa;
  unsigned *fl;
  int nf;
  unsigned cn;
  int *list;
  unsigned pn,nc;
  unsigned tn;

  if (max_char_number == -1 && min_char_number == 0) {
    return;
  }
  //ps = local_array(n_chars, int);
  LocalArray<int> ps(n_chars);
  n = d->nsx;
  db = d->text;
  ndx = d->ndx;
  //set = local_array(n, int *);
  LocalArray<int *> set(n);
  //k = local_array(n, int);
  LocalArray<int> k(n);
  //l = local_array(n, int);
  LocalArray<int> l(n);
  for (i = 1; (unsigned) i < n; i++) {
    set[i] = (db + ndx[i]);
    l[i] = *(set[i]++) - 1;
    k[i] = 0;
  }

  /*
   * For each character in the character range, scan all the character
   * sets. Make a list of the sets to which this particular character
   * belongs. This list represents a unique partition of the character
   * range. Use a dictionary to give each unique partition an
   * identifying number. Map each character to the partition to which
   * it belongs.
   */

  LOGS("Scanning characters");
  for (i = min_char_number; i <= max_char_number; i++) {
    int ix = i - min_char_number;
    iws();
    for (j = 1; j < n; j++) {
      while (k[j] < l[j] && set[j][k[j]] < i) {
	k[j]++;
      }
      if (k[j] == l[j] || set[j][k[j]] > i) {
	continue;
      }
      k[j]++;
      aws(j);
    }
    lb = list_base;
    m = rws();
    map_char_number[ix].part_number = ps[ix] = add_list_dict(lb,m,part_dict);
    LOGV(ix) LCV(ps[ix]);
  }

  /*
   * Scan each partition. For each partition, make a list of all the
   * characters that belong to this partition.
   */

  LOGS("Scanning partitions");
  m = part_dict->nsx;
  check_size(map_part_number, m, m);
  for (i = 0; (unsigned) i < part_dict->nsx; i++) {
    iws();
    for (j = 0; j < (unsigned) n_chars; j++) {
      if ((unsigned) i == map_char_number[j].part_number) {
	aws(j + min_char_number);
      }
    }
    map_part_number[i].chars_index = store_list(chars_list);
    fl = lstptr(map_part_number[i],chars);
    m = map_part_number[i].size = rws();
#ifdef INCLUDE_LOGGING
    if ((unsigned)(*fl - min_char_number) > 
	(max_char_number - min_char_number)) {
      LOGV(*fl) LCV(min_char_number) LCV(max_char_number);
    }
#endif
    assert((unsigned)(*fl - min_char_number) <= 
	   (unsigned) (max_char_number - min_char_number));
    if (m == 1 
	&& (j = map_char_number[*fl - min_char_number].token_number) != 0) {
      map_part_number[i].token_number = j;
    }
    LOGV(*fl) LCV(min_char_number) LCV(i) LCV(j);
  }

  m = part_dict->nsx;
  //fa = local_array(m, char);
  LocalArray<char> fa(m);
  m = char_set_dict->nsx;
  check_size(map_char_set,m,m);
  for (i = 1; i<m; i++) {
    char_set_map *csp = &map_char_set[i];
    partition_set(i, char_set_dict, ps, fa);
    csp->part_index = store_list(part_list);
    csp->nparts = rws();
  }


  ntk = ntkns+1;
  //assert(ok_index(map_token_number, ntkns));
  m = part_dict->nsx;

  LOGS("Scanning tokens");
  for (i = 1; (unsigned) i < ntk; i++) {
    part_number_map *ppn;
    unsigned tn;
    //int ctn = i;
    Token ctn(i);
    unsigned fn;
    ts = map_token_number[i].token_set_id;
    if (ts == 0) {
      continue;
    }
    nf = map_char_set[ts].nparts;
    if (nf != 1) {
      continue;
    }
    fl = lstptr(map_char_set[ts], part);
    ppn = &map_part_number[pn = *fl];
    assert(ok_index(map_part_number, pn));
    list = (int *) lstptr(*ppn, chars);
    //assert (pn > 0 && pn < m);
    assert(pn < (unsigned) m);
    nc = ppn->size;
    tn = ppn->token_number;
    if (tn == 0) {
      while (nc--) {
        cn = *list++ - min_char_number;
        assert(cn < n_chars);
        map_char_number[cn].token_number = i;
        LOGV(cn) LCV(i);
      }
      assert(pn < part_dict->nsx);
      ppn->token_number = i;
      map_token_number[i].part_number = pn;
      continue;
    }
    if (tn < ntk) {
      //ntkns++;
      //check_size(map_token_number, ntkns, ntkns);
      //ctn = ntkns;
      ctn = Token::create();
      ctn->value_type = default_input_type;
      //pf1x(ctn);
      //fn = form2();
      fn = makeRule(ctn);
      at(bnf_table, tn, fn);
      Rule(fn)->prim_tkn = tn;
      ppn->token_number = ctn;
      Token(tn)->operatorCandidate = ctn->operatorCandidate = 1;
      while (nc--) {
        cn = *list++ - min_char_number;
        assert(cn < n_chars);
        map_char_number[cn].token_number = ctn;
        LOGV(cn) LCV(ctn);
      }
      map_token_number[ctn].part_number = pn;
      map_token_number[ctn].value_type = default_input_type;
      map_token_number[tn].value_type = default_input_type;
      map_token_number[tn].part_number = 0;
      map_token_number[tn].non_terminal_flag = 1;
      tn = ctn;
    }
    //pf1x(tn);
    //fn = form2();
    fn = makeRule(tn);
    LOGV(i) LCV(fn) LCV(tn);
    at(bnf_table, i, fn);
    Rule(fn)->prim_tkn = i;
    map_token_number[i].non_terminal_flag = 1;
    ctn = tn;
  }
  for (i = 1; (unsigned) i < ntk; i++) {
    token_number_map *mti = &map_token_number[i];
    ts = mti->token_set_id;
    if (ts == 0) {
      continue;
    }
    nf = map_char_set[ts].nparts;
    if (nf <= 1) {
      continue;
    }
    fl = lstptr(map_char_set[ts], part);
    iws();
    j = 0;
    while (j < (unsigned) nf) {

      pn = fl[j];
      assert (pn > 0 && pn < (unsigned) m);
      tn = map_part_number[pn].token_number;
      if (tn == 0) {
        //ntkns++;
        //check_size(map_token_number,ntkns,ntkns);
	list = (int *) lstptr(map_part_number[pn], chars);
        nc = map_part_number[pn].size;
        tn = Token::create();
        while (nc--) {
          cn = *list++ - min_char_number;
          assert(cn < n_chars);
          map_char_number[cn].token_number = tn;
          LOGV(cn) LCV(tn);
        }
        //tn = ntkns;
        assert(pn < part_dict->nsx);
        map_part_number[pn].token_number = tn;
        map_token_number[tn].part_number = pn;
        map_token_number[tn].value_type = default_input_type;
	map_token_number[tn].operatorCandidate = 1;
      }
      //pf1x(tn);
      //aws(form2());
      aws(makeRule(tn));
      j++;
    }
    fl = (unsigned *) list_base;
    nf = rws();
    map_token_number[i].non_terminal_flag = 1;
    for (j = 0; j < (unsigned) nf; j++) {
      LOGV(i) LCV(fl[j]);
      at(bnf_table, i, fl[j]);
      //assert(ok_index(map_form_number, fl[j]));
      Rule rule(fl[j]);
      rule->prim_tkn = i;
    }
  }
}

static void partition_set(int ts, list_dict *d, int *ps, char *fa) {
  int *list;
  int n, np, i;

  np = part_dict->nsx;
  memset(fa, 0, np);
  list = d->text + d->ndx[ts];
  n = *(list++) - 1;
  for (i = 0; i<n; i++) {
    fa[ps[list[i] - min_char_number]] = 1;
  }
  iws();
  for (i = 0; i < np; i++) {
    if (fa[i]) {
      aws(i);
    }
  }
}

int build_char_set(int nn) {
  LOGSECTION("build_char_set");
  //char *v;
  //int i, k;
  int cs = map_parse_tree[nn].char_set;
  LOGV(cs) LCV(nn);

  if (cs) {
    return cs;
  }
  char_set_error = 0;
  LOGV(n_chars);

  //int *list = local_array(n_chars, int);
  LocalArray<int> list(n_chars);
  CharBitmap bitmap = map_parse_tree[nn].expression->bitmap();
  if (!case_sensitive) {
    bitmap.blurCase();
  }
  int count = bitmap.list(list);
  LOGV(count);
  cs = add_list_dict(list, count, char_set_dict);
  LOGV(cs);
  check_size(map_char_set, cs, cs);
  map_char_set[cs].parse_tree = nn;
  map_parse_tree[nn].char_set = cs;
  return cs;
}

static void log_undef_sym(int tn) {
  ssprintf("Undefined symbol: %s", Symbol(tn)->string.pointer());
  log_error();
}

int getParseTree(int name) {
  LOGSECTION("getParseTree");
  int pt = map_token_name[name].parse_tree;
  LOGV(pt);
  if (pt) {
    return pt;
  }

  int tn = map_token_name[name].token_number;
  if (tn) {
    pt = map_token_number[tn].parse_tree;
  }
  LOGV(tn) LCV(pt);
  if (pt) {
    return pt;
  }

  log_undef_sym(name);
  return 0;
}