diff anagram/agcore/cs.cpp @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/anagram/agcore/cs.cpp	Sat Dec 22 17:52:45 2007 -0500
@@ -0,0 +1,615 @@
+/*
+ * AnaGram, A System for Syntax Directed Programming
+ * Copyright 1993-2002 Parsifal Software. All Rights Reserved.
+ * See the file COPYING for license and usage terms.
+ *
+ * cs.cpp
+ */
+
+#include "arrays.h"
+#include "assert.h"
+#include "bpu.h"
+#include "config.h"
+#include "cs.h"
+#include "csexp.h"
+#include "dict.h"
+#include "error.h"
+#include "myalloc.h"
+#include "q1glbl.h"
+#include "rpk.h"
+#include "rule.h"
+#include "stacks.h"
+#include "symbol.h"
+#include "token.h"
+#include "tree.h"
+#include "tsd.h"
+#include "ut.h"
+
+//#define INCLUDE_LOGGING
+#include "log.h"
+
+
+static int char_set_error;
+
+static void build_partition_sets(list_dict *d);
+static void partition_set(int ts, list_dict *d, int *ps, char *fa);
+
+
+void set_universe(void) {
+  //unsigned tn;
+
+  LOGSECTION("set_universe");
+  LOGV(ntkns);
+  LOGV(nforms);
+  //min_char_number = 0;
+  max_char_number = -1;
+/*
+  for (int ptIndex = 0; ptIndex++ < ntrees;) {
+    map_parse_tree[ptIndex].expression->checkRange();
+  }
+*/
+  for (ParseTree pt; pt.next(); ) {
+    pt->expression->checkRange();
+  }
+  CharSetExpression::checkMinimum(min_char_number);
+  if (max_char_number > 0x10000) {
+    max_char_number = 0x10000;
+  }
+  LOGV(min_char_number) LCV(max_char_number);
+
+  //for (tn = 0; tn++ < ntkns; )
+  for (Each<Token> token; token.loopNotFinished(); token.getNext()) {
+    //token_number_map *tp = &map_token_number[tn];
+    //Token token(tn);
+    //int pt = tp->parse_tree;
+    ParseTree tree = token->parse_tree;
+    //parse_tree_map *ptp;
+    //int alias, name;
+
+    //if (pt) continue;
+    if (tree.isNotNull()) {
+      continue;
+    }
+    Symbol alias = token->token_name;
+    if (alias.isNull()) {
+      continue;
+    }
+    //pt = map_token_name[alias].parse_tree;
+    tree = alias->parse_tree;
+    if (tree.isNull()) {
+      continue;
+    }
+    //ptp = &map_parse_tree[pt];
+
+    CharSetExpression *exp = tree->expression;
+    if (exp && exp->nameNode()) {
+      Symbol name = ((NamedCharSet *)exp)->name;
+      Token referencedToken = name->token_number;
+      if (referencedToken.isNull()) {
+	continue;
+      }
+      shell_production(token, referencedToken);
+      tree->token_number = token;
+    }
+  }
+
+  // if (error_token == 0) {
+  //   error_token = find_token_number("error");
+  // }
+  if (grammar_token == 0) {
+    grammar_token = find_token_number("grammar");
+  }
+  if (eof_token == 0) {
+    eof_token = find_token_number("eof");
+    if (eof_token == 0) {
+      //int id = identify_string("eof", tkn_dict);
+      Symbol id("eof");
+      if (id && (auto_resynch || error_token)) {
+	eof_token = id_token(id);
+      }
+    }
+  }
+  nforms_base = nforms;
+  if (min_char_number > max_char_number) {
+    return;
+  }
+  if (max_char_number < 255) {
+    max_char_number = 255;
+  }
+  n_chars = max_char_number - min_char_number + 1;
+  check_size(map_char_number, n_chars, n_chars);
+  LOGV(ntkns);
+  LOGV(nforms);
+}
+
+static unsigned ntc_size;
+
+void build_sets(void) {
+  int k;
+  unsigned i;
+  int id;
+  char ts[100];
+  int *ntc;
+  unsigned ntk = ntkns;
+
+  if (charRangeDiagnostic || min_char_number > max_char_number) {
+    return;
+  }
+
+  LOGSECTION("build_sets");
+  LOGV(ntkns) LCV(nforms) LCV(error_token);
+  /*
+   * count the number of named tokens that point to a given parse tree.
+   * If there is only one such named token, it can be assigned to the
+   * parse tree. Otherwise, the parse tree needs its own token, and
+   * the named tokens have to have shells created.
+   */
+  //ntc_size = ntrees+1;
+  ntc_size = ParseTree::count();
+  ZALLOCATE_ST(ntc, ntc_size);
+  LOGV(ntc_size);
+  ParseTree tree;
+  LOGS("Scanning trees");
+  for (tree = ParseTree(); tree.next(); ) {
+    int name = tree->token_name;
+    if (name && tree->expression->char_node()) {
+      int value = ((IndividualCode *)tree->expression)->value;
+      assert(value >= min_char_number);
+      LOGV(value) LCV(min_char_number) LCV(name);
+      map_char_number[value - min_char_number].token_name = name;
+    }
+  }
+  LOGS("finished tree loop");
+  for (i = 0; i++ < ntkns; ) {
+    token_number_map *tp = &map_token_number[i];
+    int name = tp->token_name;
+    unsigned pt;
+
+    if (tp->non_terminal_flag) {
+      continue;
+    }
+    if (name == 0) {
+      continue;
+    }
+    pt = map_token_name[name].parse_tree;
+    LOGV(pt) LCV(ParseTree(pt)->token_number);
+    LOGV((int) map_parse_tree[pt].expression);
+    if (map_parse_tree[pt].token_number) {
+      continue;
+    }
+    assert(pt < ntc_size);
+    ntc[pt]++;
+  }
+  LOGS("finished token loop");
+
+  /*
+   * Now check those trees that have more than one named token. Create
+   * a specific token for the tree.
+   */
+
+  for (tree = ParseTree(); tree.next();) {
+    int tn;
+
+    if (ntc[tree] <= 1) {
+      ntc[tree] = 0;
+      continue;
+    }
+    tn = tree->token_number;
+    if (tn == 0) {
+      if (tree->expression->nameNode()) {
+        int name = ((NamedCharSet *)tree->expression)->name;
+        //LOGV(netAllocations());
+        tn = id_token(name);
+        Token(tn)->value_type = default_input_type;
+      }
+      else {
+	tn = form_element_1(tree->expression);
+      }
+    }
+    ntc[tree] = tn;
+  }
+  LOGS("trees scanned again");
+
+  /*
+   * Now check the named terminals again and either make up the
+   * tree-token links or build shell productions
+   */
+
+  for (i = 0; i++ < ntk;) {
+    token_number_map *tp = &map_token_number[i];
+    int name = tp->token_name;
+    int pt;
+    int tn;
+    parse_tree_map *ptp;
+
+    if (tp->non_terminal_flag) {
+      continue;
+    }
+    if (name == 0) {
+      continue;
+    }
+    pt = map_token_name[name].parse_tree;
+    if (pt == 0) {
+      continue;
+    }
+    ptp = &map_parse_tree[pt];
+    tn = ptp->token_number;
+    if (tn == 0) {
+      tn = ntc[pt];
+    }
+    if (tn == 0) {
+      ptp->token_number = i;
+      continue;
+    }
+    shell_production(i, tn);
+  }
+  DEALLOCATE(ntc);
+  LOGS("tokens scanned again");
+  nforms_base = nforms;
+
+  LOGV(ntkns) LCV(nforms);
+  LOGV(error_token);
+  for (i = 0; i++ < ntkns;) {
+    token_number_map *tp = &map_token_number[i];
+    int nn;
+    if (tp->non_terminal_flag ||
+        (int) i == error_token ||
+        tp->key ||
+        tp->token_set_id) {
+      continue;
+    }
+    nn = tp->parse_tree;
+    LOGV(i) LCV(nn);
+    if (nn == 0 && (k = tp->token_name) != 0) {
+        nn = map_token_name[k].parse_tree;
+    }
+    LOGV(nn);
+    if (nn == 0) {
+      char_set_error = 5;
+    }
+    else {
+      tp->token_set_id = build_char_set(nn);
+    }
+    if (char_set_error == 0) {
+      continue;
+    }
+
+    id = tp->token_name;
+    if (id) {
+      sprintf(ts," = %s", Symbol(id)->string.pointer());
+    }
+    else {
+      ts[0] = 0;
+    }
+    switch (char_set_error) {
+    case 1:
+      ssprintf("Error defining T%03d: ",i);
+      break;
+    case 5:
+      ssprintf("Undefined token T%03d: ",i);
+      break;
+    }
+    atkn(i);
+    log_error();
+  }
+  build_partition_sets(char_set_dict);
+  LOGV(ntkns);
+  LOGV(nforms);
+}
+
+static void build_partition_sets(list_dict *d) {
+  LOGSECTION("build_partition_sets");
+  int i;
+  unsigned j, n;
+  int *db;
+  unsigned *ndx;
+  int *lb;
+  int m;
+  //int *ps;
+  int ts;
+  unsigned ntk;
+  //char *fa;
+  unsigned *fl;
+  int nf;
+  unsigned cn;
+  int *list;
+  unsigned pn,nc;
+  unsigned tn;
+
+  if (max_char_number == -1 && min_char_number == 0) {
+    return;
+  }
+  //ps = local_array(n_chars, int);
+  LocalArray<int> ps(n_chars);
+  n = d->nsx;
+  db = d->text;
+  ndx = d->ndx;
+  //set = local_array(n, int *);
+  LocalArray<int *> set(n);
+  //k = local_array(n, int);
+  LocalArray<int> k(n);
+  //l = local_array(n, int);
+  LocalArray<int> l(n);
+  for (i = 1; (unsigned) i < n; i++) {
+    set[i] = (db + ndx[i]);
+    l[i] = *(set[i]++) - 1;
+    k[i] = 0;
+  }
+
+  /*
+   * For each character in the character range, scan all the character
+   * sets. Make a list of the sets to which this particular character
+   * belongs. This list represents a unique partition of the character
+   * range. Use a dictionary to give each unique partition an
+   * identifying number. Map each character to the partition to which
+   * it belongs.
+   */
+
+  LOGS("Scanning characters");
+  for (i = min_char_number; i <= max_char_number; i++) {
+    int ix = i - min_char_number;
+    iws();
+    for (j = 1; j < n; j++) {
+      while (k[j] < l[j] && set[j][k[j]] < i) {
+	k[j]++;
+      }
+      if (k[j] == l[j] || set[j][k[j]] > i) {
+	continue;
+      }
+      k[j]++;
+      aws(j);
+    }
+    lb = list_base;
+    m = rws();
+    map_char_number[ix].part_number = ps[ix] = add_list_dict(lb,m,part_dict);
+    LOGV(ix) LCV(ps[ix]);
+  }
+
+  /*
+   * Scan each partition. For each partition, make a list of all the
+   * characters that belong to this partition.
+   */
+
+  LOGS("Scanning partitions");
+  m = part_dict->nsx;
+  check_size(map_part_number, m, m);
+  for (i = 0; (unsigned) i < part_dict->nsx; i++) {
+    iws();
+    for (j = 0; j < (unsigned) n_chars; j++) {
+      if ((unsigned) i == map_char_number[j].part_number) {
+	aws(j + min_char_number);
+      }
+    }
+    map_part_number[i].chars_index = store_list(chars_list);
+    fl = lstptr(map_part_number[i],chars);
+    m = map_part_number[i].size = rws();
+#ifdef INCLUDE_LOGGING
+    if ((unsigned)(*fl - min_char_number) > 
+	(max_char_number - min_char_number)) {
+      LOGV(*fl) LCV(min_char_number) LCV(max_char_number);
+    }
+#endif
+    assert((unsigned)(*fl - min_char_number) <= 
+	   (unsigned) (max_char_number - min_char_number));
+    if (m == 1 
+	&& (j = map_char_number[*fl - min_char_number].token_number) != 0) {
+      map_part_number[i].token_number = j;
+    }
+    LOGV(*fl) LCV(min_char_number) LCV(i) LCV(j);
+  }
+
+  m = part_dict->nsx;
+  //fa = local_array(m, char);
+  LocalArray<char> fa(m);
+  m = char_set_dict->nsx;
+  check_size(map_char_set,m,m);
+  for (i = 1; i<m; i++) {
+    char_set_map *csp = &map_char_set[i];
+    partition_set(i, char_set_dict, ps, fa);
+    csp->part_index = store_list(part_list);
+    csp->nparts = rws();
+  }
+
+
+  ntk = ntkns+1;
+  //assert(ok_index(map_token_number, ntkns));
+  m = part_dict->nsx;
+
+  LOGS("Scanning tokens");
+  for (i = 1; (unsigned) i < ntk; i++) {
+    part_number_map *ppn;
+    unsigned tn;
+    //int ctn = i;
+    Token ctn(i);
+    unsigned fn;
+    ts = map_token_number[i].token_set_id;
+    if (ts == 0) {
+      continue;
+    }
+    nf = map_char_set[ts].nparts;
+    if (nf != 1) {
+      continue;
+    }
+    fl = lstptr(map_char_set[ts], part);
+    ppn = &map_part_number[pn = *fl];
+    assert(ok_index(map_part_number, pn));
+    list = (int *) lstptr(*ppn, chars);
+    //assert (pn > 0 && pn < m);
+    assert(pn < (unsigned) m);
+    nc = ppn->size;
+    tn = ppn->token_number;
+    if (tn == 0) {
+      while (nc--) {
+        cn = *list++ - min_char_number;
+        assert(cn < n_chars);
+        map_char_number[cn].token_number = i;
+        LOGV(cn) LCV(i);
+      }
+      assert(pn < part_dict->nsx);
+      ppn->token_number = i;
+      map_token_number[i].part_number = pn;
+      continue;
+    }
+    if (tn < ntk) {
+      //ntkns++;
+      //check_size(map_token_number, ntkns, ntkns);
+      //ctn = ntkns;
+      ctn = Token::create();
+      ctn->value_type = default_input_type;
+      //pf1x(ctn);
+      //fn = form2();
+      fn = makeRule(ctn);
+      at(bnf_table, tn, fn);
+      Rule(fn)->prim_tkn = tn;
+      ppn->token_number = ctn;
+      Token(tn)->operatorCandidate = ctn->operatorCandidate = 1;
+      while (nc--) {
+        cn = *list++ - min_char_number;
+        assert(cn < n_chars);
+        map_char_number[cn].token_number = ctn;
+        LOGV(cn) LCV(ctn);
+      }
+      map_token_number[ctn].part_number = pn;
+      map_token_number[ctn].value_type = default_input_type;
+      map_token_number[tn].value_type = default_input_type;
+      map_token_number[tn].part_number = 0;
+      map_token_number[tn].non_terminal_flag = 1;
+      tn = ctn;
+    }
+    //pf1x(tn);
+    //fn = form2();
+    fn = makeRule(tn);
+    LOGV(i) LCV(fn) LCV(tn);
+    at(bnf_table, i, fn);
+    Rule(fn)->prim_tkn = i;
+    map_token_number[i].non_terminal_flag = 1;
+    ctn = tn;
+  }
+  for (i = 1; (unsigned) i < ntk; i++) {
+    token_number_map *mti = &map_token_number[i];
+    ts = mti->token_set_id;
+    if (ts == 0) {
+      continue;
+    }
+    nf = map_char_set[ts].nparts;
+    if (nf <= 1) {
+      continue;
+    }
+    fl = lstptr(map_char_set[ts], part);
+    iws();
+    j = 0;
+    while (j < (unsigned) nf) {
+
+      pn = fl[j];
+      assert (pn > 0 && pn < (unsigned) m);
+      tn = map_part_number[pn].token_number;
+      if (tn == 0) {
+        //ntkns++;
+        //check_size(map_token_number,ntkns,ntkns);
+	list = (int *) lstptr(map_part_number[pn], chars);
+        nc = map_part_number[pn].size;
+        tn = Token::create();
+        while (nc--) {
+          cn = *list++ - min_char_number;
+          assert(cn < n_chars);
+          map_char_number[cn].token_number = tn;
+          LOGV(cn) LCV(tn);
+        }
+        //tn = ntkns;
+        assert(pn < part_dict->nsx);
+        map_part_number[pn].token_number = tn;
+        map_token_number[tn].part_number = pn;
+        map_token_number[tn].value_type = default_input_type;
+	map_token_number[tn].operatorCandidate = 1;
+      }
+      //pf1x(tn);
+      //aws(form2());
+      aws(makeRule(tn));
+      j++;
+    }
+    fl = (unsigned *) list_base;
+    nf = rws();
+    map_token_number[i].non_terminal_flag = 1;
+    for (j = 0; j < (unsigned) nf; j++) {
+      LOGV(i) LCV(fl[j]);
+      at(bnf_table, i, fl[j]);
+      //assert(ok_index(map_form_number, fl[j]));
+      Rule rule(fl[j]);
+      rule->prim_tkn = i;
+    }
+  }
+}
+
+static void partition_set(int ts, list_dict *d, int *ps, char *fa) {
+  int *list;
+  int n, np, i;
+
+  np = part_dict->nsx;
+  memset(fa, 0, np);
+  list = d->text + d->ndx[ts];
+  n = *(list++) - 1;
+  for (i = 0; i<n; i++) {
+    fa[ps[list[i] - min_char_number]] = 1;
+  }
+  iws();
+  for (i = 0; i < np; i++) {
+    if (fa[i]) {
+      aws(i);
+    }
+  }
+}
+
+int build_char_set(int nn) {
+  LOGSECTION("build_char_set");
+  //char *v;
+  //int i, k;
+  int cs = map_parse_tree[nn].char_set;
+  LOGV(cs) LCV(nn);
+
+  if (cs) {
+    return cs;
+  }
+  char_set_error = 0;
+  LOGV(n_chars);
+
+  //int *list = local_array(n_chars, int);
+  LocalArray<int> list(n_chars);
+  CharBitmap bitmap = map_parse_tree[nn].expression->bitmap();
+  if (!case_sensitive) {
+    bitmap.blurCase();
+  }
+  int count = bitmap.list(list);
+  LOGV(count);
+  cs = add_list_dict(list, count, char_set_dict);
+  LOGV(cs);
+  check_size(map_char_set, cs, cs);
+  map_char_set[cs].parse_tree = nn;
+  map_parse_tree[nn].char_set = cs;
+  return cs;
+}
+
+static void log_undef_sym(int tn) {
+  ssprintf("Undefined symbol: %s", Symbol(tn)->string.pointer());
+  log_error();
+}
+
+int getParseTree(int name) {
+  LOGSECTION("getParseTree");
+  int pt = map_token_name[name].parse_tree;
+  LOGV(pt);
+  if (pt) {
+    return pt;
+  }
+
+  int tn = map_token_name[name].token_number;
+  if (tn) {
+    pt = map_token_number[tn].parse_tree;
+  }
+  LOGV(tn) LCV(pt);
+  if (pt) {
+    return pt;
+  }
+
+  log_undef_sym(name);
+  return 0;
+}