diff anagram/agcore/keyword.cpp @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children 607e3be6bad8
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/anagram/agcore/keyword.cpp	Sat Dec 22 17:52:45 2007 -0500
@@ -0,0 +1,400 @@
+/*
+ * AnaGram, A System for Syntax Directed Programming
+ * Copyright 1993-2002 Parsifal Software. All Rights Reserved.
+ * See the file COPYING for license and usage terms.
+ *
+ * keyword.cpp
+ */
+
+#include "arrays.h"
+#include "bitmap.h"
+#include "bpe3.h"
+#include "csexp.h"
+#include "dict.h"
+#include "ftpar.h"
+#include "keyword.h"
+#include "myalloc.h"
+#include "q1glbl.h"
+#include "stacks.h"
+#include "tsd.h"
+#include "ut.h"
+
+//#define INCLUDE_LOGGING
+#include "log.h"
+
+
+static tsd *key_work_table;
+static int n_ends = 0;
+
+
+static void tokensTo(unsigned sn, AgStack<int> &tokenStack) {
+  LOGSECTION("tokensTo");
+  AgBalancedTree<int> tree;
+  while (sn) {
+    tree.insert(sn);
+    state_number_map *sp = &map_state_number[sn];
+    unsigned *list = lstptr(*sp, previous_states);
+    int n = sp->n_previous_states;
+    LOGV(sn) LCV(sp->n_previous_states);
+    while (n--) {
+      LOGV(n) LCV(*list) LCV(map_state_number[*list].char_token);
+      while (tree.includes(*list)) {
+	list++;
+      }
+      int k = map_state_number[*list].n_actions;
+      while (k--) {
+        if (lstptr(map_state_number[*list], p_actions)[k] != sn) {
+	  continue;
+	}
+        if (lstptr(map_state_number[*list], a_actions)[k] != pe_go_to) {
+	  continue;
+	}
+        break;
+      }
+      assert(k >= 0);
+      tokenStack.push(lstptr(map_state_number[*list], t_actions)[k]);
+      sn = *list;
+      LOGV(n) LCV(sn);
+      break;
+    }
+    assert(n >= 0);
+  }
+}
+
+/*
+ * Each time it is necessary for the parser to get a new token, it
+ * begins by checking the key_word_index table indexed by the current
+ * state. If the entry is zero, there are no keywords to be identified.
+ * If the entry is non_zero, the entry identifies a location in the
+ * key_word identification table. The id_key_word function is then
+ * invoked to determine whether there is a keyword present.
+ *
+ * The id_key_word function is a table driven, finite state automaton.
+ * The actions are as follows:
+ *    jump to next state
+ *      The string in hand is not itself a key, but matches the
+ *      beginning of one or more keys.
+ *    set key and jump to next state
+ *      The string in hand might be a key, but it also matches the
+ *      beginning of at least one other key. Save the token number and read
+ *      pointer for this key in case we do not get a complete match on
+ *      the longer keys.
+ *    accept key
+ *      The string in hand is a key, and there are no others which match
+ *      partially.
+ *    end key
+ *      The string in hand is the beginning of a key, there are no other
+ *      matches, and it is only necessary to see if we get a match for the
+ *      remainder of the key.
+ *    no match key
+ *      The string in hand does not match any key but there is a
+ *      substring which matched. Go return it.
+ */
+
+typedef enum {
+  accept_key,
+  set_key,
+  jmp_key,
+  end_key,
+  no_match_key,
+  cf_accept_key,
+  cf_set_key,
+  cf_end_key
+} key_words;
+
+
+static int build_key_table_state(char *sp[], int *tkn, unsigned ns) {
+  int n;
+  int nw = key_work_table->nt;
+  unsigned i = 0, j, k;
+  int ch, act, parm = 0, jmp = 0;
+
+  check_tsd(key_work_table);
+  while (i < ns) {
+    ch = *sp[i]++;
+    parm = jmp = 0;
+    if (ch == 0) {
+      continue;
+    }
+    for (j = i+1, k = 0; j < ns && *sp[j] == ch;) {
+      sp[j++]++;
+      k++;
+    }
+    if (*sp[i] == 0) {
+      parm = tkn[i];
+      if (k == 0) {
+         act = accept_key;
+      }
+      else {
+        act = set_key;
+        jmp = build_key_table_state(&sp[i+1], &tkn[i+1], k);
+      }
+    }
+    else if (k == 0) {
+      act = end_key;
+      parm = tkn[i];
+      jmp = n_ends;
+      ass(sp[i]);
+      acs(0);
+      n_ends += strlen(sp[i]) + 1;
+    }
+    else {
+      act = jmp_key;
+      jmp = build_key_table_state(&sp[i], &tkn[i], k+1);
+    }
+    at(key_work_table, ch, act, parm, jmp);
+    i = j;
+  }
+  n = key_table->nt;
+  k = nw;
+  while (k < key_work_table->nt) {
+    xtxf(key_work_table, k, &ch, &act, &parm, &jmp);
+    at(key_table, ch, act, parm, jmp);
+    k++;
+  }
+  at(key_table, 255, no_match_key, 0, 0);
+  key_work_table->nt = nw;
+  return n;
+}
+
+static int build_key_table(char *sp[], int tkn[], int ns) {
+  int i, j;
+  char *p;
+  int t;
+  LOGSECTION("build_key_table");
+
+  for (i = 0; i < ns; i++) {
+    for (j = i+1; j < ns; j++) {
+      if (strcmp(sp[j], sp[i]) < 0) {
+        p = sp[i];
+        sp[i] = sp[j];
+        sp[j] = p;
+        t = tkn[i];
+        tkn[i] = tkn[j];
+        tkn[j] = t;
+      }
+    }
+  }
+  return build_key_table_state(sp, tkn, ns);
+}
+
+void build_key_tables(void) {
+  unsigned i;
+  unsigned nki = key_list_dict->nsx;
+  //int *ki;
+  extern char *key_ends;
+  extern unsigned n_key_ends;
+  extern unsigned nstates;
+
+  LOGSECTION("build_key_tables");
+
+  if (Keyword::count() <= 1) {
+    return;
+  }
+  //ki = local_array(nki, int);
+  LocalArray<int> ki(nki);
+  memset(ki, 0, nki*sizeof(*ki));
+
+  key_table = init_tsd(4);
+  key_work_table = init_tsd(4);
+  at(key_table, 0, 0, 0, 0);
+  ics();
+  n_ends = 0;
+  ki[0] = 0;
+
+  for (i = 1; i < key_list_dict->nsx; i++) {
+    int *lb = dict_str(key_list_dict, i);
+    int ns = *lb++ - 1;
+    char **strings = ALLOCATE(ns, char *);
+    int *tokens = ALLOCATE(ns, int);
+    int j;
+
+    for (j = 0; j < ns; j++) {
+      Keyword key = map_token_number[lb[j]].key;
+      strings[j] = key->string.pointer();
+      tokens[j] = lb[j];
+    }
+    ki[i] = build_key_table(strings, tokens, ns);
+    DEALLOCATE(strings);
+    DEALLOCATE(tokens);
+  }
+  delete_tsd(key_work_table);
+  n_key_ends = tis();
+  key_ends = build_string();
+  for (i = 0; i < nstates; i++) {
+    state_number_map *sp = &map_state_number[i];
+    int k = sp->key_list;
+
+    if (k == 0) {
+      continue;
+    }
+    sp->key_index = (unsigned short) ki[k];
+  }
+}
+
+void check_key_reserve(void) {
+  int n_keys = Keyword::count();
+  int k = distinguishSets.size();
+
+  if (n_keys <= 1 || k == 0) {
+    return;
+  }
+  while (k--) {
+    int pt = distinguishSets[k];
+    CharBitmap bitmap = map_parse_tree[pt].expression->bitmap();
+    for (Each<Keyword> keyword; keyword.loopNotFinished(); keyword.getNext()) {
+      KeywordDescriptor &keywordDescriptor(keyword);
+      //unsigned char *str = (unsigned char *) keyword->string.pointer();
+      unsigned char *str = (unsigned char *) keywordDescriptor.string.pointer();
+      //if (keyword->reserve) continue;
+      if (keywordDescriptor.reserve) {
+	continue;
+      }
+      while (*str && bitmap.getBit(*str)) {
+	str++;
+      }
+      if (*str) {
+	continue;
+      }
+      //keyword->reserve = pt;
+      keywordDescriptor.reserve = pt;
+    }
+  }
+}
+
+
+void append_key(int kn) {
+  unsigned char *s = (unsigned char *) Keyword(kn)->string.pointer();
+  while (*s) {
+    append_ascii_char(*s++);
+  }
+}
+
+//int keyword_problem(unsigned sn,const unsigned char *p, int key){
+//int keyword_problem(unsigned sn,const unsigned char *p, Keyword key){
+//int keyword_problem(unsigned sn,const AgArray<int> input, Keyword key){
+int keyword_problem(unsigned sn, const AgArray<int> input, Keyword) {
+  LOGSECTION("keyword_problem");
+
+  AgStack<int> tokens;
+
+  // Get a path to this state
+  tokensTo(sn, tokens);
+
+#ifdef INCLUDE_LOGGING
+  {
+    for (int i = 0; i < tokens.size(); i++) {
+      LOGV(tokens[i]);
+    }
+  }
+#endif
+
+  FtParser parser;
+
+  // Advance to current state
+  while (tokens.size()) {
+    parser.parseToken(tokens.pop());
+  }
+
+/*
+  LOGV(key);
+  // Disable keyword under test
+  parser.ignoreKeyword(key);
+  parser.parse((const char *) p);
+  traceCounts.discardData();
+  if (parser.processState < FtParser::syntaxError) {
+    return 1;
+  }
+  return -1;
+*/
+
+  int n = input.size();
+  int flag = 1;
+  for (int i = 0; flag == 1 && i < n; i++) {
+    parser.parseToken(input[i]);
+    if (parser.processState > FtParser::running) {
+      flag = -1;
+    }
+  }
+  traceCounts.discardData();
+  return flag;
+}
+
+Keyword::Keyword()
+  : KeyedObjectRegister<KeywordDescriptor>()
+{}
+
+Keyword::Keyword(unsigned x)
+  : KeyedObjectRegister<KeywordDescriptor>(x)
+{}
+
+Keyword::Keyword(const Keyword &t)
+  : KeyedObjectRegister<KeywordDescriptor>(t)
+{}
+
+Keyword::Keyword(const char *x)
+  : KeyedObjectRegister<KeywordDescriptor>(KeywordDescriptor(x))
+{}
+
+void Keyword::reset() {
+  LOGSECTION("Keyword::reset");
+  KeyedObjectRegister<KeywordDescriptor>::reset();
+  Keyword("");
+  AgString string = Keyword((unsigned) 0)->string;
+  LOGV(string.pointer());
+  LOGV(descriptorList.size()) LCV(Keyword((unsigned) 0)->string);
+  LOGS("reset complete");
+}
+
+Keyword Keyword::sorted(int x) {
+  LOGSECTION("Keyword::sorted");
+  LOGV(x);
+  Keyword keyword = tree[x];
+  LOGV(keyword) LCV(keyword->string);
+  return keyword;
+}
+
+Keyword Keyword::create() {
+  LOGSECTION("Keyword::create");
+  tss();
+  LOGV(string_base);
+  Keyword keyword(string_base);
+  rcs();
+  LOGV(keyword);
+  return keyword;
+}
+
+Keyword add_string_dict(const char *s, Keyword::Dict &d) {
+  return d.add_string(s);
+}
+
+//Keyword::Map map_token_name;
+
+KeywordDescriptor &Keyword::Map::operator [] (unsigned x) {
+  LOGSECTION("Keyword::Map::operator []");
+  LOGV(x) LCV(descriptorList.size());
+  assert(x < Keyword::descriptorList.size());
+  return Keyword::descriptorList[x];
+}
+
+Keyword Keyword::Dict::add_string(const char *s) {
+  return Keyword(s);
+}
+
+KeywordDescriptor::KeywordDescriptor()
+  : reserve(0)
+{
+  // Nothing to do
+}
+
+KeywordDescriptor::KeywordDescriptor(const char *s)
+  : string(s), reserve(0)
+{
+  // Nothing to do
+}
+
+int KeywordDescriptor::operator < (const KeywordDescriptor &d) const {
+  return string < d.string;
+}
+
+const int Keyword::first = 1;