view anagram/agcore/keyword.cpp @ 17:12171da8943f

Don't refer to CVS.
author David A. Holland
date Tue, 31 May 2022 01:56:37 -0400
parents 607e3be6bad8
children
line wrap: on
line source

/*
 * AnaGram, A System for Syntax Directed Programming
 * Copyright 1993-2002 Parsifal Software. All Rights Reserved.
 * See the file COPYING for license and usage terms.
 *
 * keyword.cpp
 */

#include "arrays.h"
#include "bitmap.h"
#include "bpe3.h"
#include "csexp.h"
#include "dict.h"
#include "ftpar.h"
#include "keyword.h"
#include "myalloc.h"
#include "q1glbl.h"
#include "stacks.h"
#include "tsd.h"
#include "ut.h"

//#define INCLUDE_LOGGING
#include "log.h"


static tsd *key_work_table;
static int n_ends = 0;


static void tokensTo(unsigned sn, AgStack<int> &tokenStack) {
  LOGSECTION("tokensTo");
  AgBalancedTree<int> tree;
  while (sn) {
    tree.insert(sn);
    state_number_map *sp = &map_state_number[sn];
    unsigned *list = lstptr(*sp, previous_states);
    int n = sp->n_previous_states;
    LOGV(sn) LCV(sp->n_previous_states);
    while (n--) {
      LOGV(n) LCV(*list) LCV(map_state_number[*list].char_token);
      while (tree.includes(*list)) {
	list++;
      }
      int k = map_state_number[*list].n_actions;
      while (k--) {
        if (lstptr(map_state_number[*list], p_actions)[k] != sn) {
	  continue;
	}
        if (lstptr(map_state_number[*list], a_actions)[k] != pe_go_to) {
	  continue;
	}
        break;
      }
      assert(k >= 0);
      tokenStack.push(lstptr(map_state_number[*list], t_actions)[k]);
      sn = *list;
      LOGV(n) LCV(sn);
      break;
    }
    assert(n >= 0);
  }
}

/*
 * Each time it is necessary for the parser to get a new token, it
 * begins by checking the key_word_index table indexed by the current
 * state. If the entry is zero, there are no keywords to be identified.
 * If the entry is non_zero, the entry identifies a location in the
 * key_word identification table. The id_key_word function is then
 * invoked to determine whether there is a keyword present.
 *
 * The id_key_word function is a table driven, finite state automaton.
 * The actions are as follows:
 *    jump to next state
 *      The string in hand is not itself a key, but matches the
 *      beginning of one or more keys.
 *    set key and jump to next state
 *      The string in hand might be a key, but it also matches the
 *      beginning of at least one other key. Save the token number and read
 *      pointer for this key in case we do not get a complete match on
 *      the longer keys.
 *    accept key
 *      The string in hand is a key, and there are no others which match
 *      partially.
 *    end key
 *      The string in hand is the beginning of a key, there are no other
 *      matches, and it is only necessary to see if we get a match for the
 *      remainder of the key.
 *    no match key
 *      The string in hand does not match any key but there is a
 *      substring which matched. Go return it.
 */

typedef enum {
  accept_key,
  set_key,
  jmp_key,
  end_key,
  no_match_key,
  cf_accept_key,
  cf_set_key,
  cf_end_key
} key_words;


static int build_key_table_state(char *sp[], int *tkn, unsigned ns) {
  int n;
  int nw = key_work_table->nt;
  unsigned i = 0, j, k;
  int ch, act, parm = 0, jmp = 0;

  check_tsd(key_work_table);
  while (i < ns) {
    ch = *sp[i]++;
    parm = jmp = 0;
    if (ch == 0) {
      continue;
    }
    for (j = i+1, k = 0; j < ns && *sp[j] == ch;) {
      sp[j++]++;
      k++;
    }
    if (*sp[i] == 0) {
      parm = tkn[i];
      if (k == 0) {
         act = accept_key;
      }
      else {
        act = set_key;
        jmp = build_key_table_state(&sp[i+1], &tkn[i+1], k);
      }
    }
    else if (k == 0) {
      act = end_key;
      parm = tkn[i];
      jmp = n_ends;
      ass(sp[i]);
      acs(0);
      n_ends += strlen(sp[i]) + 1;
    }
    else {
      act = jmp_key;
      jmp = build_key_table_state(&sp[i], &tkn[i], k+1);
    }
    at(key_work_table, ch, act, parm, jmp);
    i = j;
  }
  n = key_table->nt;
  k = nw;
  while (k < key_work_table->nt) {
    xtxf(key_work_table, k, &ch, &act, &parm, &jmp);
    at(key_table, ch, act, parm, jmp);
    k++;
  }
  at(key_table, 255, no_match_key, 0, 0);
  key_work_table->nt = nw;
  return n;
}

static int build_key_table(char *sp[], int tkn[], int ns) {
  int i, j;
  char *p;
  int t;
  LOGSECTION("build_key_table");

  for (i = 0; i < ns; i++) {
    for (j = i+1; j < ns; j++) {
      if (strcmp(sp[j], sp[i]) < 0) {
        p = sp[i];
        sp[i] = sp[j];
        sp[j] = p;
        t = tkn[i];
        tkn[i] = tkn[j];
        tkn[j] = t;
      }
    }
  }
  return build_key_table_state(sp, tkn, ns);
}

void build_key_tables(void) {
  unsigned i;
  unsigned nki = key_list_dict->nsx;
  //int *ki;
  extern char *key_ends;
  extern unsigned n_key_ends;
  extern unsigned nstates;

  LOGSECTION("build_key_tables");

  if (Keyword::count() <= 1) {
    return;
  }
  //ki = local_array(nki, int);
  LocalArray<int> ki(nki);
  memset(ki, 0, nki*sizeof(*ki));

  key_table = init_tsd(4);
  key_work_table = init_tsd(4);
  at(key_table, 0, 0, 0, 0);
  ics();
  n_ends = 0;
  ki[0] = 0;

  for (i = 1; i < key_list_dict->nsx; i++) {
    int *lb = dict_str(key_list_dict, i);
    int ns = *lb++ - 1;
    char **strings = ALLOCATE(ns, char *);
    int *tokens = ALLOCATE(ns, int);
    int j;

    for (j = 0; j < ns; j++) {
      Keyword key = map_token_number[lb[j]].key;
      strings[j] = key->string.pointer();
      tokens[j] = lb[j];
    }
    ki[i] = build_key_table(strings, tokens, ns);
    DEALLOCATE(strings);
    DEALLOCATE(tokens);
  }
  delete_tsd(key_work_table);
  n_key_ends = tis();
  key_ends = build_string();
  for (i = 0; i < nstates; i++) {
    state_number_map *sp = &map_state_number[i];
    int k = sp->key_list;

    if (k == 0) {
      continue;
    }
    sp->key_index = (unsigned short) ki[k];
  }
}

void check_key_reserve(void) {
  int n_keys = Keyword::count();
  int k = distinguishSets.size();

  if (n_keys <= 1 || k == 0) {
    return;
  }
  while (k--) {
    int pt = distinguishSets[k];
    CharBitmap bitmap = map_parse_tree[pt].expression->bitmap();
    for (Each<Keyword> keyword; keyword.loopNotFinished(); keyword.getNext()) {
      KeywordDescriptor &keywordDescriptor(keyword);
      //unsigned char *str = (unsigned char *) keyword->string.pointer();
      unsigned char *str = (unsigned char *) keywordDescriptor.string.pointer();
      //if (keyword->reserve) continue;
      if (keywordDescriptor.reserve) {
	continue;
      }
      while (*str && bitmap.getBit(*str)) {
	str++;
      }
      if (*str) {
	continue;
      }
      //keyword->reserve = pt;
      keywordDescriptor.reserve = pt;
    }
  }
}


void append_key(int kn) {
  unsigned char *s = (unsigned char *) Keyword(kn)->string.pointer();
  while (*s) {
    append_ascii_char(*s++);
  }
}

//int keyword_problem(unsigned sn,const unsigned char *p, int key){
//int keyword_problem(unsigned sn,const unsigned char *p, Keyword key){
//int keyword_problem(unsigned sn,const AgArray<int> input, Keyword key){
int keyword_problem(unsigned sn, const AgArray<int> input, Keyword) {
  LOGSECTION("keyword_problem");

  AgStack<int> tokens;

  // Get a path to this state
  tokensTo(sn, tokens);

#ifdef INCLUDE_LOGGING
  {
    for (int i = 0; i < tokens.size(); i++) {
      LOGV(tokens[i]);
    }
  }
#endif

  FtParser parser;

  // Advance to current state
  while (tokens.size()) {
    parser.parseToken(tokens.pop());
  }

/*
  LOGV(key);
  // Disable keyword under test
  parser.ignoreKeyword(key);
  parser.parse((const char *) p);
  traceCounts.discardData();
  if (parser.processState < FtParser::syntaxError) {
    return 1;
  }
  return -1;
*/

  int n = input.size();
  int flag = 1;
  for (int i = 0; flag == 1 && i < n; i++) {
    parser.parseToken(input[i]);
    if (parser.processState > FtParser::running) {
      flag = -1;
    }
  }
  traceCounts.discardData();
  return flag;
}

Keyword::Keyword()
  : KeyedObjectRegister<KeywordDescriptor>()
{}

Keyword::Keyword(unsigned x)
  : KeyedObjectRegister<KeywordDescriptor>(x)
{}

Keyword::Keyword(const Keyword &t)
  : KeyedObjectRegister<KeywordDescriptor>(t)
{}

void Keyword::operator = (const Keyword &t) {
  KeyedObjectRegister<KeywordDescriptor>::operator = (t);
}

Keyword::Keyword(const char *x)
  : KeyedObjectRegister<KeywordDescriptor>(KeywordDescriptor(x))
{}

void Keyword::reset() {
  LOGSECTION("Keyword::reset");
  KeyedObjectRegister<KeywordDescriptor>::reset();
  Keyword("");
  AgString string = Keyword((unsigned) 0)->string;
  LOGV(string.pointer());
  LOGV(descriptorList.size()) LCV(Keyword((unsigned) 0)->string);
  LOGS("reset complete");
}

Keyword Keyword::sorted(int x) {
  LOGSECTION("Keyword::sorted");
  LOGV(x);
  Keyword keyword = tree[x];
  LOGV(keyword) LCV(keyword->string);
  return keyword;
}

Keyword Keyword::create() {
  LOGSECTION("Keyword::create");
  tss();
  LOGV(string_base);
  Keyword keyword(string_base);
  rcs();
  LOGV(keyword);
  return keyword;
}

Keyword add_string_dict(const char *s, Keyword::Dict &d) {
  return d.add_string(s);
}

//Keyword::Map map_token_name;

KeywordDescriptor &Keyword::Map::operator [] (unsigned x) {
  LOGSECTION("Keyword::Map::operator []");
  LOGV(x) LCV(descriptorList.size());
  assert(x < Keyword::descriptorList.size());
  return Keyword::descriptorList[x];
}

Keyword Keyword::Dict::add_string(const char *s) {
  return Keyword(s);
}

KeywordDescriptor::KeywordDescriptor()
  : reserve(0)
{
  // Nothing to do
}

KeywordDescriptor::KeywordDescriptor(const char *s)
  : string(s), reserve(0)
{
  // Nothing to do
}

int KeywordDescriptor::operator < (const KeywordDescriptor &d) const {
  return string < d.string;
}

const int Keyword::first = 1;