Mercurial > ~dholland > hg > ag > index.cgi
diff anagram/agcore/keyword.cpp @ 0:13d2b8934445
Import AnaGram (near-)release tree into Mercurial.
author | David A. Holland |
---|---|
date | Sat, 22 Dec 2007 17:52:45 -0500 |
parents | |
children | 607e3be6bad8 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/anagram/agcore/keyword.cpp Sat Dec 22 17:52:45 2007 -0500 @@ -0,0 +1,400 @@ +/* + * AnaGram, A System for Syntax Directed Programming + * Copyright 1993-2002 Parsifal Software. All Rights Reserved. + * See the file COPYING for license and usage terms. + * + * keyword.cpp + */ + +#include "arrays.h" +#include "bitmap.h" +#include "bpe3.h" +#include "csexp.h" +#include "dict.h" +#include "ftpar.h" +#include "keyword.h" +#include "myalloc.h" +#include "q1glbl.h" +#include "stacks.h" +#include "tsd.h" +#include "ut.h" + +//#define INCLUDE_LOGGING +#include "log.h" + + +static tsd *key_work_table; +static int n_ends = 0; + + +static void tokensTo(unsigned sn, AgStack<int> &tokenStack) { + LOGSECTION("tokensTo"); + AgBalancedTree<int> tree; + while (sn) { + tree.insert(sn); + state_number_map *sp = &map_state_number[sn]; + unsigned *list = lstptr(*sp, previous_states); + int n = sp->n_previous_states; + LOGV(sn) LCV(sp->n_previous_states); + while (n--) { + LOGV(n) LCV(*list) LCV(map_state_number[*list].char_token); + while (tree.includes(*list)) { + list++; + } + int k = map_state_number[*list].n_actions; + while (k--) { + if (lstptr(map_state_number[*list], p_actions)[k] != sn) { + continue; + } + if (lstptr(map_state_number[*list], a_actions)[k] != pe_go_to) { + continue; + } + break; + } + assert(k >= 0); + tokenStack.push(lstptr(map_state_number[*list], t_actions)[k]); + sn = *list; + LOGV(n) LCV(sn); + break; + } + assert(n >= 0); + } +} + +/* + * Each time it is necessary for the parser to get a new token, it + * begins by checking the key_word_index table indexed by the current + * state. If the entry is zero, there are no keywords to be identified. + * If the entry is non_zero, the entry identifies a location in the + * key_word identification table. The id_key_word function is then + * invoked to determine whether there is a keyword present. + * + * The id_key_word function is a table driven, finite state automaton. + * The actions are as follows: + * jump to next state + * The string in hand is not itself a key, but matches the + * beginning of one or more keys. + * set key and jump to next state + * The string in hand might be a key, but it also matches the + * beginning of at least one other key. Save the token number and read + * pointer for this key in case we do not get a complete match on + * the longer keys. + * accept key + * The string in hand is a key, and there are no others which match + * partially. + * end key + * The string in hand is the beginning of a key, there are no other + * matches, and it is only necessary to see if we get a match for the + * remainder of the key. + * no match key + * The string in hand does not match any key but there is a + * substring which matched. Go return it. + */ + +typedef enum { + accept_key, + set_key, + jmp_key, + end_key, + no_match_key, + cf_accept_key, + cf_set_key, + cf_end_key +} key_words; + + +static int build_key_table_state(char *sp[], int *tkn, unsigned ns) { + int n; + int nw = key_work_table->nt; + unsigned i = 0, j, k; + int ch, act, parm = 0, jmp = 0; + + check_tsd(key_work_table); + while (i < ns) { + ch = *sp[i]++; + parm = jmp = 0; + if (ch == 0) { + continue; + } + for (j = i+1, k = 0; j < ns && *sp[j] == ch;) { + sp[j++]++; + k++; + } + if (*sp[i] == 0) { + parm = tkn[i]; + if (k == 0) { + act = accept_key; + } + else { + act = set_key; + jmp = build_key_table_state(&sp[i+1], &tkn[i+1], k); + } + } + else if (k == 0) { + act = end_key; + parm = tkn[i]; + jmp = n_ends; + ass(sp[i]); + acs(0); + n_ends += strlen(sp[i]) + 1; + } + else { + act = jmp_key; + jmp = build_key_table_state(&sp[i], &tkn[i], k+1); + } + at(key_work_table, ch, act, parm, jmp); + i = j; + } + n = key_table->nt; + k = nw; + while (k < key_work_table->nt) { + xtxf(key_work_table, k, &ch, &act, &parm, &jmp); + at(key_table, ch, act, parm, jmp); + k++; + } + at(key_table, 255, no_match_key, 0, 0); + key_work_table->nt = nw; + return n; +} + +static int build_key_table(char *sp[], int tkn[], int ns) { + int i, j; + char *p; + int t; + LOGSECTION("build_key_table"); + + for (i = 0; i < ns; i++) { + for (j = i+1; j < ns; j++) { + if (strcmp(sp[j], sp[i]) < 0) { + p = sp[i]; + sp[i] = sp[j]; + sp[j] = p; + t = tkn[i]; + tkn[i] = tkn[j]; + tkn[j] = t; + } + } + } + return build_key_table_state(sp, tkn, ns); +} + +void build_key_tables(void) { + unsigned i; + unsigned nki = key_list_dict->nsx; + //int *ki; + extern char *key_ends; + extern unsigned n_key_ends; + extern unsigned nstates; + + LOGSECTION("build_key_tables"); + + if (Keyword::count() <= 1) { + return; + } + //ki = local_array(nki, int); + LocalArray<int> ki(nki); + memset(ki, 0, nki*sizeof(*ki)); + + key_table = init_tsd(4); + key_work_table = init_tsd(4); + at(key_table, 0, 0, 0, 0); + ics(); + n_ends = 0; + ki[0] = 0; + + for (i = 1; i < key_list_dict->nsx; i++) { + int *lb = dict_str(key_list_dict, i); + int ns = *lb++ - 1; + char **strings = ALLOCATE(ns, char *); + int *tokens = ALLOCATE(ns, int); + int j; + + for (j = 0; j < ns; j++) { + Keyword key = map_token_number[lb[j]].key; + strings[j] = key->string.pointer(); + tokens[j] = lb[j]; + } + ki[i] = build_key_table(strings, tokens, ns); + DEALLOCATE(strings); + DEALLOCATE(tokens); + } + delete_tsd(key_work_table); + n_key_ends = tis(); + key_ends = build_string(); + for (i = 0; i < nstates; i++) { + state_number_map *sp = &map_state_number[i]; + int k = sp->key_list; + + if (k == 0) { + continue; + } + sp->key_index = (unsigned short) ki[k]; + } +} + +void check_key_reserve(void) { + int n_keys = Keyword::count(); + int k = distinguishSets.size(); + + if (n_keys <= 1 || k == 0) { + return; + } + while (k--) { + int pt = distinguishSets[k]; + CharBitmap bitmap = map_parse_tree[pt].expression->bitmap(); + for (Each<Keyword> keyword; keyword.loopNotFinished(); keyword.getNext()) { + KeywordDescriptor &keywordDescriptor(keyword); + //unsigned char *str = (unsigned char *) keyword->string.pointer(); + unsigned char *str = (unsigned char *) keywordDescriptor.string.pointer(); + //if (keyword->reserve) continue; + if (keywordDescriptor.reserve) { + continue; + } + while (*str && bitmap.getBit(*str)) { + str++; + } + if (*str) { + continue; + } + //keyword->reserve = pt; + keywordDescriptor.reserve = pt; + } + } +} + + +void append_key(int kn) { + unsigned char *s = (unsigned char *) Keyword(kn)->string.pointer(); + while (*s) { + append_ascii_char(*s++); + } +} + +//int keyword_problem(unsigned sn,const unsigned char *p, int key){ +//int keyword_problem(unsigned sn,const unsigned char *p, Keyword key){ +//int keyword_problem(unsigned sn,const AgArray<int> input, Keyword key){ +int keyword_problem(unsigned sn, const AgArray<int> input, Keyword) { + LOGSECTION("keyword_problem"); + + AgStack<int> tokens; + + // Get a path to this state + tokensTo(sn, tokens); + +#ifdef INCLUDE_LOGGING + { + for (int i = 0; i < tokens.size(); i++) { + LOGV(tokens[i]); + } + } +#endif + + FtParser parser; + + // Advance to current state + while (tokens.size()) { + parser.parseToken(tokens.pop()); + } + +/* + LOGV(key); + // Disable keyword under test + parser.ignoreKeyword(key); + parser.parse((const char *) p); + traceCounts.discardData(); + if (parser.processState < FtParser::syntaxError) { + return 1; + } + return -1; +*/ + + int n = input.size(); + int flag = 1; + for (int i = 0; flag == 1 && i < n; i++) { + parser.parseToken(input[i]); + if (parser.processState > FtParser::running) { + flag = -1; + } + } + traceCounts.discardData(); + return flag; +} + +Keyword::Keyword() + : KeyedObjectRegister<KeywordDescriptor>() +{} + +Keyword::Keyword(unsigned x) + : KeyedObjectRegister<KeywordDescriptor>(x) +{} + +Keyword::Keyword(const Keyword &t) + : KeyedObjectRegister<KeywordDescriptor>(t) +{} + +Keyword::Keyword(const char *x) + : KeyedObjectRegister<KeywordDescriptor>(KeywordDescriptor(x)) +{} + +void Keyword::reset() { + LOGSECTION("Keyword::reset"); + KeyedObjectRegister<KeywordDescriptor>::reset(); + Keyword(""); + AgString string = Keyword((unsigned) 0)->string; + LOGV(string.pointer()); + LOGV(descriptorList.size()) LCV(Keyword((unsigned) 0)->string); + LOGS("reset complete"); +} + +Keyword Keyword::sorted(int x) { + LOGSECTION("Keyword::sorted"); + LOGV(x); + Keyword keyword = tree[x]; + LOGV(keyword) LCV(keyword->string); + return keyword; +} + +Keyword Keyword::create() { + LOGSECTION("Keyword::create"); + tss(); + LOGV(string_base); + Keyword keyword(string_base); + rcs(); + LOGV(keyword); + return keyword; +} + +Keyword add_string_dict(const char *s, Keyword::Dict &d) { + return d.add_string(s); +} + +//Keyword::Map map_token_name; + +KeywordDescriptor &Keyword::Map::operator [] (unsigned x) { + LOGSECTION("Keyword::Map::operator []"); + LOGV(x) LCV(descriptorList.size()); + assert(x < Keyword::descriptorList.size()); + return Keyword::descriptorList[x]; +} + +Keyword Keyword::Dict::add_string(const char *s) { + return Keyword(s); +} + +KeywordDescriptor::KeywordDescriptor() + : reserve(0) +{ + // Nothing to do +} + +KeywordDescriptor::KeywordDescriptor(const char *s) + : string(s), reserve(0) +{ + // Nothing to do +} + +int KeywordDescriptor::operator < (const KeywordDescriptor &d) const { + return string < d.string; +} + +const int Keyword::first = 1;