view anagram/agcore/dict.cpp @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
line wrap: on
line source

/*
 * AnaGram, A System for Syntax Directed Programming
 * Copyright 1993-2000 Parsifal Software. All Rights Reserved.
 * See the file COPYING for license and usage terms.
 *
 * dict.cpp - old dictionary module
 */

#include <stdarg.h>
#include "port.h"

#include "arrays.h"
#include "assert.h"
#include "dict.h"
#include "minmax.h"
#include "myalloc.h"
#include "stacks.h"

//#define INCLUDE_LOGGING
#include "log.h"


#define HASH_TABLE_MAX 0x1000000

#define EVEN_UP(x) (x) /* + ((x)%2) */

static int find_str_entry(const char *s, string_dict *d);
static int find_lst_entry(const int *s, list_dict *d);
static int find_tpl_entry(const int *s, tuple_dict *d);


#if 0 /* unused */
void sort_string_dict(string_dict *d){
  unsigned i;
  //unsigned short k;
  unsigned k;

  if (d->nss >= d->nsx) return;
  REALLOCATE_ST(d->st,d->nsx);
{
  unsigned *dest = d->st + d->nss;
  unsigned *src = d->ndx + d->nss;
  //k = (unsigned short) (d->nsx - d->nss);
  k = (d->nsx - d->nss);
  while (k--) dest[k] = src[k];
}

  /* 10/1/06 note: these two funcs are/were defined in merge.cpp */
  sort_strings(d->text,d->st+d->nss,d->nsx-d->nss);
  merge_strings(d->text,d->st, d->nss, d->nsx-d->nss);
  REALLOCATE_ST(d->pt,d->nsx);
  d->nss = d->nsx;
  for (i = 0; i< d->nss; i++) {
    k = d->st[i]-2;
    //k = *((unsigned short *)(d->text+k));
		memcpy(&k, d->text+k, sizeof(unsigned short));
    d->pt[k] = i;
  }
}
#endif /* 0 */

list_dict *reset_list_dict(list_dict *d) {
  int k;
  ok_ptr(d);
  if (d->nxs) {
    ok_ptr(d->ndx);
  }
  if (d->nts) {
    ok_ptr(d->text);
  }
  if (d->nhs) {
    ok_ptr(d->ht);
  }
  d->nsx = 0;
  d->ntc = 0;
/*
  k = sizeof(short)*d->nhs;
  if (k) {
    size_ok(d->ht, k, __FILE__, __LINE__);
  }
  memset(d->ht, 0xff, k);
*/
  k = sizeof(unsigned)*d->nhs;
  if (k) {
    size_ok(d->ht, k, __FILE__, __LINE__);
  }
  memset(d->ht, 0xff, k);
  return NULL;
}

tuple_dict *reset_tuple_dict(tuple_dict *d) {
  unsigned k;
  ok_ptr(d);
  if (d->nxs) {
    ok_ptr(d->ndx);
  }
  if (d->nts) {
    ok_ptr(d->text);
  }
  if (d->nhs) {
    ok_ptr(d->ht);
  }
  d->nsx = 0;
  d->ntc = 0;
/*
  k = sizeof(short)*d->nhs;
  if (k) {
    size_ok(d->ht, k, __FILE__, __LINE__);
  }
  memset(d->ht, 0xff, k);
*/
  k = sizeof(unsigned)*d->nhs;
  if (k) {
    size_ok(d->ht, k, __FILE__, __LINE__);
  }
  memset(d->ht, 0xff, k);
  return NULL;
}

static void hash_string_dict(string_dict *d){
  unsigned i,x;
  const char *s;

  //memset(d->ht, 0xff, sizeof(short)*d->nhs);
  memset(d->ht, 0xff, sizeof(unsigned)*d->nhs);
  for (i = 0; i < d->nsx; i++) {
    s = &d->text[d->ndx[i]];
    x = find_str_entry(s,d);
    //d->ht[x] = (unsigned short) i;
    d->ht[x] = i;
  }
}

static void hash_list_dict(list_dict *d){
  unsigned i,x;
  const int *s;

  //memset(d->ht, 0xff, sizeof(short)*d->nhs);
  memset(d->ht, 0xff, sizeof(unsigned)*d->nhs);
  for (i = 0; i < d->nsx; i++) {
    s = &d->text[d->ndx[i]];
    x = find_lst_entry(s,d);
    //d->ht[x] = (unsigned short) i;
    d->ht[x] = i;
  }
}

static void hash_tpl_dict(tuple_dict *d){
  unsigned i,x;
  const int *s;
  int n;

  //memset(d->ht, 0xff, sizeof(short)*d->nhs);
  memset(d->ht, 0xff, sizeof(unsigned)*d->nhs);
  s = d->text;
  n = d->tw;
  for (i = 0; i < d->nsx; i++) {
    x = find_tpl_entry(s,d);
    //d->ht[x] = (unsigned short) i;
    d->ht[x] = i;
    s += n;
  }
}

list_dict *delete_list_dict(list_dict *da) {
  list_dict d;

  if (da == NULL) {
    return NULL;
  }
  d = *da;
  DEALLOCATE(da);
  if (d.ndx != NULL) DEALLOCATE(d.ndx);
  if (d.text != NULL) DEALLOCATE(d.text);
  if (d.ht != NULL) DEALLOCATE(d.ht);
  if (d.st != NULL) DEALLOCATE(d.st);
  if (d.pt != NULL) DEALLOCATE(d.pt);
  return NULL;
}

tuple_dict *delete_tuple_dict(tuple_dict *da) {
  tuple_dict d;

  if (da == NULL) {
    return NULL;
  }
  d = *da;
  DEALLOCATE(da);
  if (d.ndx != NULL) DEALLOCATE(d.ndx);
  if (d.text != NULL) DEALLOCATE(d.text);
  if (d.ht != NULL) DEALLOCATE(d.ht);
  if (d.st != NULL) DEALLOCATE(d.st);
  if (d.pt != NULL) DEALLOCATE(d.pt);
  return NULL;
}

void empty_tuple_dict(tuple_dict *da) {
  tuple_dict d;

  if (da == NULL) {
    return;
  }
  d = *da;
  if (d.ndx != NULL) DEALLOCATE(d.ndx);
  if (d.text != NULL) DEALLOCATE(d.text);
  if (d.ht != NULL) DEALLOCATE(d.ht);
  if (d.st != NULL) DEALLOCATE(d.st);
  if (d.pt != NULL) DEALLOCATE(d.pt);
  memset(&d, 0, sizeof(d));
  d.tw = da->tw;
  *da = d;
}

string_dict *delete_string_dict(string_dict *da) {
  string_dict d;

  if (da == NULL) {
    return NULL;
  }
  d = *da;
  DEALLOCATE(da);
  if (d.ndx != NULL) DEALLOCATE(d.ndx);
  if (d.text != NULL) DEALLOCATE(d.text);
  if (d.ht != NULL) DEALLOCATE(d.ht);
  if (d.st != NULL) DEALLOCATE(d.st);
  if (d.pt != NULL) DEALLOCATE(d.pt);
  return NULL;
}

int identify_string(const char *s, string_dict *d) {
  unsigned he, x;

  check_stack;
  he = find_str_entry(s, d);
  if ((x = d->ht[he]) >= d->nsx) {
    x = 0;
  }
  return x;
}


int add_string_dict(const char *s,string_dict *d){
  int len;
  int he;
  unsigned x;
  long nhs, nsx;

  check_stack;
  len = strlen(s)+3;
  if (d->ntc+len > d->nts) {
    unsigned long k = d->ntc+len;
    k += k/2;
    k = max(200UL, min((unsigned long) MAX_BYTES,k));
    LOGSECTION("add_string_dict::resize");
    LOGV(k) LCV(d->ntc + len);
    assert(k >= d->ntc + len);
    REALLOCATE_ST(d->text, (unsigned) k);
    d->nts = (unsigned) k;
  }
  if (d->nsx +1 > d->nxs) {
    unsigned long k = d->nsx+1;
    k += k/2;
    k = max(100UL, min((unsigned long) MAX_INTS, k));
    LOGSECTION("add_string_dict::resize");
    LOGV(k) LCV(d->nsx + 1);
    assert(k >= d->nsx + 1);
    REALLOCATE_ST(d->ndx, (unsigned) k);
    d->nxs = (unsigned) k;
  }
  if (10*(nsx = d->nsx) >= 8*(nhs = d->nhs) && nhs < HASH_TABLE_MAX) {
    nhs = max(nhs, 256L);
    while (10*nsx > 8*nhs && nhs < HASH_TABLE_MAX) {
      nhs *= 2;
    }
    LOGSECTION("add_string_dict::resize");
    LOGV(nhs) LCV(d->nsx);
    assert((unsigned) nhs > d->nsx);
    REALLOCATE_ST(d->ht,(unsigned) nhs);
    d->nhs = (unsigned) nhs;
    hash_string_dict(d);
  }
  s = strcpy(d->text+d->ntc+2,s);
  he = find_str_entry(s, d);
  if ((x = d->ht[he]) >= d->nsx) {
    //*((unsigned short *)(d->text+d->ntc)) = (unsigned short) d->nsx;
    unsigned short length = d->nsx;
    //memcpy(d->text+d->ntc, &d->nsx, sizeof(unsigned short));
    memcpy(d->text+d->ntc, &length, sizeof(unsigned short));
    d->ntc += 2;
    x = d->nsx++;
    //d->ndx[d->ht[he] = (unsigned short) x] = d->ntc;
    d->ndx[d->ht[he] = x] = d->ntc;
    d->ntc += EVEN_UP(len-2);
  }
  return x;
}
/*
void close_list_dict(list_dict *d){
  unsigned len;

  if (d == NULL) return;
  len = 2*(d->ntc/d->nsx) + d->ntc;
  if (len < d->nts) {
    REALLOCATE_ST(d->text,len);
    d->nts = len;
  }
  if (d->nsx + 1< d->nxs) {
    unsigned k = d->nsx + 1;
    REALLOCATE_ST(d->ndx,k);
    d->nxs = k;
  }
}
*/
int list_in_dict(int *s, int n, list_dict *d){
  int k = d->nsx;
  check_stack;
  return k != add_list_dict(s, n, d);
}

int add_list_dict(const int *s,int n,list_dict *d){
  LOGSECTION("add_list_dict");
  unsigned len;
  int he;
  unsigned x;
  long nhs, nsx;
  int *sc;

  check_stack;
  len = n+3;
  if (d->ntc+len > d->nts) {
    unsigned long k = d->ntc + len;
    k += k/2;
    k = min((unsigned long)MAX_INTS, max(200UL, k));
    LOGSECTION("add_list_dict::resize");
    LOGV(k) LCV(d->ntc + len);
    assert(k >= d->ntc + len);
    REALLOCATE_ST(d->text,(unsigned) k);
    d->nts = (unsigned) k;
  }
  if (d->nsx + 1 > d->nxs) {
    unsigned long k = d->nsx + 1;
    k += k/2;
    k = min((unsigned long) MAX_INTS, max(200UL,k));
    LOGSECTION("add_list_dict::resize");
    LOGV(k) LCV(d->nsx + 1);
    assert(k >= d->nsx+1);
    REALLOCATE_ST(d->ndx, (unsigned) k);
    d->nxs = (unsigned) k;
  }
  if (10*(nsx = d->nsx) >= 8*(nhs = d->nhs) && nhs < HASH_TABLE_MAX) {
    nhs = max(nhs,256L);
    while (10*nsx > 8*nhs && nhs < HASH_TABLE_MAX) {
      nhs *= 2;
    }
    LOGSECTION("add_list_dict::resize");
    LOGV(nhs) LCV(d->nsx);
    assert((unsigned)nhs > d->nsx);
    REALLOCATE_ST(d->ht, (unsigned) nhs);
    d->nhs = (unsigned) nhs;
    hash_list_dict(d);
  }
  sc = (int *) memcpy(d->text+d->ntc+2, s, n*sizeof(int));
  *--sc = n+1;
  he = find_lst_entry(sc,d);
  if ((x = d->ht[he]) >= d->nsx) {
    //*((unsigned short *)(d->text+d->ntc)) = (unsigned short) d->nsx;
    //memcpy(d->text+d->ntc, &d->nsx, sizeof(unsigned short));
    //memcpy(d->text+d->ntc, &d->nsx, sizeof(unsigned));
    d->text[d->ntc] = d->nsx;
    //d->ndx[d->ht[he] = (unsigned short) (x = d->nsx++)] = ++d->ntc;
    d->ndx[d->ht[he] = (x = d->nsx++)] = ++d->ntc;
    d->ntc += n+1;
  }
  return x;
}

void extract_tuple_dict(tuple_dict *d, unsigned k, ...) {
  va_list ap;
  const int *p;
  unsigned n;

  assert(k < d->nsx);
  n = d->tw;
  p = d->text + n*k;
  va_start(ap,k);
  while (n--) {
    *va_arg(ap,int*) = *p++;
  }
  va_end(ap);
}

int insert_tuple_dict(tuple_dict *d,...){
  int he;
  va_list s;
  unsigned n;
  long nsx,nhs;
  //int *tpl;
  unsigned i;

  check_stack;
  va_start(s, d);
  n = d->tw;
  //tpl = local_array(n,int);
  LocalArray<int> tpl(n);
  for (i = 0; i < n; i++) tpl[i] = va_arg(s,int);
  va_end(s);
  if (d->ntc+n > d->nts) {
    unsigned long k = d->ntc + n;
    k += k/2;
    k = min((unsigned long) MAX_INTS, max(200UL,k));
    LOGSECTION("insert_tuple_dict::resize");
    LOGV(k) LCV(d->ntc + n);
    assert(k >= d->ntc + n);
    REALLOCATE_ST(d->text, (unsigned) k);
    d->nts = (unsigned) k;
  }
  if (10*(nsx = d->nsx) >= 8*(nhs = d->nhs) && nhs < HASH_TABLE_MAX) {
    nhs = max(nhs, 256L);
    while (10*nsx > 8*nhs && nhs < HASH_TABLE_MAX) {
      nhs *= 2;
    }
    LOGSECTION("insert_tuple_dict::resize");
    LOGV(nhs) LCV(d->nsx);
    assert((unsigned)nhs > d->nsx);
    REALLOCATE_ST(d->ht, (unsigned) nhs);
    d->nhs = (unsigned) nhs;
    hash_tpl_dict(d);
  }
  he = find_tpl_entry(tpl, d);
  if ((unsigned)(d->ht[he]) >= d->nsx) {
    //d->ht[he] = (unsigned short) d->nsx++;
    d->ht[he] = d->nsx++;
    memcpy(d->text+d->ntc, tpl, n*sizeof(int));
    d->ntc += n;
    return 0;
  }
  return 1;
}


int add_tuple_dict_new(tuple_dict *d, ...) {
  int he;
  unsigned x;
  va_list s;
  unsigned n;
  long nsx, nhs;
  //int *tpl;
  unsigned i;

  check_stack;
  va_start(s, d);
  n = d->tw;
  //tpl = local_array(n, int);
  LocalArray<int> tpl(n);
  for (i = 0; i < n; i++) tpl[i] = va_arg(s, int);
  va_end(s);
  if (d->ntc+n > d->nts) {
    unsigned long k = d->ntc + n;

    k += k/2;
    k = min((unsigned long) MAX_INTS, max(200UL,k));
    LOGSECTION("add_tuple_dict_new::resize");
    LOGV(k) LCV(d->ntc + n);
    assert(k >= d->ntc + n);
    REALLOCATE_ST(d->text, (unsigned)k);
    d->nts = (unsigned) k;
  }
  if (10*(nsx = d->nsx) >= 8*(nhs = d->nhs) && nhs < HASH_TABLE_MAX) {
    nhs = max(nhs, 256L);
    while (10*nsx > 8*nhs && nhs < HASH_TABLE_MAX) {
      nhs *= 2;
    }
    LOGSECTION("add_tuple_dict_new::resize");
    LOGV(nhs) LCV(d->nsx);
    assert((unsigned) nhs > d->nsx);
    REALLOCATE_ST(d->ht, (unsigned) nhs);
    d->nhs = (unsigned) nhs;
    hash_tpl_dict(d);
  }
  he = find_tpl_entry(tpl, d);
  if ((x = d->ht[he]) >= d->nsx) {
    //d->ht[he] = (unsigned short) (x = d->nsx++);
    d->ht[he] = (x = d->nsx++);
    memcpy(d->text+d->ntc, tpl, n*sizeof(int));
    d->ntc += n;
  }
  return x;
}

/*
int identify_tuple(tuple_dict *d, ...) {
  int he;
  unsigned x;
  va_list s;
  int n = d->tw;
  int *tpl, i;

  check_stack;
  va_start(s,d);
  tpl = local_array(n, int);
  for (i = 0; i < n; i++) {
    tpl[i] = va_arg(s, int);
  }
  va_end(s);
  he = find_tpl_entry(tpl,d);
  if ((x = d->ht[he]) >= d->nsx) {
    return 0;
  }
  return x;
}
*/

/*
 * Use a variant of Fletcher's checksum
 * See DDJ, May, 1992, p32.
 */

static int nstr = 0, nstrx = 0;
static int nlst = 0, nlstx = 0;
static int ntpl = 0, ntplx = 0;

static int find_str_entry(const char *s, string_dict *d) {
  unsigned x,m;
  int he,dx;
  const char *sp;

  sp = s;
  if (d->ignore_case) {
    unsigned char k1 = 0, k2 = 0, c;
    while ((c = *sp++) != 0) {
      c &= 0337;
      k1 += c;
      if (k1) k1 ^= (unsigned char) 0xff;
      k2 += k1;
      if (k2) k2 ^= (unsigned char) 0xff;
    }
    he = 255*k1 + k2;
    dx = 255*k2 + k1;
    dx += dx + 1;

    he += dx << 16;
    dx += he << 16;

    m = d->nhs - 1;
    while ((x = d->ht[he &= m]) < d->nsx
	   && 0 != stricmp(s,&d->text[d->ndx[x]])) {
      he += dx;
    }
    return he;
  }


  {
    unsigned char k1 = 0, k2 = 0, c;
    while((c = *sp++) != 0) {
      k1 += c;
      if (k1) k1 ^= (unsigned char) 0xff;
      k2 += k1;
      if (k2) k2 ^= (unsigned char) 0xff;
    }
    he = 255*k1 + k2;
    dx = 255*k2 + k1;
  }
  dx += dx + 1;

  he += dx << 16;
  dx += he << 16;

  m = d->nhs - 1;
  nstr++;
  while ((x = d->ht[he &= m]) < d->nsx && 0 != strcmp(s,&d->text[d->ndx[x]])) {
    he += dx;
    nstrx++;
  }
  return he;
}

static int find_lst_entry(const int *s, list_dict *d){
  unsigned x,m;
  int he,dx;
  int n;
  const char *sp;
  unsigned char k1 = 0, k2 = 0;

  //sp = (const char *) s;
  n = m = *s;
  m *= sizeof(int);
  sp = (const char *) s;
  while (m--) {
    k1 += *sp++;
    if (k1) k1 ^= (unsigned char) 0xff;
    k2 += k1;
    if (k2) k2 ^= (unsigned char) 0xff;
  }
  he = 255*k1 + k2;
  dx = 255*k2 + k1;
  dx += dx + 1;

  he += dx << 16;
  dx += he << 16;

  m = d->nhs-1;
  n *= sizeof(int);
  nlst++;
  while ((x = d->ht[he &= m]) < d->nsx &&
	 0 != memcmp(s,&d->text[d->ndx[x]], n))  {
    he += dx;
    nlstx++;
  }
  return he;
}

static int find_tpl_entry(const int *s, tuple_dict *d){
  unsigned x,m;
  int he,dx;
  int n,nb;
  unsigned nsx;
  //unsigned short *ht;
  unsigned *ht;
  const int *text;
  const char *sp;
  unsigned char k1 = 0, k2 = 0;

  n = d->tw;
  sp = (const char *) s;
  m = n*sizeof(int);
  while (m--) {
    k1 += *sp++;
    if (k1) k1 ^= (unsigned char) 0xff;
    k2 += k1;
    if (k2) k2 ^= (unsigned char) 0xff;
  }
  he = 255*k1 + k2;
  dx = 255*k2 + k1;
  dx += dx + 1;

  he += dx << 16;
  dx += he << 16;

  nb = n*sizeof(int);
  m = d->nhs-1;
  ht = d->ht;
  nsx = d->nsx;
  text = d->text;
  ntpl++;
  while ((x = ht[he &= m]) < nsx &&
	 0 != memcmp(s,&text[n*x],nb))  {
    he += dx;
    ntplx++;
  }
  return he;
}

#if 0 /* not used */
string_dict *restore_string_dict(char *text, int ntc, int ne, int ic,
				 int byte_swap_required) {
  string_dict *d;
  int i, k;
  unsigned *ndx;

  ZALLOCATE_ST(d, 1);
  d->text = text;
  d->ntc = ntc;
  d->nts = (ntc+1) & ~1;
  d->ignore_case = ic;
  d->nxs = d-> nsx = ne;
  ndx = (unsigned *) ALLOCATE_ST(d->ndx, ne);
  for (i = k = 0; i < ne; i++) {
    //unsigned short x = *((unsigned short *)(d->text + k));
    unsigned short x;
    memcpy(&x,d->text + k, sizeof(unsigned short));
    if (byte_swap_required) {
      x = (unsigned short) ((x << 8) | (unsigned char) (x >> 8));
    }
    k += 2;
    ndx[x] = k;
    k += 1 + strlen(d->text + k);
  }
  i = 1;
  while (8*i < 10*ne) i *= 2;
  ALLOCATE_ST(d->ht, i);
  d->nhs = i;
  hash_string_dict(d);
  return d;
}
#endif /* 0 */

string_dict *null_str_dict(void){
  string_dict *d;

  ZALLOCATE_ST(d, 1);
  add_string_dict("", d);
/*  d->ignore_case = ignore_case; */
  return d;
}

list_dict *null_list_dict(void){
  list_dict *d;

  ZALLOCATE_ST(d, 1);
  return d;
}

tuple_dict *null_tuple_dict(int tw){
  tuple_dict *d;

  ZALLOCATE_ST(d, 1);
  d->tw = tw;
  return d;
}