view tests/agcl/parsifal/xmlp8.syn @ 18:562c313f14f4

some minor updates for 2022
author David A. Holland
date Tue, 31 May 2022 02:03:50 -0400
parents 13d2b8934445
children
line wrap: on
line source

{/*
 * XML Syntax
 * Transcribed from
 *    Extensible Markup Language (XML) 1.0
 *    W3C Recommendation 10-February-1998
 *    http://www.w3.org/TR/1998/REC-xml-19980210
 *
 * Transcription Copyright © Parsifal Software, 1999.
 *
 * Revision 1 fixed definition of [4] NameChar
 *
 * Revision 2, begun 1/24/00
 *
 * Fixed problem with mixed content declaration [51]
 * Fixed missing S in [52]
 * Removed unnecessary "sticky" statement
*/

#include "xmldef8.h"
#include <stdio.h>

}


[
  reserve keywords {"--", "?>", "]]>"}
  test file mask = "*.xml"
  parser file name = "#.cpp"
  parser name = parse
  reentrant parser
  //line numbers
  event driven
  context type = Context
  extend pcb {
    AgStack<AgString> tagStack;
    AgStack<Symtab> mapStack;
    AgString textStack;
    AgString spaceStack;
    enum TokenType {
      none,
      spaceType,
      textType,
      commentType,
      startType,
      endType,
      emptyType,
      entityRefType,
      //charRefType,
      errorType
    } tokenType;
    AgString currentTagName;
    AgString currentEntityName;
    //int currentCharRef;
    AgString commentString;
    Symtab currentMap;
    AgStack<Warning> warningList;
    AgBaseMapString<Element> elementMap;
    int dtdPresent;

    parse_pcb_struct();
    parse(char *input, AgString *);
    void spaceChar(int);
    void textChar(int);
    void startTag(const Context &, const AgString &, AttvalList &);
    void emptyTag(const Context &, const AgString &, AttvalList &);
    void endTag(const AgString &);
    void registerAttributes(const AgString &name, const AgBaseStack<Attribute> &AttributeList);
    void checkAttributes(const Context &, const AgString &name, Symtab &map);
    int warnAttval(const Context &context, const char *, const AttvalPair &p);
    static void normalize(AgString &s);
  }
  wrapper {AttvalPair,
      AgString,
      AttvalList,
      Attribute,
      AttributeList,
      Default}
]

// Document

// [1]
document $
  -> prolog, element, Misc?..., eof

// For completeness
eof = -1


// Character Range

// [2]
//Char = 0x9 + 0xA + 0x20..0xd7ff + 0xe000..0xfffd + 0x0000-0x10ffff
Char = 0x9 + 0xA + 0x20..0xff          // 8 bit characters only pro tem


// White Space
// [3]
SpaceChar = 0x20 + 0x9 + 0xd + 0xa

S
 -> SpaceChar...


// Names and Tokens

// [4]
NameChar = Letter + Digit + '.' + '-' + '_' + ':' + Extender  // + CombiningChar

// [5]
(AgString) Name
 -> Letter + '_' + ':':c                         =AgString().concat(c);
 -> Name:s, NameChar:c                           =s.concat(c);

// [6]
Names
 -> Name
 -> Names, S, Name

// [7]
Nmtoken
 -> NameChar...

// [8]
Nmtokens
 -> Nmtoken, [S, Nmtoken]...


// Literals

// [9]
EntityValue
 -> '"', [~(eof + '%' + '&' + '"') | PEReference | Reference]..., '"'
 -> '\'', [~(eof + '%' + '&' + '\'') | PEReference | Reference]..., '\''

// [10]
(AgString) AttValue
 -> '"', dq AttValString:s, '"'                        =s;
 -> '\'', sq AttValString:s, '\''                      =s;

(AgString) dq AttValString
 ->                                                    =AgString();
 -> dq AttValString:s, ~(eof + '%' + '&' + '"' + SpaceChar):c      =s.concat(c);
 -> dq AttValString:s, SpaceChar                       =s.concat(' ');
 //-> dq AttValString:s, CharRef                         =s;
 -> dq AttValString:s, CharRef:c                       =s.concat(c);
 -> dq AttValString:s, EntityRef                       =s;

(AgString) sq AttValString
 ->                                                    =AgString();
 -> sq AttValString:s, ~(eof + '%' + '&' + '\'' + SpaceChar):c     =s.concat(c);
 -> sq AttValString:s, SpaceChar                       =s.concat(' ');
 -> sq AttValString:s, CharRef:c                         =s.concat(c);
 -> sq AttValString:s, EntityRef                       =s;

// [11]
SystemLiteral
 -> '"', ~(eof + '"')?..., '"'
 -> '\'', ~(eof + '\'')?..., '\''

// [12]
PubidLiteral
 -> '"', PubidChar?..., '"'
 -> '\'', PubidChar-'\''?..., '\''

// [13]
PubidChar = 0x20 + 0xd + 0xa + 'a-z' + 'A-Z' + '0-9' +
            '-' + '\'' + '(' + ')' + '+' + ',' + '.' +
            '/' + ':' + '?' + ';' + '!' + '*' + '#' +
            '@' + '$' + '_' + '%'


// Character Data

// [14]
CharData = ~(eof + '<' + '&')         // Note that the iteration is in the usage
  // Note that use of keyword "]]>" will take care of CDATA problem


// Comments

// [15]
Comment
 -> "<!--", comment text:t, "--", '>'  =PCB.tokenType = Pcb::commentType, PCB.commentString = t;

(AgString) comment text
 ->                        =AgString();
 ->comment text:s, Char:c  =s.concat(c);


// Processing Instructions

// [16]
PI
 -> "<?", PITarget, [S, [Char-SpaceChar, Char?...]], "?>"

// [17]
PITarget
 -> Name                               // Name lookup mechanism should reject "xml"


// CDATA Sections

// [18]
CDSect
 -> CDStart, CData, CDEnd

// [19]
CDStart
 -> "<![CDATA["

// [20]
CData
 -> [Char - SpaceChar:c =PCB.textChar(c); |
    "\r\n"       =PCB.spaceChar('\n'); |
     SpaceChar:c =PCB.spaceChar(c);]...
 // Keyword recognition logic overrides character recognition

// [21]
CDEnd
 -> "]]>"


// Prolog

// [22]
prolog
 -> XMLDecl?, Misc?..., [doctypedecl, Misc?...]

// [23]
XMLDecl
// -> "<?xml", VersionInfo, EncodingDecl?, SDDecl?, S?, "?>"
 -> "<?xml", S, VersionInfo, VersionDecl, "?>"

 VersionDecl
  -> S?
  -> S, EncodingDecl, S?
  -> S, EncodingDecl, S, SDDecl, S?
  -> S, SDDecl, S?


// [24]
VersionInfo
 -> "version", Eq, {'\'', VersionNum, '\'' | '"', VersionNum, '"'}

// [25]
Eq
 -> S?, '=', S?

// [26]
VersionNum
 -> 'a-z' + 'A-Z' + '0-9' + '_' + '.' + ':' + '-'...

// [27]
Misc
// -> Comment | PI | S
 -> Comment | PI | SpaceChar                    // Avoid double recursion


// Document Type Definition

// [28]
doctypedecl
 -> "<!DOCTYPE", S, Name:n,
    [S, [ExternalID, S?]],
      [ '[', [markupdecl | PEReference | SpaceChar]..., ']', S?], '>' =PCB.dtdPresent=1;

// [29]
markupdecl
 -> elementdecl
 -> AttlistDecl
 -> EntityDecl
 -> NotationDecl
 -> PI
 -> Comment


// External Subset

// [30]

extSubset
 -> TextDecl?, extSubsetDecl

// [31]
extSubsetDecl
 -> [markupdecl | conditionalSect | PEReference | SpaceChar]...


// Standalone Document Declaration

// [32]
SDDecl
 -> "standalone", Eq, "'yes'" | "\"yes\"" | "'no'" | "\"no\""


// Language Identification

// [33]
LanguageId
 -> Langcode, ['-', Subcode]...

// [34]
Langcode
 -> ISO639Code | IanaCode | UserCode

// [35]
ISO639Code
 -> 'a-z' + 'A-Z' -('i' + 'I' + 'x' + 'X'), 'a-z' + 'A-Z'

// [36]
IanaCode
 -> 'i' + 'I', '-', 'a-z' + 'A-Z'...

// [37]
UserCode
 -> 'x' + 'X', '-', 'a-z' + 'A-Z'...

// [38]
Subcode
 -> 'a-z' + 'A-Z'...


// Element

// [39]
element
 -> EmptyElementTag
 -> STag, content, ETag


// Start-Tags, End-Tags, and Empty-Element Tags

// Start-tag

// [40]
STag
 -> '<', Name:n, AttributeList:l, S?, '>'  =PCB.startTag(CONTEXT, n, l);

(AttvalList) AttributeList
 ->                                        =AttvalList();
 -> AttributeList:list, S, Attribute:a     =list.push(a);

// [41]
(AttvalPair) Attribute
 -> Name:n, Eq, AttValue:s                =AttvalPair(n, s, CONTEXT);

// End-tag

// [42]
ETag
 -> "</", Name:n, S?, '>'   =PCB.endTag(n);

// Content of Elements

// [43]
content
 -> content unit...

content unit
 -> element
 -> CharData - SpaceChar:c =PCB.textChar(c);
 -> CharRef:c   =PCB.textChar(c);
 -> EntityRef
 -> SpaceChar:c =PCB.spaceChar(c);
 -> "\r\n"      =PCB.spaceChar('\n');
 -> CDSect
 -> PI
 -> Comment

// Tags for Empty Elements

// [44]
EmptyElementTag
 -> '<', Name:n, AttributeList:l, S?, "/>" =PCB.emptyTag(CONTEXT, n, l);

// Element Type Declaration

// [45]
elementdecl
 -> "<!ELEMENT", S, Name, S, contentspec, S?, '>'

// [46]
contentspec
 -> "EMPTY" | "ANY" | Mixed | Children


// Element-content Models

// [47]
Children
 -> {choice | seq}, ['?' | '*' | '+']

// [48]
cp
 -> {Name | choice | seq}, ['?' | '*' | '+']

// [49]
choice
 -> '(', S?, cp, S?, ['|', S?, cp, S?]..., ')'

// [50]
seq
 -> '(', S?, cp, S?, {',', S?, cp, S?}..., ')'


// Mixed-content Declaration

// [51]
Mixed
 -> '(', S?, "#PCDATA", ['|', S?, Name | S]/..., ")*"
 -> '(', S?, "#PCDATA", S?, ')'


// Attribute-list Declaration

// [52]
AttlistDecl
 -> "<!ATTLIST", S, Name:n, AttDefs:l, S?, '>'   =PCB.registerAttributes(n, l);

(AttributeList) AttDefs
 -> AttDef:a                                     =AttributeList().push(a);
 -> AttDefs:l, AttDef:a                          =l.push(a);

// [53]
(Attribute) AttDef
 -> S, Name:n, S, AttType:t, S, DefaultDecl:d    =Attribute(RULE_CONTEXT[1], n, t, d);


// Attribute Types

// [54]
(Attribute::Type) AttType
 -> StringType
 -> TokenizedType
 -> EnumeratedType

// [55]
(Attribute::Type) StringType
 -> "CDATA"             =Attribute::CDATA;

// [56]
(Attribute::Type) TokenizedType
 -> "ID"                =Attribute::ID;
 -> "IDREF"             =Attribute::IDREF;
 -> "IDREFS"            =Attribute::IDREFS;
 -> "ENTITY"            =Attribute::ENTITY;
 -> "ENTITIES"          =Attribute::ENTITIES;
 -> "NMTOKEN"           =Attribute::NMTOKEN;
 -> "NMTOKENS"          =Attribute::NMTOKENS;

// [57]
(Attribute::Type) EnumeratedType
 -> NotationType   =Attribute::NOTATION;
 -> Enumeration    =Attribute::ENUMERATION;

// [58]
NotationType
 -> "NOTATION", S, '(', Name, S?, ['|', S?, Name, S?]..., ')'

// [59]
Enumeration
 -> '(', Nmtoken, S?, ['|', S?, Nmtoken, S?]..., ')'


// Attribute Defaults

// [60]
(Default) DefaultDecl
 -> "#REQUIRED"                =Default(Default::REQUIRED);
 -> "#IMPLIED"                 =Default();
 -> AttValue:v                 =Default(v);
 -> "#FIXED", S, AttValue:v    =Default(v,1);


// Conditional Section

// [61]
conditionalSect
 -> includeSect | ignoreSect

// [62]
includeSect
 -> "<![", S?, "INCLUDE", S?, '[', extSubsetDecl, "]]>"

// [63]
ignoreSect
 -> "<![", S?, "IGNORE", S?, '[', ignoreSectContents?..., "]]>"

// [64]
ignoreSectContents
 -> Ignore, ["<![", ignoreSectContents, "]]>", Ignore]...

// [65]
Ignore
 -> Char?...


//Character Reference

// [66]
(int) CharRef
 //-> "&#", '0-9'..., ';'
 //-> "&#x", '0-9' + 'a-f' + 'A-F'..., ';'
 //-> decimal CharRef:c, ';'      =PCB.currentCharRef = c, PCB.tokenType = Pcb::charRefType;
 -> decimal CharRef, ';'
 //-> hex CharRef:c, ';'          =PCB.currentCharRef = c, PCB.tokenType = Pcb::charRefType;
 -> hex CharRef, ';'

(int) decimal CharRef
 -> "&#", '0-9':d          =d-'0';
 -> decimal CharRef:n, '0-9':d          =10*n + d-'0';

(int) hex CharRef
 -> "&#x", '0-9':d                      =d-'0';
 -> hex CharRef:n, '0-9':d              =16*n + d;
 -> hex CharRef:n, 'a-f' + 'A-F':d      =16*n + (d&7) + 9;

//Entity Reference

// [67]
(int) Reference
 -> EntityRef | CharRef

// [68]
(int) EntityRef
 -> '&', Name:s, ';'  =PCB.currentEntityName = s, PCB.tokenType = Pcb::entityRefType;

// [69]
PEReference
 -> '%', Name, ';'


// Entity Declaration

// [70]
EntityDecl
 -> GEDecl | PEDecl

// [71]
GEDecl
 -> "<!ENTITY", S, Name, S, EntityDef, '>'

// [72]
PEDecl
 -> "<!ENTITY", S, '%', S, Name, S, PEDef, S?, '>'

// [73]
EntityDef
 -> EntityValue, S? | ExternalID, [NDataDecl | S]

// [74]
PEDef
 -> EntityValue | ExternalID


// External Entity Declaration

// [75]
ExternalID
 -> "SYSTEM", S, SystemLiteral
 -> "PUBLIC", S, PubidLiteral, S, SystemLiteral

// [76]
NDataDecl
 -> S, "NDATA", S, Name


// Text Declaration

// [77]
TextDecl
 -> "<?xml", S, [VersionInfo, S], EncodingDecl, S?, "?>"


// Well-Formed External Parsed Entity

// [78]
extParsedEnt
 -> TextDecl?, content

// [79]
extPE
 -> TextDecl?, extSubsetDecl


// Encoding Declaration

// [80]
EncodingDecl
 -> "encoding", Eq, {'"', EncName, '"' | '\'', EncName, '\''}

// [81]
EncName
 -> 'a-z' + 'A-Z', ['a-z'+'A-Z'+'0-9'+'_' | '-']...


// Notation Declarations

// [82]
NotationDecl
 -> "<!NOTATION", S, Name, S, {ExternalID, S? | PublicID}, '>'

// [83]
PublicID
 -> "PUBLIC", S, PubidLiteral, S?


// Characters

// [84]
Letter = BaseChar           // | Ideographic       // No ideographs for now

// [85]
BaseChar =                             // only 8 bit characters for now
  0x41..0x5a +
  0x61..0x7a +
  0xc0..0xd6 +
  0xd8..0xf6 +
  0xf8..0xff

// [86]
// Ideographic =


// [87]
// CombiningChar =

// [88]
Digit = 0x30..0x39

// [89]
Extender = 0xb7

{
  #define GET_CONTEXT CONTEXT = Context(PCB.line, PCB.column)

  typedef parse_pcb_struct Pcb;

  parse_pcb_struct::parse_pcb_struct()
    : tokenType(Pcb::none), dtdPresent(0)
  {
    // Nothing to do
  }

  void parse_pcb_struct::spaceChar(int c) {
    if (c == '\r') c = '\n';
    if (textStack.size()) tokenType = textType;
    spaceStack.concat(c);
  }

  void parse_pcb_struct::textChar(int c) {
    if (spaceStack.size()) tokenType=spaceType;
    textStack.concat((char)c);
  }

  void parse_pcb_struct::normalize(AgString &s) {
    char *readPointer = s;
    char *writePointer = readPointer;
    if (readPointer == NULL) return;
    while (*readPointer && strchr("\t\r\n ",*readPointer)) readPointer++;
    while (*readPointer) {
      while (*readPointer && strchr("\t\r\n ",*readPointer) == NULL) *writePointer++ = *readPointer++;
      while (*readPointer && strchr("\t\r\n ",*readPointer)) readPointer++;
      if (*readPointer) *writePointer++ = ' ';
    }
    *writePointer = 0;
  }

  void parse_pcb_struct::checkAttributes(const Context &context, const AgString &name, Symtab &map) {
    if (!dtdPresent) return;
    Element &element = elementMap[name];
    AgBaseStack<Attribute> &attributeList = element.attributeList;
    int i;
    for (i = 0; i < attributeList.size(); i++) {
      Attribute &attribute = attributeList[i];
      switch (attribute.def.presence) {
        case Default::REQUIRED: {
          const char *value = map.findValue(attribute.name);
          if (value) continue;
          map[attribute.name] = "VALUE NOT SPECIFIED";
          char buf[200];
          sprintf(buf, "Required attribute '%s' is missing", (const char *)attribute.name);
          warningList.push(Warning(context, buf));
          break;
        }
        case Default::DEFAULT: {
          const char *value = map[attribute.name];
          if (value == 0) value = attribute.def.value;
          if (attribute.def.fixed && attribute.def.value == value) continue;
          char buf[200];
          sprintf(buf, "Value of fixed attribute '%s' does not match default value ",
              (const char *) attribute.name);
          warningList.push(Warning(context, buf));
          break;
        }
      }
    }
    for (i = 0; i < attributeList.size(); i++) {
      Attribute &attribute = attributeList[i];
      if (attribute.type != Attribute::CDATA) normalize(map[attribute.name]);
    }
  }

  int parse_pcb_struct::warnAttval(const Context &context, const char *value, const AttvalPair &p) {
    if (value == NULL) return 0;
    char buf[200];
    sprintf(buf, "Value of '%s' ignored. Previously specified as '%s'",
            (const char *) p.name, value);
    warningList.push(Warning(context, buf));
    return 1;
  }

  void parse_pcb_struct::startTag(const Context &context, const AgString &name, AttvalList &list) {
    Symtab map;
    for (int i = 0; i < list.size(); i++) {
      AgString &value = map[list[i].name];
      if (warnAttval(context, value, list[i])) continue;
      value = list[i].value;
    }
    currentTagName = name;
    checkAttributes(context, name, map);
    currentMap = map;
    tagStack.push(name);
    mapStack.push(map);
    tokenType = startType;
  }

  void parse_pcb_struct::emptyTag(const Context &context, const AgString &name, AttvalList &list) {
    Symtab map;
    for (int i = 0; i < list.size(); i++) {
      AgString &value = map[list[i].name];
      if (warnAttval(context, value, list[i])) continue;
      map[list[i].name] = list[i].value;
    }
    checkAttributes(context, name, map);
    currentTagName = name;
    currentMap = map;
    tokenType = emptyType;
  }

  void parse_pcb_struct::endTag(const AgString &name) {
    currentTagName = name;
    currentMap = mapStack.pop();
    if (name != tagStack.pop()) {
      error_message = "Element nesting error";
      exit_flag = AG_SEMANTIC_ERROR_CODE;
      tokenType = errorType;
      return;
    }
    tokenType = endType;
  }
  void parse_pcb_struct::registerAttributes(const AgString &name, const AgBaseStack<Attribute> &attributeList) {
    Element &element = elementMap[name];
    if (element.name.size() == 0) element.name = name;
    int i;
    for (i = 0; i < attributeList.size(); i++) {
      AgString attributeName = attributeList[i].name;
      int &index = element.attributeIndex[attributeName];
      if (index != -1) {
        Attribute &attribute = element.attributeList[index];
        Context previous = attribute.context;
        char buf[200];
        sprintf(buf, "Redefinition of '%s::%s' ignored. Defined at line %d, column %d",
                (const char *) name,
                (const char *) attributeName, previous.line, previous.column);
        warningList.push(Warning(attributeList[i].context, buf));
        continue;
      }
      index = element.attributeList.size();
      element.attributeList.push(attributeList[i]);
    }
  }
}