Mercurial > ~dholland > hg > ag > index.cgi
view tests/agcl/parsifal/xmlp8.syn @ 18:562c313f14f4
some minor updates for 2022
author | David A. Holland |
---|---|
date | Tue, 31 May 2022 02:03:50 -0400 |
parents | 13d2b8934445 |
children |
line wrap: on
line source
{/* * XML Syntax * Transcribed from * Extensible Markup Language (XML) 1.0 * W3C Recommendation 10-February-1998 * http://www.w3.org/TR/1998/REC-xml-19980210 * * Transcription Copyright © Parsifal Software, 1999. * * Revision 1 fixed definition of [4] NameChar * * Revision 2, begun 1/24/00 * * Fixed problem with mixed content declaration [51] * Fixed missing S in [52] * Removed unnecessary "sticky" statement */ #include "xmldef8.h" #include <stdio.h> } [ reserve keywords {"--", "?>", "]]>"} test file mask = "*.xml" parser file name = "#.cpp" parser name = parse reentrant parser //line numbers event driven context type = Context extend pcb { AgStack<AgString> tagStack; AgStack<Symtab> mapStack; AgString textStack; AgString spaceStack; enum TokenType { none, spaceType, textType, commentType, startType, endType, emptyType, entityRefType, //charRefType, errorType } tokenType; AgString currentTagName; AgString currentEntityName; //int currentCharRef; AgString commentString; Symtab currentMap; AgStack<Warning> warningList; AgBaseMapString<Element> elementMap; int dtdPresent; parse_pcb_struct(); parse(char *input, AgString *); void spaceChar(int); void textChar(int); void startTag(const Context &, const AgString &, AttvalList &); void emptyTag(const Context &, const AgString &, AttvalList &); void endTag(const AgString &); void registerAttributes(const AgString &name, const AgBaseStack<Attribute> &AttributeList); void checkAttributes(const Context &, const AgString &name, Symtab &map); int warnAttval(const Context &context, const char *, const AttvalPair &p); static void normalize(AgString &s); } wrapper {AttvalPair, AgString, AttvalList, Attribute, AttributeList, Default} ] // Document // [1] document $ -> prolog, element, Misc?..., eof // For completeness eof = -1 // Character Range // [2] //Char = 0x9 + 0xA + 0x20..0xd7ff + 0xe000..0xfffd + 0x0000-0x10ffff Char = 0x9 + 0xA + 0x20..0xff // 8 bit characters only pro tem // White Space // [3] SpaceChar = 0x20 + 0x9 + 0xd + 0xa S -> SpaceChar... // Names and Tokens // [4] NameChar = Letter + Digit + '.' + '-' + '_' + ':' + Extender // + CombiningChar // [5] (AgString) Name -> Letter + '_' + ':':c =AgString().concat(c); -> Name:s, NameChar:c =s.concat(c); // [6] Names -> Name -> Names, S, Name // [7] Nmtoken -> NameChar... // [8] Nmtokens -> Nmtoken, [S, Nmtoken]... // Literals // [9] EntityValue -> '"', [~(eof + '%' + '&' + '"') | PEReference | Reference]..., '"' -> '\'', [~(eof + '%' + '&' + '\'') | PEReference | Reference]..., '\'' // [10] (AgString) AttValue -> '"', dq AttValString:s, '"' =s; -> '\'', sq AttValString:s, '\'' =s; (AgString) dq AttValString -> =AgString(); -> dq AttValString:s, ~(eof + '%' + '&' + '"' + SpaceChar):c =s.concat(c); -> dq AttValString:s, SpaceChar =s.concat(' '); //-> dq AttValString:s, CharRef =s; -> dq AttValString:s, CharRef:c =s.concat(c); -> dq AttValString:s, EntityRef =s; (AgString) sq AttValString -> =AgString(); -> sq AttValString:s, ~(eof + '%' + '&' + '\'' + SpaceChar):c =s.concat(c); -> sq AttValString:s, SpaceChar =s.concat(' '); -> sq AttValString:s, CharRef:c =s.concat(c); -> sq AttValString:s, EntityRef =s; // [11] SystemLiteral -> '"', ~(eof + '"')?..., '"' -> '\'', ~(eof + '\'')?..., '\'' // [12] PubidLiteral -> '"', PubidChar?..., '"' -> '\'', PubidChar-'\''?..., '\'' // [13] PubidChar = 0x20 + 0xd + 0xa + 'a-z' + 'A-Z' + '0-9' + '-' + '\'' + '(' + ')' + '+' + ',' + '.' + '/' + ':' + '?' + ';' + '!' + '*' + '#' + '@' + '$' + '_' + '%' // Character Data // [14] CharData = ~(eof + '<' + '&') // Note that the iteration is in the usage // Note that use of keyword "]]>" will take care of CDATA problem // Comments // [15] Comment -> "<!--", comment text:t, "--", '>' =PCB.tokenType = Pcb::commentType, PCB.commentString = t; (AgString) comment text -> =AgString(); ->comment text:s, Char:c =s.concat(c); // Processing Instructions // [16] PI -> "<?", PITarget, [S, [Char-SpaceChar, Char?...]], "?>" // [17] PITarget -> Name // Name lookup mechanism should reject "xml" // CDATA Sections // [18] CDSect -> CDStart, CData, CDEnd // [19] CDStart -> "<![CDATA[" // [20] CData -> [Char - SpaceChar:c =PCB.textChar(c); | "\r\n" =PCB.spaceChar('\n'); | SpaceChar:c =PCB.spaceChar(c);]... // Keyword recognition logic overrides character recognition // [21] CDEnd -> "]]>" // Prolog // [22] prolog -> XMLDecl?, Misc?..., [doctypedecl, Misc?...] // [23] XMLDecl // -> "<?xml", VersionInfo, EncodingDecl?, SDDecl?, S?, "?>" -> "<?xml", S, VersionInfo, VersionDecl, "?>" VersionDecl -> S? -> S, EncodingDecl, S? -> S, EncodingDecl, S, SDDecl, S? -> S, SDDecl, S? // [24] VersionInfo -> "version", Eq, {'\'', VersionNum, '\'' | '"', VersionNum, '"'} // [25] Eq -> S?, '=', S? // [26] VersionNum -> 'a-z' + 'A-Z' + '0-9' + '_' + '.' + ':' + '-'... // [27] Misc // -> Comment | PI | S -> Comment | PI | SpaceChar // Avoid double recursion // Document Type Definition // [28] doctypedecl -> "<!DOCTYPE", S, Name:n, [S, [ExternalID, S?]], [ '[', [markupdecl | PEReference | SpaceChar]..., ']', S?], '>' =PCB.dtdPresent=1; // [29] markupdecl -> elementdecl -> AttlistDecl -> EntityDecl -> NotationDecl -> PI -> Comment // External Subset // [30] extSubset -> TextDecl?, extSubsetDecl // [31] extSubsetDecl -> [markupdecl | conditionalSect | PEReference | SpaceChar]... // Standalone Document Declaration // [32] SDDecl -> "standalone", Eq, "'yes'" | "\"yes\"" | "'no'" | "\"no\"" // Language Identification // [33] LanguageId -> Langcode, ['-', Subcode]... // [34] Langcode -> ISO639Code | IanaCode | UserCode // [35] ISO639Code -> 'a-z' + 'A-Z' -('i' + 'I' + 'x' + 'X'), 'a-z' + 'A-Z' // [36] IanaCode -> 'i' + 'I', '-', 'a-z' + 'A-Z'... // [37] UserCode -> 'x' + 'X', '-', 'a-z' + 'A-Z'... // [38] Subcode -> 'a-z' + 'A-Z'... // Element // [39] element -> EmptyElementTag -> STag, content, ETag // Start-Tags, End-Tags, and Empty-Element Tags // Start-tag // [40] STag -> '<', Name:n, AttributeList:l, S?, '>' =PCB.startTag(CONTEXT, n, l); (AttvalList) AttributeList -> =AttvalList(); -> AttributeList:list, S, Attribute:a =list.push(a); // [41] (AttvalPair) Attribute -> Name:n, Eq, AttValue:s =AttvalPair(n, s, CONTEXT); // End-tag // [42] ETag -> "</", Name:n, S?, '>' =PCB.endTag(n); // Content of Elements // [43] content -> content unit... content unit -> element -> CharData - SpaceChar:c =PCB.textChar(c); -> CharRef:c =PCB.textChar(c); -> EntityRef -> SpaceChar:c =PCB.spaceChar(c); -> "\r\n" =PCB.spaceChar('\n'); -> CDSect -> PI -> Comment // Tags for Empty Elements // [44] EmptyElementTag -> '<', Name:n, AttributeList:l, S?, "/>" =PCB.emptyTag(CONTEXT, n, l); // Element Type Declaration // [45] elementdecl -> "<!ELEMENT", S, Name, S, contentspec, S?, '>' // [46] contentspec -> "EMPTY" | "ANY" | Mixed | Children // Element-content Models // [47] Children -> {choice | seq}, ['?' | '*' | '+'] // [48] cp -> {Name | choice | seq}, ['?' | '*' | '+'] // [49] choice -> '(', S?, cp, S?, ['|', S?, cp, S?]..., ')' // [50] seq -> '(', S?, cp, S?, {',', S?, cp, S?}..., ')' // Mixed-content Declaration // [51] Mixed -> '(', S?, "#PCDATA", ['|', S?, Name | S]/..., ")*" -> '(', S?, "#PCDATA", S?, ')' // Attribute-list Declaration // [52] AttlistDecl -> "<!ATTLIST", S, Name:n, AttDefs:l, S?, '>' =PCB.registerAttributes(n, l); (AttributeList) AttDefs -> AttDef:a =AttributeList().push(a); -> AttDefs:l, AttDef:a =l.push(a); // [53] (Attribute) AttDef -> S, Name:n, S, AttType:t, S, DefaultDecl:d =Attribute(RULE_CONTEXT[1], n, t, d); // Attribute Types // [54] (Attribute::Type) AttType -> StringType -> TokenizedType -> EnumeratedType // [55] (Attribute::Type) StringType -> "CDATA" =Attribute::CDATA; // [56] (Attribute::Type) TokenizedType -> "ID" =Attribute::ID; -> "IDREF" =Attribute::IDREF; -> "IDREFS" =Attribute::IDREFS; -> "ENTITY" =Attribute::ENTITY; -> "ENTITIES" =Attribute::ENTITIES; -> "NMTOKEN" =Attribute::NMTOKEN; -> "NMTOKENS" =Attribute::NMTOKENS; // [57] (Attribute::Type) EnumeratedType -> NotationType =Attribute::NOTATION; -> Enumeration =Attribute::ENUMERATION; // [58] NotationType -> "NOTATION", S, '(', Name, S?, ['|', S?, Name, S?]..., ')' // [59] Enumeration -> '(', Nmtoken, S?, ['|', S?, Nmtoken, S?]..., ')' // Attribute Defaults // [60] (Default) DefaultDecl -> "#REQUIRED" =Default(Default::REQUIRED); -> "#IMPLIED" =Default(); -> AttValue:v =Default(v); -> "#FIXED", S, AttValue:v =Default(v,1); // Conditional Section // [61] conditionalSect -> includeSect | ignoreSect // [62] includeSect -> "<![", S?, "INCLUDE", S?, '[', extSubsetDecl, "]]>" // [63] ignoreSect -> "<![", S?, "IGNORE", S?, '[', ignoreSectContents?..., "]]>" // [64] ignoreSectContents -> Ignore, ["<![", ignoreSectContents, "]]>", Ignore]... // [65] Ignore -> Char?... //Character Reference // [66] (int) CharRef //-> "&#", '0-9'..., ';' //-> "&#x", '0-9' + 'a-f' + 'A-F'..., ';' //-> decimal CharRef:c, ';' =PCB.currentCharRef = c, PCB.tokenType = Pcb::charRefType; -> decimal CharRef, ';' //-> hex CharRef:c, ';' =PCB.currentCharRef = c, PCB.tokenType = Pcb::charRefType; -> hex CharRef, ';' (int) decimal CharRef -> "&#", '0-9':d =d-'0'; -> decimal CharRef:n, '0-9':d =10*n + d-'0'; (int) hex CharRef -> "&#x", '0-9':d =d-'0'; -> hex CharRef:n, '0-9':d =16*n + d; -> hex CharRef:n, 'a-f' + 'A-F':d =16*n + (d&7) + 9; //Entity Reference // [67] (int) Reference -> EntityRef | CharRef // [68] (int) EntityRef -> '&', Name:s, ';' =PCB.currentEntityName = s, PCB.tokenType = Pcb::entityRefType; // [69] PEReference -> '%', Name, ';' // Entity Declaration // [70] EntityDecl -> GEDecl | PEDecl // [71] GEDecl -> "<!ENTITY", S, Name, S, EntityDef, '>' // [72] PEDecl -> "<!ENTITY", S, '%', S, Name, S, PEDef, S?, '>' // [73] EntityDef -> EntityValue, S? | ExternalID, [NDataDecl | S] // [74] PEDef -> EntityValue | ExternalID // External Entity Declaration // [75] ExternalID -> "SYSTEM", S, SystemLiteral -> "PUBLIC", S, PubidLiteral, S, SystemLiteral // [76] NDataDecl -> S, "NDATA", S, Name // Text Declaration // [77] TextDecl -> "<?xml", S, [VersionInfo, S], EncodingDecl, S?, "?>" // Well-Formed External Parsed Entity // [78] extParsedEnt -> TextDecl?, content // [79] extPE -> TextDecl?, extSubsetDecl // Encoding Declaration // [80] EncodingDecl -> "encoding", Eq, {'"', EncName, '"' | '\'', EncName, '\''} // [81] EncName -> 'a-z' + 'A-Z', ['a-z'+'A-Z'+'0-9'+'_' | '-']... // Notation Declarations // [82] NotationDecl -> "<!NOTATION", S, Name, S, {ExternalID, S? | PublicID}, '>' // [83] PublicID -> "PUBLIC", S, PubidLiteral, S? // Characters // [84] Letter = BaseChar // | Ideographic // No ideographs for now // [85] BaseChar = // only 8 bit characters for now 0x41..0x5a + 0x61..0x7a + 0xc0..0xd6 + 0xd8..0xf6 + 0xf8..0xff // [86] // Ideographic = // [87] // CombiningChar = // [88] Digit = 0x30..0x39 // [89] Extender = 0xb7 { #define GET_CONTEXT CONTEXT = Context(PCB.line, PCB.column) typedef parse_pcb_struct Pcb; parse_pcb_struct::parse_pcb_struct() : tokenType(Pcb::none), dtdPresent(0) { // Nothing to do } void parse_pcb_struct::spaceChar(int c) { if (c == '\r') c = '\n'; if (textStack.size()) tokenType = textType; spaceStack.concat(c); } void parse_pcb_struct::textChar(int c) { if (spaceStack.size()) tokenType=spaceType; textStack.concat((char)c); } void parse_pcb_struct::normalize(AgString &s) { char *readPointer = s; char *writePointer = readPointer; if (readPointer == NULL) return; while (*readPointer && strchr("\t\r\n ",*readPointer)) readPointer++; while (*readPointer) { while (*readPointer && strchr("\t\r\n ",*readPointer) == NULL) *writePointer++ = *readPointer++; while (*readPointer && strchr("\t\r\n ",*readPointer)) readPointer++; if (*readPointer) *writePointer++ = ' '; } *writePointer = 0; } void parse_pcb_struct::checkAttributes(const Context &context, const AgString &name, Symtab &map) { if (!dtdPresent) return; Element &element = elementMap[name]; AgBaseStack<Attribute> &attributeList = element.attributeList; int i; for (i = 0; i < attributeList.size(); i++) { Attribute &attribute = attributeList[i]; switch (attribute.def.presence) { case Default::REQUIRED: { const char *value = map.findValue(attribute.name); if (value) continue; map[attribute.name] = "VALUE NOT SPECIFIED"; char buf[200]; sprintf(buf, "Required attribute '%s' is missing", (const char *)attribute.name); warningList.push(Warning(context, buf)); break; } case Default::DEFAULT: { const char *value = map[attribute.name]; if (value == 0) value = attribute.def.value; if (attribute.def.fixed && attribute.def.value == value) continue; char buf[200]; sprintf(buf, "Value of fixed attribute '%s' does not match default value ", (const char *) attribute.name); warningList.push(Warning(context, buf)); break; } } } for (i = 0; i < attributeList.size(); i++) { Attribute &attribute = attributeList[i]; if (attribute.type != Attribute::CDATA) normalize(map[attribute.name]); } } int parse_pcb_struct::warnAttval(const Context &context, const char *value, const AttvalPair &p) { if (value == NULL) return 0; char buf[200]; sprintf(buf, "Value of '%s' ignored. Previously specified as '%s'", (const char *) p.name, value); warningList.push(Warning(context, buf)); return 1; } void parse_pcb_struct::startTag(const Context &context, const AgString &name, AttvalList &list) { Symtab map; for (int i = 0; i < list.size(); i++) { AgString &value = map[list[i].name]; if (warnAttval(context, value, list[i])) continue; value = list[i].value; } currentTagName = name; checkAttributes(context, name, map); currentMap = map; tagStack.push(name); mapStack.push(map); tokenType = startType; } void parse_pcb_struct::emptyTag(const Context &context, const AgString &name, AttvalList &list) { Symtab map; for (int i = 0; i < list.size(); i++) { AgString &value = map[list[i].name]; if (warnAttval(context, value, list[i])) continue; map[list[i].name] = list[i].value; } checkAttributes(context, name, map); currentTagName = name; currentMap = map; tokenType = emptyType; } void parse_pcb_struct::endTag(const AgString &name) { currentTagName = name; currentMap = mapStack.pop(); if (name != tagStack.pop()) { error_message = "Element nesting error"; exit_flag = AG_SEMANTIC_ERROR_CODE; tokenType = errorType; return; } tokenType = endType; } void parse_pcb_struct::registerAttributes(const AgString &name, const AgBaseStack<Attribute> &attributeList) { Element &element = elementMap[name]; if (element.name.size() == 0) element.name = name; int i; for (i = 0; i < attributeList.size(); i++) { AgString attributeName = attributeList[i].name; int &index = element.attributeIndex[attributeName]; if (index != -1) { Attribute &attribute = element.attributeList[index]; Context previous = attribute.context; char buf[200]; sprintf(buf, "Redefinition of '%s::%s' ignored. Defined at line %d, column %d", (const char *) name, (const char *) attributeName, previous.line, previous.column); warningList.push(Warning(attributeList[i].context, buf)); continue; } index = element.attributeList.size(); element.attributeList.push(attributeList[i]); } } }