comparison tests/agcl/parsifal/xmlp8.syn @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:13d2b8934445
1 {/*
2 * XML Syntax
3 * Transcribed from
4 * Extensible Markup Language (XML) 1.0
5 * W3C Recommendation 10-February-1998
6 * http://www.w3.org/TR/1998/REC-xml-19980210
7 *
8 * Transcription Copyright © Parsifal Software, 1999.
9 *
10 * Revision 1 fixed definition of [4] NameChar
11 *
12 * Revision 2, begun 1/24/00
13 *
14 * Fixed problem with mixed content declaration [51]
15 * Fixed missing S in [52]
16 * Removed unnecessary "sticky" statement
17 */
18
19 #include "xmldef8.h"
20 #include <stdio.h>
21
22 }
23
24
25 [
26 reserve keywords {"--", "?>", "]]>"}
27 test file mask = "*.xml"
28 parser file name = "#.cpp"
29 parser name = parse
30 reentrant parser
31 //line numbers
32 event driven
33 context type = Context
34 extend pcb {
35 AgStack<AgString> tagStack;
36 AgStack<Symtab> mapStack;
37 AgString textStack;
38 AgString spaceStack;
39 enum TokenType {
40 none,
41 spaceType,
42 textType,
43 commentType,
44 startType,
45 endType,
46 emptyType,
47 entityRefType,
48 //charRefType,
49 errorType
50 } tokenType;
51 AgString currentTagName;
52 AgString currentEntityName;
53 //int currentCharRef;
54 AgString commentString;
55 Symtab currentMap;
56 AgStack<Warning> warningList;
57 AgBaseMapString<Element> elementMap;
58 int dtdPresent;
59
60 parse_pcb_struct();
61 parse(char *input, AgString *);
62 void spaceChar(int);
63 void textChar(int);
64 void startTag(const Context &, const AgString &, AttvalList &);
65 void emptyTag(const Context &, const AgString &, AttvalList &);
66 void endTag(const AgString &);
67 void registerAttributes(const AgString &name, const AgBaseStack<Attribute> &AttributeList);
68 void checkAttributes(const Context &, const AgString &name, Symtab &map);
69 int warnAttval(const Context &context, const char *, const AttvalPair &p);
70 static void normalize(AgString &s);
71 }
72 wrapper {AttvalPair,
73 AgString,
74 AttvalList,
75 Attribute,
76 AttributeList,
77 Default}
78 ]
79
80 // Document
81
82 // [1]
83 document $
84 -> prolog, element, Misc?..., eof
85
86 // For completeness
87 eof = -1
88
89
90 // Character Range
91
92 // [2]
93 //Char = 0x9 + 0xA + 0x20..0xd7ff + 0xe000..0xfffd + 0x0000-0x10ffff
94 Char = 0x9 + 0xA + 0x20..0xff // 8 bit characters only pro tem
95
96
97 // White Space
98 // [3]
99 SpaceChar = 0x20 + 0x9 + 0xd + 0xa
100
101 S
102 -> SpaceChar...
103
104
105 // Names and Tokens
106
107 // [4]
108 NameChar = Letter + Digit + '.' + '-' + '_' + ':' + Extender // + CombiningChar
109
110 // [5]
111 (AgString) Name
112 -> Letter + '_' + ':':c =AgString().concat(c);
113 -> Name:s, NameChar:c =s.concat(c);
114
115 // [6]
116 Names
117 -> Name
118 -> Names, S, Name
119
120 // [7]
121 Nmtoken
122 -> NameChar...
123
124 // [8]
125 Nmtokens
126 -> Nmtoken, [S, Nmtoken]...
127
128
129 // Literals
130
131 // [9]
132 EntityValue
133 -> '"', [~(eof + '%' + '&' + '"') | PEReference | Reference]..., '"'
134 -> '\'', [~(eof + '%' + '&' + '\'') | PEReference | Reference]..., '\''
135
136 // [10]
137 (AgString) AttValue
138 -> '"', dq AttValString:s, '"' =s;
139 -> '\'', sq AttValString:s, '\'' =s;
140
141 (AgString) dq AttValString
142 -> =AgString();
143 -> dq AttValString:s, ~(eof + '%' + '&' + '"' + SpaceChar):c =s.concat(c);
144 -> dq AttValString:s, SpaceChar =s.concat(' ');
145 //-> dq AttValString:s, CharRef =s;
146 -> dq AttValString:s, CharRef:c =s.concat(c);
147 -> dq AttValString:s, EntityRef =s;
148
149 (AgString) sq AttValString
150 -> =AgString();
151 -> sq AttValString:s, ~(eof + '%' + '&' + '\'' + SpaceChar):c =s.concat(c);
152 -> sq AttValString:s, SpaceChar =s.concat(' ');
153 -> sq AttValString:s, CharRef:c =s.concat(c);
154 -> sq AttValString:s, EntityRef =s;
155
156 // [11]
157 SystemLiteral
158 -> '"', ~(eof + '"')?..., '"'
159 -> '\'', ~(eof + '\'')?..., '\''
160
161 // [12]
162 PubidLiteral
163 -> '"', PubidChar?..., '"'
164 -> '\'', PubidChar-'\''?..., '\''
165
166 // [13]
167 PubidChar = 0x20 + 0xd + 0xa + 'a-z' + 'A-Z' + '0-9' +
168 '-' + '\'' + '(' + ')' + '+' + ',' + '.' +
169 '/' + ':' + '?' + ';' + '!' + '*' + '#' +
170 '@' + '$' + '_' + '%'
171
172
173 // Character Data
174
175 // [14]
176 CharData = ~(eof + '<' + '&') // Note that the iteration is in the usage
177 // Note that use of keyword "]]>" will take care of CDATA problem
178
179
180 // Comments
181
182 // [15]
183 Comment
184 -> "<!--", comment text:t, "--", '>' =PCB.tokenType = Pcb::commentType, PCB.commentString = t;
185
186 (AgString) comment text
187 -> =AgString();
188 ->comment text:s, Char:c =s.concat(c);
189
190
191 // Processing Instructions
192
193 // [16]
194 PI
195 -> "<?", PITarget, [S, [Char-SpaceChar, Char?...]], "?>"
196
197 // [17]
198 PITarget
199 -> Name // Name lookup mechanism should reject "xml"
200
201
202 // CDATA Sections
203
204 // [18]
205 CDSect
206 -> CDStart, CData, CDEnd
207
208 // [19]
209 CDStart
210 -> "<![CDATA["
211
212 // [20]
213 CData
214 -> [Char - SpaceChar:c =PCB.textChar(c); |
215 "\r\n" =PCB.spaceChar('\n'); |
216 SpaceChar:c =PCB.spaceChar(c);]...
217 // Keyword recognition logic overrides character recognition
218
219 // [21]
220 CDEnd
221 -> "]]>"
222
223
224 // Prolog
225
226 // [22]
227 prolog
228 -> XMLDecl?, Misc?..., [doctypedecl, Misc?...]
229
230 // [23]
231 XMLDecl
232 // -> "<?xml", VersionInfo, EncodingDecl?, SDDecl?, S?, "?>"
233 -> "<?xml", S, VersionInfo, VersionDecl, "?>"
234
235 VersionDecl
236 -> S?
237 -> S, EncodingDecl, S?
238 -> S, EncodingDecl, S, SDDecl, S?
239 -> S, SDDecl, S?
240
241
242 // [24]
243 VersionInfo
244 -> "version", Eq, {'\'', VersionNum, '\'' | '"', VersionNum, '"'}
245
246 // [25]
247 Eq
248 -> S?, '=', S?
249
250 // [26]
251 VersionNum
252 -> 'a-z' + 'A-Z' + '0-9' + '_' + '.' + ':' + '-'...
253
254 // [27]
255 Misc
256 // -> Comment | PI | S
257 -> Comment | PI | SpaceChar // Avoid double recursion
258
259
260 // Document Type Definition
261
262 // [28]
263 doctypedecl
264 -> "<!DOCTYPE", S, Name:n,
265 [S, [ExternalID, S?]],
266 [ '[', [markupdecl | PEReference | SpaceChar]..., ']', S?], '>' =PCB.dtdPresent=1;
267
268 // [29]
269 markupdecl
270 -> elementdecl
271 -> AttlistDecl
272 -> EntityDecl
273 -> NotationDecl
274 -> PI
275 -> Comment
276
277
278 // External Subset
279
280 // [30]
281
282 extSubset
283 -> TextDecl?, extSubsetDecl
284
285 // [31]
286 extSubsetDecl
287 -> [markupdecl | conditionalSect | PEReference | SpaceChar]...
288
289
290 // Standalone Document Declaration
291
292 // [32]
293 SDDecl
294 -> "standalone", Eq, "'yes'" | "\"yes\"" | "'no'" | "\"no\""
295
296
297 // Language Identification
298
299 // [33]
300 LanguageId
301 -> Langcode, ['-', Subcode]...
302
303 // [34]
304 Langcode
305 -> ISO639Code | IanaCode | UserCode
306
307 // [35]
308 ISO639Code
309 -> 'a-z' + 'A-Z' -('i' + 'I' + 'x' + 'X'), 'a-z' + 'A-Z'
310
311 // [36]
312 IanaCode
313 -> 'i' + 'I', '-', 'a-z' + 'A-Z'...
314
315 // [37]
316 UserCode
317 -> 'x' + 'X', '-', 'a-z' + 'A-Z'...
318
319 // [38]
320 Subcode
321 -> 'a-z' + 'A-Z'...
322
323
324 // Element
325
326 // [39]
327 element
328 -> EmptyElementTag
329 -> STag, content, ETag
330
331
332 // Start-Tags, End-Tags, and Empty-Element Tags
333
334 // Start-tag
335
336 // [40]
337 STag
338 -> '<', Name:n, AttributeList:l, S?, '>' =PCB.startTag(CONTEXT, n, l);
339
340 (AttvalList) AttributeList
341 -> =AttvalList();
342 -> AttributeList:list, S, Attribute:a =list.push(a);
343
344 // [41]
345 (AttvalPair) Attribute
346 -> Name:n, Eq, AttValue:s =AttvalPair(n, s, CONTEXT);
347
348 // End-tag
349
350 // [42]
351 ETag
352 -> "</", Name:n, S?, '>' =PCB.endTag(n);
353
354 // Content of Elements
355
356 // [43]
357 content
358 -> content unit...
359
360 content unit
361 -> element
362 -> CharData - SpaceChar:c =PCB.textChar(c);
363 -> CharRef:c =PCB.textChar(c);
364 -> EntityRef
365 -> SpaceChar:c =PCB.spaceChar(c);
366 -> "\r\n" =PCB.spaceChar('\n');
367 -> CDSect
368 -> PI
369 -> Comment
370
371 // Tags for Empty Elements
372
373 // [44]
374 EmptyElementTag
375 -> '<', Name:n, AttributeList:l, S?, "/>" =PCB.emptyTag(CONTEXT, n, l);
376
377 // Element Type Declaration
378
379 // [45]
380 elementdecl
381 -> "<!ELEMENT", S, Name, S, contentspec, S?, '>'
382
383 // [46]
384 contentspec
385 -> "EMPTY" | "ANY" | Mixed | Children
386
387
388 // Element-content Models
389
390 // [47]
391 Children
392 -> {choice | seq}, ['?' | '*' | '+']
393
394 // [48]
395 cp
396 -> {Name | choice | seq}, ['?' | '*' | '+']
397
398 // [49]
399 choice
400 -> '(', S?, cp, S?, ['|', S?, cp, S?]..., ')'
401
402 // [50]
403 seq
404 -> '(', S?, cp, S?, {',', S?, cp, S?}..., ')'
405
406
407 // Mixed-content Declaration
408
409 // [51]
410 Mixed
411 -> '(', S?, "#PCDATA", ['|', S?, Name | S]/..., ")*"
412 -> '(', S?, "#PCDATA", S?, ')'
413
414
415 // Attribute-list Declaration
416
417 // [52]
418 AttlistDecl
419 -> "<!ATTLIST", S, Name:n, AttDefs:l, S?, '>' =PCB.registerAttributes(n, l);
420
421 (AttributeList) AttDefs
422 -> AttDef:a =AttributeList().push(a);
423 -> AttDefs:l, AttDef:a =l.push(a);
424
425 // [53]
426 (Attribute) AttDef
427 -> S, Name:n, S, AttType:t, S, DefaultDecl:d =Attribute(RULE_CONTEXT[1], n, t, d);
428
429
430 // Attribute Types
431
432 // [54]
433 (Attribute::Type) AttType
434 -> StringType
435 -> TokenizedType
436 -> EnumeratedType
437
438 // [55]
439 (Attribute::Type) StringType
440 -> "CDATA" =Attribute::CDATA;
441
442 // [56]
443 (Attribute::Type) TokenizedType
444 -> "ID" =Attribute::ID;
445 -> "IDREF" =Attribute::IDREF;
446 -> "IDREFS" =Attribute::IDREFS;
447 -> "ENTITY" =Attribute::ENTITY;
448 -> "ENTITIES" =Attribute::ENTITIES;
449 -> "NMTOKEN" =Attribute::NMTOKEN;
450 -> "NMTOKENS" =Attribute::NMTOKENS;
451
452 // [57]
453 (Attribute::Type) EnumeratedType
454 -> NotationType =Attribute::NOTATION;
455 -> Enumeration =Attribute::ENUMERATION;
456
457 // [58]
458 NotationType
459 -> "NOTATION", S, '(', Name, S?, ['|', S?, Name, S?]..., ')'
460
461 // [59]
462 Enumeration
463 -> '(', Nmtoken, S?, ['|', S?, Nmtoken, S?]..., ')'
464
465
466 // Attribute Defaults
467
468 // [60]
469 (Default) DefaultDecl
470 -> "#REQUIRED" =Default(Default::REQUIRED);
471 -> "#IMPLIED" =Default();
472 -> AttValue:v =Default(v);
473 -> "#FIXED", S, AttValue:v =Default(v,1);
474
475
476 // Conditional Section
477
478 // [61]
479 conditionalSect
480 -> includeSect | ignoreSect
481
482 // [62]
483 includeSect
484 -> "<![", S?, "INCLUDE", S?, '[', extSubsetDecl, "]]>"
485
486 // [63]
487 ignoreSect
488 -> "<![", S?, "IGNORE", S?, '[', ignoreSectContents?..., "]]>"
489
490 // [64]
491 ignoreSectContents
492 -> Ignore, ["<![", ignoreSectContents, "]]>", Ignore]...
493
494 // [65]
495 Ignore
496 -> Char?...
497
498
499 //Character Reference
500
501 // [66]
502 (int) CharRef
503 //-> "&#", '0-9'..., ';'
504 //-> "&#x", '0-9' + 'a-f' + 'A-F'..., ';'
505 //-> decimal CharRef:c, ';' =PCB.currentCharRef = c, PCB.tokenType = Pcb::charRefType;
506 -> decimal CharRef, ';'
507 //-> hex CharRef:c, ';' =PCB.currentCharRef = c, PCB.tokenType = Pcb::charRefType;
508 -> hex CharRef, ';'
509
510 (int) decimal CharRef
511 -> "&#", '0-9':d =d-'0';
512 -> decimal CharRef:n, '0-9':d =10*n + d-'0';
513
514 (int) hex CharRef
515 -> "&#x", '0-9':d =d-'0';
516 -> hex CharRef:n, '0-9':d =16*n + d;
517 -> hex CharRef:n, 'a-f' + 'A-F':d =16*n + (d&7) + 9;
518
519 //Entity Reference
520
521 // [67]
522 (int) Reference
523 -> EntityRef | CharRef
524
525 // [68]
526 (int) EntityRef
527 -> '&', Name:s, ';' =PCB.currentEntityName = s, PCB.tokenType = Pcb::entityRefType;
528
529 // [69]
530 PEReference
531 -> '%', Name, ';'
532
533
534 // Entity Declaration
535
536 // [70]
537 EntityDecl
538 -> GEDecl | PEDecl
539
540 // [71]
541 GEDecl
542 -> "<!ENTITY", S, Name, S, EntityDef, '>'
543
544 // [72]
545 PEDecl
546 -> "<!ENTITY", S, '%', S, Name, S, PEDef, S?, '>'
547
548 // [73]
549 EntityDef
550 -> EntityValue, S? | ExternalID, [NDataDecl | S]
551
552 // [74]
553 PEDef
554 -> EntityValue | ExternalID
555
556
557 // External Entity Declaration
558
559 // [75]
560 ExternalID
561 -> "SYSTEM", S, SystemLiteral
562 -> "PUBLIC", S, PubidLiteral, S, SystemLiteral
563
564 // [76]
565 NDataDecl
566 -> S, "NDATA", S, Name
567
568
569 // Text Declaration
570
571 // [77]
572 TextDecl
573 -> "<?xml", S, [VersionInfo, S], EncodingDecl, S?, "?>"
574
575
576 // Well-Formed External Parsed Entity
577
578 // [78]
579 extParsedEnt
580 -> TextDecl?, content
581
582 // [79]
583 extPE
584 -> TextDecl?, extSubsetDecl
585
586
587 // Encoding Declaration
588
589 // [80]
590 EncodingDecl
591 -> "encoding", Eq, {'"', EncName, '"' | '\'', EncName, '\''}
592
593 // [81]
594 EncName
595 -> 'a-z' + 'A-Z', ['a-z'+'A-Z'+'0-9'+'_' | '-']...
596
597
598 // Notation Declarations
599
600 // [82]
601 NotationDecl
602 -> "<!NOTATION", S, Name, S, {ExternalID, S? | PublicID}, '>'
603
604 // [83]
605 PublicID
606 -> "PUBLIC", S, PubidLiteral, S?
607
608
609 // Characters
610
611 // [84]
612 Letter = BaseChar // | Ideographic // No ideographs for now
613
614 // [85]
615 BaseChar = // only 8 bit characters for now
616 0x41..0x5a +
617 0x61..0x7a +
618 0xc0..0xd6 +
619 0xd8..0xf6 +
620 0xf8..0xff
621
622 // [86]
623 // Ideographic =
624
625
626 // [87]
627 // CombiningChar =
628
629 // [88]
630 Digit = 0x30..0x39
631
632 // [89]
633 Extender = 0xb7
634
635 {
636 #define GET_CONTEXT CONTEXT = Context(PCB.line, PCB.column)
637
638 typedef parse_pcb_struct Pcb;
639
640 parse_pcb_struct::parse_pcb_struct()
641 : tokenType(Pcb::none), dtdPresent(0)
642 {
643 // Nothing to do
644 }
645
646 void parse_pcb_struct::spaceChar(int c) {
647 if (c == '\r') c = '\n';
648 if (textStack.size()) tokenType = textType;
649 spaceStack.concat(c);
650 }
651
652 void parse_pcb_struct::textChar(int c) {
653 if (spaceStack.size()) tokenType=spaceType;
654 textStack.concat((char)c);
655 }
656
657 void parse_pcb_struct::normalize(AgString &s) {
658 char *readPointer = s;
659 char *writePointer = readPointer;
660 if (readPointer == NULL) return;
661 while (*readPointer && strchr("\t\r\n ",*readPointer)) readPointer++;
662 while (*readPointer) {
663 while (*readPointer && strchr("\t\r\n ",*readPointer) == NULL) *writePointer++ = *readPointer++;
664 while (*readPointer && strchr("\t\r\n ",*readPointer)) readPointer++;
665 if (*readPointer) *writePointer++ = ' ';
666 }
667 *writePointer = 0;
668 }
669
670 void parse_pcb_struct::checkAttributes(const Context &context, const AgString &name, Symtab &map) {
671 if (!dtdPresent) return;
672 Element &element = elementMap[name];
673 AgBaseStack<Attribute> &attributeList = element.attributeList;
674 int i;
675 for (i = 0; i < attributeList.size(); i++) {
676 Attribute &attribute = attributeList[i];
677 switch (attribute.def.presence) {
678 case Default::REQUIRED: {
679 const char *value = map.findValue(attribute.name);
680 if (value) continue;
681 map[attribute.name] = "VALUE NOT SPECIFIED";
682 char buf[200];
683 sprintf(buf, "Required attribute '%s' is missing", (const char *)attribute.name);
684 warningList.push(Warning(context, buf));
685 break;
686 }
687 case Default::DEFAULT: {
688 const char *value = map[attribute.name];
689 if (value == 0) value = attribute.def.value;
690 if (attribute.def.fixed && attribute.def.value == value) continue;
691 char buf[200];
692 sprintf(buf, "Value of fixed attribute '%s' does not match default value ",
693 (const char *) attribute.name);
694 warningList.push(Warning(context, buf));
695 break;
696 }
697 }
698 }
699 for (i = 0; i < attributeList.size(); i++) {
700 Attribute &attribute = attributeList[i];
701 if (attribute.type != Attribute::CDATA) normalize(map[attribute.name]);
702 }
703 }
704
705 int parse_pcb_struct::warnAttval(const Context &context, const char *value, const AttvalPair &p) {
706 if (value == NULL) return 0;
707 char buf[200];
708 sprintf(buf, "Value of '%s' ignored. Previously specified as '%s'",
709 (const char *) p.name, value);
710 warningList.push(Warning(context, buf));
711 return 1;
712 }
713
714 void parse_pcb_struct::startTag(const Context &context, const AgString &name, AttvalList &list) {
715 Symtab map;
716 for (int i = 0; i < list.size(); i++) {
717 AgString &value = map[list[i].name];
718 if (warnAttval(context, value, list[i])) continue;
719 value = list[i].value;
720 }
721 currentTagName = name;
722 checkAttributes(context, name, map);
723 currentMap = map;
724 tagStack.push(name);
725 mapStack.push(map);
726 tokenType = startType;
727 }
728
729 void parse_pcb_struct::emptyTag(const Context &context, const AgString &name, AttvalList &list) {
730 Symtab map;
731 for (int i = 0; i < list.size(); i++) {
732 AgString &value = map[list[i].name];
733 if (warnAttval(context, value, list[i])) continue;
734 map[list[i].name] = list[i].value;
735 }
736 checkAttributes(context, name, map);
737 currentTagName = name;
738 currentMap = map;
739 tokenType = emptyType;
740 }
741
742 void parse_pcb_struct::endTag(const AgString &name) {
743 currentTagName = name;
744 currentMap = mapStack.pop();
745 if (name != tagStack.pop()) {
746 error_message = "Element nesting error";
747 exit_flag = AG_SEMANTIC_ERROR_CODE;
748 tokenType = errorType;
749 return;
750 }
751 tokenType = endType;
752 }
753 void parse_pcb_struct::registerAttributes(const AgString &name, const AgBaseStack<Attribute> &attributeList) {
754 Element &element = elementMap[name];
755 if (element.name.size() == 0) element.name = name;
756 int i;
757 for (i = 0; i < attributeList.size(); i++) {
758 AgString attributeName = attributeList[i].name;
759 int &index = element.attributeIndex[attributeName];
760 if (index != -1) {
761 Attribute &attribute = element.attributeList[index];
762 Context previous = attribute.context;
763 char buf[200];
764 sprintf(buf, "Redefinition of '%s::%s' ignored. Defined at line %d, column %d",
765 (const char *) name,
766 (const char *) attributeName, previous.line, previous.column);
767 warningList.push(Warning(attributeList[i].context, buf));
768 continue;
769 }
770 index = element.attributeList.size();
771 element.attributeList.push(attributeList[i]);
772 }
773 }
774 }