comparison tests/agcl/parsifal/detag1.syn @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:13d2b8934445
1 {
2 /*
3 detag.syn
4
5 Program to strip HTML tags from HTML files.
6 Copyright (c) 1996 - 1999 Parsifal Software, All
7 Rights Reserved.
8 See the file COPYING for license and usage terms.
9
10 For information about AnaGram, visit http://www.parsifalsoft.com.
11 */
12
13 #include <stdio.h>
14
15 }
16
17
18 // -- CONFIGURATION SECTION ----------------------------
19 [
20 default token type = int
21 pointer input
22 nest comments
23 test file mask = "*.htm*"
24 ]
25 //------------------------------------------------------
26
27 eof = 0 // In this case, eof is the null char ending a string
28 tag innard char = ~(eof + '<' + '>')
29 comment char = ~(eof + '>')
30 ordinary text char = ~(eof + '<')
31 header type = '1-6' // for H1, H2,... H6
32
33 (void) input string $ // specify grammar token
34 -> html, eof
35
36 html
37 -> [text | tag...]/... // virtual production w. alternating seq.
38
39 tag
40 -> '<', tag innards, '>'
41
42
43 /* Here we pick up some of the HTML start tags so
44 we can insert a blank line in the output file for
45 readability. Note that because keywords take
46 precedence in AnaGram, the keyword is chosen
47 by the parser in preference to other input
48 beginning with the same characters. Turned out
49 that we didn't really want the blank line in
50 some cases, so the reduction procedure was
51 simply removed for those start tags - so,
52 nothing happens when they get picked up. */
53 tag innards
54 -> {"HR" | "hr"}, other stuff = putc('\n', output);
55 -> {"P" | "p"}, other stuff = putc('\n', output);
56 -> {"BR" | "br"}, other stuff
57 -> {"H" | "h"}, header type, other stuff = putc('\n', output);
58 -> {"HEAD" | "head" | "HTML" | "html"}, other stuff // avoid getting Hn
59 -> {"UL" | "ul"}, other stuff
60 -> {"OL" | "ol"}, other stuff
61 -> {"DL" | "dl"}, other stuff
62 -> {"LI" | "li"}, other stuff
63 -> {"TABLE" | "table"}, other stuff = putc('\n', output);
64 -> {"TR" | "tr"}, other stuff
65 -> {"TD" | "td"}, other stuff
66 -> {"PRE" | "pre"}, other stuff
67 -> "!--", comment stuff
68 -> other stuff
69
70 other stuff
71 -> tag innard char?...
72
73 comment stuff // treatment of comments is a bit simple-minded here
74 -> comment char?...
75
76 text // text is what we want to copy to output file
77 ->text char
78 ->text, text char
79
80
81 text char
82 -> ordinary text char:c = putc(c, output);
83 -> entity text char
84
85 entity text char // we only bother with 3 entities here
86 -> "&lt;" = putc('<', output);
87 -> "&gt;" = putc('>', output);
88 -> "&amp;" = putc('&', output);
89
90
91 { // ----- Embedded C ---------------------------
92
93 FILE *output;
94
95 int main(int argc, char *argv[]) {
96
97 FILE *input;
98 size_t fileLength;
99 size_t stringLength;
100 int errorFlag = 0;
101 char *inString;
102
103
104 /* Check for enough arguments */
105 if (argc != 3) {
106 printf("Program to strip HTML tags from a file\n"
107 "Usage: %s <input filename> <output filename>\n", argv[0]);
108 return 1;
109 }
110
111 /* Open input file for reading only */
112 input = fopen(argv[1],"r");
113 if (input == NULL) {
114 printf("Cannot open %s\n", argv[1]);
115 return 2;
116 }
117
118 /* find out how big the file is */
119 if (fseek(input, SEEK_SET, SEEK_END)) {
120 printf("Strange problems with %s\n", argv[1]);
121 return 3;
122 }
123 fileLength = ftell(input);
124 if (fileLength < 0 ) { // -1L is error return
125 printf("Error getting file length (%d) of %s\n", fileLength, argv[1]);
126 return 4;
127 }
128
129 /* fseek to beginning of file */
130 if (fseek(input, 0, SEEK_SET)) {
131 printf("Strange problems with %s\n", argv[1]);
132 return 5;
133 }
134
135 /* Allocate storage for input string */
136 inString = (char*)malloc(fileLength + 1);
137 if (inString == NULL) {
138 printf("Insufficient memory\n");
139 return 6;
140 }
141
142 /* Read file */
143 stringLength = fread(inString, 1, fileLength, input);
144 if (stringLength == 0) {
145 printf("Unable to read %s\n", argv[1]);
146 return 7;
147 }
148 inString[stringLength] = 0;
149
150
151 /* Open output file for writing only */
152 output = fopen(argv[2],"w");
153 if (output == NULL) {
154 printf("Cannot open %s\n", argv[2]);
155 free(inString);
156 fclose(input);
157 return 8;
158 }
159
160
161 /* Invoke parser */
162 PCB.pointer = (unsigned char *)inString; // using pointer input
163 detag();
164 if (PCB.exit_flag != 1) {
165 printf( "Unsuccessful termination of parse, PCB.exit_flag = %d\n",
166 PCB.exit_flag);
167 }
168
169
170 /* Done */
171 free(inString);
172 fclose(input);
173 fclose(output);
174 printf( " End detag ");
175 return 0;
176 }
177
178 } // --------- End of Embedded C --------------------
179
180