comparison tests/agcl/parsifal/detag2.syn @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:13d2b8934445
1 {
2 /*
3 detag.syn
4
5 Program to strip HTML tags from HTML files.
6 Copyright (c) 1996 - 1999 Parsifal Software, All
7 Rights Reserved.
8 See the file COPYING for license and usage terms.
9
10 For information about AnaGram, visit http://www.parsifalsoft.com.
11 */
12
13 #include <stdio.h>
14
15 }
16
17
18 // -- CONFIGURATION SECTION ----------------------------
19 [
20 default token type = int
21 pointer input
22 nest comments
23 test file mask = "*.htm*"
24 ~default reductions
25 ]
26 //------------------------------------------------------
27
28 eof = 0 // In this case, eof is the null char ending a string
29 tag innard char = ~(eof + '<' + '>')
30 comment char = ~(eof + '>')
31 ordinary text char = ~(eof + '<')
32 header type = '1-6' // for H1, H2,... H6
33
34 (void) input string $ // specify grammar token
35 -> html, eof
36
37 html
38 -> [text | tag...]/... // virtual production w. alternating seq.
39
40 tag
41 -> '<', tag innards, '>'
42
43
44 /* Here we pick up some of the HTML start tags so
45 we can insert a blank line in the output file for
46 readability. Note that because keywords take
47 precedence in AnaGram, the keyword is chosen
48 by the parser in preference to other input
49 beginning with the same characters. Turned out
50 that we didn't really want the blank line in
51 some cases, so the reduction procedure was
52 simply removed for those start tags - so,
53 nothing happens when they get picked up. */
54 tag innards
55 -> {"HR" | "hr"}, other stuff = putc('\n', output);
56 -> {"P" | "p"}, other stuff = putc('\n', output);
57 -> {"BR" | "br"}, other stuff
58 -> {"H" | "h"}, header type, other stuff = putc('\n', output);
59 -> {"HEAD" | "head" | "HTML" | "html"}, other stuff // avoid getting Hn
60 -> {"UL" | "ul"}, other stuff
61 -> {"OL" | "ol"}, other stuff
62 -> {"DL" | "dl"}, other stuff
63 -> {"LI" | "li"}, other stuff
64 -> {"TABLE" | "table"}, other stuff = putc('\n', output);
65 -> {"TR" | "tr"}, other stuff
66 -> {"TD" | "td"}, other stuff
67 -> {"PRE" | "pre"}, other stuff
68 -> "!--", comment stuff
69 -> other stuff
70
71 other stuff
72 -> tag innard char?...
73
74 comment stuff // treatment of comments is a bit simple-minded here
75 -> comment char?...
76
77 text // text is what we want to copy to output file
78 ->text char
79 ->text, text char
80
81
82 text char
83 -> ordinary text char:c = putc(c, output);
84 -> entity text char
85
86 entity text char // we only bother with 3 entities here
87 -> "&lt;" = putc('<', output);
88 -> "&gt;" = putc('>', output);
89 -> "&amp;" = putc('&', output);
90
91
92 { // ----- Embedded C ---------------------------
93
94 FILE *output;
95
96 int main(int argc, char *argv[]) {
97
98 FILE *input;
99 size_t fileLength;
100 size_t stringLength;
101 int errorFlag = 0;
102 char *inString;
103
104
105 /* Check for enough arguments */
106 if (argc != 3) {
107 printf("Program to strip HTML tags from a file\n"
108 "Usage: %s <input filename> <output filename>\n", argv[0]);
109 return 1;
110 }
111
112 /* Open input file for reading only */
113 input = fopen(argv[1],"r");
114 if (input == NULL) {
115 printf("Cannot open %s\n", argv[1]);
116 return 2;
117 }
118
119 /* find out how big the file is */
120 if (fseek(input, SEEK_SET, SEEK_END)) {
121 printf("Strange problems with %s\n", argv[1]);
122 return 3;
123 }
124 fileLength = ftell(input);
125 if (fileLength < 0 ) { // -1L is error return
126 printf("Error getting file length (%d) of %s\n", fileLength, argv[1]);
127 return 4;
128 }
129
130 /* fseek to beginning of file */
131 if (fseek(input, 0, SEEK_SET)) {
132 printf("Strange problems with %s\n", argv[1]);
133 return 5;
134 }
135
136 /* Allocate storage for input string */
137 inString = (char*)malloc(fileLength + 1);
138 if (inString == NULL) {
139 printf("Insufficient memory\n");
140 return 6;
141 }
142
143 /* Read file */
144 stringLength = fread(inString, 1, fileLength, input);
145 if (stringLength == 0) {
146 printf("Unable to read %s\n", argv[1]);
147 return 7;
148 }
149 inString[stringLength] = 0;
150
151
152 /* Open output file for writing only */
153 output = fopen(argv[2],"w");
154 if (output == NULL) {
155 printf("Cannot open %s\n", argv[2]);
156 free(inString);
157 fclose(input);
158 return 8;
159 }
160
161
162 /* Invoke parser */
163 PCB.pointer = (unsigned char *)inString; // using pointer input
164 detag();
165 if (PCB.exit_flag != 1) {
166 printf( "Unsuccessful termination of parse, PCB.exit_flag = %d\n",
167 PCB.exit_flag);
168 }
169
170
171 /* Done */
172 free(inString);
173 fclose(input);
174 fclose(output);
175 printf( " End detag ");
176 return 0;
177 }
178
179 } // --------- End of Embedded C --------------------
180
181