comparison tests/agcl/parsifal/detag.syn @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:13d2b8934445
1 {
2 /*
3 detag.syn
4
5 Program to strip HTML tags from HTML files.
6 Copyright (c) 1996 - 1999 Parsifal Software, All
7 Rights Reserved.
8 See the file COPYING for license and usage terms.
9
10 For information about AnaGram, visit http://www.parsifalsoft.com.
11 */
12
13 #include <stdio.h>
14
15 }
16
17
18 // -- CONFIGURATION SECTION ----------------------------
19 [
20 default token type = int
21 pointer input
22 nest comments
23 ]
24 //------------------------------------------------------
25
26 eof = 0 // In this case, eof is the null char ending a string
27 tag innard char = ~(eof + '<' + '>')
28 comment char = ~(eof + '>')
29 ordinary text char = ~(eof + '<')
30 header type = '1-6' // for H1, H2,... H6
31
32 (void) input string $ // specify grammar token
33 -> html, eof
34
35 html
36 -> [text | tag...]/... // virtual production w. alternating seq.
37
38 tag
39 -> '<', tag innards, '>'
40
41
42 /* Here we pick up some of the HTML start tags so
43 we can insert a blank line in the output file for
44 readability. Note that because keywords take
45 precedence in AnaGram, the keyword is chosen
46 by the parser in preference to other input
47 beginning with the same characters. Turned out
48 that we didn't really want the blank line in
49 some cases, so the reduction procedure was
50 simply removed for those start tags - so,
51 nothing happens when they get picked up. */
52 tag innards
53 -> {"HR" | "hr"}, other stuff = putc('\n', output);
54 -> {"P" | "p"}, other stuff = putc('\n', output);
55 -> {"BR" | "br"}, other stuff
56 -> {"H" | "h"}, header type, other stuff = putc('\n', output);
57 -> {"HEAD" | "head" | "HTML" | "html"}, other stuff // avoid getting Hn
58 -> {"UL" | "ul"}, other stuff
59 -> {"OL" | "ol"}, other stuff
60 -> {"DL" | "dl"}, other stuff
61 -> {"LI" | "li"}, other stuff
62 -> {"TABLE" | "table"}, other stuff = putc('\n', output);
63 -> {"TR" | "tr"}, other stuff
64 -> {"TD" | "td"}, other stuff
65 -> {"PRE" | "pre"}, other stuff
66 -> "!--", comment stuff
67 -> other stuff
68
69 other stuff
70 -> tag innard char?...
71
72 comment stuff // treatment of comments is a bit simple-minded here
73 -> comment char?...
74
75 text // text is what we want to copy to output file
76 ->text char
77 ->text, text char
78
79
80 text char
81 -> ordinary text char:c = putc(c, output);
82 -> entity text char
83
84 entity text char // we only bother with 3 entities here
85 -> "&lt;" = putc('<', output);
86 -> "&gt;" = putc('>', output);
87 -> "&amp;" = putc('&', output);
88
89
90 { // ----- Embedded C ---------------------------
91
92 FILE *output;
93
94 int main(int argc, char *argv[]) {
95
96 FILE *input;
97 size_t fileLength;
98 size_t stringLength;
99 int errorFlag = 0;
100 char *inString;
101
102
103 /* Check for enough arguments */
104 if (argc != 3) {
105 printf("Program to strip HTML tags from a file\n"
106 "Usage: %s <input filename> <output filename>\n", argv[0]);
107 return 1;
108 }
109
110 /* Open input file for reading only */
111 input = fopen(argv[1],"r");
112 if (input == NULL) {
113 printf("Cannot open %s\n", argv[1]);
114 return 2;
115 }
116
117 /* find out how big the file is */
118 if (fseek(input, SEEK_SET, SEEK_END)) {
119 printf("Strange problems with %s\n", argv[1]);
120 return 3;
121 }
122 fileLength = ftell(input);
123 if (fileLength < 0 ) { // -1L is error return
124 printf("Error getting file length (%d) of %s\n", fileLength, argv[1]);
125 return 4;
126 }
127
128 /* fseek to beginning of file */
129 if (fseek(input, 0, SEEK_SET)) {
130 printf("Strange problems with %s\n", argv[1]);
131 return 5;
132 }
133
134 /* Allocate storage for input string */
135 inString = (char*)malloc(fileLength + 1);
136 if (inString == NULL) {
137 printf("Insufficient memory\n");
138 return 6;
139 }
140
141 /* Read file */
142 stringLength = fread(inString, 1, fileLength, input);
143 if (stringLength == 0) {
144 printf("Unable to read %s\n", argv[1]);
145 return 7;
146 }
147 inString[stringLength] = 0;
148
149
150 /* Open output file for writing only */
151 output = fopen(argv[2],"w");
152 if (output == NULL) {
153 printf("Cannot open %s\n", argv[2]);
154 free(inString);
155 fclose(input);
156 return 8;
157 }
158
159
160 /* Invoke parser */
161 PCB.pointer = (unsigned char *)inString; // using pointer input
162 detag();
163 if (PCB.exit_flag != 1) {
164 printf( "Unsuccessful termination of parse, PCB.exit_flag = %d\n",
165 PCB.exit_flag);
166 }
167
168
169 /* Done */
170 free(inString);
171 fclose(input);
172 fclose(output);
173 printf( " End detag ");
174 return 0;
175 }
176
177 } // --------- End of Embedded C --------------------
178
179