Mercurial > ~dholland > hg > ag > index.cgi
comparison tests/agcl/parsifal/detag2.syn @ 0:13d2b8934445
Import AnaGram (near-)release tree into Mercurial.
author | David A. Holland |
---|---|
date | Sat, 22 Dec 2007 17:52:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:13d2b8934445 |
---|---|
1 { | |
2 /* | |
3 detag.syn | |
4 | |
5 Program to strip HTML tags from HTML files. | |
6 Copyright (c) 1996 - 1999 Parsifal Software, All | |
7 Rights Reserved. | |
8 See the file COPYING for license and usage terms. | |
9 | |
10 For information about AnaGram, visit http://www.parsifalsoft.com. | |
11 */ | |
12 | |
13 #include <stdio.h> | |
14 | |
15 } | |
16 | |
17 | |
18 // -- CONFIGURATION SECTION ---------------------------- | |
19 [ | |
20 default token type = int | |
21 pointer input | |
22 nest comments | |
23 test file mask = "*.htm*" | |
24 ~default reductions | |
25 ] | |
26 //------------------------------------------------------ | |
27 | |
28 eof = 0 // In this case, eof is the null char ending a string | |
29 tag innard char = ~(eof + '<' + '>') | |
30 comment char = ~(eof + '>') | |
31 ordinary text char = ~(eof + '<') | |
32 header type = '1-6' // for H1, H2,... H6 | |
33 | |
34 (void) input string $ // specify grammar token | |
35 -> html, eof | |
36 | |
37 html | |
38 -> [text | tag...]/... // virtual production w. alternating seq. | |
39 | |
40 tag | |
41 -> '<', tag innards, '>' | |
42 | |
43 | |
44 /* Here we pick up some of the HTML start tags so | |
45 we can insert a blank line in the output file for | |
46 readability. Note that because keywords take | |
47 precedence in AnaGram, the keyword is chosen | |
48 by the parser in preference to other input | |
49 beginning with the same characters. Turned out | |
50 that we didn't really want the blank line in | |
51 some cases, so the reduction procedure was | |
52 simply removed for those start tags - so, | |
53 nothing happens when they get picked up. */ | |
54 tag innards | |
55 -> {"HR" | "hr"}, other stuff = putc('\n', output); | |
56 -> {"P" | "p"}, other stuff = putc('\n', output); | |
57 -> {"BR" | "br"}, other stuff | |
58 -> {"H" | "h"}, header type, other stuff = putc('\n', output); | |
59 -> {"HEAD" | "head" | "HTML" | "html"}, other stuff // avoid getting Hn | |
60 -> {"UL" | "ul"}, other stuff | |
61 -> {"OL" | "ol"}, other stuff | |
62 -> {"DL" | "dl"}, other stuff | |
63 -> {"LI" | "li"}, other stuff | |
64 -> {"TABLE" | "table"}, other stuff = putc('\n', output); | |
65 -> {"TR" | "tr"}, other stuff | |
66 -> {"TD" | "td"}, other stuff | |
67 -> {"PRE" | "pre"}, other stuff | |
68 -> "!--", comment stuff | |
69 -> other stuff | |
70 | |
71 other stuff | |
72 -> tag innard char?... | |
73 | |
74 comment stuff // treatment of comments is a bit simple-minded here | |
75 -> comment char?... | |
76 | |
77 text // text is what we want to copy to output file | |
78 ->text char | |
79 ->text, text char | |
80 | |
81 | |
82 text char | |
83 -> ordinary text char:c = putc(c, output); | |
84 -> entity text char | |
85 | |
86 entity text char // we only bother with 3 entities here | |
87 -> "<" = putc('<', output); | |
88 -> ">" = putc('>', output); | |
89 -> "&" = putc('&', output); | |
90 | |
91 | |
92 { // ----- Embedded C --------------------------- | |
93 | |
94 FILE *output; | |
95 | |
96 int main(int argc, char *argv[]) { | |
97 | |
98 FILE *input; | |
99 size_t fileLength; | |
100 size_t stringLength; | |
101 int errorFlag = 0; | |
102 char *inString; | |
103 | |
104 | |
105 /* Check for enough arguments */ | |
106 if (argc != 3) { | |
107 printf("Program to strip HTML tags from a file\n" | |
108 "Usage: %s <input filename> <output filename>\n", argv[0]); | |
109 return 1; | |
110 } | |
111 | |
112 /* Open input file for reading only */ | |
113 input = fopen(argv[1],"r"); | |
114 if (input == NULL) { | |
115 printf("Cannot open %s\n", argv[1]); | |
116 return 2; | |
117 } | |
118 | |
119 /* find out how big the file is */ | |
120 if (fseek(input, SEEK_SET, SEEK_END)) { | |
121 printf("Strange problems with %s\n", argv[1]); | |
122 return 3; | |
123 } | |
124 fileLength = ftell(input); | |
125 if (fileLength < 0 ) { // -1L is error return | |
126 printf("Error getting file length (%d) of %s\n", fileLength, argv[1]); | |
127 return 4; | |
128 } | |
129 | |
130 /* fseek to beginning of file */ | |
131 if (fseek(input, 0, SEEK_SET)) { | |
132 printf("Strange problems with %s\n", argv[1]); | |
133 return 5; | |
134 } | |
135 | |
136 /* Allocate storage for input string */ | |
137 inString = (char*)malloc(fileLength + 1); | |
138 if (inString == NULL) { | |
139 printf("Insufficient memory\n"); | |
140 return 6; | |
141 } | |
142 | |
143 /* Read file */ | |
144 stringLength = fread(inString, 1, fileLength, input); | |
145 if (stringLength == 0) { | |
146 printf("Unable to read %s\n", argv[1]); | |
147 return 7; | |
148 } | |
149 inString[stringLength] = 0; | |
150 | |
151 | |
152 /* Open output file for writing only */ | |
153 output = fopen(argv[2],"w"); | |
154 if (output == NULL) { | |
155 printf("Cannot open %s\n", argv[2]); | |
156 free(inString); | |
157 fclose(input); | |
158 return 8; | |
159 } | |
160 | |
161 | |
162 /* Invoke parser */ | |
163 PCB.pointer = (unsigned char *)inString; // using pointer input | |
164 detag(); | |
165 if (PCB.exit_flag != 1) { | |
166 printf( "Unsuccessful termination of parse, PCB.exit_flag = %d\n", | |
167 PCB.exit_flag); | |
168 } | |
169 | |
170 | |
171 /* Done */ | |
172 free(inString); | |
173 fclose(input); | |
174 fclose(output); | |
175 printf( " End detag "); | |
176 return 0; | |
177 } | |
178 | |
179 } // --------- End of Embedded C -------------------- | |
180 | |
181 |