Mercurial > ~dholland > hg > ag > index.cgi
comparison tests/agcl/parsifal/detag1.syn @ 0:13d2b8934445
Import AnaGram (near-)release tree into Mercurial.
author | David A. Holland |
---|---|
date | Sat, 22 Dec 2007 17:52:45 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:13d2b8934445 |
---|---|
1 { | |
2 /* | |
3 detag.syn | |
4 | |
5 Program to strip HTML tags from HTML files. | |
6 Copyright (c) 1996 - 1999 Parsifal Software, All | |
7 Rights Reserved. | |
8 See the file COPYING for license and usage terms. | |
9 | |
10 For information about AnaGram, visit http://www.parsifalsoft.com. | |
11 */ | |
12 | |
13 #include <stdio.h> | |
14 | |
15 } | |
16 | |
17 | |
18 // -- CONFIGURATION SECTION ---------------------------- | |
19 [ | |
20 default token type = int | |
21 pointer input | |
22 nest comments | |
23 test file mask = "*.htm*" | |
24 ] | |
25 //------------------------------------------------------ | |
26 | |
27 eof = 0 // In this case, eof is the null char ending a string | |
28 tag innard char = ~(eof + '<' + '>') | |
29 comment char = ~(eof + '>') | |
30 ordinary text char = ~(eof + '<') | |
31 header type = '1-6' // for H1, H2,... H6 | |
32 | |
33 (void) input string $ // specify grammar token | |
34 -> html, eof | |
35 | |
36 html | |
37 -> [text | tag...]/... // virtual production w. alternating seq. | |
38 | |
39 tag | |
40 -> '<', tag innards, '>' | |
41 | |
42 | |
43 /* Here we pick up some of the HTML start tags so | |
44 we can insert a blank line in the output file for | |
45 readability. Note that because keywords take | |
46 precedence in AnaGram, the keyword is chosen | |
47 by the parser in preference to other input | |
48 beginning with the same characters. Turned out | |
49 that we didn't really want the blank line in | |
50 some cases, so the reduction procedure was | |
51 simply removed for those start tags - so, | |
52 nothing happens when they get picked up. */ | |
53 tag innards | |
54 -> {"HR" | "hr"}, other stuff = putc('\n', output); | |
55 -> {"P" | "p"}, other stuff = putc('\n', output); | |
56 -> {"BR" | "br"}, other stuff | |
57 -> {"H" | "h"}, header type, other stuff = putc('\n', output); | |
58 -> {"HEAD" | "head" | "HTML" | "html"}, other stuff // avoid getting Hn | |
59 -> {"UL" | "ul"}, other stuff | |
60 -> {"OL" | "ol"}, other stuff | |
61 -> {"DL" | "dl"}, other stuff | |
62 -> {"LI" | "li"}, other stuff | |
63 -> {"TABLE" | "table"}, other stuff = putc('\n', output); | |
64 -> {"TR" | "tr"}, other stuff | |
65 -> {"TD" | "td"}, other stuff | |
66 -> {"PRE" | "pre"}, other stuff | |
67 -> "!--", comment stuff | |
68 -> other stuff | |
69 | |
70 other stuff | |
71 -> tag innard char?... | |
72 | |
73 comment stuff // treatment of comments is a bit simple-minded here | |
74 -> comment char?... | |
75 | |
76 text // text is what we want to copy to output file | |
77 ->text char | |
78 ->text, text char | |
79 | |
80 | |
81 text char | |
82 -> ordinary text char:c = putc(c, output); | |
83 -> entity text char | |
84 | |
85 entity text char // we only bother with 3 entities here | |
86 -> "<" = putc('<', output); | |
87 -> ">" = putc('>', output); | |
88 -> "&" = putc('&', output); | |
89 | |
90 | |
91 { // ----- Embedded C --------------------------- | |
92 | |
93 FILE *output; | |
94 | |
95 int main(int argc, char *argv[]) { | |
96 | |
97 FILE *input; | |
98 size_t fileLength; | |
99 size_t stringLength; | |
100 int errorFlag = 0; | |
101 char *inString; | |
102 | |
103 | |
104 /* Check for enough arguments */ | |
105 if (argc != 3) { | |
106 printf("Program to strip HTML tags from a file\n" | |
107 "Usage: %s <input filename> <output filename>\n", argv[0]); | |
108 return 1; | |
109 } | |
110 | |
111 /* Open input file for reading only */ | |
112 input = fopen(argv[1],"r"); | |
113 if (input == NULL) { | |
114 printf("Cannot open %s\n", argv[1]); | |
115 return 2; | |
116 } | |
117 | |
118 /* find out how big the file is */ | |
119 if (fseek(input, SEEK_SET, SEEK_END)) { | |
120 printf("Strange problems with %s\n", argv[1]); | |
121 return 3; | |
122 } | |
123 fileLength = ftell(input); | |
124 if (fileLength < 0 ) { // -1L is error return | |
125 printf("Error getting file length (%d) of %s\n", fileLength, argv[1]); | |
126 return 4; | |
127 } | |
128 | |
129 /* fseek to beginning of file */ | |
130 if (fseek(input, 0, SEEK_SET)) { | |
131 printf("Strange problems with %s\n", argv[1]); | |
132 return 5; | |
133 } | |
134 | |
135 /* Allocate storage for input string */ | |
136 inString = (char*)malloc(fileLength + 1); | |
137 if (inString == NULL) { | |
138 printf("Insufficient memory\n"); | |
139 return 6; | |
140 } | |
141 | |
142 /* Read file */ | |
143 stringLength = fread(inString, 1, fileLength, input); | |
144 if (stringLength == 0) { | |
145 printf("Unable to read %s\n", argv[1]); | |
146 return 7; | |
147 } | |
148 inString[stringLength] = 0; | |
149 | |
150 | |
151 /* Open output file for writing only */ | |
152 output = fopen(argv[2],"w"); | |
153 if (output == NULL) { | |
154 printf("Cannot open %s\n", argv[2]); | |
155 free(inString); | |
156 fclose(input); | |
157 return 8; | |
158 } | |
159 | |
160 | |
161 /* Invoke parser */ | |
162 PCB.pointer = (unsigned char *)inString; // using pointer input | |
163 detag(); | |
164 if (PCB.exit_flag != 1) { | |
165 printf( "Unsuccessful termination of parse, PCB.exit_flag = %d\n", | |
166 PCB.exit_flag); | |
167 } | |
168 | |
169 | |
170 /* Done */ | |
171 free(inString); | |
172 fclose(input); | |
173 fclose(output); | |
174 printf( " End detag "); | |
175 return 0; | |
176 } | |
177 | |
178 } // --------- End of Embedded C -------------------- | |
179 | |
180 |