diff tests/agcl/parsifal/detag.syn @ 0:13d2b8934445

Import AnaGram (near-)release tree into Mercurial.
author David A. Holland
date Sat, 22 Dec 2007 17:52:45 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/agcl/parsifal/detag.syn	Sat Dec 22 17:52:45 2007 -0500
@@ -0,0 +1,179 @@
+{
+/*
+ detag.syn
+
+ Program to strip HTML tags from HTML files.
+ Copyright (c) 1996 - 1999 Parsifal Software, All
+ Rights Reserved.
+ See the file COPYING for license and usage terms.
+
+ For information about AnaGram, visit http://www.parsifalsoft.com.
+*/
+
+#include <stdio.h>
+
+}
+
+
+// -- CONFIGURATION SECTION ----------------------------
+[
+  default token type = int
+  pointer input
+  nest comments
+]
+//------------------------------------------------------
+
+eof = 0         // In this case, eof is the null char ending a string
+tag innard char = ~(eof + '<' + '>')
+comment char = ~(eof + '>')
+ordinary text char = ~(eof + '<')
+header type = '1-6'                 // for H1, H2,... H6
+
+(void) input string $             // specify grammar token
+ -> html, eof
+
+html
+ -> [text | tag...]/...     // virtual production w. alternating seq.
+
+tag
+ -> '<', tag innards, '>'
+
+
+    /* Here we pick up some of the HTML start tags so 
+       we can insert a blank line in the output file for 
+       readability.  Note that because keywords take 
+       precedence in AnaGram, the keyword is chosen 
+       by the parser in preference to other input 
+       beginning with the same characters. Turned out 
+       that we didn't really want the blank line in 
+       some cases, so the reduction procedure was 
+       simply removed for those start tags - so, 
+       nothing happens when they get picked up.  */
+tag innards 
+ -> {"HR" | "hr"}, other stuff                = putc('\n', output);
+ -> {"P" | "p"}, other stuff                  = putc('\n', output);
+ -> {"BR" | "br"}, other stuff
+ -> {"H" | "h"}, header type, other stuff      = putc('\n', output);
+ -> {"HEAD" | "head" | "HTML" | "html"}, other stuff  // avoid getting Hn
+ -> {"UL" | "ul"}, other stuff
+ -> {"OL" | "ol"}, other stuff
+ -> {"DL" | "dl"}, other stuff
+ -> {"LI" | "li"}, other stuff
+ -> {"TABLE" | "table"}, other stuff          = putc('\n', output);
+ -> {"TR" | "tr"}, other stuff
+ -> {"TD" | "td"}, other stuff
+ -> {"PRE" | "pre"}, other stuff
+ -> "!--", comment stuff
+ -> other stuff
+
+other stuff
+ -> tag innard char?...
+
+comment stuff       // treatment of comments is a bit simple-minded here
+ -> comment char?...
+
+text                  // text is what we want to copy to output file
+ ->text char
+ ->text, text char
+
+
+text char
+ -> ordinary text char:c              = putc(c, output);
+ -> entity text char
+
+entity text char         // we only bother with 3 entities here
+ -> "&lt;"                            = putc('<', output);
+ -> "&gt;"                            = putc('>', output);
+ -> "&amp;"                           = putc('&', output);
+
+
+{ // ----- Embedded C ---------------------------
+
+FILE *output;
+
+int main(int argc, char *argv[]) {
+
+  FILE *input;
+  size_t fileLength;
+  size_t stringLength;
+  int errorFlag = 0;
+  char *inString;
+
+
+  /* Check for enough arguments */
+  if (argc != 3) {
+    printf("Program to strip HTML tags from a file\n"
+     "Usage: %s <input filename> <output filename>\n", argv[0]);
+    return 1;
+  }
+
+  /* Open input file for reading only */
+  input = fopen(argv[1],"r");
+  if (input == NULL) {
+    printf("Cannot open %s\n", argv[1]);
+    return 2;
+  }
+
+  /* find out how big the file is */
+  if (fseek(input, SEEK_SET, SEEK_END)) {
+    printf("Strange problems with %s\n", argv[1]);
+    return 3;
+  }
+  fileLength = ftell(input);
+  if (fileLength < 0 ) {    // -1L is error return
+    printf("Error getting file length (%d) of %s\n", fileLength, argv[1]);
+    return 4;
+  }
+
+  /* fseek to beginning of file */
+  if (fseek(input, 0, SEEK_SET)) {
+    printf("Strange problems with %s\n", argv[1]);
+    return 5;
+  }
+
+  /* Allocate storage for input string */
+  inString = (char*)malloc(fileLength + 1);
+  if (inString == NULL) {
+    printf("Insufficient memory\n");
+    return 6;
+  }
+
+  /* Read file */
+  stringLength = fread(inString, 1, fileLength, input);
+  if (stringLength == 0) {
+    printf("Unable to read %s\n", argv[1]);
+    return 7;
+  }
+  inString[stringLength] = 0;
+
+
+  /* Open output file for writing only */
+  output = fopen(argv[2],"w");
+  if (output == NULL) {
+    printf("Cannot open %s\n", argv[2]);
+  free(inString);
+  fclose(input);
+    return 8;
+  }
+
+
+  /* Invoke parser */
+  PCB.pointer = (unsigned char *)inString;  // using pointer input
+  detag();
+  if (PCB.exit_flag != 1) {
+    printf( "Unsuccessful termination of parse, PCB.exit_flag = %d\n",
+      PCB.exit_flag);
+  }
+
+
+  /* Done */
+  free(inString);
+  fclose(input);
+  fclose(output);
+  printf( " End detag ");
+  return 0;
+}
+
+}  // --------- End of Embedded C --------------------
+
+