view tests/agcl/parsifal/detag1.syn @ 15:f5acaf0c8a29

Don't cast through "volatile int". Causes a gcc warning nowadays. XXX: should put something else back here to frighten the optimizer
author David A. Holland
date Tue, 31 May 2022 01:00:55 -0400
parents 13d2b8934445
children
line wrap: on
line source

{
/*
 detag.syn

 Program to strip HTML tags from HTML files.
 Copyright (c) 1996 - 1999 Parsifal Software, All
 Rights Reserved.
 See the file COPYING for license and usage terms.

 For information about AnaGram, visit http://www.parsifalsoft.com.
*/

#include <stdio.h>

}


// -- CONFIGURATION SECTION ----------------------------
[
  default token type = int
  pointer input
  nest comments
	test file mask = "*.htm*"
]
//------------------------------------------------------

eof = 0         // In this case, eof is the null char ending a string
tag innard char = ~(eof + '<' + '>')
comment char = ~(eof + '>')
ordinary text char = ~(eof + '<')
header type = '1-6'                 // for H1, H2,... H6

(void) input string $             // specify grammar token
 -> html, eof

html
 -> [text | tag...]/...     // virtual production w. alternating seq.

tag
 -> '<', tag innards, '>'


    /* Here we pick up some of the HTML start tags so
       we can insert a blank line in the output file for
       readability.  Note that because keywords take
       precedence in AnaGram, the keyword is chosen
       by the parser in preference to other input
       beginning with the same characters. Turned out
       that we didn't really want the blank line in
       some cases, so the reduction procedure was
       simply removed for those start tags - so,
       nothing happens when they get picked up.  */
tag innards
 -> {"HR" | "hr"}, other stuff                = putc('\n', output);
 -> {"P" | "p"}, other stuff                  = putc('\n', output);
 -> {"BR" | "br"}, other stuff
 -> {"H" | "h"}, header type, other stuff      = putc('\n', output);
 -> {"HEAD" | "head" | "HTML" | "html"}, other stuff  // avoid getting Hn
 -> {"UL" | "ul"}, other stuff
 -> {"OL" | "ol"}, other stuff
 -> {"DL" | "dl"}, other stuff
 -> {"LI" | "li"}, other stuff
 -> {"TABLE" | "table"}, other stuff          = putc('\n', output);
 -> {"TR" | "tr"}, other stuff
 -> {"TD" | "td"}, other stuff
 -> {"PRE" | "pre"}, other stuff
 -> "!--", comment stuff
 -> other stuff

other stuff
 -> tag innard char?...

comment stuff       // treatment of comments is a bit simple-minded here
 -> comment char?...

text                  // text is what we want to copy to output file
 ->text char
 ->text, text char


text char
 -> ordinary text char:c              = putc(c, output);
 -> entity text char

entity text char         // we only bother with 3 entities here
 -> "&lt;"                            = putc('<', output);
 -> "&gt;"                            = putc('>', output);
 -> "&amp;"                           = putc('&', output);


{ // ----- Embedded C ---------------------------

FILE *output;

int main(int argc, char *argv[]) {

  FILE *input;
  size_t fileLength;
  size_t stringLength;
  int errorFlag = 0;
  char *inString;


  /* Check for enough arguments */
  if (argc != 3) {
    printf("Program to strip HTML tags from a file\n"
     "Usage: %s <input filename> <output filename>\n", argv[0]);
    return 1;
  }

  /* Open input file for reading only */
  input = fopen(argv[1],"r");
  if (input == NULL) {
    printf("Cannot open %s\n", argv[1]);
    return 2;
  }

  /* find out how big the file is */
  if (fseek(input, SEEK_SET, SEEK_END)) {
    printf("Strange problems with %s\n", argv[1]);
    return 3;
  }
  fileLength = ftell(input);
  if (fileLength < 0 ) {    // -1L is error return
    printf("Error getting file length (%d) of %s\n", fileLength, argv[1]);
    return 4;
  }

  /* fseek to beginning of file */
  if (fseek(input, 0, SEEK_SET)) {
    printf("Strange problems with %s\n", argv[1]);
    return 5;
  }

  /* Allocate storage for input string */
  inString = (char*)malloc(fileLength + 1);
  if (inString == NULL) {
    printf("Insufficient memory\n");
    return 6;
  }

  /* Read file */
  stringLength = fread(inString, 1, fileLength, input);
  if (stringLength == 0) {
    printf("Unable to read %s\n", argv[1]);
    return 7;
  }
  inString[stringLength] = 0;


  /* Open output file for writing only */
  output = fopen(argv[2],"w");
  if (output == NULL) {
    printf("Cannot open %s\n", argv[2]);
  free(inString);
  fclose(input);
    return 8;
  }


  /* Invoke parser */
  PCB.pointer = (unsigned char *)inString;  // using pointer input
  detag();
  if (PCB.exit_flag != 1) {
    printf( "Unsuccessful termination of parse, PCB.exit_flag = %d\n",
      PCB.exit_flag);
  }


  /* Done */
  free(inString);
  fclose(input);
  fclose(output);
  printf( " End detag ");
  return 0;
}

}  // --------- End of Embedded C --------------------