view tests/agcl/parsifal/detag2.syn @ 24:a4899cdfc2d6 default tip

Obfuscate the regexps to strip off the IBM compiler's copyright banners. I don't want bots scanning github to think they're real copyright notices because that could cause real problems.
author David A. Holland
date Mon, 13 Jun 2022 00:40:23 -0400
parents 13d2b8934445
children
line wrap: on
line source

{
/*
 detag.syn

 Program to strip HTML tags from HTML files.
 Copyright (c) 1996 - 1999 Parsifal Software, All
 Rights Reserved.
 See the file COPYING for license and usage terms.

 For information about AnaGram, visit http://www.parsifalsoft.com.
*/

#include <stdio.h>

}


// -- CONFIGURATION SECTION ----------------------------
[
  default token type = int
  pointer input
  nest comments
	test file mask = "*.htm*"
	~default reductions
]
//------------------------------------------------------

eof = 0         // In this case, eof is the null char ending a string
tag innard char = ~(eof + '<' + '>')
comment char = ~(eof + '>')
ordinary text char = ~(eof + '<')
header type = '1-6'                 // for H1, H2,... H6

(void) input string $             // specify grammar token
 -> html, eof

html
 -> [text | tag...]/...     // virtual production w. alternating seq.

tag
 -> '<', tag innards, '>'


    /* Here we pick up some of the HTML start tags so
       we can insert a blank line in the output file for
       readability.  Note that because keywords take
       precedence in AnaGram, the keyword is chosen
       by the parser in preference to other input
       beginning with the same characters. Turned out
       that we didn't really want the blank line in
       some cases, so the reduction procedure was
       simply removed for those start tags - so,
       nothing happens when they get picked up.  */
tag innards
 -> {"HR" | "hr"}, other stuff                = putc('\n', output);
 -> {"P" | "p"}, other stuff                  = putc('\n', output);
 -> {"BR" | "br"}, other stuff
 -> {"H" | "h"}, header type, other stuff      = putc('\n', output);
 -> {"HEAD" | "head" | "HTML" | "html"}, other stuff  // avoid getting Hn
 -> {"UL" | "ul"}, other stuff
 -> {"OL" | "ol"}, other stuff
 -> {"DL" | "dl"}, other stuff
 -> {"LI" | "li"}, other stuff
 -> {"TABLE" | "table"}, other stuff          = putc('\n', output);
 -> {"TR" | "tr"}, other stuff
 -> {"TD" | "td"}, other stuff
 -> {"PRE" | "pre"}, other stuff
 -> "!--", comment stuff
 -> other stuff

other stuff
 -> tag innard char?...

comment stuff       // treatment of comments is a bit simple-minded here
 -> comment char?...

text                  // text is what we want to copy to output file
 ->text char
 ->text, text char


text char
 -> ordinary text char:c              = putc(c, output);
 -> entity text char

entity text char         // we only bother with 3 entities here
 -> "&lt;"                            = putc('<', output);
 -> "&gt;"                            = putc('>', output);
 -> "&amp;"                           = putc('&', output);


{ // ----- Embedded C ---------------------------

FILE *output;

int main(int argc, char *argv[]) {

  FILE *input;
  size_t fileLength;
  size_t stringLength;
  int errorFlag = 0;
  char *inString;


  /* Check for enough arguments */
  if (argc != 3) {
    printf("Program to strip HTML tags from a file\n"
     "Usage: %s <input filename> <output filename>\n", argv[0]);
    return 1;
  }

  /* Open input file for reading only */
  input = fopen(argv[1],"r");
  if (input == NULL) {
    printf("Cannot open %s\n", argv[1]);
    return 2;
  }

  /* find out how big the file is */
  if (fseek(input, SEEK_SET, SEEK_END)) {
    printf("Strange problems with %s\n", argv[1]);
    return 3;
  }
  fileLength = ftell(input);
  if (fileLength < 0 ) {    // -1L is error return
    printf("Error getting file length (%d) of %s\n", fileLength, argv[1]);
    return 4;
  }

  /* fseek to beginning of file */
  if (fseek(input, 0, SEEK_SET)) {
    printf("Strange problems with %s\n", argv[1]);
    return 5;
  }

  /* Allocate storage for input string */
  inString = (char*)malloc(fileLength + 1);
  if (inString == NULL) {
    printf("Insufficient memory\n");
    return 6;
  }

  /* Read file */
  stringLength = fread(inString, 1, fileLength, input);
  if (stringLength == 0) {
    printf("Unable to read %s\n", argv[1]);
    return 7;
  }
  inString[stringLength] = 0;


  /* Open output file for writing only */
  output = fopen(argv[2],"w");
  if (output == NULL) {
    printf("Cannot open %s\n", argv[2]);
  free(inString);
  fclose(input);
    return 8;
  }


  /* Invoke parser */
  PCB.pointer = (unsigned char *)inString;  // using pointer input
  detag();
  if (PCB.exit_flag != 1) {
    printf( "Unsuccessful termination of parse, PCB.exit_flag = %d\n",
      PCB.exit_flag);
  }


  /* Done */
  free(inString);
  fclose(input);
  fclose(output);
  printf( " End detag ");
  return 0;
}

}  // --------- End of Embedded C --------------------