Mercurial > ~dholland > hg > ag > index.cgi
view tests/agcl/parsifal/detag2.syn @ 24:a4899cdfc2d6 default tip
Obfuscate the regexps to strip off the IBM compiler's copyright banners.
I don't want bots scanning github to think they're real copyright
notices because that could cause real problems.
author | David A. Holland |
---|---|
date | Mon, 13 Jun 2022 00:40:23 -0400 |
parents | 13d2b8934445 |
children |
line wrap: on
line source
{ /* detag.syn Program to strip HTML tags from HTML files. Copyright (c) 1996 - 1999 Parsifal Software, All Rights Reserved. See the file COPYING for license and usage terms. For information about AnaGram, visit http://www.parsifalsoft.com. */ #include <stdio.h> } // -- CONFIGURATION SECTION ---------------------------- [ default token type = int pointer input nest comments test file mask = "*.htm*" ~default reductions ] //------------------------------------------------------ eof = 0 // In this case, eof is the null char ending a string tag innard char = ~(eof + '<' + '>') comment char = ~(eof + '>') ordinary text char = ~(eof + '<') header type = '1-6' // for H1, H2,... H6 (void) input string $ // specify grammar token -> html, eof html -> [text | tag...]/... // virtual production w. alternating seq. tag -> '<', tag innards, '>' /* Here we pick up some of the HTML start tags so we can insert a blank line in the output file for readability. Note that because keywords take precedence in AnaGram, the keyword is chosen by the parser in preference to other input beginning with the same characters. Turned out that we didn't really want the blank line in some cases, so the reduction procedure was simply removed for those start tags - so, nothing happens when they get picked up. */ tag innards -> {"HR" | "hr"}, other stuff = putc('\n', output); -> {"P" | "p"}, other stuff = putc('\n', output); -> {"BR" | "br"}, other stuff -> {"H" | "h"}, header type, other stuff = putc('\n', output); -> {"HEAD" | "head" | "HTML" | "html"}, other stuff // avoid getting Hn -> {"UL" | "ul"}, other stuff -> {"OL" | "ol"}, other stuff -> {"DL" | "dl"}, other stuff -> {"LI" | "li"}, other stuff -> {"TABLE" | "table"}, other stuff = putc('\n', output); -> {"TR" | "tr"}, other stuff -> {"TD" | "td"}, other stuff -> {"PRE" | "pre"}, other stuff -> "!--", comment stuff -> other stuff other stuff -> tag innard char?... comment stuff // treatment of comments is a bit simple-minded here -> comment char?... text // text is what we want to copy to output file ->text char ->text, text char text char -> ordinary text char:c = putc(c, output); -> entity text char entity text char // we only bother with 3 entities here -> "<" = putc('<', output); -> ">" = putc('>', output); -> "&" = putc('&', output); { // ----- Embedded C --------------------------- FILE *output; int main(int argc, char *argv[]) { FILE *input; size_t fileLength; size_t stringLength; int errorFlag = 0; char *inString; /* Check for enough arguments */ if (argc != 3) { printf("Program to strip HTML tags from a file\n" "Usage: %s <input filename> <output filename>\n", argv[0]); return 1; } /* Open input file for reading only */ input = fopen(argv[1],"r"); if (input == NULL) { printf("Cannot open %s\n", argv[1]); return 2; } /* find out how big the file is */ if (fseek(input, SEEK_SET, SEEK_END)) { printf("Strange problems with %s\n", argv[1]); return 3; } fileLength = ftell(input); if (fileLength < 0 ) { // -1L is error return printf("Error getting file length (%d) of %s\n", fileLength, argv[1]); return 4; } /* fseek to beginning of file */ if (fseek(input, 0, SEEK_SET)) { printf("Strange problems with %s\n", argv[1]); return 5; } /* Allocate storage for input string */ inString = (char*)malloc(fileLength + 1); if (inString == NULL) { printf("Insufficient memory\n"); return 6; } /* Read file */ stringLength = fread(inString, 1, fileLength, input); if (stringLength == 0) { printf("Unable to read %s\n", argv[1]); return 7; } inString[stringLength] = 0; /* Open output file for writing only */ output = fopen(argv[2],"w"); if (output == NULL) { printf("Cannot open %s\n", argv[2]); free(inString); fclose(input); return 8; } /* Invoke parser */ PCB.pointer = (unsigned char *)inString; // using pointer input detag(); if (PCB.exit_flag != 1) { printf( "Unsuccessful termination of parse, PCB.exit_flag = %d\n", PCB.exit_flag); } /* Done */ free(inString); fclose(input); fclose(output); printf( " End detag "); return 0; } } // --------- End of Embedded C --------------------