program dehtml(input, output); (* dehtml: remove html structures *) const (* begin module version *) version = 1.00; (* of dehtml.p 1998 January 3 origin 1998 jan 3 from decom *) (* end module version *) (* begin module describe.dehtml *) (* name dehtml: remove html structures synopsis dehtml(input: in; output: out) files input: a program having HTML tags output: the same program with HTML tags removed description HTML tags are indicated by < and >. All text between these and including them is removed. see also decom.p author Thomas Dana Schneider bugs technical notes *) (* end module describe.dehtml *) const debug = false; (* turn on to see states changing *) var c: char; (* the current character just read *) p: char; (* the character previous to c *) state: integer; (* state of the program. The program moves between three states depending on the characters it sees: 0: outside html 1: inside html 2: inside end of html *) begin state := 0; (* start outside the comments *) c := ' '; (* previous character is neutral *) while not eof(input) do begin while not eoln(input) do begin read(input,c); case state of 0: begin (* outside html *) if c = '<' then state := 1 end; 1: begin (* inside html comment *) if c = '>' then state := 2; end; 2: begin (* ending >, delay one character *) if c = '<' then state := 1 (* start next one right away! *) else state := 0; end; end; if state = 0 then write(output,c); if debug then begin if state = 1 then write(output,'1'); if state = 2 then write(output,'2'); end end; readln(input); writeln(output); end; end.