program dehtml(input, output);
(* dehtml: remove html structures *)
const
(* begin module version *)
version = 1.00; (* of dehtml.p 1998 January 3
origin 1998 jan 3 from decom *)
(* end module version *)
(* begin module describe.dehtml *)
(*
name
dehtml: remove html structures
synopsis
dehtml(input: in; output: out)
files
input: a program having HTML tags
output: the same program with HTML tags removed
description
HTML tags are indicated by < and >. All text between these and including
them is removed.
see also
decom.p
author
Thomas Dana Schneider
bugs
technical notes
*)
(* end module describe.dehtml *)
const
debug = false; (* turn on to see states changing *)
var
c: char; (* the current character just read *)
p: char; (* the character previous to c *)
state: integer; (* state of the program. The program moves between three
states depending on the characters it sees:
0: outside html
1: inside html
2: inside end of html
*)
begin
state := 0; (* start outside the comments *)
c := ' '; (* previous character is neutral *)
while not eof(input) do begin
while not eoln(input) do begin
read(input,c);
case state of
0: begin (* outside html *)
if c = '<' then state := 1
end;
1: begin (* inside html comment *)
if c = '>' then state := 2;
end;
2: begin (* ending >, delay one character *)
if c = '<'
then state := 1 (* start next one right away! *)
else state := 0;
end;
end;
if state = 0 then write(output,c);
if debug then begin
if state = 1 then write(output,'1');
if state = 2 then write(output,'2');
end
end;
readln(input);
writeln(output);
end;
end.