program dbclean(dbin, dbout); (* dbclean: remove non-entry material Tom Schneider NCI/FCRDC Bldg 469. Room 144 P.O. Box B Frederick, MD 21702-1201 (301) 846-5581 (-5532 for messages) toms@ncifcrf.gov http://www-lmmb.ncifcrf.gov/~toms/ National Cancer Institute Laboratory of Mathematical Biology *) label 1; (* end of program *) const (* begin module version *) version = 1.05; (* of dbclean.p 1996 Oct 8 origin 1996 August 24 *) (* end module version *) (* begin module describe.dbclean *) (* name dbclean: remove non-entry material synopsis dbclean(dbin: in, dbout: out) files dbin: a file containing genbank flat-file entries and any other junk. dbout: just the genbank entires without the junk. output: messages to the user description The retrieve@ncbi.nlm.nih.gov will return genbank entries with other junk. This program removes the junk. examples documentation see also author Thomas Dana Schneider bugs technical notes *) (* end module describe.dbclean *) (* begin module interact.const *) maxstring = 150; (* the maximum string *) (* end module interact.const version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module my.filler.const *) fillermax = 20; (* the size of the filler array for a string *) (* end module my.filler.const *) (* from filler.const version = 4.13; (@ of prgmod.p 1994 sep 5 *) type (* begin module interact.type *) string = record (* a string of characters *) letters: array[1..maxstring] of char; (* the letters in the string *) length: integer; (* the number of characters in the string *) current: integer; (* the letter we are working on *) end; (* end module interact.type version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module filler.type *) (* the following is an array used to fill a string. it is convenient to have it much shorter than the maxstring, so that it is easy to fill the string using procedure fillstring. the user must declare the value of constant fillermax. *) filler = packed array[1..fillermax] of char; (* end module filler.type version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module trigger.type *) trigger = record (* an object to be searched for *) seek: string; (* the characters looked for *) state: integer; (* how close to triggering we are *) skip: boolean; (* trigger not found- skip the line *) found: boolean (* the trigger was found *) end; (* end module trigger.type version = 4.16; (@ of prgmod.p 1996 August 12 *) var dbin, dbout: text; (* files used by this program *) (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module interact.clearstring *) procedure clearstring(var ribbon: string); (* empty the string *) var index: integer; (* to the ribbon *) begin (* clearstring *) with ribbon do begin for index := 1 to maxstring do letters[index] := ' '; length := 0; current := 0; end end; (* clearstring *) (* end module interact.clearstring version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module interact.writestring *) procedure writestring(var tofile: text; var s: string); (* write the string s to file tofile, no writeln *) var i: integer; (* index to s *) begin (* writestring *) with s do for i := 1 to length do write(tofile, letters[i]) end; (* writestring *) (* end module interact.writestring version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module filler.fillstring *) procedure fillstring(var s: string; a: filler); (* this procedure makes it reasonably easy to fill the string s with characters. one calls the procedure as: *) (* 1 2 3 4 5 *) (* 12345678901234567890123456789012345678901234567890 *) (* fillstring(s, 'this-is-the-string '); the two comments make it easy to line the characters up. also, for this example, it was assumed that the length of filler as defined by the constant fillermax was 50. *) var length: integer; (* of the string without trailing blanks *) index: integer; (* of s *) begin (* fillstring *) clearstring(s); length := fillermax; while (length > 1) and (a[length] = ' ') do length := pred(length); if (length = 1) and (a[length] = ' ') then begin writeln(output, 'fillstring: the string is empty'); halt end; for index := 1 to length do s.letters[index] := a[index]; s.length := length; s.current := 1 end; (* fillstring *) (* end module filler.fillstring version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module filler.filltrigger *) procedure filltrigger(var t: trigger; a: filler); (* fill the trigger t *) begin (* filltrigger *) fillstring(t.seek,a) end; (* fillstring *) (* end module filler.filltrigger version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module trigger.proc *) (* this module allows one to scan a series of characters, as from an array or a file, and to "trigger" or detect a simple string in the series. the advantage of the trigger is that several triggers can "observe" a stream of characters at once, each looking for a different thing. some other modules required: interact.const, interact.type *) procedure resettrigger(var t: trigger); (* reset the trigger to ground state *) begin (* resettrigger *) with t do begin state := 0; skip := false; found := false end end; (* resettrigger *) procedure testfortrigger(ch: char; var t: trigger); (* look at the character ch. if it is part of the trigger (at the current trigger state), then the trigger state goes higher. if it is not part of the trigger then the trigger state is reset, skip is true and one should skip onward to find the trigger. if the trigger is found, found is true. *) begin (* testfortrigger *) with t do begin state := succ(state); { writestring(output,seek); writeln(output,'testfortrigger seek.letters[',state:1,']:"', seek.letters[state],'" ch:"',ch,'"'); writeln(output,'ord(ch) = ',ord(ch):1); writeln(output,'ord(seek.letters[state]) = ',ord(seek.letters[state]):1); } (* if debugging then begin writestring(list,seek); writeln(list,'testfortrigger seek.letters[',state:1,']:', seek.letters[state],' ch:',ch); end;*) if seek.letters[state] = ch then begin skip := false; if state = seek.length then found := true else found := false end else begin (* reset trigger *) state := 0; skip := true; found := false end end end; (* testfortrigger *) (* end module trigger.proc version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module copyaline *) procedure copyaline(var fin, fout: text); (* copy a line from file fin to file fout *) begin (* copyaline *) while not eoln(fin) do begin fout^ := fin^; put(fout); get(fin) end; readln(fin); writeln(fout); end; (* copyaline *) (* end module copyaline version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module copyline *) procedure copyline(var fin, fout: text); (* copy a line from file fin to file fout but DO NOT CARRIAGE RETURN on the fout. Carriage return on the fin. *) begin (* copyline *) while not eoln(fin) do begin fout^ := fin^; put(fout); get(fin) end; readln(fin); end; (* copyline *) (* end module copyline version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module skipblanks *) procedure skipblanks(var thefile: text); (* skip over blanks until a non-blank, or end of line, is found *) begin while (thefile^ = ' ') and not eoln(thefile) do get(thefile); end; procedure skipnonblanks(var thefile: text); (* skip over nonblanks until a blank, or end of line, is found *) begin while (thefile^ <> ' ') and not eoln(thefile) do get(thefile); end; procedure skipcolumn(var thefile: text); (* skip over a data column *) begin skipblanks(thefile); skipnonblanks(thefile) end; (* end module skipblanks version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module grabtoken *) procedure grabtoken(var thefile: text; var thestring: string); (* skip any blanks and then grab the next token from the file *) var c: char; (* a character in thefile *) done: boolean; (* done finding the name *) begin skipblanks(thefile); done := false; with thestring do begin length := 0; while not done do begin if eoln(thefile) then done := true else begin read(thefile,c); if c = ' ' then done := true else begin length := succ(length); letters[length] := c; end end end end end; (* end module grabtoken version = 4.16; (@ of prgmod.p 1996 August 12 *) (* begin module dbclean.themain *) procedure themain(var dbin, dbout: text); (* the main procedure of the program *) var c: char; (* a character in db *) done: boolean; (* done copying locus? *) linenumber: integer; (* count of the lines in db *) locus: trigger; (* trigger to find the LOCUS pattern *) locusend: trigger; (* trigger to find the end of the LOCUS pattern *) locuscount: integer; (* count of locus *) begin writeln(output,'dbclean ',version:4:2); reset(dbin); rewrite(dbout); (* 1 2 3 4 5 *) (* 12345678901234567890123456789012345678901234567890 *) filltrigger(locus, 'LOCUS '); filltrigger(locusend, '// '); resettrigger(locus); resettrigger(locusend); locuscount := 0; linenumber := 1; (* we are already at the first line *) while not eof(dbin) do begin read(dbin, c); testfortrigger(c,locus); if locus.found then begin locuscount := succ(locuscount); write(dbout,'LOCUS'); (* copy the locus *) done := false; while not done do begin while not eoln(dbin) do begin read(dbin, c); write(dbout, c); testfortrigger(c,locusend); if locusend.found then begin (* no need to copy the locusend, but add blank: *) writeln(dbout); done := true; end end; resettrigger(locus); resettrigger(locusend); readln(dbin); writeln(dbout); if eof(dbin) then begin if not done then begin writeln(output,'BAD DATABASE:', ' end of dbin before end of an entry'); halt end; end; end; end; if not eof(dbin) then if eoln(dbin) then readln(dbin) end; if locuscount <> 1 then writeln(output,locuscount:1,' loci') else writeln(output,locuscount:1,' locus'); end; (* end module dbclean.themain *) begin themain(dbin, dbout); 1: end.