program dbfilter (input, output, dbfilterp); (* dbfilter: filter GenBank databases to remove unwanted entries Mike Stephens, 1989 *) label 1; (* end of program *) const (* begin module version *) version = 1.08; (* of dbfilter.p 1992 November 1 origin 1989 August 29 *) (* end module version *) (* begin module describe.dbfilter *) (* name dbfilter: filter GenBank databases to remove unwanted entries synopsis dbfilter(input: in: output: out, dbfilterp: in) files input: a database of GenBank entries output: database after the filtration. When errors occur, the program halts and produces an error message at the end of the output file. dbfilterp: parameters to control the program FIRST LINE: the name of the organism to use, consisting of two parts (eg, Homo sapiens). description GenBank entries in input that contain the requested organsim are copied to output. The GenBank ORGANISM contains the two part genus/species name, such as: ORGANISM Homo sapiens Entries of an unwanted ORGANISM type are not copied from input to output. Those of the desired type are transferred directly. examples If dbfilterp contains: Homo sapiens then only those entries with the ORGANISM type Homo sapiens will be copied into output. All others will be filtered out. documentation none see also dbinst.p dbbk.p author R. Michael Stephens bugs Error messages are buried at the bottom of the output file. technical notes Constant maxlines determines the greatest number of lines that can be handled between LOCUS and ORGANISM. *) (* end module describe.dbfilter *) (* ************************************************************************ *) (* ************************************************************************ *) (* begin module dbfilter.const *) maxlines = 20; (* maximum number of lines that can be handled between LOCUS and ORGANISM by the buffer *) (* end module dbfilter.const *) (* begin module interact.const *) maxstring = 150; (* the maximum string *) (* end module interact.const version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module filler.const *) fillermax = 50; (* the size of the filler array for a string *) (* end module filler.const version = 4.11; (@ of prgmod.p 1991 Apr 22 *) type (* begin module interact.type *) string = record (* a string of characters *) letters: array[1..maxstring] of char; (* the letters in the string *) length: integer; (* the number of characters in the string *) current: integer; (* the letter we are working on *) end; (* end module interact.type version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module filler.type *) (* the following is an array used to fill a string. it is convenient to have it much shorter than the maxstring, so that it is easy to fill the string using procedure fillstring. the user must declare the value of constant fillermax. *) filler = packed array[1..fillermax] of char; (* end module filler.type version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module dbfilter.type *) linebuffer=array[1..maxlines] of string; (* to store the lines between LOCUS and ORGANISM until we decide whether to copy them or not *) (* end module dbfilter.type *) var (* begin module dbfilter.var *) dbfilterp: text; (* parameter file *) (* end module dbfilter.var *) (* ************************************************************************ *) (* ************************************************************************ *) (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* ************************************************************************ *) (* ************************************************************************ *) (* begin module interact.clearstring *) procedure clearstring(var ribbon: string); (* empty the string *) var index: integer; (* to the ribbon *) begin (* clearstring *) with ribbon do begin for index := 1 to maxstring do letters[index] := ' '; length := 0; current := 0; end end; (* clearstring *) (* end module interact.clearstring version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module interact.figurestring *) procedure figurestring( var line: string; (* a string of characters to figure out *) var first: integer; (* first found non-blank character in the line *) var last: integer; (* last character before a blank after first *) var whzat: char; (* what the token is *) var c: char; (* the first character of the token *) var i: integer; (* integer value of token if it is integer; or 0 *) var r: real); (* the real value if it is real; or 0.0 *) (* figurestring figures out the tokens in a string. it recognizes words, integers, reals and poorly formed numbers. you can easily use it to parse lines. our goal is to figure out what thing is on a string. start looking at the current place on the line. first and last are the first 'token' in line after start. the current place is updated to the letter after last. the thing found is described by the value of whzat: 'c': character (when the token does not begin with a digit, '+', or '-') 'i': integer 'r': real ' ': blank line 'g': garbage, cannot figure it out and the value of the thing found is the appropriate variable *) var numbers: set of '0'..'9'; sign: integer; (* sign of a number *) numberstart: integer; (* the point a number starts, beyond its sign, if any *) point: integer; (* location of decimal point *) power: integer; (* of 10 representing a place value in the number *) l: integer; (* an index for dissecting numbers *) function figureinteger(first,last:integer):integer; (* figure the integer in the token *) var i: integer; (* index *) sum, increment: integer; begin (* figureinteger *) power:=1; (* start at ones place *) sum:=0; (* start sum at zero *) for i:=last downto first do begin case line.letters[i] of '0': increment:=0; '1': increment:=1; '2': increment:=2; '3': increment:=3; '4': increment:=4; '5': increment:=5; '6': increment:=6; '7': increment:=7; '8': increment:=8; '9': increment:=9 end; sum:=sum+power*increment; power:=power*10 end; figureinteger:=sum end; (* figureinteger *) begin (* figurestring *) numbers:=['0','1','2','3','4','5','6','7','8','9']; (* c:=' '; i:=0; r:=0.0; do not affect these variables unless necessary *) point:=0; whzat := '.'; (* assume that we have someting to work on *) (* now to see if that is true: *) with line do if (length = 0) or (current < 1) or (current > length) then whzat := ' ' else begin (* figure out where the first token is in the line *) first:=line.current; while (line.letters[first]=' ') and (first < line.length) do first:=succ(first); if (first = line.length) and (line.letters[first] = ' ') then whzat := ' '; end; if whzat <> ' ' then begin last:=first; while (line.letters[last]<>' ') and (last < line.length) do last:=succ(last); if line.letters[last] = ' ' then last := pred(last); (* the token is between inclusive first and last *) c:=line.letters[first]; if (c in numbers) or (c in ['+','-']) then begin if c in ['+','-'] then begin case c of '+': sign:=+1; '-': sign:=-1; end; numberstart:=succ(first) end else begin sign:=+1; numberstart:=first end; whzat:='i'; for l:=numberstart to last do begin if not(line.letters[l] in numbers) then if line.letters[l]='.' (* we found a period *) then if whzat='i' (* if so far it is numbers *) then begin whzat:='r'; (* it is actually real *) point:=l end else whzat:='g' (* it is a second '.', ie garbage *) else whzat:='g' (* it is garbage *) end; (* if it is only numbers, it is integer *) (* build number *) (* if it ends in a period, it is integer *) if (whzat = 'r') and (point = last) then whzat:='i'; if whzat = 'i' then begin if point = last (* had an ending decimal point *) then i:=sign * figureinteger(numberstart,pred(last)) else i:=sign * figureinteger(numberstart,last); r:=i end else if whzat = 'r' then begin i:=figureinteger(numberstart,point-1); r:=sign * (i+figureinteger(point+1,last)/power); i:=sign * i end end else begin whzat:='c'; end; (* move the start to just beyond the last character of the token *) line.current:=succ(last) end end; (* figurestring *) (* end module interact.figurestring version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module interact.writestring *) procedure writestring(var tofile: text; var s: string); (* write the string s to file tofile, no writeln *) var i: integer; (* index to s *) begin (* writestring *) with s do for i := 1 to length do write(tofile, letters[i]) end; (* writestring *) (* end module interact.writestring version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module interact.token *) procedure token(var buffer, atoken: string; var gotten: boolean); (* get a token from the buffer *) var (* variables for calling figurestring: *) first: integer; last: integer; what: char; cha: char; int: integer; rea: real; index: integer; (* to the buffer *) begin figurestring(buffer,first,last,what,cha,int,rea); if what = ' ' then gotten := false else begin clearstring(atoken); for index := first to last do atoken.letters[index-first+1] := buffer.letters[index]; atoken.length := last - first + 1; atoken.current := 1; gotten:=true end end; (* end module interact.token version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module interact.getstring *) procedure getstring(var afile: text; var buffer: string; var gotten: boolean); (* get a string from a file not using string calls. this lets one obtain lines from a file without interactive prompts *) var index: integer; (* of buffer *) begin (* getstring *) clearstring(buffer); if eof(afile) then gotten := false else begin index := 0; while (not eoln(afile)) and (index < maxstring) do begin index := succ(index); read(afile, buffer.letters[index]) end; if not eoln(afile) then begin writeln(output, ' getstring: a line exceeds maximum string size (', maxstring:1,')'); halt end; buffer.length := index; buffer.current := 1; readln(afile); gotten := true end end; (* getstring *) (* end module interact.getstring version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module equalstring *) function equalstring(a, b: string): boolean; (* test for equality between two strings at current positions *) var index: integer; (* index to both strings *) equal: boolean; (* are letters in a and b the same? *) begin (* equalstring *) if a.length = b.length then begin index := 1; repeat equal := (a.letters[index] = b.letters[index]); index := succ(index) until (not equal) or (index > a.length); equalstring := equal end else equalstring := false end; (* equalstring *) (* end module equalstring version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module copystring *) procedure copystring(a: string; var b: string); (* copy string a to b *) var l: integer; (* index to the string *) begin b.length := a.length; for l := 1 to a.length do b.letters[l] := a.letters[l] end; (* end module copystring version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module filler.fillstring *) procedure fillstring(var s: string; a: filler); (* this procedure makes it reasonably easy to fill the string s with characters. one calls the procedure as: *) (* 1 2 3 4 5 *) (* 12345678901234567890123456789012345678901234567890 *) (* fillstring(s, 'this-is-the-string '); the two comments make it easy to line the characters up. also, for this example, it was assumed that the length of filler as defined by the constant fillermax was 50. *) var length: integer; (* of the string without trailing blanks *) index: integer; (* of s *) begin (* fillstring *) clearstring(s); length := fillermax; while (length > 1) and (a[length] = ' ') do length := pred(length); if (length = 1) and (a[length] = ' ') then begin writeln(output, 'fillstring: the string is empty'); halt end; for index := 1 to length do s.letters[index] := a[index]; s.length := length; s.current := 1 end; (* fillstring *) (* end module filler.fillstring version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* ************************************************************************ *) (* ************************************************************************ *) (* begin module dbfilter.writeparameters *) procedure writeparameters(var tofile:text; genus, species: string ); (* write the paramters to file tofile *) begin write(tofile,'* searching for organism '); writestring(tofile,genus); write(tofile,' '); writestring(tofile,species); writeln(tofile); end; (* end module dbfilter.writeparameters *) (* begin module dbfilter.readparameters *) procedure readparameters(var dbfilterp: text; var genus, species : string ); (* read the parameters from dbfilterp *) var gotten: boolean; (* was the string gotten from the file? *) buffer: string; (* a line from dbfilterp *) begin (* get the feature *) reset(dbfilterp); getstring(dbfilterp,buffer,gotten); if not gotten then begin writeln(output,'empty dbfilterp'); halt end; (* extract the genus from the line *) token(buffer, genus, gotten); if not gotten then begin writeln(output,'missing first parameter'); halt end; (* extract the species from the line *) token(buffer, species, gotten); if not gotten then begin writeln(output,'missing second parameter'); halt end; end; (* end module dbfilter.readparameters *) (* ************************************************************************ *) (* ************************************************************************ *) (* begin module dbfilter.themain *) procedure themain(var fin, fout, dbfilterp: text); (* the main procedure of the program *) var atoken:string; (* first token of a given string *) buffer:string; (* line of the fin file *) endentry:string; (* trigger string for fin entry ends *) foundgenus, (* the string to be compared with genus *) foundspecies:string; (* the string to be compared with species *) genus:string; (* first token in the organism name *) gotten:boolean; (* was a line really there? *) index:integer; (* index for storage array *) locus:string; (* trigger string for LOCUS name *) newtoken:string; (* the token to compare for the end of the entry *) organism:string; (* trigger string for ORGANISM *) species:string; (* second token in the organism name *) storage:linebuffer; (* temporary buffer for text *) tokenfound:boolean; (* was a token gotten? *) x:integer; (* loop control variable for index *) begin writeln(output,'dbfilter ',version:4:2); reset(fin); rewrite(fout); (* fill the strings to be used as identifiers *) (* 1 2 3 4 5 *) (* 12345678901234567890123456789012345678901234567890 *) fillstring(locus, 'LOCUS '); fillstring(organism,'ORGANISM '); fillstring(endentry,'// '); (* read the parameters from the dbfilterp file *) readparameters(dbfilterp, genus, species); writeparameters(fout,genus,species); while not eof(fin) do begin getstring(fin,buffer,gotten); if gotten then begin token(buffer,atoken,tokenfound); if equalstring(atoken,locus) then begin index := 1; copystring(buffer,storage[index]); writestring(fout,buffer); writeln(fout); while not equalstring(atoken,organism) do begin getstring(fin,buffer,gotten); index := index + 1; if index > maxlines then begin writeln(fout,'buffer capacity exceeded', ' increase constant maxlines'); halt end; copystring(buffer,storage[index]); token(buffer,atoken,tokenfound); if tokenfound and equalstring(atoken,organism) then begin token(buffer,foundgenus,tokenfound); if tokenfound then begin token(buffer,foundspecies,tokenfound); if tokenfound then begin writestring(fout,genus); write(fout,' '); writestring(fout,species); writeln(fout); if equalstring(genus,foundgenus) and equalstring(species,foundspecies) then begin for x:= 1 to index do begin writestring(fout,storage[x]); writeln(fout) end; repeat getstring(fin,buffer,gotten); if not gotten then begin writeln(fout,'incomplete entry'); halt end; writestring(fout,buffer); writeln(fout); token(buffer,newtoken,tokenfound); if not tokenfound then clearstring(newtoken) until equalstring(newtoken,endentry) end; end; end; end; end; end; end; end; end; (* end module dbfilter.themain *) begin themain(input, output, dbfilterp); 1: end.