program mkdb(sequ, mkdbp, entries, output); (* mkdb: read sequence; make GenBank entry with features for capitalized regions Dr. Thomas D. Schneider National Institutes of Health National Cancer Institute Center for Cancer Research Nanobiology Program Molecular Information Theory Group Frederick, Maryland 21702-1201 schneidt@mail.nih.gov permanent email: toms@alum.mit.edu http://www.ccrnp.ncifcrf.gov/~toms/ *) label 1; (* end of program *) const (* begin module version *) version = 1.26; (* of mkdb.p 2009 Apr 08 2009 Apr 08, 1.26: supply version 2008 Jun 25, 1.25: put topology into the entries 2007 Apr 02, 1.24: increase maximum sequence length 2007 Mar 01, 1.23: options for multipart names 2007 Mar 01, 1.22: make locus name not be multi part 2005 Sep 16, 1.21: upgrade documentation and brush up code 2002 Apr 9, 1.20: sequence length incremented, brush up documentation 2000 Nov 7, 1.19: primer bound corrected 2000 Jun 15, 1.18: upgrade to take completely raw sequence 1999 Sep 23, 1.17: Format spacing corrected: more space after id words 1999 Jun 20, 1.16: allow reading fasta format 1997 Nov 13: : allow spaces, blank lines comments and ignore numbers 1997 Oct 21: : origin *) updateversion = 1.09; (* defines lowest acceptable current parameter file *) (* end module version *) (* begin module describe.mkdb *) (* name mkdb: read sequence; make GenBank entry with features for capitalized regions synopsis mkdb(sequ: in, mkdbp: in, entries: out, output: out) files sequ: raw DNA sequences in lower case except for objects of interest marked in upper case. The program also accepts 'n'. Sequences are separated by periods. Each sequence may be preceeded by a name line and a species line. These lines can begin with '>' or '*', but this is not necessary. Spaces, blank lines and numbers are ignored. Other lines that begin with '>' or '*' are comments If there is no species name, the single name will be used for the species also. This kludge allows the program to read the fasta format. If there is no name at all (just sequence) then a name will be assigned: nameste. name: a string of characters to name the sequence. organism: the species this sequence represents entries: GenBank entries for the sequences, with features for the capitalized regions marked as exons and features for the lower case regions (not including primers) as introns. mkdbp: parameters to control the program. The file must contain the following parameters, one per line: parameterversion: The version number of the program. This allows the user to be warned if an old parameter file is used. exoncutoff (integer): Capitalized regions longer than this number of bases will be called exons, the others will be called primers. multipart (character): What to do if a name has spaces in it. 'i' ignore the rest of the name 'u' replace spaces with underscores output: messages to the user description Sequences are often marked by people with capital letters to indicate interesting regions (exons, primers, mutations, etc). This program uses raw sequences to create simple flat-file GenBank style entries with features marked by capital letters. Long features are called 'exons' while short ones are called 'primers'. The division between these two is given by the exoncutoff parameter. Example If the sequ contains: * T7stuff * Bacteriophage T7 aacataaaggacacaatgcaatgaacattaccgacatcatgaacgctatc gacgcaatcaaagcactgccaatctgtgaacttgacaagcgtcaaggtat gcttatcgacttactggtcgagatggtcaacagcgagacgtgtgatggcg agctaacCGAACTAAATCAGGCACttgagcatcaagattggtggactacc ttgaagtgtctcacggctgacgcagggttcaagATGCTCGGTAATGGTCA CTTCTCGGCTGCTTATAGTCACCCGCTGCTACCTAACAGAGTGATTAAGG TGGGCTTTAAGAAAGAGGATTCAGGCGCAGCCTATACCGCATTCTgccgc atgtatcagggtcgtcctggtatccctaacgtctacgatgtacagcgcca cgctggatgctatacggtggtacttgacgcacttaaggattgcgagcgtt tcaacaatgatgccCATTATAAATACGCTGAgattgcaagcgacatcatt gattgcaattcggatgagcatgatgagttaactggatgggatggtgagtt tgttgaaacttgtaaactaatccgcaagttctttgagggcatcgcctcat . The entries file will contain: LOCUS T7stuff 600 bp DNA * mkdb 1.21 DEFINITION Bacteriophage T7 ACCESSION T7stuff VERSION T7stuff.1 SOURCE Bacteriophage T7 ORGANISM Bacteriophage T7 FEATURES primer 158..174 exon 234..345 primer 465..481 BASE COUNT 166 a 133 c 151 g 150 t 0 n ORIGIN 1 aacataaagg acacaatgca atgaacatta ccgacatcat gaacgctatc gacgcaatca 61 aagcactgcc aatctgtgaa cttgacaagc gtcaaggtat gcttatcgac ttactggtcg 121 agatggtcaa cagcgagacg tgtgatggcg agctaaccga actaaatcag gcacttgagc 181 atcaagattg gtggactacc ttgaagtgtc tcacggctga cgcagggttc aagatgctcg 241 gtaatggtca cttctcggct gcttatagtc acccgctgct acctaacaga gtgattaagg 301 tgggctttaa gaaagaggat tcaggcgcag cctataccgc attctgccgc atgtatcagg 361 gtcgtcctgg tatccctaac gtctacgatg tacagcgcca cgctggatgc tatacggtgg 421 tacttgacgc acttaaggat tgcgagcgtt tcaacaatga tgcccattat aaatacgctg 481 agattgcaag cgacatcatt gattgcaatt cggatgagca tgatgagtta actggatggg 541 atggtgagtt tgttgaaact tgtaaactaa tccgcaagtt ctttgagggc atcgcctcat // documentation see also {example parameter file:} mkdbp {example sequ file:} mkdb.sequ {move to the name 'sequ' to use it} {Program for listing the sequences:} lister.p {Program for generating search for capitalized sequence:} capsmark.p author Thomas Dana Schneider bugs technical notes Capitalization that abuts either end of the sequence will be indicated in the entry as beyond the end. This way the ends of the sequence will not be marked as donors or acceptors. The maximum name and sequence lengths are constants maxobjectlength and maxsequencelength respectively. *) (* end module describe.mkdb *) (* begin module describe.const *) maxnamelength = 100; (* maximum length name *) maxsequencelength = 6000000 ; (* maximum sequence length *) (* 253256583 human chromosome 2 length *) (* Set the length to the maximum your computer can handle *) debugging = false; (* set true to get debugging output *) (* end module describe.const *) var sequ, (* file used by this program *) mkdbp, (* file used by this program *) entries: text; (* file used by this program *) (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 'delmod 6.16 84 mar 12 tds/gds'; *) (* begin module capitalize *) function capitalize(c: char): char; (* convert the character c to upper case *) var n: integer; (* c is the n'th letter of the alphabet *) begin n := ord(c); if (n >= ord('a')) and (n <= ord('z')) then c := chr( n - ord('a') + ord('A')); capitalize := c end; (* end module capitalize prgmod *) (* begin module decapitalize *) function decapitalize(c: char): char; (* convert the character c to lower case *) var n: integer; (* c is the n'th letter of the alphabet *) begin n := ord(c); if (n >= ord('A')) and (n <= ord('Z')) then c := chr( n - ord('A') + ord('a')) else c := chr(n); decapitalize := c end; (* end module decapitalize prgmod *) (* begin module mkdb.themain *) procedure themain(var sequ, mkdbp, entries: text); (* the main procedure of the program *) type object = record id: array[1..maxnamelength] of char; (* a string object *) length: integer; (* length of the object *) end; sequence = record id: array[1..maxsequencelength] of char; (* a sequence *) length: integer; (* length of the sequence *) end; var c: char; (* a character in the sequence *) die: boolean; (* stop the program if bad sequence characters found *) entrylength: integer; (* length of the sequence *) exoncutoff: integer; (* shortest feature to report as an exon *) multipart: char; (* what to do for multipart names *) name: object; (* name of the sequence *) na, nc, ng, nt, nn: integer; (* numbers of bases and unknown base *) organism: object; (* organism for the sequence *) parameterversion: real; (* parameter version number *) seq: sequence; (* the sequence *) start: integer; (* position in the sequence, start of feature *) stop: integer; (* position in the sequence, end of feature *) topology: char; (* topology of the sequence *) waslower: boolean; (* true if previous character was lower case *) procedure writename(var afile: text; name: object); (* write the name to the file. 2007 Mar 01: Force any spaces to be underscores. *) var n: integer; (* index to the name *) skip: boolean; (* skip the rest of the line *) begin { for n := 1 to name.length do write(afile, name.id[n]); } skip := false; {writeln(output,'multipart = ',multipart);} for n := 1 to name.length do begin if not skip then begin if name.id[n] = ' ' then begin if multipart = 'i' then skip := true else write(afile, '_') end else write(afile, name.id[n]); end; end; { display to output for debugging: for n := 1 to name.length do begin if name.id[n] = ' ' then write(output, '_') else write(output, name.id[n]); end; writeln(output); } end; procedure readname(var afile: text; var name: object); (* write the name to the file *) begin (* read the name in *) if (afile^ = '>') or (afile^ = '*') then begin get(afile); while (afile^ = ' ') and not eoln(afile) do get(afile); if eoln(afile) then begin writeln(output,'missing name'); halt end; name.length := 0; while not eoln(afile) do begin name.length := name.length + 1; if name.length > maxnamelength then begin writeln(output,'name too long:'); write(output,'"'); writename(output, name); writeln(output,'"'); halt; end; read(afile, name.id[name.length]); end; readln(afile); write(output,'READING: '); writename(output,name); writeln(output); end else begin write(output,'Assigned name: '); with name do begin id[1] := 'n'; id[2] := 'a'; id[3] := 'm'; id[4] := 'e'; id[5] := 's'; id[6] := 't'; id[7] := 'e'; length := 7; end; write(output,'"'); writename(output, name); writeln(output,'"'); { writeln(output,'names must begin with > or *'); writeln(output,'character seen is: "',afile^,'"'); halt } end; end; procedure indent(var afile: text; amount: integer); (* put some indentation to the file *) var n: integer; (* index to the name *) begin for n := 1 to amount do write(afile, ' '); end; procedure featurecoordinate(var afile: text; start, stop: integer; entrylength: integer); (* write the feature coordinates start and stop to a file. If they are at or beyond the entry size, put < or > *) begin if start < 1 then write(afile,'<'); write(afile,start:1); write(afile,'..'); if stop > entrylength then write(afile,'>'); write(afile,stop:1); end; procedure countem(b: char); (* count the base b *) begin case decapitalize(b) of 'a': na := succ(na); 'c': nc := succ(nc); 'g': ng := succ(ng); 't': nt := succ(nt); 'n': nn := succ(nn); end; end; { procedure skipcomments; (* skip comment lines *) begin if not eof(sequ) then if (sequ^ = '>') or (sequ^ = '*') then readln(sequ); end; } procedure chooseandwrite; (* choose what kind of feature to write out *) begin if stop-start+1 > exoncutoff then begin indent(entries, 5); write(entries,'exon '); featurecoordinate(entries,start,stop,entrylength); writeln(entries); end else begin (* rejected features *) indent(entries, 5); write(entries,'primer '); featurecoordinate(entries,start,stop,entrylength); writeln(entries); end end; procedure processsequence; (* process one sequence ending in a period or end of file *) var position: integer; (* position in the sequence, starting with 1 *) (* loop variable - must be local *) begin (* read the next sequence *) position := 0; die := false; c := ' '; na := 0; nc := 0; ng := 0; nt := 0; nn := 0; { skipcomments; (* skip comments just after header *) } while not eof(sequ) and (c <> '.') do begin if eoln(sequ) (* end of line *) then begin readln(sequ); (* allow sequence not to end with a period *) if not eof(sequ) then if (sequ^ = '*') or (sequ^ = '>') then c := '.'; if debugging then writeln(output); { skipcomments; } end else begin (* not end of line *) read(sequ, c); if debugging then write(output,c); if decapitalize(c) in ['a','c','g','t', 'n'] then begin position := position + 1; if position > maxsequencelength then begin writeln(output, 'Sequence exceeds the maximum sequence length: ', maxsequencelength:1); writeln(output, 'Increase constant maxsequencelength'); halt; end; seq.id[position] := c; countem(c); end else if c = '.' then begin readln(sequ); end else if not(c in ['0','1','2','3','4','5','6','7','8','9',' ']) then begin writeln(output,'unidentified character: ', c); die := true; end end end; entrylength := position; if die then halt; (* start header of GenBank entry *) writeln(entries); write(entries,'LOCUS '); writename(entries, name); write(entries, ' ',entrylength:1, ' bp DNA'); (* 2008 Jun 25 *) if topology = 'l' then write(entries, ' linear') else write(entries, ' circular'); writeln(entries,' * mkdb ',version:4:2); write(entries,'DEFINITION '); writename(entries, organism); writeln(entries); write(entries,'ACCESSION '); writename(entries, name); writeln(entries); write(entries,'VERSION '); writename(entries, name); writeln(entries,'.1'); write(entries,'SOURCE '); writename(entries, organism); writeln(entries); write(entries,' ORGANISM '); writename(entries, organism); writeln(entries); writeln(entries,'FEATURES'); waslower := true; start := 1; stop := 1; for position := 1 to entrylength do begin c := seq.id[position]; if c in ['a','c','g','t', 'n'] then begin if (position <> 1) and (not waslower) then begin (* upper case ending *) stop := position - 1; chooseandwrite; end; waslower := true; end else if c in ['A','C','G','T','N'] then begin if waslower then begin (* begin upper case *) start := position; end; waslower := false; end else begin writeln(output,'program error:'); writeln(output,'illegal character "',c,'" found in sequence array'); halt end end; if not waslower (* ie was upper *) then begin (* After the end of sequence is found the sequence can end on capital letters, catch that case. *) stop := entrylength; chooseandwrite; end; writeln(entries,'BASE COUNT', ' ',na:7, ' a', ' ',nc:7, ' c', ' ',ng:7, ' g', ' ',nt:7, ' t', ' ',nn:7, ' n' ); if na+nc+ng+nt+nn <> entrylength then begin writeln(output,'program error: BASE COUNT <> sum of bases'); halt; end; write(entries,'ORIGIN'); (* note: ln is done as first step below *) (* write out sequence *) for position := 1 to entrylength do begin if (position mod 10) = 1 then begin write(entries,' '); end; if ((position-1) mod 60) = 0 then begin writeln(entries); write(entries, position:10,' '); end; write(entries, decapitalize(seq.id[position])); end; writeln(entries); writeln(entries,'//'); end; begin writeln(output,'mkdb ',version:4:2); reset(mkdbp); readln(mkdbp, parameterversion); if round(100*parameterversion) < round(100*updateversion) then begin writeln(output, 'You have an old parameter file!'); halt end; (* read other parameters *) readln(mkdbp, exoncutoff); if not eof(mkdbp) then readln(mkdbp, multipart) else multipart := 'i'; topology := 'l'; (* assume linear topology for now *) rewrite(entries); reset(sequ); while not eof(sequ) do begin { if (sequ^ = '>') or (sequ^='*') then readname(sequ, name); } readname(sequ, name); if (sequ^ = '>') or (sequ^='*') then readname(sequ, organism) else organism := name; processsequence; end; end; (* end module mkdb.themain *) begin themain(sequ, mkdbp, entries); 1: end.