program clual(clustalout, clualp, protseq, output); (* clual: clustal to alpro conversion Dr. Thomas D. Schneider National Cancer Institute Laboratory of Experimental and Computational Biology Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu http://www.lecb.ncifcrf.gov/~toms/ National Cancer Institute Laboratory of Experimental and Computational Biology *) label 1; (* end of program *) const (* begin module version *) version = 1.08; (* of clual.p 2002 Feb 19 2002 Feb 19, 1.00: origin *) updateversion = 1.00; (* defines lowest acceptable current parameter file *) (* end module version *) (* begin module describe.clual *) (* name clual: clustal to alpro conversion synopsis clual(clustalout: in, clualp: in, protseq: out, output: out) files clustalout: output of the CLUSTAL program protseq: input to the alpro program clualp: parameters to control the program. The file must contain the following parameters, one per line: parameterversion: The version number of the program. This allows the user to be warned if an old parameter file is used. The second line of clualp must match the first line of the clustalout file. This is used to check that the clustalout file is correct. verbose (character): If the first character of the third line is a 'v' then the program will name the segment numbers as it reads it in, and then give the name of each sequence as it is written out. output: messages to the user description Convert from CLUSTAL format to allow one to present COG output as a sequence logo. The CLUSTAL format is broken up into segments. Alpro requires continuous sequences. This program rearranges the CLUSTAL data to the form alpro needs. examples The first line of a clustal file looks like this: CLUSTAL W (1.74) multiple sequence alignment This is used to check that the input is good. documentation see also {example parameter file:} clualp {program that uses the output of this program:} alpro.p {program that finally generates the sequence logo:} makelogo.p {COG:} http://www.ncbi.nlm.nih.gov/COG/ {COG:} ftp://ncbi.nlm.nih.gov/pub/COG/ {an example alignment:} http://www.ncbi.nlm.nih.gov/COG/aln/COG0526.aln {the entire list of alignments, ready to grab:} ftp://ncbi.nlm.nih.gov/pub/COG/aln/ {wget can be used to grab the alignments:} http://www.lecb.ncifcrf.gov/~toms/wget.html {Why one should not use consensus sequences:} http://www.lecb.ncifcrf.gov/~toms/glossary.html#consensus_sequence author Thomas Dana Schneider bugs technical notes The clustal format has waste spaces. If "_" represents a space, then we have at the boundary of two segments: YPR082c_________------------------------------------------------------------ ____________________________________________________________________________ _ BS_resA_________------------------ This program ignores the spaces, but one wonders why they are there ... AH!!! These contain STUPID consensus sequences!!! The program will ignore this idiotic data line. *) (* end module describe.clual *) (* begin module string.const *) maxstring = 2000; (* the maximum string *) (* end module string.const version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module filler.const *) fillermax = 50; (* the size of the filler array for a string *) (* end module filler.const version = 4.54; (@ of prgmod.p 2001 Aug 29 *) type (* begin module string.type *) stringptr = ^string; (* pointer to a string *) string = record (* a string of characters *) letters: array[1..maxstring] of char; (* the letters in the string *) length: integer; (* the number of characters in the string *) current: integer; (* the letter we are working on *) next: stringptr; (* the next string in a series *) end; (* end module string.type version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module filler.type *) (* the following is an array used to fill a string. it is convenient to have it much shorter than the maxstring, so that it is easy to fill the string using procedure fillstring. the user must declare the value of constant fillermax. *) filler = packed array[1..fillermax] of char; (* end module filler.type version = 4.54; (@ of prgmod.p 2001 Aug 29 *) var clustalout, (* file used by this program *) clualp, (* file used by this program *) protseq: text; (* file used by this program *) (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module clearstring *) procedure clearstring(var ribbon: string); (* empty the string *) var index: integer; (* to the ribbon *) begin (* clearstring *) with ribbon do begin for index := 1 to maxstring do letters[index] := ' '; length := 0; current := 0; end end; (* clearstring *) procedure initializestring(var ribbon: string); (* start the string with a nil pointer. This routine should be called before doing linked list work. This allows the standard string routines to clear the string without killing the pointer. *) begin (* initializestring *) clearstring(ribbon); ribbon.next := nil; end; (* initializestring *) (* end module clearstring version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module interact.getstring *) procedure getstring(var afile: text; var buffer: string; var gotten: boolean); (* get a line (as a string) from a file not using string calls. this lets one obtain lines from a file without interactive prompts *) var index: integer; (* of buffer *) begin (* getstring *) clearstring(buffer); if eof(afile) then gotten := false else begin index := 0; while (not eoln(afile)) and (index < maxstring) do begin index := succ(index); read(afile, buffer.letters[index]) end; if not eoln(afile) then begin writeln(output, ' getstring: a line exceeds maximum string size (', maxstring:1,')'); halt end; buffer.length := index; buffer.current := 1; readln(afile); gotten := true end end; (* getstring *) (* end module interact.getstring version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module interact.figurestring *) procedure figurestring( var line: string; (* a string of characters to figure out *) var first: integer; (* first found non-blank character in the line *) var last: integer; (* last character before a blank after first *) var whzat: char; (* what the token is *) var c: char; (* the first character of the token *) var i: integer; (* integer value of token if it is integer; or 0 *) var r: real); (* the real value if it is real; or 0.0 *) (* figurestring figures out the tokens in a string. it recognizes words, integers, reals and poorly formed numbers. you can easily use it to parse lines. our goal is to figure out what thing is on a string. start looking at the current place on the line. first and last are the first 'token' in line after start. the current place is updated to the letter after last. the thing found is described by the value of whzat: 'c': character (when the token does not begin with a digit, '+', or '-') 'i': integer 'r': real ' ': blank line 'g': garbage, cannot figure it out and the value of the thing found is the appropriate variable *) var numbers: set of '0'..'9'; sign: integer; (* sign of a number *) numberstart: integer; (* the point a number starts, beyond its sign, if any *) point: integer; (* location of decimal point *) power: integer; (* of 10 representing a place value in the number *) l: integer; (* an index for dissecting numbers *) function figureinteger(first,last:integer):integer; (* figure the integer in the token *) var i: integer; (* index *) sum, increment: integer; begin (* figureinteger *) power:=1; (* start at ones place *) sum:=0; (* start sum at zero *) for i:=last downto first do begin case line.letters[i] of '0': increment:=0; '1': increment:=1; '2': increment:=2; '3': increment:=3; '4': increment:=4; '5': increment:=5; '6': increment:=6; '7': increment:=7; '8': increment:=8; '9': increment:=9 end; sum:=sum+power*increment; power:=power*10 end; figureinteger:=sum end; (* figureinteger *) begin (* figurestring *) numbers:=['0','1','2','3','4','5','6','7','8','9']; (* c:=' '; i:=0; r:=0.0; do not affect these variables unless necessary *) point:=0; whzat := '.'; (* assume that we have someting to work on *) (* now to see if that is true: *) with line do if (length = 0) or (current < 1) or (current > length) then whzat := ' ' else begin (* figure out where the first token is in the line *) first:=line.current; while (line.letters[first]=' ') and (first < line.length) do first:=succ(first); if (first = line.length) and (line.letters[first] = ' ') then whzat := ' '; end; if whzat <> ' ' then begin last:=first; while (line.letters[last]<>' ') and (last < line.length) do last:=succ(last); if line.letters[last] = ' ' then last := pred(last); (* the token is between inclusive first and last *) c:=line.letters[first]; if (c in numbers) or (c in ['+','-']) then begin if c in ['+','-'] then begin case c of '+': sign:=+1; '-': sign:=-1; end; numberstart:=succ(first) end else begin sign:=+1; numberstart:=first end; whzat:='i'; for l:=numberstart to last do begin if not(line.letters[l] in numbers) then if line.letters[l]='.' (* we found a period *) then if whzat='i' (* if so far it is numbers *) then begin whzat:='r'; (* it is actually real *) point:=l end else whzat:='g' (* it is a second '.', ie garbage *) else whzat:='g' (* it is garbage *) end; (* if it is only numbers, it is integer *) (* build number *) (* if it ends in a period, it is integer *) if (whzat = 'r') and (point = last) then whzat:='i'; if whzat = 'i' then begin if point = last (* had an ending decimal point *) then i:=sign * figureinteger(numberstart,pred(last)) else i:=sign * figureinteger(numberstart,last); r:=i end else if whzat = 'r' then begin i:=figureinteger(numberstart,point-1); r:=sign * (i+figureinteger(point+1,last)/power); i:=sign * i end end else begin whzat:='c'; end; (* move the start to just beyond the last character of the token *) line.current:=succ(last) end end; (* figurestring *) (* end module interact.figurestring version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module interact.token *) procedure token(var buffer, atoken: string; var gotten: boolean); (* get a token from the buffer *) var (* variables for calling figurestring: *) first: integer; last: integer; what: char; cha: char; int: integer; rea: real; index: integer; (* to the buffer *) begin figurestring(buffer,first,last,what,cha,int,rea); if what = ' ' then gotten := false else begin clearstring(atoken); for index := first to last do atoken.letters[index-first+1] := buffer.letters[index]; atoken.length := last - first + 1; atoken.current := 1; gotten:=true end end; (* end module interact.token version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module writestring *) procedure writestring(var tofile: text; var s: string); (* write the string s to file tofile, no writeln *) var i: integer; (* index to s *) begin (* writestring *) with s do for i := 1 to length do write(tofile, letters[i]) end; (* writestring *) (* end module writestring version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module filler.fillstring *) procedure fillstring(var s: string; a: filler); (* this procedure makes it reasonably easy to fill the string s with characters. one calls the procedure as: *) (* 1 2 3 4 5 *) (* 12345678901234567890123456789012345678901234567890 *) (* fillstring(s, 'this-is-the-string '); the two comments make it easy to line the characters up. also, for this example, it was assumed that the length of filler as defined by the constant fillermax was 50. *) var length: integer; (* of the string without trailing blanks *) index: integer; (* of s *) begin (* fillstring *) clearstring(s); length := fillermax; while (length > 1) and (a[length] = ' ') do length := pred(length); if (length = 1) and (a[length] = ' ') then begin writeln(output, 'fillstring: the string is empty'); halt end; for index := 1 to length do s.letters[index] := a[index]; s.length := length; s.current := 1 end; (* fillstring *) (* end module filler.fillstring version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module equalstring *) function equalstring(a, b: string): boolean; (* Test for equality between two strings at current positions. NOTE: A compiler bug results if one directly tests this way: if thedefinition^.nametag = aname The reason is completely not clear! I showed that the parts of the strings were identical, but the whole was not by this test. For this reason it is *critical to test strings with equalstring. *) var index: integer; (* index to both strings *) equal: boolean; (* are letters in a and b the same? *) begin (* equalstring *) if a.length = b.length then begin index := 1; repeat equal := (a.letters[index] = b.letters[index]); index := succ(index) until (not equal) or (index > a.length); equalstring := equal end else equalstring := false end; (* equalstring *) (* end module equalstring version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module copystring *) procedure copystring(a: string; var b: string); (* copy string a to b *) var l: integer; (* index to the string *) begin b.length := a.length; for l := 1 to a.length do b.letters[l] := a.letters[l] end; (* end module copystring version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module skipblanks *) procedure skipblanks(var thefile: text); (* skip over blanks until a non-blank, or end of line, is found *) begin while (thefile^ = ' ') and not eoln(thefile) do get(thefile); end; procedure skipnonblanks(var thefile: text); (* skip over nonblanks until a blank, or end of line, is found *) begin while (thefile^ <> ' ') and not eoln(thefile) do get(thefile); end; procedure skipcolumn(var thefile: text); (* skip over a data column *) begin skipblanks(thefile); skipnonblanks(thefile) end; (* end module skipblanks version = 4.54; (@ of prgmod.p 2001 Aug 29 *) (* begin module clual.themain *) procedure themain(var clustalout, clualp, protseq: text); (* the main procedure of the program *) type proteinptr = ^protein; (* pointer to a protein string *) protein = record (* a protein string *) name: string; (* the name of the protein *) data: stringptr; (* the sequence of the protein *) lastdata: stringptr; (* the end of the data string *) next: ^protein; (* the next protein *) end; var aline: string; (* a line of data from clustalout *) clustalid: string; (* the first line of a clustal file *) d: stringptr; (* pointer to a data string in proteins *) gotten: boolean; (* a line was gotten from clustalout? *) newsegment: boolean; (* we are now between or at the star of a segment *) parameterversion: real; (* parameter version number *) p: proteinptr; (* pointer to proteins *) proteins: proteinptr; (* the collection of aligned protein sequences, arranged so that they can be output continuously *) protcount: integer; (* number of proteins *) segment: integer; (* the current segment of the alignment *) thedata: string; (* the data on the data line *) thename: string; (* the name of the data line *) verbose: char; (* if 'v' then report segments and names to output *) begin writeln(output,'clual ',version:4:2); reset(clualp); readln(clualp, parameterversion); if round(100*parameterversion) < round(100*updateversion) then begin writeln(output, 'You have an old parameter file!'); halt end; rewrite(protseq); writeln(protseq,'> clual ',version:4:2); reset(clustalout); (* check that the first line of the file is a clustal file *) getstring(clualp, clustalid, gotten); if not gotten then begin writeln(output, 'The second line of clualp must match the first line'); writeln(output, 'of the clustalout file.'); writeln(output, 'This is used to check that', ' the clustalout file is correct.'); halt end; readln(clualp, verbose); getstring(clustalout, aline, gotten); if not gotten then begin writeln(output, 'clustalout is empty?'); halt end; if not equalstring(aline, clustalid) then begin writeln(output, 'clustalout is not a correct file'); writeln(output, 'The first line MUST be:'); writestring(output, clustalid); writeln(output); writeln(output, 'but this was found instead:'); writestring(output, aline); writeln(output); halt end; proteins := nil; (* we have not started yet *) p := nil; (* we have not started yet *) protcount := 0; segment := 0; newsegment := false; while not eof(clustalout) do begin (* skip consensus lines *) if not eoln(clustalout) then begin (* read to the end of the line *) if clustalout^ = ' ' then while not eoln(clustalout) do get(clustalout) end; if eoln(clustalout) then begin (* skip blank material between segments *) readln(clustalout); newsegment := true; p := nil; (* we are not at any protein at the moment *) end else begin (* absorb a segment *) if newsegment then begin segment := segment + 1; newsegment := false; if verbose = 'v' then writeln(output,'segment ', segment:1); if p <> nil then begin writeln(output,'segment ',segment:1,' is too short!'); halt end; end; if proteins = nil then begin new(proteins); (* start the list *) with proteins^ do begin clearstring(name); new(data); lastdata := nil; next := nil; end; p := proteins; if segment = 1 then protcount := 1; end else begin if p = nil then begin (* start working in this segment *) p := proteins; end else begin if p^.next = nil then begin (* start the next protein *) new(p^.next); with p^.next^ do begin clearstring(name); new(data); lastdata := nil; next := nil; end; end; p := p^.next; if segment = 1 then protcount := protcount + 1; end; end; (* read one data line *) getstring(clustalout, aline, gotten); if not gotten then begin writeln(output, 'clual: could not read line correctly?'); writeln(output, 'in segment ',segment:1,' the line is:'); writestring(output, aline); writeln(output); halt end; clearstring(thename); token(aline, thename, gotten); if not gotten then begin writeln(output, 'clual: could not read name correctly?'); writeln(output, 'in segment ',segment:1); writeln(output, 'the line read is:'); writestring(output, aline); writeln(output); writeln(output, 'the PREVIOUS name read is:'); writestring(output, thename); writeln(output); writeln(output, 'the PREVIOUS data read is:'); writestring(output, thedata); writeln(output); halt end; token(aline, thedata, gotten); if not gotten then begin writeln(output, 'clual: could not read data correctly?'); writeln(output, 'in segment ',segment:1); writeln(output, 'the line read is:'); writestring(output, aline); writeln(output); writeln(output, 'the PREVIOUS name read is:'); writestring(output, thename); writeln(output); writeln(output, 'the PREVIOUS data read is:'); writestring(output, thedata); writeln(output); halt end; if p^.name.length = 0 then begin (* set up the name the first time *) copystring(thename, p^.name); new(p^.data); clearstring(p^.data^); p^.lastdata := p^.data; copystring(thedata, p^.data^); p^.lastdata^.next := nil; end else begin (* fill in the next data segment *) if not equalstring(p^.name, thename) then begin write(output,'name "'); writestring(output, p^.name); writeln(output,'"'); writeln(output,'DOES NOT MATCH PREVIOUS NAME'); write(output,'name "'); writestring(output, thename); writeln(output,'"'); halt end; new(p^.lastdata^.next); p^.lastdata := p^.lastdata^.next; clearstring(p^.lastdata^); copystring(thedata, p^.lastdata^); p^.lastdata^.next := nil; end; end end; if verbose <> 'v' then begin write(output, segment:1,' segment'); if segment > 1 then write(output,'s'); writeln(output); write(output, protcount:1,' protein'); if protcount > 1 then write(output,'s'); writeln(output); end; (* write the results out *) p := proteins; while p <> nil do begin if verbose = 'v' then begin writestring(output, p^.name); writeln(output); end; write(protseq, '> '); writestring(protseq, p^.name); writeln(protseq); d := p^.data; while d <> nil do begin writestring(protseq, d^); writeln(protseq); if d^.next = nil then begin if d <> p^.lastdata then begin writeln(output,'ERROR: lastdata is not end of list'); halt end; end; d := d^.next; end; p := p^.next; end; end; (* end module clual.themain *) begin themain(clustalout, clualp, protseq); 1: end. { duplicate for reference: proteinptr = ^protein; (* pointer to a protein string *) protein = record (* a protein string *) name: string; (* the name of the protein *) data: stringptr; (* the sequence of the protein *) lastdata: stringptr; (* the end of the data string *) next: ^protein; (* the next protein *) end; }