program alword(words, alwordp, symvec, output); (* alword: frequency and information of aligned words Tom Schneider National Cancer Institute Laboratory of Mathematical Biology Frederick, Maryland 21702-1201 toms@ncifcrf.gov *) label 1; (* end of program *) const (* begin module version *) version = 2.07; (* of alword.p 1992 June 4 previous version: 1990 August 8 origin 1990 June 14 from alpro 1.22 1990 May 9 *) (* end module version *) (* begin module describe.alword *) (* name alword: frequency and information of aligned words synopsis alword(words: in, symvec: out, output: out) files words: Aligned words. Since the input is usually to be a UNIX dictionary, there need not be any header lines. However, if they exist, they must begin with an asterisk '*'. The remaining lines are used for the words. alwordp: parameters to control the program. If the file is empty defaults are used. If the first line begins with the letter `e' then the words are aligned by their last character. If there is a first line, the second line must have the maximum word length to be included in the calculation. Words longer than this will be skipped (and reported to output). If the first character of the second line is 'a' then all of the words in the file will be read. Otherwise, only the first word on each line will be read. symvec: table of frequencies and information content. The information measure is corrected for small sample size (Schneider et al, 1986). output: messages to the user description Take an aligned set of protein sequences and produce input to the consensus program for producing a logo. examples * This is an example sequence. AGGEGCTT. * This is the second example sequence. * It is the last one. YLREBS. documentation Jotun Hein, Methods of Enzymology 183 (1990) Schneider et al. JMB 188:415 (1986) see also alpro.p, makelogo.p author Thomas Dana Schneider bugs technical notes *) (* end module describe.alword *) (* begin module alword.const *) maxpos = 1000; (* maximum number of positions the program can handle *) (* end module alword.const *) (* begin module alword.type *) type sequence = record letters: array[1..maxpos] of char; (* one of the sequences *) length: integer; (* the length of the sequence *) end; (* end module alword.type *) var words, (* input aligned protein sequences *) alwordp, (* parameters *) symvec: text; (* output frequency table *) (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 4.09; (@ of prgmod.p 1990 May 18 *) (* begin module copyaline *) procedure copyaline(var fin, fout: text); (* copy a line from file fin to file fout *) begin (* copyaline *) while not eoln(fin) do begin fout^ := fin^; put(fout); get(fin) end; readln(fin); writeln(fout); end; (* copyaline *) (* end module copyaline version = 4.09; (@ of prgmod.p 1990 May 18 *) (* begin module skipblanks *) procedure skipblanks(var thefile: text); (* skip over blanks until a non-blank, or end of line, is found *) begin while (thefile^ = ' ') and not eoln(thefile) do get(thefile); end; procedure skipnonblanks(var thefile: text); (* skip over nonblanks until a blank, or end of line, is found *) begin while (thefile^ <> ' ') and not eoln(thefile) do get(thefile); end; (* end module skipblanks version = 4.09; (@ of prgmod.p 1990 May 18 *) (* begin module capitalize *) function capitalize(c: char): char; (* convert the character c to upper case *) var n: integer; (* c is the n'th letter of the alphabet *) begin n := ord(c); if (n >= ord('a')) and (n <= ord('z')) then c := chr( n - ord('a') + ord('A')); capitalize := c end; (* end module capitalize version = 4.09; (@ of prgmod.p 1990 May 18 *) procedure die; (* destroy symvec (as a safety feature) and halt the program *) begin rewrite(symvec); halt end; (* begin module alword.readsequence *) procedure readsequence(var f: text; oneperline: boolean; var s: sequence); (* read in the sequence s from file. The sequence may have a header consisting of any number of lines that begin with an asterisk '*'. The sequence itself contains any character except space and period '.'. If oneperline is true, only one word is read per line *) var c: char; (* a character from words *) done: boolean; (* done reading the sequence *) i: integer; (* position on the sequence *) begin i := 0; with s do begin done := false; skipblanks(f); while not done do begin (* read one word *) read(f,c); (* write(output,c); *) i := i + 1; if i > maxpos then begin writeln(output,'A sequence is longer than', ' constant maxpos (',maxpos:1,').'); die; end else s.letters[i] := c; if eoln(f) then begin done := true; readln(f) end (* else if (ord(capitalize(c)) < ord('A')) or (ord(capitalize(c)) > ord('Z')) then begin done := true; if oneperline then readln(f) end *) end; if i = 0 then begin (* sequence will be zero long if there was none read *) writeln(output,'zero length sequence found!'); die end else length := i; end; (* writeln(output) *) end; (* end module alword.readsequence *) (* begin module alword.themain *) procedure themain(var words, alwordp, symvec: text); (* the main procedure of the program *) var alignment: char; (* if 'e' by the end of the word *) avarhnb: real; (* variance of the information r *) b: integer; (* index to the letters *) c: char; (* a character from words *) e: real; (* for calculating small sample bias and variance *) endalignment: boolean; (* if true, align words by their end *) f: real; (* frequency of a letter *) i: integer; (* position on the sequence *) j: integer; (* position on the sequence, from the back end of the word *) imax: integer; (* maximum position on the sequence *) ln2: real; (* natural log of 2 for calculating bits *) hmax: real; (* log2 of s *) maxword: integer; (* longest word that will be reversed *) n: integer; (* counting the input sequences *) ntrue: integer; (* true count in a position, deleting blanks *) nbi: array[0..26, 1..maxpos] of integer; (* number of each each amino acid (b) at each position (i). zero is for blanks. *) oneperline: boolean; (* one word per line *) r: real; (* information at a position *) s: integer; (* number of symbols *) skipped: boolean; (* if true, we have already alerted the user about this word being skipped *) seq: sequence; (* one of the sequences *) begin writeln(output,'alword ',version:4:2); reset(words); rewrite(symvec); writeln(symvec,'* alword ',version:4:2); (* get the parameter *) reset(alwordp); if eof(alwordp) then begin alignment := ' '; oneperline := true; end else begin readln(alwordp,alignment); if not eof(alwordp) then readln(alwordp,maxword) else begin maxword := 20; writeln(output,'maximum word size: ',maxword:1); end end; endalignment := (alignment = 'e'); if not eof(alwordp) then begin if alwordp^='a' then oneperline := false else oneperline := true; readln(alwordp) end; (* clear the table *) for i := 1 to maxpos do for b := 0 to 26 do nbi[b,i] := 0; n := 0; imax := 0; while not eof(words) do begin (* read one sequence *) while words^ = '*' do begin writeln(symvec,'*'); copyaline(words,symvec); end; n := n + 1; readsequence(words,oneperline,seq); (* Don't do this except for debugging! writeln(output,'sequence ',n:3,' is ',seq.length:3,' letters long'); *) skipped := false; for i := 1 to seq.length do begin c := capitalize(seq.letters[i]); if (ord(c) < ord('A')) or (ord(c) > ord('Z')) then b := 0 else b := ord(c) - ord('A') + 1; (* make b be 1 to 26 for A to Z *) { write(output,c); } if endalignment then begin if seq.length > maxword then begin if not skipped then begin write(output,'skipping '); for j := 1 to seq.length do write(output,seq.letters[j]); writeln(output); skipped := true; end end else begin j := maxword - seq.length + i; if j > 0 then nbi[b,j] := succ(nbi[b,j]) end end else nbi[b,i] := succ(nbi[b,i]); end; { writeln(output,c); } if seq.length > imax then imax := seq.length end; (* If we are aligning by the end of the word, correct imax to be maxword: *) if endalignment then imax := maxword; s := 26; (* number of symbols *) ln2 := ln(2.0); hmax := ln(s); writeln(symvec,s:1,' number of symbols'); (* write out the array *) for i := 1 to imax do begin (* find the true number of letters at this position *) ntrue := 0; for b := 1 to 26 do ntrue := ntrue + nbi[b,i]; { ntrue := n - nbi[0,i]; (* true number of samples *) write(output,'i=',i:1,' | '); write(output,'ntrue=',ntrue:1,' | '); write(output,'n=',n:1,' | '); writeln(output,'nbi[0,',i:1,']=',nbi[0,i]:1); } (* calculate the information *) if ntrue <= 0 then begin write(output,' WARNING: something is really STRANGE!'); writeln(output,' Position ',i:1,' has ',ntrue:1,' sequences!'); r := 0.0; end else begin r := hmax; for b := 1 to 26 do begin f := nbi[b,i]/ntrue; if f > 0 then r := r + f * ln(f); (* ie, Hmax - (f log f) *) end; r := r / ln2; (* convert to bits *) (* a correction for small sample sizes *) e := (s-1) / (2 * ln2 * ntrue); r := r - e; avarhnb := e * e; if r < 0.0 then r := 0; (* eliminate negative results *) end; (* display the data *) writeln(symvec,i:5,' ',ntrue:5,' ',r:8:5,' ',avarhnb:8:5, ' (position, samples, information, variance)'); for b := 1 to 26 do begin writeln(symvec,chr(b+ord('A')-1), ' ',nbi[b,i]:4); end; end; end; (* end module alword.themain *) begin themain(words, alwordp, symvec); 1: end.