program rseq(encseq, cmp, rsdata, output); (* rseq: rsequence calculated from encoded sequences Dr. Thomas D. Schneider National Institutes of Health National Cancer Institute Center for Cancer Research Nanobiology Program Molecular Information Theory Group Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu (use only if first address fails) http://www.ccrnp.ncifcrf.gov/~toms/ module libraries required: delman, matmods, prgmods, auxmods *) label 1; (* end of the program *) const (* begin module version *) version = 5.41; (* of rseq.p 2005 Sep 21 2005 Sep 21: 5.41 improved emptyfile test so 'encode' is written to rsdata' 2005 Sep 19: 5.40 emptyfile test for eof; await compiler problem solution 2000 Jun 28: 5.39 upgrade documentation 1997 June 19: 5.37 point to rsgra.p dalvec.p in see also. 1997 april 21: 5.36 upgrade to compress range to actually used region of encseq. This means the user can specify a range in encodep larger than the book/inst range. thesis version 4.46 completed 1984 mar 1 *) (* end module version *) (* begin module describe.rseq *) (* name rseq: rsequence calculated from encoded sequences synopsis rseq(encseq: in, cmp: in, rsdata: out, output: out) files encseq: the output of the encode program cmp: a composition from the comp program. if cmp is empty, then equal frequencies are assumed. rsdata: a display of the information content of each position of the sequences, with the sampling error variance. This output is ready to be used as input to rsgra or as data for genhis for plotting. output: messages to the user. description Encoded sequences from encseq are converted to a table of frequencies for each base (b) at each aligned position (l). rsequence(l) and the variance var(hnb) are calculated and shown along with their running sums. rsequence and the variance due to sampling error are shown for the whole site, but the running sums let one find rsequence and the variance for any subrange desired. n, the number of example sequences may vary with position, so both n and e(hnb) are shown. documentation Schneider, T.D., G.D. Stormo, L. Gold and A. Ehrenfeucht (1986) The information content of binding sites on nucleotide sequences. J. Mol. Biol. 188: 415-431. see also {INPUT} {the program that makes the input encseq file:} encode.p {the program that makes the input cmp file:} comp.p {OUTPUT:} {the program that uses the rsdata file for making logos:} dalvec.p {GRAPHICS:} {a program to make a graph of the curve (old):} rsgra.p {another program to make a graph of the curve:} xyplo.p {RELATED:} {a program to compute the small sample error:} calhnb.p {a related program to compute frequencies:} encfrq.p author Thomas D. Schneider bugs Does not handle di-nucleotides or longer oligos technical notes Constants maxsize (procedure calehnb) and kickover (procedure makehnblist) determine the largest n for which e(hnb) is used. Above this, ae(hnb) is used. Do not set these below 50 without careful analysis. Other constants are in module rseq.const. *) (* end module describe.rseq *) (* begin module rseq.const *) negativeinfinity = -1000000; (* the definition of negative infinity for the w(b,l) matrix, 100x scaleup *) tolerance = -20.0; (* the definition of the tolerance of the w(b,l) matrix, in bits*) posfield = 4; (* field size for aligned sequence positions *) infofield = 8; (* field size for bits *) infodecim = 5; (* decimal places for bits *) nfield = 4; (* size of field for printing n, the number of sites *) wfield = 10; (* size of field for wmatrix *) (* end module rseq.const *) (* begin module vector.const *) maxvecpart = 64; (* the number of elements in a 'part' of a vector *) vpagewidth = 64; (* the width (in characters) of the output vector file from procedure writevector *) (* end module vector.const version = 'matmod 1.95 85 apr 18 tds/gds'; *) type (* begin module encode.type.param *) (* the following types allow the user to specify parameters which will be used to encode the sequences in the book. *) (* spaces are allowed between the encoded bases, and the number of bases to be skipped between each encoded pair of bases are kept in this linked list of integers *) spaceptr = ^spacelist; spacelist = record skips : integer; (* bases skipped to next coded base *) next : spaceptr; (* points to next spacing number *) end; (* the encoding parameters for each region are stored in these records, and the records for each region are connected into a linked list of all the encoding parameters for the entire sequences *) endpoints = (start,stop); (* end points of a coding region *) paramptr = ^parameter; parameter = record (* these are the instructions for doing the coding *) range : array[endpoints] of integer; (* the bases to be coded by these instructions, relative to the alignedbase *) window : integer; (* the number of bases included in a coding vector *) wshift : integer; (* movement of the window to the next site *) coding : integer; (* the 'level' at which the coding is being done, i.e., 1: mononucleotides; 2: dinucleotides; ... *) spaces : spaceptr; (* the spacing between the encoded bases *) cshift : integer; (* shift to the next coding unit in the window *) (* values calculated at read-in time *) wvlength: integer; (* length of a window vector. this is coding raised to the 4th power *) pvlength: integer; (* parameter vector length. the vector made up of a series of window vectors. *) (* note: pvlength/wvlength is the number of windows in the parameter range *) next : paramptr; (* set of instructions for coding another region *) end; (* end module encode.type.param version = 'matmod 1.95 85 apr 18 tds/gds'; *) (* begin module vector.type *) (* vectors are useful for many applications, including the encoding of sequences. this vector type is designed to be flexible enough to be used whenever one needs a vector. it is designed as a linked list so it can contain as many elements as are ever needed. the actual 'vector' is a record containing the length and a pointer to the first 'vectorpart'. that vectorpart is a record containing an array of the first 'maxvecpart' elements (maxvecpart is a constant, from module vector.const, which must be included) and a pointer to the next 'vectorpart'. the elements are of type real so that either integers or reals may be used. *) partptr = ^vectorpart; vectorpart = record numbers : array[1..maxvecpart] of real; next : partptr; end; vector = record length : integer; part : partptr; end; (* end module vector.type version = 'matmod 1.95 85 apr 18 tds/gds'; *) (* begin module base.type *) (* declare the type base to avoid need for modules from delmods *) base = (a, c, g, t); (* end module base.type version = 'auxmod 1.37 85 apr 4 gds/tds'; *) (* begin module comp.type *) (* the composition is stored in a tree of these nodes *) compnodeptr = ^compnode; (* points to a node of the tree *) compnode = record (* a node of the composition tree *) count : integer; (* the number of oligos for this node *) son : array[base] of compnodeptr; (* the pointers to 'descendants' of this node of the tree *) end; (* spiders are used to make the composition tree *) spiderptr = ^spider; (* points to a 'spider' *) spider = record (* a spider climbs the composition tree, its path determined by the sequences, and increments 'count' at all the nodes it passes, thereby determining the composition *) depth : integer; (* the level of the node now at *) place : compnodeptr; (* a pointer to the current node *) next : spiderptr; (* the next spider in the collection *) end; (* the total number of composition entries at a given level is stored in the linked list of type comptotal *) compptr = ^comptotal; comptotal = record count: integer; (* the number at a given level *) next: compptr; (* pointer to the next level totals *) end; (* 'path' is used in printing the tree *) pathptr = ^path; (* pointer into a path on the tree *) path = record (* the path of bases to get to a particular node *) bas : base; next : pathptr; end; (* end module comp.type version = 'auxmod 1.37 85 apr 4 gds/tds'; *) (* begin module hnblist.type *) (* a list of e(hnb) values for various n. by calculating these only once and saving them in this list, time can be saved. *) ehnblstptr = ^ehnblist; ehnblist = record n: integer; (* number of examples *) ehnb: real; (* e(hnb) for n *) varhnb: real; (* variance for e(hnb) for n *) aflag: char; (* 'a' if approximations used; 'e' if not *) next: ehnblstptr; (* next item on the list *) end; (* end module hnblist.type *) (* ************************************************************************ *) var (* begin module rseq.var *) (* encoded sequences, composition and output files *) encseq, cmp, rsdata: text; (* end module rseq.var *) (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 6.56; (@ of alist.p 2005 Sep 21 *) (* begin module copyaline *) procedure copyaline(var fin, fout: text); (* copy a line from file fin to file fout *) begin (* copyaline *) while not eoln(fin) do begin fout^ := fin^; put(fout); get(fin) end; readln(fin); writeln(fout); end; (* copyaline *) (* end module copyaline version = 6.56; (@ of alist.p 2005 Sep 21 *) (* begin module emptyfile *) function emptyfile(var afile: text): boolean; (* This function is a replacement for eof() to get around a bug in the GPC compiler. The GPC bug is that end-of-file, as provided by eof(), is false when a file is empty. The way to get around it is to count characters in the file to determine if it is empty. Fortunately this is fast for large files because the count stops once a single or a few characters are seen. A single character will not do unfortunately because GPC will give an '*' for an empty file. So the test is for having a certain number (boundary) or fewer characters. This is not great, but it only means that the file must be at least 'boundary' characters long. Furthermore, since the file is being READ, the emptyfile test CANNOT be used in a loop to test for eof! Use emptyfile only to check that the file is empty. Use eof inside loops IF the file is not empty. Dr. Thomas D. Schneider National Institutes of Health National Cancer Institute Center for Cancer Research Nanobiology Program Molecular Information Theory Group Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu (use only if first address fails) http://www.ccrnp.ncifcrf.gov/~toms/ *) const debugging = false; boundary = 2; (* one needs at least this many characters for a file to be considered 'non empty'. *) var lines: integer; (* count of lines in the file *) chars: integer; (* count of characters in the file *) ch: char; (* a character in the file *) begin reset(afile); lines := 0; chars := 0; while (not eof(afile)) and (chars < boundary) do begin if eoln(afile) then begin lines := succ(lines); readln(afile); end else begin read(afile,ch); if debugging then begin writeln(output,'ord(ch) = ',ord(ch)); writeln(output,' ch = ', ch ); end; chars := succ(chars); end end; if chars < boundary then emptyfile := true else emptyfile := false; if debugging then begin writeln(output,'lines = ',lines:1); writeln(output,'chars = ',chars:1); if chars < boundary then writeln(output,' empty (file)') else writeln(output,'NOT empty (file)'); end; reset(afile); (* put the file back at the start!! *) end; (* end module emptyfile version = 6.56; (@ of alist.p 2005 Sep 21 *) (* ************************************************************************ *) (* begin module encode.readencpar *) (* read encode parameters *) function vectorsize(param: paramptr): integer; (* determine the size of the vector generated by a string of parameter records, begin with 'param' *) var size: integer; (* of the vector, so far *) begin size := 0; while (param <> nil) do begin size := size + param^.pvlength; param := param^.next; end; vectorsize := size; end; function paramsize(param: paramptr): integer; (* determine the size of the vector generated by one parameter record *) var rangesize, (* the number of bases in the range *) numwindows: integer; (* the number of windows which start in the range, taking into account the shifting *) begin with param^ do begin rangesize := range[stop] - range[start] + 1; numwindows := trunc((rangesize -1)/wshift + 1); paramsize := numwindows * wvlength end end; (* paramsize *) procedure readencpar(var thefile: text; var param: paramptr; var regions, vectorlength: integer); (* read the parameter list from 'thefile' and put the information into the linked parameter list, beginning with 'param'; the number of parameter records is returned in 'regions'. the length of each vector is vectorlength *) var aparam, (* from the parameter list *) newparam: paramptr; (* the new parameter record from 'thefile' *) firstspaces, (* in the spaces list *) newspaces: spaceptr; (* for the spaces list *) i,j: integer; (* indices *) ch: char; (* to check for proper positioning *) begin reset(thefile); if emptyfile(thefile) then begin writeln(output,' encoded sequence file is empty'); halt end; new(aparam); if param = nil then new(param); aparam := param; (* skip the header *) for i := 1 to 2 do readln(thefile); (* skip a blank line in the header *) readln(thefile); (* read the number of regions *) readln(thefile,regions); for i := 1 to regions do begin with aparam^ do begin (* read the range *) read(thefile,range[start]); repeat read(thefile,ch) until (ch = 'o'); readln(thefile,range[stop]); (* read the window size and shift *) readln(thefile,window); readln(thefile,wshift); (* read the coding level and arrangement *) read(thefile,coding); new(spaces); if (coding > 1) then begin repeat read(thefile,ch) until (ch = ':'); new(firstspaces); spaces := firstspaces; read(thefile,spaces^.skips); for j := 2 to coding -1 do begin new(newspaces); spaces^.next := newspaces; spaces := newspaces; read(thefile,spaces^.skips); end; spaces^.next := nil; spaces := firstspaces end else spaces := nil; readln(thefile); readln(thefile,cshift); (* set up wvlength *) wvlength := round(exp(coding*ln(4))); end; (* set up pvlength *) aparam ^.pvlength := paramsize(aparam); if (i < regions) then begin new(newparam); aparam^.next := newparam; aparam := newparam; end; end; aparam^.next := nil; (* read the vector size *) readln(thefile,vectorlength); if vectorlength <> vectorsize(param) then begin writeln(output,' vector length in the encoded file'); writeln(output,' does not correspond to the parameters'); halt end; end; (* end module encode.readencpar version = 'matmod 1.95 85 apr 18 tds/gds'; *) (* ************************************************************************ *) (* begin module vector.primitives *) (* these functions and procedures do manipulations on vectors. *) function vget(v:vector; pos:integer): real; (* this returns from vector 'v' the value of the element at position 'pos' *) var i: integer; begin if (pos > v.length) or (pos < 1) then begin writeln(output,' error in call to function vget:', ' position ',pos:1,' is beyond the end of the vector'); halt; end; (* move to the correct 'vectorpart' *) for i := 1 to ((pos -1) div maxvecpart) do v.part := v.part^.next; (* get the proper array element from this part *) vget := v.part^.numbers[(pos -1) mod maxvecpart + 1]; end; procedure vput(var v:vector; pos:integer; number: real); (* this puts into vector 'v' the value of 'number' at position 'pos' *) var i: integer; firstpart: partptr; (* of the vector *) begin if (pos > v.length) or (pos < 1) then begin writeln(output,' error in call to function vput:', ' position ',pos:1,' is beyond the end of the vector'); halt; end; (* move to the correct 'vectorpart' *) firstpart := v.part; for i := 1 to ((pos -1) div maxvecpart) do v.part := v.part^.next; (* put the 'number' into the proper array element for this part *) v.part^.numbers[(pos -1) mod maxvecpart + 1] := number; v.part := firstpart; end; procedure makevector(var v: vector; l: integer); (* create the linked list of vector-parts which are needed for a vector of length 'l' *) var numparts, (* number of parts needed for the vector *) i: integer; (* index to the parts of the vector *) firstpart, (* of the vector *) newpart: partptr; (* of the vector *) begin if l < 1 then begin writeln(output,' makevector: positive length required'); halt end; v.length := l; new(v.part); firstpart := v.part; numparts := (v.length - 1) div maxvecpart + 1; for i := 1 to (numparts - 1) do begin new(newpart); v.part^.next := newpart; v.part := newpart; end; v.part^.next := nil; v.part := firstpart; end; (* end module vector.primitives version = 'matmod 1.95 85 apr 18 tds/gds'; *) (* begin module vector.readvector *) procedure readvector(var thefile:text; var v: vector); (* read the elements of v into v from 'thefile'. v must already be set up (all the parts created and linked together) before calling. 'thefile' must contain, from the cursor point until the end of the vector, only numbers, either integers or reals; otherwise it will bomb. the vector ends with an end of line so that end of file can be tested for. *) var i,j: integer; (* indecies *) numparts, (* number of parts in the vector *) lastpart: integer; (* the number of elements in the last vector part *) firstpart: partptr; (* pointer to the first vector part *) begin firstpart := v.part; numparts := (v.length - 1) div maxvecpart + 1; lastpart := (v.length - 1) mod maxvecpart + 1; for i := 1 to (numparts - 1) do begin for j := 1 to maxvecpart do read(thefile,v.part^.numbers[j]); v.part := v.part^.next; end; for j := 1 to lastpart do read(thefile,v.part^.numbers[j]); readln(thefile); v.part := firstpart; end; (* end module vector.readvector version = 'matmod 1.95 85 apr 18 tds/gds'; *) (* begin module vector.vset *) procedure vset(thevalue: real; var v: vector); (* set the value thevalue into v *) var i: integer; (* an index *) begin for i := 1 to v.length do vput(v,i,thevalue) end; (* end module vector.vset version = 'matmod 1.95 85 apr 18 tds/gds'; *) (* begin module vector.vadd *) procedure vadd(a: vector; var b: vector); (* add the vector a into the vector b *) var i: integer; (* an index *) begin if a.length <> b.length then begin writeln(output,' function vadd: vector lengths must be equal'); halt end; for i := 1 to a.length do vput(b,i,vget(b,i)+vget(a,i)) end; (* end module vector.vadd version = 'matmod 1.95 85 apr 18 tds/gds'; *) (* ************************************************************************ *) (* begin module comp.readcomp *) (* begin module skipoligo *) procedure skipoligo(var thefile: text); (* this procedure is used to skip over an oligonucleotide string. it reads until it comes to a base, and then continues to read until it comes to a blank. no checking is done for non-base characters in between. *) var ch : char; begin repeat read(thefile,ch) until (ch in ['a','c','g','t']); repeat read(thefile,ch) until (ch = ' '); end; (* end module skipoligo version = 'auxmod 1.37 85 apr 4 gds/tds'; *) procedure readcomp(var comp: text; var compmax: integer; readmax: integer; var root: compnodeptr; var monocomptotal: compptr); (* this procedure requires modules: comp.type, skipoligo, halt; this procedure reads from file 'comp' a composition and puts it into the tree pointed to by the 'root' pointer. 'compmax' is the depth of the composition tree which is stored. it is the minimum of 'readmax', the requested depth, and 'detcomp', the depth for which the input file composition was determined. 'monocomptotal' points to the beginning (for the monos) of a linked list which gives the totals for each level of the composition. *) type (* for efficient reading of the information into the tree, each node that has a successor is stored in a queue as well as put into the tree. the variables of the type list are the queue elements. *) listptr = ^list; list = record item: compnodeptr; (* points to the composition tree node *) next: listptr; (* points to the next list item in the queue *) end; var freeitem, (* the list of unused list pointers *) listitem, (* an item in the queue *) first, (* the first item in the queue *) last: listptr; (* the last item in the queue *) comptot, (* the comp total for a given level *) newcomptot: compptr; (* for adding to the string of comptot"s *) detcomp, (* the level to which the input composition was determined *) level, (* of the composition being read, i.e., monos, dis, ... *) number: integer; (* read from the 'comp' file *) ch: char; (* for reading from 'comp' *) ba: base; (* an index *) (* getitem and clearitem provide efficient use of linked list storage by keeping a list of unused pointers that can be allocated instead of always creating 'new' ones *) procedure getitem(var l: listptr); (* obtain a listitem from the free list or by making a new one *) begin if freeitem<>nil then begin l:=freeitem; freeitem:=freeitem^.next end else new(l); l^.next:=nil end; procedure clearitem(var l: listptr); (* return a listitem to the free list *) var lptr: listptr; begin if l<>nil then begin lptr:=l; l:=l^.next; lptr^.next:=freeitem; freeitem:=lptr end end; begin (* readcomp *) reset(comp); if emptyfile(comp) then begin writeln(output,' error: no composition file provided'); halt; end; readln(comp); (* skip the program identification *) readln(comp); (* skip the book identification *) readln(comp,detcomp); (* obtain the determined composition *) readln(comp); (* skip the blank line *) (* determine the level of composition to be stored *) if (readmax < 1) then begin writeln(output); writeln(output,' warning: 0 or negative oligo length requested'); writeln(output,' composition used is depth ',detcomp:1); writeln(output); compmax := detcomp end else if (readmax > detcomp) then begin writeln(output); writeln(output,' warning: requested composition oligo', ' length (',readmax:1,')'); writeln(output,' is larger than the determined composition', ' oligo length (',detcomp:1,').'); writeln(output,' composition used is to depth ',detcomp:1); writeln(output); compmax := detcomp end else compmax := readmax; new(root); new(monocomptotal); new(comptot); new(first); new(last); first^.item := root; first^.next := nil; last := first; new(freeitem); freeitem^.next := nil; (* read in the total number of bases from the composition *) readln(comp); (* skip the * *) readln(comp); (* skip the information line *) readln(comp,number); root^.count := number; repeat read(comp,ch); (* skip the space *) read(comp,ch); (* the ch determines what to do next *) if (ch = '*') then begin (* determine the level about to be read *) readln(comp); readln(comp,level); if (level = 1) then monocomptotal := comptot else begin new(newcomptot); comptot^.next := newcomptot; comptot := newcomptot; end; comptot^.count := 0; comptot^.next := nil end else begin (* read the composition values *) for ba := a to t do begin skipoligo(comp); read(comp,number); if (number <> 0) then begin new(first^.item^.son[ba]); first^.item^.son[ba]^.count := number; comptot^.count := comptot^.count + number; getitem(listitem); last^.next := listitem; last := listitem; last^.next := nil; last^.item := first^.item^.son[ba] end else first^.item^.son[ba] := nil; end; clearitem(first); readln(comp); end; until level = compmax; (* for the last level of compositions to be read we don"t need to store the nodes in the queue because their successors will not be read in. *) repeat for ba := a to t do begin skipoligo(comp); read(comp,number); if (number <> 0) then begin new(first^.item^.son[ba]); first^.item^.son[ba]^.count := number; comptot^.count := comptot^.count + number; end else first^.item^.son[ba] := nil; end; clearitem(first); readln(comp); until (first = nil); end; (* readcomp *) function getcount(root: compnodeptr; start: pathptr): integer; (* this function follows the tree from the node 'root' (which may be the root of a subtree) along the path initiated with 'start', and returns the count of the resulting node. if the path through the tree ever hits a null node in the process the count is returned to be zero. *) var place: compnodeptr; (* a place in the composition tree *) point: pathptr; (* a point in the path through the tree *) begin point:= start; place := root; while ((place <> nil) and (point <> nil)) do begin place := place^.son[point^.bas]; point := point^.next; end; if (place = nil) then getcount := 0 else getcount := place^.count; end; (* end module comp.readcomp version = 'auxmod 1.37 85 apr 4 gds/tds'; *) (* begin module comp.getmonocomposition *) procedure getmonocomposition(var cmp: text; var gna,gnc,gng,gnt: integer; var data: text; var equalmono: boolean); (* get the mononucleotide composition in gna to gnt from the file cmp, messages about what is returned go to data. if the file cmp is empty, equal mononucleotides are assumed and equalmono is true *) var (* variables for composition procedures readcomp and getcount *) compmax: integer; root: compnodeptr; monocomptotal: compptr; start: pathptr; b1: base; (* base for comp *) count: integer; (* one of the base counts *) begin (* getmonocomposition *) reset(cmp); if emptyfile(cmp) then begin writeln(data,'* the composition file is empty.'); writeln(data,'* (equal mononucleotides assumed)'); gna := 1; gnc := 1; gng := 1; gnt := 1; equalmono := true end else begin write(data,'*'); copyaline(cmp,data); write(data,'*'); copyaline(cmp,data); readcomp(cmp,compmax,1,root,monocomptotal); (* obtain the mono composition *) new(start); start^.next := nil; for b1 := a to t do begin start^.bas := b1; count := getcount(root,start); case b1 of a: gna := count; c: gnc := count; g: gng := count; t: gnt := count end end; dispose(start); equalmono := false end end; (* getmonocomposition *) (* end module comp.getmonocomposition *) (* ************************************************************************ *) (* begin module calehnb *) procedure calehnb(n: integer; gna,gnc,gng,gnt: integer; var hg, ehnb, varhnb: real (* ; debugging: boolean *)); (* calculate e(hnb) in bits/bp (ehnb) for a number (n) of example sequence sites. gna to gnt are the composition to use for the genome probabilities of a to t. the genomic entropy hg and the variance var(hnb) (=varhnb) are also calculated. if the variable debugging is passed to the procedure then the individual combinations of hnb are displayed. note: this procedure should not be broken into smaller procedures so that it remains efficient. version = 3.02; of procedure calehnb 1983 nov 23 *) const maxsize = 200; (* the largest n allowed *) accuracy = 10000; (* less than (1/accuracy) bits error is demanded for the sum of pnb (see variable 'total') at the end of the procedure *) var log2: real; (* natural log of 2, used to find log base 2 *) logn: real; (* log of n *) nlog2: real; (* n * log2 *) gn: integer; (* sum of gna..gnt *) logpa,logpc,logpg,logpt: real; (* logs of genome probabilities *) (* log of n factorial is the sum of i=1 to n of log(i). the array below represents these logs up to n *) logfact: array[0..maxsize] of real; (* precalculated values of -p*log2(p), where p=nb/n for nb = 0 .. n. m stands for minus *) mplog2p: array[0..maxsize] of real; i: integer; (* index for logfact and mplog2p *) logi: real; (* natural log of i *) na, nc, ng, nt: integer; (* numbers of bases in a site *) done: boolean; (* true when the loop is completed *) pnb: real; (* multinomial probability of a combination of na, nc, ng, nt *) hnb: real; (* entropy for a combination of na..nt *) pnbhnb: real; (* pnb*hnb, an intermediate result *) sshnb: real; (* sum of squares of hnb *) (* variables for testing program correctness: *) total: real; (* sum of pnb over all combinations of na..nt if this is not 1.00, the program is in error *) counter: integer; (* counts the number of times through the loop *) begin (* calehnb *) (* prevent access to outside the arrays: *) if n > maxsize then begin writeln(output,' procedure calehnb: n > maxsize (', n:1,'>',maxsize:1,')'); halt end; counter := 0; total := 0.0; log2 := ln(2); logn := ln(n); nlog2 := n * log2; (* get logs of genome probabilities *) gn := gna + gnc + gng + gnt; logpa := ln(gna/gn); logpc := ln(gnc/gn); logpg := ln(gng/gn); logpt := ln(gnt/gn); (* find genomic entropy *) hg := -(gna*logpa + gnc*logpc + gng*logpg + gnt*logpt)/(gn*log2); ehnb := 0.0; (* start error entropy at zero *) sshnb := 0.0; (* make table of log of n factorial up to n and entropies for nb/n *) logfact[0] := 0.0; (* factorial(0) = 0 *) mplog2p[0] := 0.0; for i := 1 to n do begin logi := ln(i); logfact[i] := logfact[i - 1] + logi; mplog2p[i] := - i * (logi - logn) / nlog2 end; (* begin by looking at the combination with all a: na = n *) na := n; nc := 0; ng := 0; nt := 0; (* the following loop simulates a number of nested loops of the form: for b1=a to t do for b2=b1 to t do for b3=b2 to t do ... for bn=b(n-1) to t do ... the resulting set of variables increase in alphabetic order since no inner loop variable can have a value less than any outer loop. the number of times through the inner-most loop is given by: o = (n + 1)*(n + 2)*(n + 3)/6 in the case where there are four symbols (a,c,g,t) and n is the number of nested loops. a recursive set of loops would be possible, but it would use up too much memory in practical cases (up to n=150 or higher). a second algorithm sequests the loop variables into an array and increments them there. however, the goal is to get all possible combinations for na, nc, ng, nt, where the sum of these is n. the nested loops provide all the combinations in alphabetic order, assuring that there can not be any duplicates. to find nb (one of na..nt) one would look at which of the variables b1 to bn were of value b. this is a wasteful operation. the loop below simulates the array of control variables by changing each nb directly. *) done := false; repeat (* pnb is calculated by taking the log of the expression fact(n) na nc ng nt pnb = ------------------- pa * pc * pg * pt . fact(na).. fact(nt) log(pnb) generates a series of sums, allowing the calculation to proceed by addition and multiplication rather than multiplication and exponentiation. the factorials become tractable in this way *) pnb := exp(logfact[n] (* n factorial *) - logfact[na] - logfact[nc] - logfact[ng] - logfact[nt] + na * logpa + nc * logpc + ng * logpg + nt * logpt); hnb := mplog2p[na] + mplog2p[nc] + mplog2p[ng] + mplog2p[nt]; pnbhnb := pnb * hnb; ehnb := ehnb + pnbhnb; sshnb := sshnb + pnbhnb * hnb; (* sum of squares of hnb *) (* the following section keeps track of the calculation and writes out the current set of nb. *) counter := succ(counter); (* if debugging then begin write(output,' ',counter:2,' '); for i := 1 to na do write(output,'a'); for i := 1 to nc do write(output,'c'); for i := 1 to ng do write(output,'g'); for i := 1 to nt do write(output,'t'); write(output,' ',na:3,nc:3,ng:3,nt:3); writeln(output,' pnb = ',pnb:10:5); end; *) total := total + pnb; (* the remaining portion of this repeat loop generates the values of na, nc, ng and nt. notice that there are 7 possibilities at each loop increment. other than the stop, in each case the sum of na+nc+ng+nt remains constant (=n). *) if nt > 0 then begin (* ending on a t - do outer loops *) if ng > 0 then begin (* turn g into t *) ng := ng - 1; nt := nt + 1 end else if nc > 0 then begin (* turn one c into g, and all t to g (note ng = 0 initially) *) nc := nc - 1; ng := nt + 1; nt := 0 end else if na > 0 then begin (* turn one a into c and all g and t to c. (note ng=nc=0 initially) *) na := na - 1; nc := nt + 1; nt := 0 end else done := true (* since nt = n *) end else begin (* no t - increment innermost loop *) if ng > 0 then begin (* turn g into t *) ng := ng - 1; nt := nt + 1 end else if nc > 0 then begin (* turn c into g *) nc := nc - 1; ng := ng + 1 end else begin (* na > 0; turn a into c *) na := na - 1; nc := nc + 1 end end until done; (* final adjustment: we only have the sum of squares so far *) varhnb := sshnb - ehnb*ehnb; (* if this message appears, there is either a bug in the code or the computer cannot be as accurate as requested *) if accuracy <> round(accuracy * total) then begin writeln(output,' procedure calehnb: the sum of probabilities is'); writeln(output,' not accurate to one part in ',accuracy:1); writeln(output,' the sum of the probabilities is ',total:10:8); end; (* if this message appear, then there is an error in the repeat-until loop: it did not repeat as many times as is expected from the algorithm *) if counter <> round((n+1)*(n+2)*(n+3)/6) then begin writeln(output,' procedure calehnb: program error, the number of'); writeln(output,' calculations is in error'); halt end; (* writeln(output, ' total: ',total:10:5); writeln(output,' count = ',counter:1); writeln(output,' (n+1)*(n+2)*(n+3)/6 = ', round((n+1)*(n+2)*(n+3)/6):1); *) end; (* calehnb *) (* end module calehnb version = 2.19; (@ of calhnb 1986 mar 31 *) (* begin module calaehnb *) procedure calaehnb(n: integer; gna,gnc,gng,gnt: integer; var hg, aehnb, avarhnb: real); (* calculate ae(hnb) in bits/bp (=aehnb) for a number (n) of example sequence sites. gna to gnt are the composition to use for the genome probabilities of a to t. the genomic entropy (=hg) and the variance avar(hnb) (=avarhnb) are also calculated. this procedure gives approximations for e(hnb) and var(hnb). it is based on a formula derived by jeff haemer. see also: g. p. basharin theory probability appl. 4(3): 333-336 (1959) 'on a statistical estimate for the entropy of a sequence of independent random variables' version = 3.01; of procedure calaehnb 1983 nov 23 *) var log2: real; (* natural log of 2 *) gn: integer; (* sum of gna..gnt *) pa, pc, pg, pt: real; (* genomic probabilities *) e: real; (* the approximate sampling error *) begin (* calaehnb *) log2 := ln(2); gn := gna + gnc + gng + gnt; pa := gna/gn; pc := gnc/gn; pg := gng/gn; pt := gnt/gn; hg := -( (pa*ln(pa)) +(pc*ln(pc)) +(pg*ln(pg)) +(pt*ln(pt)))/log2; e := 3/(2*log2*n); aehnb := hg - e; avarhnb := e * e end; (* calaehnb *) (* end module calaehnb version = 2.19; (@ of calhnb 1986 mar 31 *) (* begin module hnblist.makehnblist *) procedure makehnblist(n: vector; gna, gnc, gng, gnt: integer; var l: ehnblstptr; var hg: real); (* make a sorted list l of e(hnb) values based on the vector of samples n and the genomic composition gna to gnt. also, return hg for the genomic information *) const kickover = 50; (* e(hnb) for n values above this are estimated from ae(hnb) = hg - 3/(2 ln(2) n) while those below are calculated exactly *) var nindex: integer; (* index to n *) num: integer; (* hold the value of n at nindex to avoid multiple calls to vget *) lindex: ehnblstptr; (* index to l *) spare: ehnblstptr; (* a spare pointer for construction *) done: boolean; (* have we finished searching l? *) procedure fill(var l: ehnblstptr); (* fill l with the (global) values of n from at nindex *) begin (* fill *) if num = 0 then begin {zzz} { writeln(output,' do not use encodings that reach', ' beyond the sequences'); halt } l^.aflag := ' ' (* no computation to be done here *) end else begin l^.n := num; if num <= kickover then begin calehnb(num, gna,gnc,gng,gnt, hg,l^.ehnb,l^.varhnb); l^.aflag := 'e' (* not using approximation *) end else begin calaehnb(num, gna,gnc,gng,gnt, hg,l^.ehnb,l^.varhnb); l^.aflag := 'a' (* using approximation *) end end end; (* fill *) begin (* makehnblist *) new(l); (* start the list up *) nindex := 1; (* for the first one *) num := round(vget(n,nindex)); fill(l); l^.next := nil; for nindex := 2 to n.length do begin num := round(vget(n,nindex)); (*write(output,' nindex = ',nindex:1); (@ debug *) (*write(output,' num = ',num:1);(@debug*) if num < l^.n then begin (* insert before, to become the first on the list *) (*writeln(output,' first on list'); (@debug*) new(spare); spare^.next := l; l := spare; fill(l) end else begin (* insert somewhere in l *) lindex := l; done := false; while not done do if lindex^.next = nil then done := true else if lindex^.next^.n <= num then lindex := lindex^.next else done := true; if lindex^.n <> num then begin (*write(output,' accepted'); (@debug*) (* new number to insert, otherwise ignore *) if lindex^.next = nil then begin (* add to end *) (*writeln(output,' add to end'); (@debug*) new(lindex^.next); lindex := lindex^.next; lindex^.next := nil end else begin (* insert to middle of list *) (*writeln(output,' insert to middle'); (@debug*) new(spare); spare^.next := lindex^.next; lindex^.next := spare; lindex := spare end; fill(lindex) end (* else: reject this n since it is already on the list *) (*else writeln(output,' rejected') (@debug*) end end end; (* makehnblist *) (* end module hnblist.makehnblist *) (* begin module hnblist.gethnb *) procedure gethnb(l: ehnblstptr; n: integer; var ehnb, varhnb: real; var aflag: char); (* get the e(hnb) [ehnb] value for n example sites by looking in the precalculated list l. also return the variance of hnb, and whether these are approximations. *) var lindex: ehnblstptr; (* index to l *) begin (* gethnb *) lindex := l; while (lindex^.n <> n) and (lindex^.next <> nil) do lindex := lindex^.next; if lindex^.n <> n then begin (* oh oh, something is wrong *) writeln(output,' gethnb: program error.'); writeln(output,' unable to find n = ',n:1); halt end; ehnb := lindex^.ehnb; varhnb := lindex^.varhnb; aflag := lindex^.aflag end; (* gethnb *) (* end module hnblist.gethnb *) (* ************************************************************************ *) (* begin module rseq.header *) procedure header(var data, encseq: text); (* display the header to data, using information in the encoded sequences, encseq *) begin (* header *) rewrite(data); writeln(data,'* rseq ',version:4:2, ' rsequence calculated from encoded sequences from:'); reset(encseq); if emptyfile(encseq) then begin writeln(output,' encseq is empty'); halt end; write(data,'*'); copyaline(encseq,data); write(data,'*'); copyaline(encseq,data); writeln(data,'*'); end; (* header *) (* end module rseq.header *) (* begin module rseq.checkparams *) procedure checkparams(theparameters: paramptr); (* make sure that the parameters are fitting to this version of rseq *) begin (* checkparams *) with theparameters^ do begin if window <> 1 then begin writeln(output,' rseq requires an encoding window length of 1'); halt end; if wshift <> 1 then begin writeln(output,' rseq requires encoding window shifts of 1'); halt end; if coding <> 1 then begin writeln(output,' rseq can only handle mononucleotide encodings'); halt end; if cshift <> 1 then begin writeln(output,' rseq can only handle coding shifts of 1'); halt end; if next <> nil then begin writeln(output,' rseq can only handle one', ' encoding parameter set'); halt end; end; end; (* checkparams *) (* end module rseq.checkparams *) (* begin module vector.sumvectors *) procedure sumvectors(var encseq: text; var nbl: vector; var firstparam: paramptr); (* scan the encoded sequences in encseq and sum them up in nbl *) var regions: integer; (* for readencpar *) vsize: integer; (* size of the vectors *) v: vector; (* a vector read in *) begin (* sumvectors *) firstparam := nil; readencpar(encseq,firstparam,regions,vsize); makevector(nbl,vsize); makevector(v,vsize); vset(0.0,nbl); while not eof(encseq) do begin readvector(encseq,v); vadd(v,nbl); end end; (* sumvectors *) (* end module vector.sumvectors *) (* begin module vector.makenl *) procedure makenl(nbl: vector; var nl: vector; var firstparam: paramptr); (* sum nbl (the number of bases (b) of each type at a position l) over the bases to create nl, the number of bases at position l *) var aparam: paramptr; (* one of the parameters *) l: integer; (* a position in the sequence *) v: integer; (* position in the vectors *) w: integer; (* position in a window of the encoding *) total: real; (* sum of encoding at l *) begin makevector(nl,nbl.length); vset(0.0,nl); aparam := firstparam; v := 0; (* handle one parameter at a time: *) while aparam <> nil do begin l := aparam^.range[start]; repeat (* add up bases at position l *) total := 0; for w := 1 to aparam^.wvlength do total := total + vget(nbl,w+v); (* put the total into the same positions of nl *) for w := 1 to aparam^.wvlength do vput(nl,w+v,total); (* move to the next slots in the vector *) v := v + aparam^.wvlength; l := l + aparam^.wshift; until l > aparam^.range[stop]; (* move to the next parameter *) aparam := aparam^.next; end; end; (* end module vector.makenl *) (* begin module vector.compressnl *) procedure compressnl(nl: vector; firstparam: paramptr; var thefrom, theto: integer); (* Determine the true from-to bounds of the nl vector *) var aparam: paramptr; (* one of the parameters *) l: integer; (* a position in the sequence *) total: real; (* sum of encoding at l *) v: integer; (* position in the vectors *) begin aparam := firstparam; thefrom := +maxint; theto := -maxint; v := 0; (* handle one parameter at a time: *) while aparam <> nil do begin l := aparam^.range[start]; repeat { for w := 1 to aparam^.wvlength do total := total + vget(nbl,w+v); (* put the total into the same positions of nl *) for w := 1 to aparam^.wvlength do vput(nl,w+v,total); } total := vget(nl,1+v); if total > 0 then theto := l; if thefrom >= +maxint then if total > 0 then thefrom := l; (* move to the next slots in the vector *) v := v + aparam^.wvlength; l := l + aparam^.wshift; until l > aparam^.range[stop]; (* move to the next parameter *) aparam := aparam^.next; end; end; (* end module vector.compressnl *) (* begin module rseq.prepareehnb *) procedure prepareehnb(var cmp, data: text; nl: vector; var ehnb: ehnblstptr); (* prepare the list of e(hnb) values keyed on the number of sequences, n(l). the procedure reads the composition from cmp, and announces what it did to data. *) const cmpnmin = 1000; (* minimum number of bases allowed for the composition *) var (* mononucleotide genomic composition *) gmono: array[0..3] of integer; (* a to t *) genomicn: integer; (* the number of bases of the composition *) equalmono: boolean; (* true if equal mononucleotides were assumed by getmonocomposition *) hg: real; (* the genomic entropy *) begin (* prepareehnb *) writeln(data,'* genomic probabilities from:'); getmonocomposition(cmp,gmono[0],gmono[1],gmono[2],gmono[3], data, equalmono); genomicn := gmono[0]+gmono[1]+gmono[2]+gmono[3]; if not equalmono then if genomicn < cmpnmin then begin writeln(output,' ************ WARNING ***********'); writeln(output,' there should be at least'); writeln(output,' ',cmpnmin:4,' bases of composition information'); writeln(output,' to avoid needing to adjust for sampling'); writeln(output,' error in the genomic hg.'); writeln(output,' the error with ',genomicn:5,' bases is ', 3/(2*genomicn*ln(2)):infofield:infodecim,' bits'); end; writeln(data,'* the genomic composition used is: ', ' a = ',gmono[0]:1, ', c = ',gmono[1]:1, ', g = ',gmono[2]:1, ', t = ',gmono[3]:1); writeln(data,'*'); makehnblist(nl, gmono[0],gmono[1],gmono[2],gmono[3], ehnb, hg); writeln(data,'* genomic information, hg: ',hg:infofield:infodecim, ' bits/base'); writeln(data,'*') end; (* prepareehnb *) (* end module rseq.prepareehnb *) (* begin module rseq.colinfo *) procedure colinfo(var data: text); (* send the final header column information to data *) begin (* colinfo *) writeln(data,'* l position in aligned set'); writeln(data,'* a..t number of each base at the position l'); writeln(data,'* rs(l) rsequence(l)'); writeln(data,'* rs rsequence, running sum of rs(l)'); writeln(data,'* var(hnb) variance of hnb'); writeln(data,'* sum-var running sum of var(hnb)'); writeln(data,'* n(l) number of sequence examples at l'); writeln(data,'* e(hnb) expectation of hnb'); writeln(data,'* f flag for calculation of e(hnb) and var(hnb):'); writeln(data,'* e = exact, a = approximation'); end; (* colinfo *) (* end module rseq.colinfo *) (* begin module rseq.colid *) procedure colid(var data: text); (* identify each column of the output *) begin (* colid *) writeln(data,'*', ' l':(posfield), (* no +1 since the * is there *) ' a':(nfield+1), ' c':(nfield+1), ' g':(nfield+1), ' t':(nfield+1), ' rs(l)':(infofield+1), ' rs':(infofield+1), ' var(hnb)':(infofield+1), ' sum-var':(infofield+1), ' n(l)':(posfield+1), ' e(hnb)':(infofield+1), ' f'); end; (* colid *) (* end module rseq.colid *) (* begin module rseq.finalcomments *) procedure finalcomments(var data: text; rs, sumvarhnb: real); (* write rsequence and its standard deviation data to data *) begin (* finalcomments *) writeln(data,'* rsequence = ',rs:infofield:infodecim, ' +/- ',sqrt(sumvarhnb):infofield:infodecim,' bits/site'); end; (* finalcomments *) (* end module rseq.finalcomments *) (* begin module rseq.makers *) procedure makers(var data: text; (* display rs(l) and other results *) nbl, (* number of each kind of base at each position *) nl: vector; (* number of sequences at each position *) ehnblist: ehnblstptr; (* e(hnb) for various n *) theparameters: paramptr; (* the parameters that describe the vectors *) thefrom, theto: integer (* actual range of the data *)); (* make rsequence. the core of the program. the vectors n(bl) [nbl] and n(l) [nl] are scanned to generate frequencies of bases. these are used to generate the w matrix w(b,l) [wbl], the average rs(l) [rsl] and then rsequence. sampling error is also recorded. *) var aflag: char; (* 'a' if the approximations were used for ehnb and varhnb, 'e' if they were calculated exactly *) b: integer; (* the position in either vector corresponding to l. the b variables correspond to the base index *) bstart: integer; (* the first position in the vector for some l, used to get the number of sites *) bstop: integer; (* the last position in the vector *) ehnb: real; (* e(hnb) *) freq: real; (* the frequency of a base at a position *) l: integer; (* the aligned position in the range *) ln2: real; (* the natural log of 2. dividing the natural log of a number by ln2 gives the log to the base 2 of the number *) rangestart: integer; (* the start of the range *) rangestop: integer; (* the stop of the range *) rsl: real; (* rsequence at a position l *) rs: real; (* the running sum of rsl *) sumvarhnb: real; (* running sum of varhnb *) symbols: integer; (* the number of bases *) varhnb: real; (* var(hnb) at a position l *) wbl: real; (* the weight w(b,l) *) begin (* makers *) ln2 := ln(2); rangestart := theparameters^.range[start]; rangestop := theparameters^.range[stop]; symbols := theparameters^.wvlength; rs := 0; sumvarhnb := 0; (* the following information is provided to allow another program to know how many lines will follow *) { writeln(data,'* ',rangestart:posfield,' ',rangestop:posfield, ' is the range'); zzz} writeln(data,'* ',thefrom:posfield,' ',theto:posfield, ' is the range'); writeln(data,'*'); colinfo(data); writeln(data,'*'); colid(data); {zzz} { for l := rangestart to rangestop do if (l >= thefrom) and (l <= theto) begin } for l := thefrom to theto do begin (* begin the display *) write(data,' ',l:posfield); bstart := symbols*(l - rangestart) + 1; bstop := symbols*(l - rangestart) + symbols; gethnb(ehnblist,round(vget(nl,bstart)),ehnb,varhnb,aflag); rsl := 0; for b := bstart to bstop do begin write(data,' ',round(vget(nbl,b)):nfield); freq := vget(nbl,b)/vget(nl,b); if freq > 0.0 then wbl := ehnb + ln(freq) /ln2 else wbl := negativeinfinity; rsl := rsl + freq * wbl end; rs := rs + rsl; sumvarhnb := sumvarhnb + varhnb; (* display the results *) writeln(data,' ',rsl:infofield:infodecim, ' ',rs:infofield:infodecim, ' ',varhnb:(infofield+3), (* scientific notation *) ' ',sumvarhnb:(infofield+3), (* scientific notation *) { ' ',varhnb:infofield:infodecim, ' ',sumvarhnb:infofield:infodecim, } ' ',round(vget(nl,bstart)):nfield, (* the extra space looks nicer *) ' ',ehnb:infofield:infodecim, ' ',aflag); end; finalcomments(output,rs,sumvarhnb); finalcomments(data,rs,sumvarhnb) end; (* makers *) (* end module rseq.makers *) (* begin module rseq.themain *) procedure themain(var encseq, cmp, rsdata: text); (* the main procedure *) var nbl, (* sum of each kind of base at each position *) nl: (* number of sequences at each position *) vector; ehnb: ehnblstptr; (* list of e(hnb) values *) theparameters: paramptr; (* how the vectors are structured *) thefrom, theto: integer; (* the region of the encoding that is not zero *) begin (* themain *) writeln(output, ' rseq ',version:4:2); header(rsdata,encseq); sumvectors(encseq,nbl,theparameters); checkparams(theparameters); {zzz} { writeln(output,'theparameters^.range[start] =', theparameters^.range[start]:5); writeln(output,'theparameters^.range[stop ] =', theparameters^.range[stop ]:5); } makenl(nbl,nl,theparameters); compressnl(nl,theparameters, thefrom, theto); { writeln(output,'thefrom = ',thefrom:1); writeln(output,'theto = ',theto:1); } prepareehnb(cmp,rsdata,nl,ehnb); makers(rsdata,nbl,nl,ehnb,theparameters,thefrom,theto); end; (* themain *) (* end module rseq.themain *) begin (* rseq *) themain(encseq,cmp,rsdata); 1: end. (* rseq *)