program calhnb(fin, fout, output); (* calhnb: calculate e(hnb), var(hnb), ae(hnb), avar(hnb) Dr. Thomas D. Schneider National Cancer Institute Laboratory of Experimental and Computational Biology Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu http://www.lecb.ncifcrf.gov/~toms/ module libraries required: delman, prgmods *) label 1; (* end of program *) const (* begin module version *) version = 2.27; (* of calhnb.p 2003 Jul 23 2.28, 2003 Jul 23: remove 'entropy' 2.27, 2001 Jun 7: upgrade documentation 2.26, 2001 Jun 7: upgrade documentation 2.25, 2001 Jun 7: add sd(n) column 2.24, 2000 Jan 18: allow approx. for large n. 2.23, upgrade documentation 2.22, 1994 Sep 5: last changes original before 1984 jan 18 *) (* end module version *) { 2001 Jun 7: the original one liner is not too descriptive! calhnb: calculate e(hnb), var(hnb), ae(hnb), avar(hnb), e(n), sd(n) } (* begin module describe.calhnb *) (* name calhnb: small-sample correction for information and uncertainty synopsis calhnb(fin: in, fout: out, output: out) files fin: the genomic composition (integers) on one line followed by a set of integers, one per line representing values of n fout: a table showing n, e(hnb), ae(hnb) and their difference. the variances var(hnb) and avar(hnb) are tabulated along with the difference between their square roots. This is the difference between the standard deviations. e(n) is found from the genomic uncertainty minus e(hnb). Finally, sd(n) = sqrt(var(hnb)) is given. output: messages to the user. describe Given a genomic composition and a series of integers (n) that represent the number of sample sites, calhnb calculates the sampling error as e(hnb) and the variance var(hnb). It also finds the approximations ae(hnb) and avar(hnb). These values are presented in a table along with the differences between the exact and approximate calculations. This table will allow a user to decide when to use the approximations. Beware that the exact calculation becomes very expensive for large n. For this reason, I use the approximate computation for n > 20 in rseq and alpro. examples When used as fin, the calhnb.fin file should generate the calhnb.fout file in the fout. The data should be identical those given in Figure A.2 on page 428 of the Appendix of Schneider et al 1986. documentation "Information content of binding sites on nucleotide sequences" T. D. Schneider, G. D. Stormo, L. Gold, and A. Ehrenfeucht JMB 188:415-431 (1986) [see link below] see also {Example input file, fin:} calhnb.fin {Corresponding output file, fout:} calhnb.fout {Discussion about correctiing for small sample size:} http://www.lecb.ncifcrf.gov/~toms/small.sample.correction.html {Schneider et al. (1986):} http://www.lecb.ncifcrf.gov/~toms/paper/schneider1986 {related programs:} rseq.p, alpro.p author Thomas D. Schneider bugs It would be nice to have a generalized algorithm for any number of symbols. *) (* end module describe.calhnb *) var fin, (* input file of n values one per line *) fout: (* table of e(hnb), ae(hnb) and variances *) text; (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 'prgmod 3.96 85 mar 18 tds'; *) (* begin module calehnb *) procedure calehnb(n: integer; gna,gnc,gng,gnt: integer; var hg, ehnb, varhnb: real (* ; debugging: boolean *)); (* calculate e(hnb) in bits/bp (ehnb) for a number (n) of example sequence sites. gna to gnt are the composition to use for the genome probabilities of a to t. the genomic uncertainty hg and the variance var(hnb) (=varhnb) are also calculated. if the variable debugging is passed to the procedure then the individual combinations of hnb are displayed. note: this procedure should not be broken into smaller procedures so that it remains efficient. version = 3.02; of procedure calehnb 1983 nov 23 *) const maxsize = 200; (* the largest n allowed *) accuracy = 10000; (* less than (1/accuracy) bits error is demanded for the sum of pnb (see variable 'total') at the end of the procedure *) var log2: real; (* natural log of 2, used to find log base 2 *) logn: real; (* log of n *) nlog2: real; (* n * log2 *) gn: integer; (* sum of gna..gnt *) logpa,logpc,logpg,logpt: real; (* logs of genome probabilities *) (* log of n factorial is the sum of i=1 to n of log(i). the array below represents these logs up to n *) logfact: array[0..maxsize] of real; (* precalculated values of -p*log2(p), where p=nb/n for nb = 0 .. n. m stands for minus *) mplog2p: array[0..maxsize] of real; i: integer; (* index for logfact and mplog2p *) logi: real; (* natural log of i *) na, nc, ng, nt: integer; (* numbers of bases in a site *) done: boolean; (* true when the loop is completed *) pnb: real; (* multinomial probability of a combination of na, nc, ng, nt *) hnb: real; (* uncertainty for a combination of na..nt *) pnbhnb: real; (* pnb*hnb, an intermediate result *) sshnb: real; (* sum of squares of hnb *) (* variables for testing program correctness: *) total: real; (* sum of pnb over all combinations of na..nt if this is not 1.00, the program is in error *) counter: integer; (* counts the number of times through the loop *) begin (* calehnb *) (* prevent access to outside the arrays: *) if n > maxsize then begin writeln(output,' procedure calehnb: n > maxsize (', n:1,'>',maxsize:1,')'); halt end; counter := 0; total := 0.0; log2 := ln(2); logn := ln(n); nlog2 := n * log2; (* get logs of genome probabilities *) gn := gna + gnc + gng + gnt; logpa := ln(gna/gn); logpc := ln(gnc/gn); logpg := ln(gng/gn); logpt := ln(gnt/gn); (* find genomic uncertainty *) hg := -(gna*logpa + gnc*logpc + gng*logpg + gnt*logpt)/(gn*log2); ehnb := 0.0; (* start error uncertainty at zero *) sshnb := 0.0; (* make table of log of n factorial up to n and entropies for nb/n *) logfact[0] := 0.0; (* factorial(0) = 0 *) mplog2p[0] := 0.0; for i := 1 to n do begin logi := ln(i); logfact[i] := logfact[i - 1] + logi; mplog2p[i] := - i * (logi - logn) / nlog2 end; (* begin by looking at the combination with all a: na = n *) na := n; nc := 0; ng := 0; nt := 0; (* the following loop simulates a number of nested loops of the form: for b1=a to t do for b2=b1 to t do for b3=b2 to t do ... for bn=b(n-1) to t do ... the resulting set of variables increase in alphabetic order since no inner loop variable can have a value less than any outer loop. the number of times through the inner-most loop is given by: o = (n + 1)*(n + 2)*(n + 3)/6 in the case where there are four symbols (a,c,g,t) and n is the number of nested loops. a recursive set of loops would be possible, but it would use up too much memory in practical cases (up to n=150 or higher). a second algorithm sequests the loop variables into an array and increments them there. however, the goal is to get all possible combinations for na, nc, ng, nt, where the sum of these is n. the nested loops provide all the combinations in alphabetic order, assuring that there can not be any duplicates. to find nb (one of na..nt) one would look at which of the variables b1 to bn were of value b. this is a wasteful operation. the loop below simulates the array of control variables by changing each nb directly. *) done := false; repeat (* pnb is calculated by taking the log of the expression fact(n) na nc ng nt pnb = ------------------- pa * pc * pg * pt . fact(na).. fact(nt) log(pnb) generates a series of sums, allowing the calculation to proceed by addition and multiplication rather than multiplication and exponentiation. the factorials become tractable in this way *) pnb := exp(logfact[n] (* n factorial *) - logfact[na] - logfact[nc] - logfact[ng] - logfact[nt] + na * logpa + nc * logpc + ng * logpg + nt * logpt); hnb := mplog2p[na] + mplog2p[nc] + mplog2p[ng] + mplog2p[nt]; pnbhnb := pnb * hnb; ehnb := ehnb + pnbhnb; sshnb := sshnb + pnbhnb * hnb; (* sum of squares of hnb *) (* the following section keeps track of the calculation and writes out the current set of nb. *) counter := succ(counter); (* if debugging then begin write(output,' ',counter:2,' '); for i := 1 to na do write(output,'a'); for i := 1 to nc do write(output,'c'); for i := 1 to ng do write(output,'g'); for i := 1 to nt do write(output,'t'); write(output,' ',na:3,nc:3,ng:3,nt:3); writeln(output,' pnb = ',pnb:10:5); end; *) total := total + pnb; (* the remaining portion of this repeat loop generates the values of na, nc, ng and nt. notice that there are 7 possibilities at each loop increment. other than the stop, in each case the sum of na+nc+ng+nt remains constant (=n). *) if nt > 0 then begin (* ending on a t - do outer loops *) if ng > 0 then begin (* turn g into t *) ng := ng - 1; nt := nt + 1 end else if nc > 0 then begin (* turn one c into g, and all t to g (note ng = 0 initially) *) nc := nc - 1; ng := nt + 1; nt := 0 end else if na > 0 then begin (* turn one a into c and all g and t to c. (note ng=nc=0 initially) *) na := na - 1; nc := nt + 1; nt := 0 end else done := true (* since nt = n *) end else begin (* no t - increment innermost loop *) if ng > 0 then begin (* turn g into t *) ng := ng - 1; nt := nt + 1 end else if nc > 0 then begin (* turn c into g *) nc := nc - 1; ng := ng + 1 end else begin (* na > 0; turn a into c *) na := na - 1; nc := nc + 1 end end until done; (* final adjustment: we only have the sum of squares so far *) varhnb := sshnb - ehnb*ehnb; (* if this message appears, there is either a bug in the code or the computer cannot be as accurate as requested *) if accuracy <> round(accuracy * total) then begin writeln(output,' procedure calehnb: the sum of probabilities is'); writeln(output,' not accurate to one part in ',accuracy:1); writeln(output,' the sum of the probabilities is ',total:10:8); end; (* if this message appear, then there is an error in the repeat-until loop: it did not repeat as many times as is expected from the algorithm *) if counter <> round((n+1)*(n+2)*(n+3)/6) then begin writeln(output,' procedure calehnb: program error, the number of'); writeln(output,' calculations is in error'); halt end; (* writeln(output, ' total: ',total:10:5); writeln(output,' count = ',counter:1); writeln(output,' (n+1)*(n+2)*(n+3)/6 = ', round((n+1)*(n+2)*(n+3)/6):1); *) end; (* calehnb *) (* end module calehnb *) (* begin module calaehnb *) procedure calaehnb(n: integer; gna,gnc,gng,gnt: integer; var hg, aehnb, avarhnb: real); (* calculate ae(hnb) in bits/bp (=aehnb) for a number (n) of example sequence sites. gna to gnt are the composition to use for the genome probabilities of a to t. the genomic uncertainty (=hg) and the variance avar(hnb) (=avarhnb) are also calculated. this procedure gives approximations for e(hnb) and var(hnb). it is based on a formula derived by jeff haemer. see also: G. P. Basharin Theory Probability Appl. 4(3): 333-336 (1959) 'On a statistical estimate for the entropy of a sequence of independent random variables' version = 3.01; of procedure calaehnb 1983 nov 23 *) var log2: real; (* natural log of 2 *) gn: integer; (* sum of gna..gnt *) pa, pc, pg, pt: real; (* genomic probabilities *) e: real; (* the approximate sampling error *) begin (* calaehnb *) log2 := ln(2); gn := gna + gnc + gng + gnt; pa := gna/gn; pc := gnc/gn; pg := gng/gn; pt := gnt/gn; hg := -( (pa*ln(pa)) +(pc*ln(pc)) +(pg*ln(pg)) +(pt*ln(pt)))/log2; e := 3/(2*log2*n); aehnb := hg - e; avarhnb := e * e end; (* calaehnb *) (* end module calaehnb *) procedure themain(var fin, fout: text); (* the main procedure of the program *) const inf = 9; (* field for information display *) ind = 5; (* decimal place for information display *) { extra = 5; (* more size for final output *) } mintell = 25; (* the minimum n for which to tell the user were we are... *) maxsize = 200; (* maximum size for computing exactly. This *must* match the same constant in procedure calehnb. *) var gna,gnc,gng,gnt: integer; (* genomic composition *) n: integer; (* number of example sequences *) h, ha, (* genomic hg for calehnb or calaehnb procedures *) ehnb, (* e(hnb) *) aehnb, (* ae(hnb) *) varhnb, (* variance of hnb *) avarhnb: (* approximate variance of hnb *) real; flag: boolean; (* for flagging cases that only have approx computation *) flags: boolean; (* true if flag was ever true *) power: integer; (* 10 raised to the number of digits displayed (ind). by multiplying by this number and rounding, one can check the accuracy of the two genomic uncertainty calculations h and ha. if this is not done, one can have round-off problems. *) begin (* themain *) writeln(output,' calhnb ',version:4:2); rewrite(fout); writeln(fout,'* calhnb ',version:4:2,' calculate statistics of hnb'); writeln(fout,'*'); (* obtain the genomic composition *) reset(fin); if eof(fin) then begin writeln(output,' file fin is empty'); halt end; readln(fin,gna,gnc,gng,gnt); writeln(fout,'* genomic composition: ', ' a = ',gna:1,', ', ' c = ',gnc:1,', ', ' g = ',gng:1,', ', ' t = ',gnt:1); (* find out the genomic uncertainty: *) calaehnb(1,gna,gnc,gng,gnt,ha,aehnb,avarhnb); writeln(fout,'* genomic uncertainty, hg = ',ha:inf:ind,' bits'); writeln(output,'* genomic uncertainty, hg = ',ha:inf:ind,' bits'); writeln(fout,'*'); writeln(fout,'* n is the number of sequence examples'); writeln(fout,'* e(hnb) is the expectation of the uncertainty hnb', ' calculated from n examples'); writeln(fout,'* ae(hnb) an approximation of e(hnb) that is', ' calculated'); writeln(fout,'* more rapidly than e(hnb) for large n'); writeln(fout,'* e diff e(hnb)-ae(hnb)'); writeln(fout,'* var(hnb) is the variance of hnb'); writeln(fout,'* avar(hnb) is the approximate variance of hnb'); writeln(fout,'* std diff is the difference between the standard', ' deviations'); writeln(fout,'* (square roots of) var(hnb) and avar(hnb)'); writeln(fout,'* e(n) hg - e(hnb), the sampling error.'); writeln(fout,'* sd(n) square root of var(hnb).'); writeln(fout,'*'); writeln(fout,'* units are bits/base, except for the variances which'); writeln(fout,'* are the square of these.'); writeln(fout,'*'); writeln(fout,'*','n':3, ' ','e(hnb)':inf, ' ','ae(hnb)':inf, ' ','e diff':inf, ' ','var(hnb)':inf, ' ','avar(hnb)':inf, ' ','std diff':inf, ' ','e(n)':inf, ' ','sd(n)':inf); writeln(fout,'*'); (* ten to the number of digits. see definition of power *) power := round(exp(ind*ln(10.0))); flags := false; while not eof(fin) do begin readln(fin, n); if n > mintell then writeln(output,' calculating n = ',n:4); if n <= maxsize then begin calehnb(n, gna,gnc,gng,gnt,h,ehnb,varhnb); flag := false; end else begin flag := true; flags := true; end; calaehnb(n, gna,gnc,gng,gnt,ha,aehnb,avarhnb); write(fout,' ',n:3, ' ',ehnb:inf:ind, ' ',aehnb:inf:ind, ' ',(ehnb-aehnb):inf:ind); write(fout,' ',varhnb:inf:ind, ' ',avarhnb:inf:ind, ' ',(sqrt(varhnb) - sqrt(avarhnb)):inf:ind, ' ',(h - ehnb):inf:ind); write(fout,' ',sqrt(varhnb):inf:ind); if flag then begin write(fout, ' *'); end else if round(power * h) <> round(power * ha) then begin writeln(output,' program error in procedure themain ', 'h <> ha (',h:inf:ind,'<>',ha:inf:ind,')'); halt end; writeln(fout) end; if flags then begin writeln(fout,'*'); writeln(fout, '* for these, only approximate values were computed'); writeln(fout, 'last values in scientific notation:'); { write(fout,' ',n:3, ' ',ehnb:inf+extra:ind+extra, ' ',aehnb:inf+extra:ind+extra, ' ',(ehnb-aehnb):inf+extra:ind+extra); write(fout,' ',varhnb:inf+extra:ind+extra, ' ',avarhnb:inf+extra:ind+extra, ' ',(sqrt(varhnb) - sqrt(avarhnb)):inf+extra:ind+extra, ' ',(h - ehnb):inf+extra:ind+extra); } write(fout,' ',n:3, ' ',ehnb:inf, ' ',aehnb:inf, ' ',(ehnb-aehnb):inf); write(fout,' ',varhnb:inf, ' ',avarhnb:inf, ' ',(sqrt(varhnb) - sqrt(avarhnb)):inf, ' ',(h - ehnb):inf); writeln(fout,'*'); end; end; (* themain *) begin (* calhnb *) themain(fin, fout); 1: end. (* calhnb *)