/* Output from p2c 1.21alpha-07.Dec.93, the Pascal-to-C translator */ /* From input file "calhnb.p" */ #include /* calhnb: calculate e(hnb), var(hnb), ae(hnb), avar(hnb) Dr. Thomas D. Schneider National Cancer Institute Laboratory of Experimental and Computational Biology Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu http://www.lecb.ncifcrf.gov/~toms/ module libraries required: delman, prgmods */ /* end of program */ /* begin module version */ #define version 2.27 /* of calhnb.p 2003 Jul 23 2.28, 2003 Jul 23: remove 'entropy' 2.27, 2001 Jun 7: upgrade documentation 2.26, 2001 Jun 7: upgrade documentation 2.25, 2001 Jun 7: add sd(n) column 2.24, 2000 Jan 18: allow approx. for large n. 2.23, upgrade documentation 2.22, 1994 Sep 5: last changes original before 1984 jan 18 */ /* end module version */ /* 2001 Jun 7: the original one liner is not too descriptive! calhnb: calculate e(hnb), var(hnb), ae(hnb), avar(hnb), e(n), sd(n) */ /* begin module describe.calhnb */ /* name calhnb: small-sample correction for information and uncertainty synopsis calhnb(fin: in, fout: out, output: out) files fin: the genomic composition (integers) on one line followed by a set of integers, one per line representing values of n fout: a table showing n, e(hnb), ae(hnb) and their difference. the variances var(hnb) and avar(hnb) are tabulated along with the difference between their square roots. This is the difference between the standard deviations. e(n) is found from the genomic uncertainty minus e(hnb). Finally, sd(n) = sqrt(var(hnb)) is given. output: messages to the user. describe Given a genomic composition and a series of integers (n) that represent the number of sample sites, calhnb calculates the sampling error as e(hnb) and the variance var(hnb). It also finds the approximations ae(hnb) and avar(hnb). These values are presented in a table along with the differences between the exact and approximate calculations. This table will allow a user to decide when to use the approximations. Beware that the exact calculation becomes very expensive for large n. For this reason, I use the approximate computation for n > 20 in rseq and alpro. examples When used as fin, the calhnb.fin file should generate the calhnb.fout file in the fout. The data should be identical those given in Figure A.2 on page 428 of the Appendix of Schneider et al 1986. documentation "Information content of binding sites on nucleotide sequences" T. D. Schneider, G. D. Stormo, L. Gold, and A. Ehrenfeucht JMB 188:415-431 (1986) [see link below] see also {Example input file, fin:} calhnb.fin {Corresponding output file, fout:} calhnb.fout {Discussion about correctiing for small sample size:} http://www.lecb.ncifcrf.gov/~toms/small.sample.correction.html {Schneider et al. (1986):} http://www.lecb.ncifcrf.gov/~toms/paper/schneider1986 {related programs:} rseq.p, alpro.p author Thomas D. Schneider bugs It would be nice to have a generalized algorithm for any number of symbols. */ /* end module describe.calhnb */ Static _TEXT fin; /* input file of n values one per line */ Static _TEXT fout; /* table of e(hnb), ae(hnb) and variances */ Static jmp_buf _JL1; /* begin module halt */ Static Void halt() { /* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. */ printf(" program halt.\n"); longjmp(_JL1, 1); } #define maxsize 200 /* the largest n allowed */ #define accuracy 10000 /* end module halt version = 'prgmod 3.96 85 mar 18 tds'; */ /* begin module calehnb */ Static Void calehnb(n, gna, gnc, gng, gnt, hg, ehnb, varhnb) long n, gna, gnc, gng, gnt; double *hg, *ehnb, *varhnb; { /* ; debugging: boolean */ /* calculate e(hnb) in bits/bp (ehnb) for a number (n) of example sequence sites. gna to gnt are the composition to use for the genome probabilities of a to t. the genomic uncertainty hg and the variance var(hnb) (=varhnb) are also calculated. if the variable debugging is passed to the procedure then the individual combinations of hnb are displayed. note: this procedure should not be broken into smaller procedures so that it remains efficient. version = 3.02; of procedure calehnb 1983 nov 23 */ /* less than (1/accuracy) bits error is demanded for the sum of pnb (see variable 'total') at the end of the procedure */ double log2 = log(2.0); /* natural log of 2, used to find log base 2 */ double logn; /* log of n */ double nlog2; /* n * log2 */ long gn; /* sum of gna..gnt */ double logpa, logpc, logpg, logpt; /* logs of genome probabilities */ /* log of n factorial is the sum of i=1 to n of log(i). the array below represents these logs up to n */ double logfact[maxsize + 1]; /* precalculated values of -p*log2(p), where p=nb/n for nb = 0 .. n. m stands for minus */ double mplog2p[maxsize + 1]; long i; /* index for logfact and mplog2p */ double logi; /* natural log of i */ long na; long nc = 0, ng = 0, nt = 0; /* numbers of bases in a site */ boolean done = false; /* true when the loop is completed */ double pnb; /* multinomial probability of a combination of na, nc, ng, nt */ double hnb; /* uncertainty for a combination of na..nt */ double pnbhnb; /* pnb*hnb, an intermediate result */ double sshnb = 0.0; /* sum of squares of hnb */ /* variables for testing program correctness: */ double total = 0.0; /* sum of pnb over all combinations of na..nt if this is not 1.00, the program is in error */ long counter = 0; /* counts the number of times through the loop */ /* prevent access to outside the arrays: */ if (n > maxsize) { printf(" procedure calehnb: n > maxsize (%ld>%ld)\n", n, (long)maxsize); halt(); } logn = log((double)n); nlog2 = n * log2; /* get logs of genome probabilities */ gn = gna + gnc + gng + gnt; logpa = log((double)gna / gn); logpc = log((double)gnc / gn); logpg = log((double)gng / gn); logpt = log((double)gnt / gn); /* find genomic uncertainty */ *hg = -((gna * logpa + gnc * logpc + gng * logpg + gnt * logpt) / (gn * log2)); *ehnb = 0.0; /* start error uncertainty at zero */ /* make table of log of n factorial up to n and entropies for nb/n */ logfact[0] = 0.0; /* factorial(0) = 0 */ mplog2p[0] = 0.0; for (i = 1; i <= n; i++) { logi = log((double)i); logfact[i] = logfact[i-1] + logi; mplog2p[i] = i * (logn - logi) / nlog2; } /* begin by looking at the combination with all a: na = n */ na = n; /* the following loop simulates a number of nested loops of the form: for b1=a to t do for b2=b1 to t do for b3=b2 to t do ... for bn=b(n-1) to t do ... the resulting set of variables increase in alphabetic order since no inner loop variable can have a value less than any outer loop. the number of times through the inner-most loop is given by: o = (n + 1)*(n + 2)*(n + 3)/6 in the case where there are four symbols (a,c,g,t) and n is the number of nested loops. a recursive set of loops would be possible, but it would use up too much memory in practical cases (up to n=150 or higher). a second algorithm sequests the loop variables into an array and increments them there. however, the goal is to get all possible combinations for na, nc, ng, nt, where the sum of these is n. the nested loops provide all the combinations in alphabetic order, assuring that there can not be any duplicates. to find nb (one of na..nt) one would look at which of the variables b1 to bn were of value b. this is a wasteful operation. the loop below simulates the array of control variables by changing each nb directly. */ do { /* pnb is calculated by taking the log of the expression fact(n) na nc ng nt pnb = ------------------- pa * pc * pg * pt . fact(na).. fact(nt) log(pnb) generates a series of sums, allowing the calculation to proceed by addition and multiplication rather than multiplication and exponentiation. the factorials become tractable in this way */ pnb = exp(logfact[n] - logfact[na] - logfact[nc] - logfact[ng] - logfact[nt] + na * logpa + nc * logpc + ng * logpg + nt * logpt); /* n factorial */ hnb = mplog2p[na] + mplog2p[nc] + mplog2p[ng] + mplog2p[nt]; pnbhnb = pnb * hnb; *ehnb += pnbhnb; sshnb += pnbhnb * hnb; /* sum of squares of hnb */ /* the following section keeps track of the calculation and writes out the current set of nb. */ counter++; /* if debugging then begin write(output,' ',counter:2,' '); for i := 1 to na do write(output,'a'); for i := 1 to nc do write(output,'c'); for i := 1 to ng do write(output,'g'); for i := 1 to nt do write(output,'t'); write(output,' ',na:3,nc:3,ng:3,nt:3); writeln(output,' pnb = ',pnb:10:5); end; */ total += pnb; /* the remaining portion of this repeat loop generates the values of na, nc, ng and nt. notice that there are 7 possibilities at each loop increment. other than the stop, in each case the sum of na+nc+ng+nt remains constant (=n). */ if (nt > 0) { /* ending on a t - do outer loops */ if (ng > 0) { /* turn g into t */ ng--; nt++; } else if (nc > 0) { /* turn one c into g, and all t to g (note ng = 0 initially) */ nc--; ng = nt + 1; nt = 0; } else if (na > 0) { /* turn one a into c and all g and t to c. (note ng=nc=0 initially) */ na--; nc = nt + 1; nt = 0; } else done = true; /* since nt = n */ } else { if (ng > 0) { /* turn g into t */ ng--; nt++; } else if (nc > 0) { /* turn c into g */ nc--; ng++; } else { na--; nc++; /* na > 0; turn a into c */ } } } while (!done); /* no t - increment innermost loop */ /* final adjustment: we only have the sum of squares so far */ *varhnb = sshnb - *ehnb * *ehnb; /* if this message appears, there is either a bug in the code or the computer cannot be as accurate as requested */ if (accuracy != (long)floor(accuracy * total + 0.5)) { printf(" procedure calehnb: the sum of probabilities is\n"); printf(" not accurate to one part in %ld\n", (long)accuracy); printf(" the sum of the probabilities is %10.8f\n", total); } /* if this message appear, then there is an error in the repeat-until loop: it did not repeat as many times as is expected from the algorithm */ if (counter == (long)floor((n + 1.0) * (n + 2) * (n + 3) / 6 + 0.5)) return; /* writeln(output, ' total: ',total:10:5); writeln(output,' count = ',counter:1); writeln(output,' (n+1)*(n+2)*(n+3)/6 = ', round((n+1)*(n+2)*(n+3)/6):1); */ printf(" procedure calehnb: program error, the number of\n"); printf(" calculations is in error\n"); halt(); } /* calehnb */ #undef maxsize #undef accuracy /* end module calehnb */ /* begin module calaehnb */ Static Void calaehnb(n, gna, gnc, gng, gnt, hg, aehnb, avarhnb) long n, gna, gnc, gng, gnt; double *hg, *aehnb, *avarhnb; { /* calculate ae(hnb) in bits/bp (=aehnb) for a number (n) of example sequence sites. gna to gnt are the composition to use for the genome probabilities of a to t. the genomic uncertainty (=hg) and the variance avar(hnb) (=avarhnb) are also calculated. this procedure gives approximations for e(hnb) and var(hnb). it is based on a formula derived by jeff haemer. see also: G. P. Basharin Theory Probability Appl. 4(3): 333-336 (1959) 'On a statistical estimate for the entropy of a sequence of independent random variables' version = 3.01; of procedure calaehnb 1983 nov 23 */ double log2 = log(2.0); /* natural log of 2 */ long gn; /* sum of gna..gnt */ double pa, pc, pg, pt; /* genomic probabilities */ double e; /* the approximate sampling error */ gn = gna + gnc + gng + gnt; pa = (double)gna / gn; pc = (double)gnc / gn; pg = (double)gng / gn; pt = (double)gnt / gn; *hg = -((pa * log(pa) + pc * log(pc) + pg * log(pg) + pt * log(pt)) / log2); e = 3 / (2 * log2 * n); *aehnb = *hg - e; *avarhnb = e * e; } /* calaehnb */ #define inf 9 /* field for information display */ #define ind 5 /* decimal place for information display */ /* extra = 5; (* more size for final output *) */ #define mintell 25 /* the minimum n for which to tell the user were we are... */ #define maxsize 200 /* end module calaehnb */ Static Void themain(fin, fout) _TEXT *fin, *fout; { /* the main procedure of the program */ /* maximum size for computing exactly. This *must* match the same constant in procedure calehnb. */ long gna, gnc, gng, gnt; /* genomic composition */ long n; /* number of example sequences */ double h, ha; /* genomic hg for calehnb or calaehnb procedures */ double ehnb; /* e(hnb) */ double aehnb; /* ae(hnb) */ double varhnb; /* variance of hnb */ double avarhnb; /* approximate variance of hnb */ boolean flag; /* for flagging cases that only have approx computation */ boolean flags = false; /* true if flag was ever true */ long power = (long)floor(exp(ind * log(10.0)) + 0.5); /* 10 raised to the number of digits displayed (ind). by multiplying by this number and rounding, one can check the accuracy of the two genomic uncertainty calculations h and ha. if this is not done, one can have round-off problems. */ printf(" calhnb %4.2f\n", version); if (*fout->name != '\0') { if (fout->f != NULL) fout->f = freopen(fout->name, "w", fout->f); else fout->f = fopen(fout->name, "w"); } else { if (fout->f != NULL) rewind(fout->f); else fout->f = tmpfile(); } if (fout->f == NULL) _EscIO2(FileNotFound, fout->name); SETUPBUF(fout->f, Char); fprintf(fout->f, "* calhnb %4.2f calculate statistics of hnb\n", version); fprintf(fout->f, "*\n"); /* obtain the genomic composition */ if (*fin->name != '\0') { if (fin->f != NULL) fin->f = freopen(fin->name, "r", fin->f); else fin->f = fopen(fin->name, "r"); } else rewind(fin->f); if (fin->f == NULL) _EscIO2(FileNotFound, fin->name); RESETBUF(fin->f, Char); if (BUFEOF(fin->f)) { printf(" file fin is empty\n"); halt(); } fscanf(fin->f, "%ld%ld%ld%ld%*[^\n]", &gna, &gnc, &gng, &gnt); getc(fin->f); fprintf(fout->f, "* genomic composition: a = %ld, c = %ld, g = %ld, t = %ld\n", gna, gnc, gng, gnt); /* find out the genomic uncertainty: */ calaehnb(1L, gna, gnc, gng, gnt, &ha, &aehnb, &avarhnb); fprintf(fout->f, "* genomic uncertainty, hg = %*.*f bits\n", inf, ind, ha); printf("* genomic uncertainty, hg = %*.*f bits\n", inf, ind, ha); fprintf(fout->f, "*\n"); fprintf(fout->f, "* n is the number of sequence examples\n"); fprintf(fout->f, "* e(hnb) is the expectation of the uncertainty hnb calculated from n examples\n"); fprintf(fout->f, "* ae(hnb) an approximation of e(hnb) that is calculated\n"); fprintf(fout->f, "* more rapidly than e(hnb) for large n\n"); fprintf(fout->f, "* e diff e(hnb)-ae(hnb)\n"); fprintf(fout->f, "* var(hnb) is the variance of hnb\n"); fprintf(fout->f, "* avar(hnb) is the approximate variance of hnb\n"); fprintf(fout->f, "* std diff is the difference between the standard deviations\n"); fprintf(fout->f, "* (square roots of) var(hnb) and avar(hnb)\n"); fprintf(fout->f, "* e(n) hg - e(hnb), the sampling error.\n"); fprintf(fout->f, "* sd(n) square root of var(hnb).\n"); fprintf(fout->f, "*\n"); fprintf(fout->f, "* units are bits/base, except for the variances which\n"); fprintf(fout->f, "* are the square of these.\n"); fprintf(fout->f, "*\n"); fprintf(fout->f, "*%3c %*s %*s %*s %*s %*s %*s %*s %*s\n", 'n', inf, "e(hnb)", inf, "ae(hnb)", inf, "e diff", inf, "var(hnb)", inf, "avar(hnb)", inf, "std diff", inf, "e(n)", inf, "sd(n)"); fprintf(fout->f, "*\n"); /* ten to the number of digits. see definition of power */ while (!BUFEOF(fin->f)) { fscanf(fin->f, "%ld%*[^\n]", &n); getc(fin->f); if (n > mintell) printf(" calculating n = %4ld\n", n); if (n <= maxsize) { calehnb(n, gna, gnc, gng, gnt, &h, &ehnb, &varhnb); flag = false; } else { flag = true; flags = true; } calaehnb(n, gna, gnc, gng, gnt, &ha, &aehnb, &avarhnb); fprintf(fout->f, " %3ld %*.*f %*.*f %*.*f", n, inf, ind, ehnb, inf, ind, aehnb, inf, ind, ehnb - aehnb); fprintf(fout->f, " %*.*f %*.*f %*.*f %*.*f", inf, ind, varhnb, inf, ind, avarhnb, inf, ind, sqrt(varhnb) - sqrt(avarhnb), inf, ind, h - ehnb); fprintf(fout->f, " %*.*f", inf, ind, sqrt(varhnb)); if (flag) fprintf(fout->f, " *"); else if ((long)floor(power * h + 0.5) != (long)floor(power * ha + 0.5)) { printf(" program error in procedure themain h <> ha (%*.*f<>%*.*f)\n", inf, ind, h, inf, ind, ha); halt(); } putc('\n', fout->f); } if (!flags) return; fprintf(fout->f, "*\n"); fprintf(fout->f, "* for these, only approximate values were computed\n"); fprintf(fout->f, "last values in scientific notation:\n"); /* write(fout,' ',n:3, ' ',ehnb:inf+extra:ind+extra, ' ',aehnb:inf+extra:ind+extra, ' ',(ehnb-aehnb):inf+extra:ind+extra); write(fout,' ',varhnb:inf+extra:ind+extra, ' ',avarhnb:inf+extra:ind+extra, ' ',(sqrt(varhnb) - sqrt(avarhnb)):inf+extra:ind+extra, ' ',(h - ehnb):inf+extra:ind+extra); */ fprintf(fout->f, " %3ld % .*E % .*E % .*E", n, P_max(inf - 7, 1), ehnb, P_max(inf - 7, 1), aehnb, P_max(inf - 7, 1), ehnb - aehnb); fprintf(fout->f, " % .*E % .*E % .*E % .*E", P_max(inf - 7, 1), varhnb, P_max(inf - 7, 1), avarhnb, P_max(inf - 7, 1), sqrt(varhnb) - sqrt(avarhnb), P_max(inf - 7, 1), h - ehnb); fprintf(fout->f, "*\n"); } /* themain */ #undef inf #undef ind #undef mintell #undef maxsize main(argc, argv) int argc; Char *argv[]; { PASCAL_MAIN(argc, argv); if (setjmp(_JL1)) goto _L1; fout.f = NULL; strcpy(fout.name, "fout"); fin.f = NULL; strcpy(fin.name, "fin"); themain(&fin, &fout); _L1: if (fin.f != NULL) fclose(fin.f); if (fout.f != NULL) fclose(fout.f); exit(EXIT_SUCCESS); } /* calhnb */ /* End. */