/* Output from p2c 1.21alpha-07.Dec.93, the Pascal-to-C translator */ /* From input file "dirty.p" */ #include /* dirty: calculate probabilities for dirty DNA synthesis tom schneider */ /* end of program */ /* begin module version */ #define version 2.39 /* of dirty, 1994 sep 5 origin: 1987 may 28 */ /* end module version */ /* begin module describe.dirty */ /* name dirty: calculate probabilities for dirty DNA synthesis synopsis dirty(dirtyp: in, distribition: out, xyin: out, output: out) files dirtyp: parameter file. one line giving the number of random bases that will be used (r). one line giving the average number of changes desired (n) distribution: the distribution of numbers of changes at the peak for n xyin: Graphics output of the program. The input to xyplo for plotting. The graph gives three curves against the independent variable p, which is the probability of getting the correct base and randoms is the number of random bases: o = probability of only one base changed, as randoms (1-p)p^(randoms-1) m = probability of one or more bases changed: 1 - p^randoms n = probability of n bases changed I have not found this output to be too useful; I concentrate on the distribution file. output: messages to the user description If one is designing a randomized ("dirty") DNA synthesis, how heavily should it be randomized? To use this program, pick the size of the region you want to randomize, r. Then make a guess at the average number of changes you want over the region, n. Put r and n into dirtyp and run the program. Look at the distribution file. the line for n=0 is the frequency that you will get back the original sequence. You must chose whether this is tolerable. For example, when I synthesized the T7 promoters, I knew that I could find at least 1 promoter in 100 clones by toothpicking, and I was willing to toothpick thousands. This way I was sure to get some positives, even if they were the original sequence. (As it turned out, the frequency of functional promoters was much higher than 1%.) If you have a strong selection, you could make this a small number, by increasing the number of changes per clone. With more changes per clone you will get more data from the randomization, so make it as high as you can tolerate. The program calculates the ratio of bases to random bases. In the experiment described in the NAR paper, the technician put 4 drops of the appropriate base with 1 drop of the equiprobable mix. This made the dirty bottle. example This is the analysis used in the NAR paper. With the dirtyp file containing: 27 the number of random bases that will be used. 4 the number of changes desired (n) the distribution file is: * dirty 2.38 * distribution of number of changes calculated from binomial * 27 random positions * 4 average number of bases changed * p = probability of correct base = 0.85185185 * fraction of [base] : 0.80246914 * fraction of [random n] : 0.19753086 * * ratio of [base] to [random N]: 4.06250000 * * TO DO THE SYNTHESIS, * add one part of an equimolar mixture of the 4 bases * to 4.06250000 parts of the "wild type" base * * In the following table, * n = number of changes * f = frequency of n changes * s = running sum of frequencies f (should approach 1.0) * In the first row, where n=0, f is the frequency of wild type sequences * n = 0 f = 0.01317741 s = 0.01317741 n = 1 f = 0.06187652 s = 0.07505392 n = 2 f = 0.13989473 s = 0.21494866 n = 3 f = 0.20274599 s = 0.41769465 n = 4 f = 0.21156103 s = 0.62925568 n = 5 f = 0.16924883 s = 0.79850451 n = 6 f = 0.10792679 s = 0.90643130 n = 7 f = 0.05630963 s = 0.96274093 n = 8 f = 0.02448245 s = 0.98722338 n = 9 f = 0.00898872 s = 0.99621210 n =10 f = 0.00281386 s = 0.99902596 n =11 f = 0.00075629 s = 0.99978226 n =12 f = 0.00017537 s = 0.99995763 n =13 f = 0.00003519 s = 0.99999282 n =14 f = 0.00000612 s = 0.99999894 n =15 f = 0.00000092 s = 0.99999986 n =16 f = 0.00000012 s = 0.99999999 n =17 f = 0.00000001 s = 1.00000000 n =18 f = 0.00000000 s = 1.00000000 n =19 f = 0.00000000 s = 1.00000000 n =20 f = 0.00000000 s = 1.00000000 n =21 f = 0.00000000 s = 1.00000000 n =22 f = 0.00000000 s = 1.00000000 n =23 f = 0.00000000 s = 1.00000000 n =24 f = 0.00000000 s = 1.00000000 n =25 f = 0.00000000 s = 1.00000000 n =26 f = 0.00000000 s = 1.00000000 n =27 f = 0.00000000 s = 1.00000000 see also xyplo.p documentation @article{Schneider1989, author = "T. D. Schneider and G. D. Stormo", title = "Excess Information at Bacteriophage {T7} Genomic Promoters Detected by a Random Cloning Technique", year = "1989", journal = "Nucleic Acids Research", volume = "17", pages = "659-674"} author Tom Schneider National Cancer Institute Laboratory of Mathematical Biology Frederick, Maryland toms@ncifcrf.gov bugs n must be an integer */ /* end module describe.dirty */ Static double b; /* fraction of one base in total dirty mix */ Static _TEXT dirtyp; /* parameter file from which to read randoms */ Static _TEXT distribution; /* number of changes */ Static double g; /* probability of having n changes */ Static double o; /* probability of having exactly one change */ Static double m; /* probability of having one or more change */ Static long n; /* average number of given changes */ Static double p; /* the probability of having a correct base in one position */ Static double r; /* fraction of one base to mix with N base */ Static long randoms; /* number of random positions to calculate for */ Static double sum; /* running sum of g */ Static _TEXT xyin; /* the input to the xyplo program for plotting, output of this program */ Static double x; /* ratio of the random base (N) in total dirty mix */ Static jmp_buf _JL1; /* begin module halt */ Static Void halt() { /* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. */ printf(" program halt.\n"); longjmp(_JL1, 1); } /* end module halt version = 'delmod 6.16 84 mar 12 tds/gds'; */ Static double raise(tothe) long tothe; { /* raise p to the tothe */ return exp(tothe * log(p)); } Static double calcgiven(p) double p; { /* calculate the frequency of n given changes, warning: n is global! */ long i; /* index */ double m = 1.0; /* multiplicative parameter */ long FORLIM; FORLIM = randoms; for (i = randoms - n + 1; i <= FORLIM; i++) m *= i; FORLIM = n; for (i = 2; i <= FORLIM; i++) m /= i; return (m * raise(randoms - n) * exp(n * log(1 - p))); } Static Void header(f) _TEXT *f; { /* give information about what the run is about */ if (*f->name != '\0') { if (f->f != NULL) f->f = freopen(f->name, "w", f->f); else f->f = fopen(f->name, "w"); } else { if (f->f != NULL) rewind(f->f); else f->f = tmpfile(); } if (f->f == NULL) _EscIO2(FileNotFound, f->name); SETUPBUF(f->f, Char); fprintf(f->f, "* dirty %4.2f\n", version); fprintf(f->f, "* %ld random positions\n", randoms); fprintf(f->f, "* independent variable, p = probability of correct base\n"); fprintf(f->f, "* dependendent variables: o, m\n"); fprintf(f->f, "* o = probability of only one base changed, as %ld(1-p)p^%ld\n", randoms, randoms - 1); fprintf(f->f, "* m = probability of one or more bases changed, 1 - p^%ld\n", randoms); fprintf(f->f, "* n = probability of n (=%ld) bases changed calculated from binomial\n", n); } main(argc, argv) int argc; Char *argv[]; { long FORLIM; PASCAL_MAIN(argc, argv); if (setjmp(_JL1)) goto _L1; xyin.f = NULL; strcpy(xyin.name, "xyin"); distribution.f = NULL; strcpy(distribution.name, "distribution"); dirtyp.f = NULL; strcpy(dirtyp.name, "dirtyp"); printf("dirty %4.2f\n", version); if (*dirtyp.name != '\0') dirtyp.f = fopen(dirtyp.name, "r"); else rewind(dirtyp.f); if (dirtyp.f == NULL) _EscIO2(FileNotFound, dirtyp.name); RESETBUF(dirtyp.f, Char); if (BUFEOF(dirtyp.f)) { printf("empty parameter file\n"); halt(); } fscanf(dirtyp.f, "%ld%*[^\n]", &randoms); getc(dirtyp.f); if (randoms < 1) { printf("there must be at least 1 random position\n"); halt(); } printf("%ld random positions\n", randoms); fscanf(dirtyp.f, "%ld%*[^\n]", &n); getc(dirtyp.f); if (n <= 0) { printf("n must be positive\n"); halt(); } if (n > randoms) { printf("n must less than or equal to the number of random positions\n"); halt(); } header(&xyin); p = 0.01; while (p < 1.0) { o = randoms * raise(randoms - 1) * (1 - p); m = 1 - raise(randoms); g = calcgiven(p); /* based on n */ fprintf(xyin.f, "o %10.8f %10.8f\n", p, o); fprintf(xyin.f, "n %10.8f %10.8f\n", p, g); fprintf(xyin.f, "m %10.8f %10.8f o/m ratio: %10.8f\n", p, m, o / m); p += 0.01; } p = 1 - (double)n / randoms; r = (1 - p) * 4 / 3; b = 1 - r; x = b / r; if (*distribution.name != '\0') { if (distribution.f != NULL) distribution.f = freopen(distribution.name, "w", distribution.f); else distribution.f = fopen(distribution.name, "w"); } else { if (distribution.f != NULL) rewind(distribution.f); else distribution.f = tmpfile(); } if (distribution.f == NULL) _EscIO2(FileNotFound, distribution.name); SETUPBUF(distribution.f, Char); fprintf(distribution.f, "* dirty %4.2f\n", version); fprintf(distribution.f, "* distribution of number of changes calculated from binomial\n"); fprintf(distribution.f, "* %ld random positions\n", randoms); fprintf(distribution.f, "* %2ld average number of bases changed\n", n); if (r < 1.0) { fprintf(distribution.f, "* p = probability of correct base = %10.8f\n", p); fprintf(distribution.f, "* fraction of [base] : %10.8f\n", b); fprintf(distribution.f, "* fraction of [random n] : %10.8f\n", r); fprintf(distribution.f, "*\n"); fprintf(distribution.f, "* ratio of [base] to [random N]: %10.8f\n", x); fprintf(distribution.f, "*\n"); fprintf(distribution.f, "* TO DO THE SYNTHESIS, \n"); fprintf(distribution.f, "* add one part of an equimolar mixture of the 4 bases\n"); fprintf(distribution.f, "* to %10.8f parts of the \"wild type\" base\n", x); fprintf(distribution.f, "*\n"); fprintf(distribution.f, "* In the following table,\n"); fprintf(distribution.f, "* n = number of changes\n"); fprintf(distribution.f, "* f = frequency of n changes\n"); fprintf(distribution.f, "* s = running sum of frequencies f (should approach 1.0)\n"); fprintf(distribution.f, "* In the first row, where n=0, f is the frequency of wild type sequences\n"); fprintf(distribution.f, "*\n"); sum = 0.0; FORLIM = randoms; for (n = 0; n <= FORLIM; n++) { g = calcgiven(p); /* based on n */ sum += g; fprintf(distribution.f, "n =%2ld f = %10.8f s = %10.8f\n", n, g, sum); } } else fprintf(distribution.f, "* USE PURE EQUIPROBABLY RANDOM BASES FOR THE SYNTHESIS.\n"); _L1: if (dirtyp.f != NULL) fclose(dirtyp.f); if (distribution.f != NULL) fclose(distribution.f); if (xyin.f != NULL) fclose(xyin.f); exit(EXIT_SUCCESS); } /* End. */