/* Output from p2c 1.21alpha-07.Dec.93, the Pascal-to-C translator */ /* From input file "sites.p" */ #include /* sites: analyse sites from randomized sequence data base Dr. Thomas D. Schneider National Cancer Institute Laboratory of Experimental and Computational Biology Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu http://www.lecb.ncifcrf.gov/~toms/ */ /* end of program */ /* begin module version */ #define version 8.09 /* of sites.p 2002 Mar 6 2002 Mar 6, 8.09: upgrade documentation 1996 Jul 27, 8.08: final version for publication origin 1987 oct 27 */ /* end module version */ /* begin module describe.sites */ /* name sites: analyse sites from randomized sequence data base synopsis sites(database: in, standard: in, caps: out, latex: out, list: out, sorted: out, stats: out, tables: out, rsdata: out, sequ: out, makebkp: out, output: out) files database: database consisting of DNA sequence data. The first line is the name of the database. The remaining lines consist of experimental packages. The start of a package is a line like: @ -27 11 -21 5 0.85 The '@' must be left justified as the first character on the line. The numbers are defined to be: @ FROM.range TO.range FROM.random TO.random fraction.canonical FROM.range: the coordinate of the first base reported in the database TO.range: the coordinate of the last base reported in the database FROM.random: the coordinate of the first randomized base TO.random: the coordinate of the last randomized base fraction.canonical: the fraction of the canonical base during chemical synthesis. The next line defines the canonical sequence which was 'randomized'. It is in the format of the remaining sequences. The first sequence in the package is always the standard, so do not forget to include it! The sequences follow the standard. The format of the standard and the randomized sequences consists of: DNA sequence, plasmid name, primer, experiment, date (year, month, day) separated by one space each instead of commas. The sequence may contain any of the characters: "acgtxd.". "x" means that the base is not known. "d" means that that base was deleted. The program will reject these sequences (to make pure data), but this allows them to be stored in the database. "." means 'the same as the standard sequence in this position'. This allows one to enter sequences as a set of changes from the standard. The next experimental package begins with another '@'. The data from each experimental package are gathered as frequencies and normalized by using the given canonical base frequency. The normalized frequencies from all the packages are averaged to produce the final results. This allows one to combine several experiments together, however all experiments are given the same weight. This is reasonable if the experiments have similar canonical frequencies and numbers of sequences, but is probably not correct if one experiment carries more "importance" than another. A method to accounting for these different weightings is not known. standard: Use the rsdata output of the rseq program from the natural sequences as your standard. It is used for statistical comparison of the experiment to wild-type sequences. caps: listing of the database sorted and with capital letters showing changes from the standard and database errors. latex: just like list, but in a form that can be run through the typesetting program LaTeX. list: listing of the database in an easy-to-read format showing only the changes from the standard. Also gives the tables of numbers of bases. sorted: the list sorted by sequence stats: frequency statistics of the database differences. summary of information results. tables: frequency tables for various stages of the normalization. rsdata: This simulates the output of the rseq program by giving the numbers of bases (b) at each position (i). When the frequency tables are normalized in this program, the effective number of sequences is lost. To make sure that the numbers reported in rsdata are accurate, they are multiplied by constant scaleup. The table can be run through dalvec and makelogo to make a sequence logo. The variance, varhnb, is set to be negative to indicate that no method is known for how to calculate it. An earlier version of the program gave the minimum error based on the number of sequences in the database, but people tended to miss this fact when looking at the final sequence logo, so were unduely impressed by the data. sequ: raw sequences (after processing) ready for makebk makebkp: input for makebk to create the book output: messages to the user description The function of the sites program is to gather, collate and analyze data from a randomization experiment. See the reference given below. It was designed to help enter sequence data. One may enter several copies of a particular sequence, and they will be joined together by merging their data. Sequences of the same clone are identified by their common plasmid names. Inconsistent data are flagged. First the program sorts the data and checks that multiple entries are consistant with one another. If they are not, the program halts and you should look into the caps file to figure out what is wrong. The program converts the database into a more readable form in list, and provides statistical analysis. If the standard is: gaattcaaattaatacgactcactatagggagaaagctt pTS37 kc7 ex100 87 nov 2 and one of the data base lines is: gaattcaaattaattcgactcactttagggaaaaagctt pTS331 1204 ex394 87 nov 2 the program presents the data in file list as: ..............t.........t......a....... pTS331 1204 ex394 87 nov 2 which is more readable. This allows entry as a sequence, but display in a form that is easy to understand. If two primers are used, and data are found for both, then the name becomes 'both'. The stats file contains tables of the wild type frequencies and the experimental frequencies. examples See database.t7 and standard.t7. documentation @article{Schneider1989, author = "T. D. Schneider and G. D. Stormo", title = "Excess Information at Bacteriophage {T7} Genomic Promoters Detected by a Random Cloning Technique", year = "1989", journal = "Nucl. Acids Res.", volume = "17", pages = "659-674"} see also {Examples:} database.t7 {and} standard.t7 {Related programs:} siva.p, dalvec.p, makelogo.p, makebk.p author Tom Schneider bugs For sorting all plasmid initials are ignored, sorting is by the plasmid number only. A correction for small sample size is not known for the normalized experimental data. Certainly the method given in program Calhnb is not right. Therefore, the program does not report the expected variation. */ /* end module describe.sites */ /* begin module sites.const */ #define maxstring 800 /* largest string allowed */ #define maxseq 100 /* largest sequence allowed */ #define maxentries 1000 /* largest number of data entries */ #define primerlength 4 /* length of primers on output, to make them line up */ #define fillermax 21 /* for filling a trigger */ /* end module sites.const */ /* begin module rseq.const */ #define negativeinfinity (-1000000L) /* the definition of negative infinity = 100x scaleup */ #define scaleup 10000 /* the amount that the frequencies are to be multiplied by. This determines the number of digits accuracy in the resulting table */ #define posfield 4 /* field size for aligned sequence positions */ #define infofield 8 /* field size for bits */ #define infodecim 5 /* decimal places for bits */ #define nfield 4 /* size of field for printing n, the number of sites */ /* end module rseq.const */ /* begin module rs.type */ typedef struct rstype { /* types of data in the data table from rseq */ long rstart, rstop; /* range of the data */ long l; /* position */ long nal, ncl, ngl, ntl; /* numbers of each base */ double rsl; /* rsequence(l) */ double rs; /* running sum of rs */ double varhnb; /* variance estimate for small sample correction */ double sumvar; /* sum of varhnb */ long nl; /* = a+c+g+t */ double ehnb; /* hg-e(n) */ Char flag; /* e = exact, a = approximate sampling statistics */ } rstype; /* end module rs.type version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module interact.type */ typedef struct string { /* a string of characters */ Char letters[maxstring]; /* the letters in the string */ long length; /* the number of characters in the string */ long current; /* the letter we are working on */ } string; /* end module interact.type version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module trigger.type */ typedef struct trigger { /* an object to be searched for */ string seek; /* the characters looked for */ long state; /* how close to triggering we are */ boolean skip; /* trigger not found- skip the line */ /* the trigger was found */ boolean found; } trigger; /* end module trigger.type version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module filler.type */ /* the following is an array used to fill a string. it is convenient to have it much shorter than the maxstring, so that it is easy to fill the string using procedure fillstring. the user must declare the value of constant fillermax. */ typedef Char filler[fillermax]; /* end module filler.type version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module sites.type */ typedef enum { a, c, g, t, x } base; /* the nucleotide bases and unknown base */ /* the standard entry format is: gaattcaaattaattcgactcactttagggaaaaagctt pTS331 1204-- ex394 87 nov 2 seq------------------------------------ name-- primer ex--- date------ */ typedef struct entry_ { string seq; /* the sequence read in */ string name; /* the plasmid */ long number; /* the number of the plasmid */ string primer; /* the primer */ string ex; /* the experiment number */ string date; /* the date of the developed gel */ struct entry_ *next; } entry_; typedef entry_ *entries[maxentries]; /* a set of "entry"s */ typedef struct coordinates { /* defines the overall region of a set of sequences, fromrange to torange and the subset of that region which is randomized, fromrandom to torandom. Since the fraction of canonical base is also associated with these numbers, they are included here too. */ long fromrange; /* first position in user coordinates */ long torange; /* last position in user coordinates */ long lengthrange; /* the region of data in use */ long fromrandom; /* start of random region in user coordinates */ long torandom; /* end of random region in user coordinates */ long lengthrandom; /* the region of data which was randomized */ double fracan; /* fraction of canonical base in random region. Note that fracan is always associated with a set of coordinates. */ double notfracan; /* fraction of non-canonical bases, (1-fracan */ } coordinates; typedef struct ilarray { long data[maxseq]; /* integer(l) */ coordinates coor; } ilarray; typedef struct iblarray { long data[(long)t - (long)a + 1][maxseq]; /* integer(b,l) */ coordinates coor; } iblarray; typedef struct rlarray { double data[maxseq]; /* real(l) */ coordinates coor; } rlarray; typedef struct rblarray { double data[(long)t - (long)a + 1][maxseq]; /* real(b,l) */ coordinates coor; } rblarray; /* end module sites.type */ /* begin module sites.var */ Static _TEXT database; /* sequence database */ Static _TEXT tables; /* frequency tables */ Static _TEXT caps; /* capitalized and sorted listing of the database */ Static _TEXT latex; /* latex */ Static _TEXT list; /* easy-to-read listing of the database, by name */ Static _TEXT sorted; /* sorted listing of the database, by sequence */ Static _TEXT standard; /* standard sequence */ Static _TEXT stats; /* statistics of the database */ Static _TEXT rsdata; /* f(b,l) table */ Static _TEXT sequ; /* raw processed sequences */ Static _TEXT makebkp; /* input file for makebk */ Static jmp_buf _JL1; /* end module sites.var */ /* begin module halt */ Static Void halt() { /* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. */ printf(" program halt.\n"); longjmp(_JL1, 1); } /* end module halt version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module copyaline */ Static Void copyaline(fin, fout) _TEXT *fin, *fout; { /* copy a line from file fin to file fout */ while (!P_eoln(fin->f)) { putc(P_peek(fin->f), fout->f); getc(fin->f); } fscanf(fin->f, "%*[^\n]"); getc(fin->f); putc('\n', fout->f); } /* copyaline */ /* end module copyaline version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module capitalize */ Static Char capitalize(c_) Char c_; { /* convert the character c to upper case */ long n = c_; /* c is the n'th letter of the alphabet */ if (n >= 'a' && n <= 'z') c_ = _toupper(n); return c_; } /* end module capitalize version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module interact.clearstring */ Static Void clearstring(ribbon) string *ribbon; { /* empty the string */ long index; /* to the ribbon */ for (index = 0; index < maxstring; index++) ribbon->letters[index] = ' '; ribbon->length = 0; ribbon->current = 0; } /* clearstring */ /* end module interact.clearstring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module interact.writestring */ Static Void writestring(tofile, s) _TEXT *tofile; string *s; { /* write the string s to file tofile, no writeln */ long i; /* index to s */ long FORLIM; FORLIM = s->length; for (i = 0; i < FORLIM; i++) putc(s->letters[i], tofile->f); } /* writestring */ /* end module interact.writestring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module trigger.proc */ /* this module allows one to scan a series of characters, as from an array or a file, and to "trigger" or detect a simple string in the series. the advantage of the trigger is that several triggers can "observe" a stream of characters at once, each looking for a different thing. some other modules required: interact.const, interact.type */ Static Void resettrigger(t_) trigger *t_; { /* reset the trigger to ground state */ t_->state = 0; t_->skip = false; t_->found = false; } /* resettrigger */ Static Void testfortrigger(ch, t_) Char ch; trigger *t_; { /* look at the character ch. if it is part of the trigger (at the current trigger state), then the trigger state goes higher. if it is not part of the trigger then the trigger state is reset, skip is true and one should skip onward to find the trigger. if the trigger is found, found is true. */ t_->state++; /* if debugging then begin writestring(list,seek); writeln(list,'testfortrigger seek.letters[',state:1,']:', seek.letters[state],' ch:',ch); end;*/ if (t_->seek.letters[t_->state - 1] == ch) { t_->skip = false; if (t_->state == t_->seek.length) t_->found = true; else t_->found = false; return; } t_->state = 0; t_->skip = true; t_->found = false; /* reset trigger */ } /* testfortrigger */ /* end module trigger.proc version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module filler.fillstring */ Static Void fillstring(s, a_) string *s; Char *a_; { /* this procedure makes it reasonably easy to fill the string s with characters. one calls the procedure as: */ /* 1 2 3 4 5 */ /* 12345678901234567890123456789012345678901234567890 */ /* fillstring(s, 'this-is-the-string '); the two comments make it easy to line the characters up. also, for this example, it was assumed that the length of filler as defined by the constant fillermax was 50. */ long length = fillermax; /* of the string without trailing blanks */ long index; /* of s */ clearstring(s); while (length > 1 && a_[length-1] == ' ') length--; if (length == 1 && a_[length-1] == ' ') { printf("fillstring: the string is empty\n"); halt(); } for (index = 0; index < length; index++) s->letters[index] = a_[index]; s->length = length; s->current = 1; } /* fillstring */ /* end module filler.fillstring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module filler.filltrigger */ Static Void filltrigger(t_, a_) trigger *t_; Char *a_; { /* fill the trigger t */ fillstring(&t_->seek, a_); } /* fillstring */ /* end module filler.filltrigger version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* ********************************************************************** */ /* ********************************************************************** */ /* begin module rs.readrsrange */ Static Void readrsrange(rsdata, r) _TEXT *rsdata; rstype *r; { /* read the range data from rsdata to r. data is assumed to be the rsdata file from program rseq. */ long index; /* for counting lines of rsdata */ Char skip; /* a character to skip the '*' on the line */ for (index = 1; index <= 11; index++) { fscanf(rsdata->f, "%*[^\n]"); getc(rsdata->f); } fscanf(rsdata->f, "%c%ld%ld%*[^\n]", &skip, &r->rstart, &r->rstop); getc(rsdata->f); /* writeln(output, 'range: ',r.rstart:1,' ',r.rstop:1);*/ if (skip == '\n') skip = ' '; } /* end module rs.readrsrange version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module rs.getrsbegin */ Static Void getrsbegin(infile) _TEXT *infile; { /* skip to the beginning of the data in a data file from the rseq program */ Char ch; /* a character read from infile */ trigger begindata; /* a trigger to locate the beginning of the data */ /* 1 2 */ /* 123456789012345678901 */ filltrigger(&begindata, "l a c g t"); resettrigger(&begindata); if (*infile->name != '\0') { if (infile->f != NULL) infile->f = freopen(infile->name, "r", infile->f); else infile->f = fopen(infile->name, "r"); } else rewind(infile->f); if (infile->f == NULL) _EscIO2(FileNotFound, infile->name); RESETBUF(infile->f, Char); while (!begindata.found) { if (P_eoln(infile->f)) { fscanf(infile->f, "%*[^\n]"); getc(infile->f); } if (BUFEOF(infile->f)) { printf("beginning of data not found\n"); halt(); } ch = getc(infile->f); if (ch == '\n') ch = ' '; testfortrigger(ch, &begindata); } fscanf(infile->f, "%*[^\n]"); getc(infile->f); } /* end module rs.getrsbegin version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module rs.readrsdata */ Static Void readrsdata(rsdata, rdata) _TEXT *rsdata; rstype *rdata; { /* read data from the data file of program rseq into the datatype */ fscanf(rsdata->f, "%ld%ld%ld%ld%ld%lg%lg%lg%lg%ld%lg", &rdata->l, &rdata->nal, &rdata->ncl, &rdata->ngl, &rdata->ntl, &rdata->rsl, &rdata->rs, &rdata->varhnb, &rdata->sumvar, &rdata->nl, &rdata->ehnb); /* skip spaces to find the flag: */ while (P_peek(rsdata->f) == ' ') getc(rsdata->f); fscanf(rsdata->f, "%c%*[^\n]", &rdata->flag); getc(rsdata->f); /* writeln(output,'readrsdata: l a c g t flag = ', l:1,' ',nal:1,' ',ncl:1,' ',ngl:1,' ',ntl:1,' ',flag); */ if (rdata->flag == '\n') rdata->flag = ' '; } /* Local variables for numberdigit: */ struct LOC_numberdigit { long number, place; /* the exponent of logplace */ long absolute; /* the absolute value of number */ Char acharacter; /* the character to be returned */ } ; Local Void digit(LINK) struct LOC_numberdigit *LINK; { /* extract a digit at the place position */ long tenplace; /* ten times place */ long z; /* an intermediate value */ long d; /* the digit extracted */ tenplace = LINK->place * 10; z = LINK->absolute - LINK->absolute / tenplace * tenplace; if (LINK->place == 1) d = z; else d = z / LINK->place; switch (d) { case 0: LINK->acharacter = '0'; break; case 1: LINK->acharacter = '1'; break; case 2: LINK->acharacter = '2'; break; case 3: LINK->acharacter = '3'; break; case 4: LINK->acharacter = '4'; break; case 5: LINK->acharacter = '5'; break; case 6: LINK->acharacter = '6'; break; case 7: LINK->acharacter = '7'; break; case 8: LINK->acharacter = '8'; break; case 9: LINK->acharacter = '9'; break; } } /* digit */ Local Void sign(LINK) struct LOC_numberdigit *LINK; { /* put a negative sign out or a positive sign */ if (LINK->number < 0) LINK->acharacter = '-'; else LINK->acharacter = '+'; } /* sign */ /* end module rs.readrsdata version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* ********************************************************************** */ /* ********************************************************************** */ /* begin module package.numbar */ /* ************************************************************************ */ /* begin module numberdigit */ Static Char numberdigit(number_, logplace) long number_, logplace; { /* return the digit at the place value ('logplace') position of number. example: numberdigit(13625, 3) = 3 numberdigit(13625, 4) = 1 */ struct LOC_numberdigit V; long count; /* used to make place */ V.number = number_; V.place = 1; for (count = 1; count <= logplace; count++) V.place *= 10; if (V.number == 0) { if (V.place == 1) V.acharacter = '0'; else V.acharacter = ' '; return V.acharacter; } V.absolute = labs(V.number); if (V.absolute < V.place / 10) { V.acharacter = ' '; return V.acharacter; } if (V.absolute >= V.place) digit(&V); else sign(&V); return V.acharacter; } /* numberdigit */ #define ln10 2.30259 /* natural log of 10 - for conversion to log base 10 */ #define epsilon 0.00001 /* a small number to correct log base 10 errors */ /* end module numberdigit version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module numbersize */ Static long numbersize(n) long n; { /* calculate amount of space to be reserved for the integer n */ if (n == 0) return 1; else { return ((long)(log((double)labs(n)) / ln10 + epsilon) + 2); /* the epsilon assures that we do not lose a place due to roundoff. eg, sometimes log base 10 of 10 would be 0.9999 instead of 1, and we would not do it right... note: this will fail for very large numbers on the order of 1/epsilon. */ /* the 2 is for the sign and last digit */ } } /* numbersize */ #undef ln10 #undef epsilon /* end module numbersize version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module numberbar */ Static Void numberbar(afile, spaces, firstnumber, lastnumber, linesused) _TEXT *afile; long spaces, firstnumber, lastnumber, *linesused; { /* write a bar of numbers to a file, with several spaces before. the number of lines used is returned */ long logplace; /* the log of the digit being looked at */ long spacecount; /* count of spaces */ long number; /* the current number being written */ if (labs(firstnumber) > labs(lastnumber)) *linesused = numbersize(firstnumber); else *linesused = numbersize(lastnumber); for (logplace = *linesused - 1; logplace >= 0; logplace--) { for (spacecount = 1; spacecount <= spaces; spacecount++) putc(' ', afile->f); for (number = firstnumber; number <= lastnumber; number++) fputc(numberdigit(number, logplace), afile->f); putc('\n', afile->f); } } /* end module numberbar version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* ************************************************************************ */ /* end module package.numbar version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module chartorange */ Static base chartorange(ch) Char ch; { base Result; if (ch != 'x' && ch != 't' && ch != 'g' && ch != 'c' && ch != 'a') { printf("chartorange: %c is not a base\n", ch); halt(); return Result; } switch (ch) { case 'a': Result = a; break; case 'c': Result = c; break; case 'g': Result = g; break; case 't': Result = t; break; case 'x': Result = x; break; } return Result; } /* end module chartorange */ /* begin module basetochar */ Static Char basetochar(b) base b; { Char Result; switch (b) { case a: Result = 'a'; break; case c: Result = 'c'; break; case g: Result = 'g'; break; case t: Result = 't'; break; case x: Result = 'x'; break; } return Result; } /* end module basetochar */ /* begin module makenumber */ Static Void makenumber(name, number, found) string name; long *number; boolean *found; { /* make a integer number from the name. If a number was not detected, found is false. */ long l; /* position in the string */ *found = false; *number = 0; for (l = 0; l < name.length; l++) { if (name.letters[l] == '9' || name.letters[l] == '8' || name.letters[l] == '7' || name.letters[l] == '6' || name.letters[l] == '5' || name.letters[l] == '4' || name.letters[l] == '3' || name.letters[l] == '2' || name.letters[l] == '1' || name.letters[l] == '0') { *found = true; *number *= 10; /* make room for the next digit */ switch (name.letters[l]) { case '0': /* blank case */ break; case '1': (*number)++; break; case '2': *number += 2; break; case '3': *number += 3; break; case '4': *number += 4; break; case '5': *number += 5; break; case '6': *number += 6; break; case '7': *number += 7; break; case '8': *number += 8; break; case '9': *number += 9; break; } } } } /* end module makenumber version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module getstring */ Static Void getstring(f, s, spaces) _TEXT *f; string *s; boolean spaces; { /* read in the string. if spaces is true, then there will be spaces in the string and the string ends at the end of the line. otherwise the string ends either at a space (and does not include it) or at the end of the line */ Char c_ = 'x'; /* a character read in */ s->length = 0; while ((c_ != ' ' || spaces) & (!P_eoln(f->f))) { c_ = getc(f->f); if (c_ == '\n') c_ = ' '; if (!(c_ != ' ' || spaces)) continue; s->length++; if (s->length > maxstring) { printf("a string being read is bigger than %ld\n", (long)maxstring); printf("increase constant maxstring\n"); halt(); } s->letters[s->length - 1] = c_; } } /* end module getstring */ /* begin module putstring */ Static Void putstring(f, s) _TEXT *f; string *s; { /* write the string to f */ long l; /* index to the string */ long FORLIM; FORLIM = s->length; for (l = 0; l < FORLIM; l++) putc(s->letters[l], f->f); } /* end module putstring */ /* begin module copystring */ Static Void copystring(a_, b) string a_, *b; { /* copy string a to b */ long l; /* index to the string */ b->length = a_.length; for (l = 0; l < a_.length; l++) b->letters[l] = a_.letters[l]; } /* end module copystring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module equalstring */ Static boolean equalstring(a_, b) string a_, b; { /* test for equality between two strings at current positions */ long index; /* index to both strings */ boolean equal; /* are letters in a and b the same? */ if (a_.length == b.length) { index = 1; do { equal = (a_.letters[index-1] == b.letters[index-1]); index++; } while (equal && index <= a_.length); return equal; } else return false; } /* equalstring */ /* end module equalstring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module lessstring */ Static boolean lessstring(a_, b) string a_, b; { /* is string a before string b in the alphabet? */ boolean Result; boolean done = false; /* done checking the strings */ long l = 0; /* index to the string */ /* note: lengths assumed equal */ do { l++; if (l > a_.length) { /* the whole strings matched */ done = true; Result = false; } else if (b.letters[l-1] != a_.letters[l-1]) { done = true; Result = (b.letters[l-1] > a_.letters[l-1]); } } while (!done); return Result; } /* end module lessstring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module equalseq */ Static boolean equalseq(a_, b) string a_, b; { /* are the sequence strings a and b equal? an 'x' means that they match */ long p; /* position in the strings */ boolean failure; /* true means the strings did not match */ if (a_.length != b.length) return false; else { failure = false; p = 1; do { if (a_.letters[p-1] != 'x' && b.letters[p-1] != 'x') { if (a_.letters[p-1] != b.letters[p-1]) failure = true; } p++; } while (!(p > a_.length || failure)); return (!failure); } } /* end module equalseq */ /* begin module diffseq */ Static Void diffseq(f, a_, b) _TEXT *f; string a_, b; { /* show the differences between sequences a and b to file f */ long p = 1; /* position in the strings */ if (a_.length != b.length) { fprintf(f->f, "sequences are different in length\n"); return; } for (p = 0; p < a_.length; p++) { if (a_.letters[p] != 'x' && b.letters[p] != 'x') { if (a_.letters[p] != b.letters[p]) putc('^', f->f); else putc(' ', f->f); } else { putc(' ', f->f); /* else write(f,' ') else write(f,a.letters[p]) qqq */ } } putc('\n', f->f); } /* end module diffseq */ /* begin module sites.identify */ Static Void identify(f, e) _TEXT *f; entry_ *e; { /* identify the entry data, write to file f */ fprintf(f->f, "seq: \""); putstring(f, &e->seq); fprintf(f->f, "\"\n"); fprintf(f->f, "name: \""); putstring(f, &e->name); fprintf(f->f, "\"\n"); fprintf(f->f, "primer: \""); putstring(f, &e->primer); fprintf(f->f, "\"\n"); fprintf(f->f, "experiment: \""); putstring(f, &e->ex); fprintf(f->f, "\"\n"); fprintf(f->f, "date: \""); putstring(f, &e->date); fprintf(f->f, "\"\n"); } /* end module sites.identify */ /* begin module sites.getentry */ Static Void getentry(f, e, std, gotten) _TEXT *f; entry_ **e, *std; boolean *gotten; { /* get the entry e from f, tell whether it is acceptable. If the base is a period '.', then use the standard sequence in std instead. When the standard is being read, std is empty. If the user illegally puts a period in the standard, then the routinge will halt because the user tried to use a period in the standard. */ long extra; /* extra spaces to align primer names */ boolean foundnumber; /* found a number in the string 'name' */ long l = 1; /* index to a string */ entry_ *WITH; _TEXT TEMP; long FORLIM; WITH = *e; getstring(f, &WITH->seq, false); getstring(f, &WITH->name, false); makenumber(WITH->name, &WITH->number, &foundnumber); if (!foundnumber) { /* if this is the standard, it's ok, otherwise not */ if (std == NULL) WITH->number = 0; else { printf("getentry: sequence:\n"); TEMP.f = stdout; *TEMP.name = '\0'; putstring(&TEMP, &WITH->seq); printf("\nhas a name:\n"); TEMP.f = stdout; *TEMP.name = '\0'; putstring(&TEMP, &WITH->name); printf("\nwhich cannot be converted to a number.\n"); printf("The sites program requires a plasmid number for\n"); printf("every database entry except the canonical standard.\n"); halt(); } } getstring(f, &WITH->primer, false); /* make the primers align */ if (WITH->primer.length < primerlength) { for (extra = WITH->primer.length; extra < primerlength; extra++) WITH->primer.letters[extra] = ' '; WITH->primer.length = primerlength; } getstring(f, &WITH->ex, false); getstring(f, &WITH->date, true); WITH->next = NULL; /* check the entry */ *gotten = true; FORLIM = WITH->seq.length; for (l = 0; l < FORLIM; l++) { if (WITH->seq.letters[l] != '.' && WITH->seq.letters[l] != 'd' && WITH->seq.letters[l] != 'x' && WITH->seq.letters[l] != 't' && WITH->seq.letters[l] != 'g' && WITH->seq.letters[l] != 'c' && WITH->seq.letters[l] != 'a') { TEMP.f = stdout; *TEMP.name = '\0'; identify(&TEMP, *e); printf("non \"acgtxd.\" character found in sequence\n"); halt(); } if (WITH->seq.letters[l] == '.') { if (std == NULL) { printf("A period (.) cannot be used in a standard sequence.\n"); halt(); } /* use the standard: */ WITH->seq.letters[l] = std->seq.letters[l]; } if (*gotten) { if (WITH->seq.letters[l] == 'd') { /* it is the responsibility of the calling program to tell the user about dropped entries identify(output,e); */ *gotten = false; } } } /* allow person to have plasmid name that does not begin with p: if name.letters[1] <> 'p' then begin identify(output,e); writeln(output,'WARNING: The plasmid name does not begin with "p"'); end; */ fscanf(f->f, "%*[^\n]"); getc(f->f); } /* end module sites.getentry */ /* begin module sites.putid */ Static Void putid(f, s) _TEXT *f; entry_ *s; { /* put the identifier (without the sequence) of s to f */ putstring(f, &s->name); putc(' ', f->f); putstring(f, &s->primer); putc(' ', f->f); putstring(f, &s->ex); putc(' ', f->f); putstring(f, &s->date); } /* end module sites.putid */ /* begin module sites.putentry */ Static Void putentry(f, s) _TEXT *f; entry_ *s; { /* put the entry in s to f */ putstring(f, &s->seq); putc(' ', f->f); putid(f, s); putc('\n', f->f); } /* end module sites.putentry */ /* begin module sites.clearcoor */ Static Void clearcoor(c_) coordinates *c_; { /* clear the coordinates c */ c_->fromrange = 0; c_->torange = 0; c_->fromrandom = 0; c_->torandom = 0; c_->lengthrange = 0; c_->lengthrandom = 0; c_->fracan = 0.0; c_->notfracan = 0.0; } /* end module sites.clearcoor */ /* begin module sites.clearibl */ Static Void clearibl(fbl) iblarray *fbl; { /* clear the fbl array */ base b; /* a base in the d sequence */ long l; /* position in the d sequence */ coordinates *WITH; WITH = &fbl->coor; for (l = 0; l < maxseq; l++) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) fbl->data[(long)b - (long)a][l] = 0; } clearcoor(&fbl->coor); } /* end module sites.clearibl */ /* begin module sites.clearrbl */ Static Void clearrbl(fbl) rblarray *fbl; { /* clear the fbl array */ base b; /* a base in the d sequence */ long l; /* position in the d sequence */ coordinates *WITH; WITH = &fbl->coor; for (l = 0; l < maxseq; l++) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) fbl->data[(long)b - (long)a][l] = 0.0; } clearcoor(&fbl->coor); } /* end module sites.clearrbl */ /* begin module sites.copycoords */ Static Void copycoords(a_, b) coordinates a_, *b; { /* copy the coordinates a into b */ b->fromrange = a_.fromrange; b->torange = a_.torange; b->lengthrange = a_.lengthrange; b->fromrandom = a_.fromrandom; b->torandom = a_.torandom; b->lengthrandom = a_.lengthrandom; b->fracan = a_.fracan; b->notfracan = a_.notfracan; } /* end module sites.copycoords */ /* begin module sites.copyrbl */ Static Void copyrbl(Aarray, Barray) rblarray Aarray, *Barray; { /* copy a into b */ base b; /* index to array */ long l; /* index to array */ copycoords(Aarray.coor, &Barray->coor); for (l = 0; l < Aarray.coor.lengthrange; l++) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) Barray->data[(long)b - (long)a][l] = Aarray.data[(long)b - (long)a][l]; } } /* end module sites.copyrbl */ /* begin module sites.makenotfracan */ Static Void makenotfracan(fracan, notfracan) double fracan, *notfracan; { /* make the proportion of non canonical bases (not fraction canonical = notfracan). This is so simple it is better than passing it all over the place */ *notfracan = (1.0 - fracan) / 3.0; /* fraction of non canonical bases */ } /* end module sites.makenotfracan */ /* begin module sites.setcoordinates */ Static Void setcoordinates(a_) coordinates *a_; { /* determine the lengths of the coordinate set and the fraction canonical and its inverse. */ a_->lengthrange = a_->torange - a_->fromrange + 1; a_->lengthrandom = a_->torandom - a_->fromrandom + 1; makenotfracan(a_->fracan, &a_->notfracan); } /* end module sites.setcoordinates */ /* begin module sites.showiblarray */ Static Void showiblarray(afile, ar) _TEXT *afile; iblarray ar; { /* show the iblarray ar to afile */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ long l; /* index to array */ for (coordinate = ar.coor.fromrange; coordinate <= ar.coor.torange; coordinate++) { l = coordinate - ar.coor.fromrange + 1; fprintf(afile->f, "%4ld ", coordinate); if (coordinate >= ar.coor.fromrandom && coordinate <= ar.coor.torandom) putc('R', afile->f); else putc(' ', afile->f); for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) fprintf(afile->f, " %5ld", ar.data[(long)b - (long)a][l-1]); putc('\n', afile->f); } } /* end module sites.showiblarray */ /* begin module sites.showrblarray */ Static Void showrblarray(afile, ar) _TEXT *afile; rblarray ar; { /* show the rblarray ar to afile */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ long l; /* index to array */ for (coordinate = ar.coor.fromrange; coordinate <= ar.coor.torange; coordinate++) { l = coordinate - ar.coor.fromrange + 1; fprintf(afile->f, "%4ld ", coordinate); if (coordinate >= ar.coor.fromrandom && coordinate <= ar.coor.torandom) putc('R', afile->f); else putc(' ', afile->f); for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) fprintf(afile->f, " %5.2f", ar.data[(long)b - (long)a][l-1]); putc('\n', afile->f); } } /* end module sites.showrblarray */ /* begin module sites.getrsdata */ Static Void getrsdata(rsdata, fbl) _TEXT *rsdata; iblarray *fbl; { /* read from the standard rsdata format the natural numbers of bases at each position in the sites, f(b,l) */ long l; /* position in the fbl array */ rstype rdata; /* data from rseq */ long position; /* a location in the aligned sequence */ coordinates *WITH; long FORLIM; if (*rsdata->name != '\0') { if (rsdata->f != NULL) rsdata->f = freopen(rsdata->name, "r", rsdata->f); else rsdata->f = fopen(rsdata->name, "r"); } else rewind(rsdata->f); if (rsdata->f == NULL) _EscIO2(FileNotFound, rsdata->name); RESETBUF(rsdata->f, Char); clearibl(fbl); if (BUFEOF(rsdata->f)) { printf("empty standard rsdata file\n"); halt(); return; } /* find the range of the graph in bases and prepare for reading the rsdata for graphing */ readrsrange(rsdata, &rdata); getrsbegin(rsdata); /* set the size */ WITH = &fbl->coor; WITH->fromrange = rdata.rstart; WITH->torange = rdata.rstop; WITH->fromrandom = WITH->fromrange; WITH->torandom = WITH->torange; WITH->fracan = 0.25; setcoordinates(&fbl->coor); FORLIM = rdata.rstop + 1; /* create the fbl */ for (position = rdata.rstart + 1; position <= FORLIM; position++) { /* skip lines with an '*' */ if (P_peek(rsdata->f) != '*') { readrsdata(rsdata, &rdata); l = position - rdata.rstart; /* move to the next rsdata line and give the data */ fbl->data[0][l-1] = rdata.nal; fbl->data[(long)c - (long)a][l-1] = rdata.ncl; fbl->data[(long)g - (long)a][l-1] = rdata.ngl; fbl->data[(long)t - (long)a][l-1] = rdata.ntl; /* write(output,l:nfield, (* position *) ' ',rdata.nl:infofield, (* number sequences *) ' ',rdata.rsl:infofield:infodecim, (* information *) ' ',rdata.varhnb:infofield:infodecim); (* variance of rsl *) write(output,' a ',rdata.nal:nfield); write(output,' c ',rdata.ncl:nfield); write(output,' g ',rdata.ngl:nfield); write(output,' t ',rdata.ntl:nfield); writeln(output); */ } } } /* end module sites.getrsdata */ /* begin module sites.announcement */ Static Void announcement(list, header) _TEXT *list; string header; { /* announce simple things to the list file */ fprintf(list->f, "sites %4.2f\n", version); fprintf(list->f, "the database is:\n"); putstring(list, &header); putc('\n', list->f); } /* end module sites.announcement */ /* begin module sites.mkheader */ Static Void mkheader(list, header, experiment, coords, std) _TEXT *list; string header; long experiment; coordinates coords; entry_ *std; { /* make the header to file list */ long l; /* dummy variable */ if (experiment == 1) announcement(list, header); fprintf(list->f, "\nExperiment %ld\n", experiment); fprintf(list->f, " range: from %4ld to %4ld\n", coords.fromrange, coords.torange); fprintf(list->f, " random: from %4ld to %4ld\n", coords.fromrandom, coords.torandom); fprintf(list->f, " fraction canonical: %4.2f\n\n", coords.fracan); numberbar(list, 0L, coords.fromrange, coords.torange, &l); putentry(list, std); } /* end module sites.mkheader */ /* begin module sites.dataheader */ Static Void dataheader(list, header, std) _TEXT *list; string header; entry_ *std; { /* put out the header for data files */ fprintf(list->f, "* sites %4.2f\n", version); fprintf(list->f, "* the database is: "); putstring(list, &header); fprintf(list->f, "\n* "); putid(list, std); putc('\n', list->f); } /* end module sites.dataheader */ /* begin module sites.crunch */ Static Void crunch(data, numentries, standard, symbol) entry_ **data; long numentries; entry_ *standard; Char symbol; { /* crush the data so that anything the same as the standard is now the symbol WARNING: this will affect statistics if any are done afterward! There are numentries of sequences to do this to. */ long l; /* index to datum and standard */ long n; /* index to the entries */ long FORLIM1; for (n = 0; n < numentries; n++) { FORLIM1 = data[n]->seq.length; for (l = 0; l < FORLIM1; l++) { if (data[n]->seq.letters[l] == standard->seq.letters[l]) data[n]->seq.letters[l] = symbol; } } } /* end module sites.crunch */ /* begin module sites.uncrunch */ Static Void uncrunch(data, numentries, standard, symbol) entry_ **data; long numentries; entry_ *standard; Char symbol; { /* undo the crunch operation */ long l; /* index to datum and standard */ long n; /* index to the entries */ long FORLIM1; for (n = 0; n < numentries; n++) { FORLIM1 = data[n]->seq.length; for (l = 0; l < FORLIM1; l++) { if (data[n]->seq.letters[l] == symbol) data[n]->seq.letters[l] = standard->seq.letters[l]; } } } typedef long position; /* Local variables for sortdata: */ struct LOC_sortdata { entry_ **data; boolean onname; } ; Local boolean lessthan(a_, b, LINK) position a_, b; struct LOC_sortdata *LINK; { /* is a less than b? */ if (LINK->onname) return (LINK->data[a_-1]->number < LINK->data[b-1]->number); else return (lessstring(LINK->data[a_-1]->seq, LINK->data[b-1]->seq)); } Local Void swap_(a_, b, LINK) position a_, b; struct LOC_sortdata *LINK; { /* switch positions a and b */ entry_ *hold; hold = LINK->data[a_-1]; LINK->data[a_-1] = LINK->data[b-1]; LINK->data[b-1] = hold; } /* begin module quicksort */ Local Void quicksort(left, right, LINK) position left, right; struct LOC_sortdata *LINK; { /* quick sort a list between positions left and right, into ascending order. a position is simply a scalar of the form 0..max. the array to be sorted is dimensioned 1..max. (the difference in the ranges is important to the correct operation of the sort...) two external routines are used: function lessthan(a, b: position): boolean is a generalized test for value-at-a < value-at-b. procedure swap(a, b: position) switches the items at positions a and b. since these routines are external, the procedure is general. this procedure taken from the book 'algorithms + data structures = programs' by niklaus wirth, prentice-hall, inc., englewood cliffs, n.j.(1976), pp. 76-82 */ position lower = left; position upper; /* the positions looked at currently */ position center; /* the rough center of the region being sorted */ center = (left + right) / 2; upper = right; do { while (lessthan(lower, center, LINK)) lower++; while (lessthan(center, upper, LINK)) upper--; if (lower <= upper) { /* ho ho keep track of the center through the map: */ if (lower == center) center = upper; else if (upper == center) center = lower; swap_(lower, upper, LINK); lower++; upper--; } } while (lower <= upper); if (left < upper) quicksort(left, upper, LINK); if (lower < right) quicksort(lower, right, LINK); } /* end module sites.uncrunch */ /* begin module sites.sortdata */ Static Void sortdata(data_, numentries, onname_) entry_ **data_; long numentries; boolean onname_; { /* sort the entries by name (onname is true) or by sequence (onname is false). */ struct LOC_sortdata V; /* end module quicksort version = 'prgmod 3.96 85 mar 18 tds'; */ V.data = data_; V.onname = onname_; /* sort the entries */ quicksort(1L, numentries, &V); } /* end module sites.sortdata */ /* begin module sites.readheader */ Static Void readheader(database, header) _TEXT *database; string *header; { /* read the header from the database */ if (*database->name != '\0') { if (database->f != NULL) database->f = freopen(database->name, "r", database->f); else database->f = fopen(database->name, "r"); } else rewind(database->f); if (database->f == NULL) _EscIO2(FileNotFound, database->name); RESETBUF(database->f, Char); getstring(database, header, true); fscanf(database->f, "%*[^\n]"); getc(database->f); } /* end module sites.readheader */ /* begin module sites.readexpt */ Static Void readexpt(database, coords, std) _TEXT *database; coordinates *coords; entry_ **std; { /* read from database the coordinates of the next experiment (expt), its coordinates, including thefraction of canonical base, calculate the fractions of the other bases, and the standard sequence. */ boolean gotten; /* got the standard and it has no deleted bases */ Char skip; /* charactyer for skipping the '@' of the database */ if (P_peek(database->f) != '@') { printf("PROGRAM ERROR: @ missing\n"); halt(); } fscanf(database->f, "%c%ld%ld%ld%ld%lg%*[^\n]", &skip, &coords->fromrange, &coords->torange, &coords->fromrandom, &coords->torandom, &coords->fracan); getc(database->f); if (skip == '\n') skip = ' '; setcoordinates(coords); /* get the standard canonical sequence */ if (*std == NULL) *std = (entry_ *)Malloc(sizeof(entry_)); getentry(database, std, NULL, &gotten); if (!gotten) { printf("The standard must not contain deletion bases (d).\n"); halt(); } if ((*std)->seq.length != coords->torange - coords->fromrange + 1) { printf( "the length of the standard is not consistent with the given coordinates\n"); halt(); } if (coords->fromrange > coords->torange) { printf("fromrange (%ld) is greater than torange (%ld)!\n", coords->fromrange, coords->torange); halt(); } if (coords->fromrandom < coords->fromrange) { printf("fromrandom (%ld) is less than fromrange (%ld)!\n", coords->fromrandom, coords->fromrange); printf("Randomized positions must be within the region sequenced\n"); halt(); } if (coords->torandom > coords->torange) { printf("torandom (%ld) is greater than torange (%ld)!\n", coords->torandom, coords->torange); printf("Randomized positions must be within the region sequenced\n"); halt(); } if (coords->fromrandom > coords->torandom) { printf("fromrandom (%ld) is greater than torandom (%ld)!\n", coords->fromrandom, coords->torandom); halt(); } if ((unsigned)coords->fracan > 1.0) { printf("fraction canonical (%4.2f) must be between 0 and 1!\n", coords->fracan); halt(); } } /* end module sites.readexpt */ /* begin module sites.findextent */ Static Void findextent(database, global) _TEXT *database; rblarray *global; { /* find the extents of the range of the data and the random region */ coordinates c_; /* coordinates of the experiment being read */ long experiments = 0; /* the number of separate experiments performed */ entry_ *std = NULL; /* standard canonical sequence */ coordinates *WITH; /* allow multiple readings of std */ clearrbl(global); WITH = &global->coor; WITH->fromrange = LONG_MAX; WITH->torange = -LONG_MAX; WITH->fromrandom = LONG_MAX; WITH->torandom = -LONG_MAX; printf("\nChecking Experiments\n"); while (!BUFEOF(database->f)) { if (P_peek(database->f) != '@') { fscanf(database->f, "%*[^\n]"); getc(database->f); continue; } experiments++; printf("Experiment %ld\n", experiments); readexpt(database, &c_, &std); /* find out the overall range of the data */ if (c_.fromrange < WITH->fromrange) WITH->fromrange = c_.fromrange; if (c_.torange > WITH->torange) WITH->torange = c_.torange; if (c_.fromrandom < WITH->fromrandom) WITH->fromrandom = c_.fromrandom; if (c_.torandom > WITH->torandom) WITH->torandom = c_.torandom; } if (experiments == 0) { printf("No experiments found in database!\n"); printf("(perhaps you forgot the @ and standard lines?)\n"); halt(); } if (WITH->fromrange == LONG_MAX || WITH->torange == -LONG_MAX) { printf("The experimental information is missing!\n"); halt(); } printf(" overall global range: %ld to %ld\n", WITH->fromrange, WITH->torange); printf("overall randomized regions: %ld to %ld\n", WITH->fromrandom, WITH->torandom); /* now we can define the global length: */ global->coor.fracan = c_.fracan; setcoordinates(&global->coor); Free(std); /* clear out what we don't need now */ } /* end module sites.findextent */ /* begin module sites.readdata */ Static Void readdata(database, data, std, numentries) _TEXT *database; entry_ **data; entry_ *std; long *numentries; { /* Read the entry data for one experiment from the database, return number of them in numentries. */ long d = 0; /* actual number of sequence read in */ entry_ *datum; /* one of the entries */ boolean done = false; /* done with reading one experimental set */ boolean gotten; /* got the standard and it has no deleted bases */ _TEXT TEMP; *numentries = 0; while (!done) { if (BUFEOF(database->f)) done = true; if (!done) { if (P_peek(database->f) == '@') done = true; } if (done) break; /* create the space for the next one */ (*numentries)++; d++; if (*numentries > maxentries) { printf("The number of entries exceeds %ld\n", (long)maxentries); printf("increase constant maxentries.\n"); halt(); } /* datum := data[numentries]; if datum = nil then new(data[numentries]); */ if (data[*numentries - 1] == NULL) data[*numentries - 1] = (entry_ *)Malloc(sizeof(entry_)); datum = data[*numentries - 1]; getentry(database, &datum, std, &gotten); /*here is the point to check the entry lengths */ /* check that the length of the sequence matches the standard */ if (datum->seq.length != std->seq.length) { printf( "\nreaddata: length of sequence datum %ld does not equal the standard.\n", d); printf("standard: %5ld\n", std->seq.length); printf(" datum: %5ld\n", datum->seq.length); TEMP.f = stdout; *TEMP.name = '\0'; identify(&TEMP, datum); printf("WARNING: THIS SEQUENCE WILL BE SKIPPED\n\n"); (*numentries)--; /* backpeddle! */ } /* writeln(output); writeln(output,'***********************************'); writeln(output,numentries:1); identify(output,data[numentries]); writeln(output,'***********************************'); writeln(output); */ if (!gotten) { printf("WARNING: this entry was dropped because of deletions (\"d\"):\n"); TEMP.f = stdout; *TEMP.name = '\0'; identify(&TEMP, data[*numentries - 1]); putchar('\n'); /* leave the space for the next one! */ (*numentries)--; } } } /* end module sites.readdata */ /* begin module sites.copyentry */ Static Void copyentry(a_, b) entry_ *a_, **b; { /* copy the entry a to b, except for the sequence itself */ (*b)->seq.length = a_->seq.length; copystring(a_->name, &(*b)->name); (*b)->number = a_->number; copystring(a_->primer, &(*b)->primer); copystring(a_->ex, &(*b)->ex); copystring(a_->date, &(*b)->date); } /* end module sites.copyentry */ /* begin module sites.capsequence */ Static Void capsequence(datum, naked, standard) entry_ *datum, **naked, *standard; { /* using the standard, capitalize the sequence in datum whenever they are unequal, put the result in naked */ long l; /* index to datum and standard */ long FORLIM; copyentry(datum, naked); FORLIM = datum->seq.length; for (l = 0; l < FORLIM; l++) { if (datum->seq.letters[l] != standard->seq.letters[l]) { switch (datum->seq.letters[l]) { case 'a': (*naked)->seq.letters[l] = 'A'; break; case 'c': (*naked)->seq.letters[l] = 'C'; break; case 'g': (*naked)->seq.letters[l] = 'G'; break; case 't': (*naked)->seq.letters[l] = 'T'; break; case 'x': /* nothing to note */ (*naked)->seq.letters[l] = 'x'; break; } } else (*naked)->seq.letters[l] = datum->seq.letters[l]; } } /* end module sites.capsequence */ /* begin module sites.strip */ Static Void strip(coords, datum, naked, standard, symbol, blank) coordinates coords; entry_ *datum, **naked, *standard; Char symbol, blank; { /* strip the sequence in datum using the standard: whenever they are equal, put the symbol. outside of the random region use the blank character. put the result in naked */ long coordinate; /* true coordinate system */ long l; /* index to datum and standard */ copyentry(datum, naked); for (coordinate = coords.fromrange; coordinate <= coords.torange; coordinate++) { l = coordinate - coords.fromrange + 1; if (datum->seq.letters[l-1] == standard->seq.letters[l-1]) { if (coordinate >= coords.fromrandom && coordinate <= coords.torandom) (*naked)->seq.letters[l-1] = symbol; else (*naked)->seq.letters[l-1] = blank; } else (*naked)->seq.letters[l-1] = datum->seq.letters[l-1]; } } /* end module sites.strip */ /* begin module sites.checkdata */ Static Void checkdata(list, data, d, failures) _TEXT *list; entry_ **data; long d, *failures; { /* standard: entryptr; */ /* check that the length of the sequence matches the standard. also, check that the sequence at d is identical with the sequence at the previous number if they have the same plasmid number. the routine assumes that the data are sorted. report results in list. increment the number of failures reported. */ /* var datum: entryptr; (* one of the entries *) */ /* formerly the size check was done here, but move it out to readdata. (* check that the length of the sequence matches the standard *) datum := data[d]; if datum^.seq.length <> standard^.seq.length then begin writeln(output,'Checkdata: length of sequence datum ',d:1, ' does not equal the standard.'); writeln(output,'standard: ', standard^.seq.length:5); writeln(output,' datum: ', datum^.seq.length:5); identify(output,datum); there was a halt here end; */ if (d <= 1) return; if (data[d-1]->number != data[d-2]->number) return; if (equalseq(data[d-1]->seq, data[d-2]->seq)) return; diffseq(list, data[d-1]->seq, data[d-2]->seq); (*failures)++; fprintf(list->f, "DATA ERROR: sequence of \""); putid(list, data[d-2]); fprintf(list->f, "\"\n"); fprintf(list->f, " does not equal \""); putid(list, data[d-1]); fprintf(list->f, "\"\n"); } /* end module sites.checkdata */ /* begin module sites.figure */ Static Void figure(f, data, numentries, experiment, coords, std, kind, blank, testforequal) _TEXT *f; entry_ **data; long numentries, experiment; coordinates coords; entry_ *std; Char kind, blank; boolean testforequal; { /*header: string; not anymore*/ /* this procedure is similar to the display procedure, but it is designed just to put out a figure for a paper */ /* display the set of entries in data to file f. note: the entries are not destroyed if kind = 'c' then capitalize the differences to the standard; if kind = 's' then strip to '.' non differences to the standard. Regions outside the randomized area are set to 'blank'. If testforequal is true, then whenever a sequence is equal to the proceeding one and the strip option is chosen, equal symbols will be used in place of the periods. */ entry_ *datum; /* one of the entries */ long duplicates = 0; /* number of sequences duplicated */ long dummy; /* dummy variable */ entry_ *naked; /* datum changed relative to the std */ long number; /* the number of one datum */ fprintf(f->f, "Experiment %ld\n\n", experiment); numberbar(f, 0L, coords.fromrange, coords.torange, &dummy); putentry(f, std); naked = (entry_ *)Malloc(sizeof(entry_)); for (number = 1; number <= numentries; number++) { datum = data[number-1]; if (kind == 's') { if (testforequal) { if (number > 1) { if (equalstring(datum->seq, data[number-2]->seq)) { strip(coords, datum, &naked, std, '=', blank); duplicates++; } else strip(coords, datum, &naked, std, '.', blank); } else strip(coords, datum, &naked, std, '.', blank); } else strip(coords, datum, &naked, std, '.', blank); } else if (kind == 'c') capsequence(datum, &naked, std); /* replacement for putentry(f,naked); */ putstring(f, &naked->seq); putc(' ', f->f); /* replacement for putid(f,naked); */ putstring(f, &naked->name); fprintf(f->f, " \n"); /* note: no primer name in this version of the program for the final latex figures: putstring(f,primer); write(f,' '); putstring(f, ex); write(f,' '); putstring(f, date); */ } Free(naked); } #define dashline 80 /* number of characters in a dashed line */ /* Local variables for display: */ struct LOC_display { _TEXT *f; } ; Local Void dashit(LINK) struct LOC_display *LINK; { /* make a dashed line */ long dashcount; /* for counting dashes */ for (dashcount = 1; dashcount <= dashline; dashcount++) putc('-', LINK->f->f); putc('\n', LINK->f->f); } /* end module sites.figure */ /* begin module sites.display */ Static Void display(f_, data, numentries, header, experiment, coords, std, kind, blank, testforequal, duplicates) _TEXT *f_; entry_ **data; long numentries; string header; long experiment; coordinates coords; entry_ *std; Char kind, blank; boolean testforequal; long *duplicates; { /* display the set of entries in data to file f. note: the entries are not destroyed if kind = 'c' then capitalize the differences to the standard; if kind = 's' then strip to '.' non differences to the standard. Regions outside the randomized area are set to 'blank'. If testforequal is true, then whenever a sequence is equal to the proceeding one and the strip option is chosen, equal symbols will be used in place of the periods. duplicates is the number of sequences duplicated */ struct LOC_display V; entry_ *datum; /* one of the entries */ long failures = 0; /* number of failed checks */ entry_ *naked; /* datum changed relative to the std */ long number; /* the number of one datum */ V.f = f_; mkheader(V.f, header, experiment, coords, std); *duplicates = 0; naked = (entry_ *)Malloc(sizeof(entry_)); for (number = 1; number <= numentries; number++) { /* writeln(f,'debug: display, number=',number:1); */ datum = data[number-1]; if (kind == 's') { if (testforequal) { if (number > 1) { if (equalstring(datum->seq, data[number-2]->seq)) { strip(coords, datum, &naked, std, '=', blank); (*duplicates)++; } else strip(coords, datum, &naked, std, '.', blank); } else strip(coords, datum, &naked, std, '.', blank); } else strip(coords, datum, &naked, std, '.', blank); } else if (kind == 'c') { capsequence(datum, &naked, std); /* now put a line if the plasmid number has changed */ if (number == 1) dashit(&V); else if (!equalstring(datum->name, data[number-2]->name)) dashit(&V); } putentry(V.f, naked); /*std,*/ checkdata(V.f, data, number, &failures); } if (failures > 0) { printf("\n%ld data errors:\n", failures); printf("inconsistant readings, no analysis will be done\n"); halt(); } Free(naked); } #undef dashline /* Local variables for compress: */ struct LOC_compress { entry_ **data; entry_ *std; _TEXT *caps; Char f; /* a character in the fillspot sequence */ long fillspot; /* the location to put the next entry */ long l; /* index into the sequences */ Char r; /* a character in the readspot sequence */ long readspot; /* the number of one datum that we are inspecting */ } ; Local Void death(LINK) struct LOC_compress *LINK; { /* put here to make code easier to read below */ printf("program error in compress:\n"); printf("position in sequence:%2ld\n", LINK->l); printf("filling at %2ld, letter is: %c\n", LINK->fillspot, LINK->f); printf("reading at %2ld, letter is: %c\n", LINK->readspot, LINK->r); printf("this sequence difference was missed\n"); halt(); } Local Void warning(b, LINK) Char b; struct LOC_compress *LINK; { /* see if the base b is standard, and if not, warn the user */ if (b == LINK->std->seq.letters[LINK->l-1]) return; printf("check sequence%12ld - see caps file\n", LINK->data[LINK->fillspot-1]->number); fprintf(LINK->caps->f, "\nWARNING: sequence: "); putid(LINK->caps, LINK->data[LINK->fillspot-1]); fprintf(LINK->caps->f, "\nand sequence : "); putid(LINK->caps, LINK->data[LINK->readspot-1]); fprintf(LINK->caps->f, "\ndiffer by an x, and the other base is mutant (%c). Check that it IS mutant.\n", b); } /* end module sites.display */ /* begin module sites.compress */ Static Void compress(data_, numentries, std_, caps_) entry_ **data_; long *numentries; entry_ *std_; _TEXT *caps_; { /* compress together multiple entries of the same plasmid number, change numentries to account for the compression. The standard sequence std allows the detection of cases where a mutant is chosen against an x. That is, if one sequence has 'x' and another has 'a' then 'a' should be chosen. If 'a' is the standard sequence this is not a problem. If however, 'a' is NOT the standard sequence, then the user should be warned, because this may be an error that goes down as a mutation. The warnings go into caps, where they are most useful */ struct LOC_compress V; long FORLIM; string *WITH; V.data = data_; V.std = std_; V.caps = caps_; V.fillspot = 1; V.readspot = 2; while (V.readspot <= *numentries) { if (V.data[V.fillspot-1]->number == V.data[V.readspot-1]->number) { FORLIM = V.data[V.fillspot-1]->seq.length; /* merge the readspot entry into the fillspot entry */ for (V.l = 1; V.l <= FORLIM; V.l++) { V.f = V.data[V.fillspot-1]->seq.letters[V.l-1]; V.r = V.data[V.readspot-1]->seq.letters[V.l-1]; if (V.f != V.r) { /* only act if they differ */ if (V.f == 'x') { V.data[V.fillspot-1]->seq.letters[V.l-1] = V.r; warning(V.r, &V); } else if (V.r != 'x') death(&V); else warning(V.f, &V); } } /* change the primer to indicate the merge */ if (!equalstring(V.data[V.fillspot-1]->primer, V.data[V.readspot-1]->primer)) { WITH = &V.data[V.fillspot-1]->primer; WITH->letters[0] = 'b'; WITH->letters[1] = 'o'; WITH->letters[2] = 't'; WITH->letters[3] = 'h'; WITH->length = 4; } } else { /* and fill it */ V.fillspot++; /* could not merge, so move to next spot */ V.data[V.fillspot-1] = V.data[V.readspot-1]; } /* move the readspot entry to fillspot */ V.readspot++; /* continue to read through the entries */ } *numentries = V.fillspot; /* also account for the other way to detect the end of the data: */ if (*numentries < maxentries) V.data[*numentries] = NULL; printf("compressed data set: %ld sequences\n", *numentries); } /* end module sites.compress */ /* begin module sites.detfi */ Static Void detfi(fo, std, fi) rblarray fo; entry_ *std; rblarray *fi; { /* determine fi(b,l), the input frequency array, from the standard sequence in std. */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ boolean inrandom; /* true if working within the randomized region */ long l; /* index to array */ clearrbl(fi); copycoords(fo.coor, &fi->coor); for (coordinate = fo.coor.fromrandom; coordinate <= fo.coor.torandom; coordinate++) { l = coordinate - fo.coor.fromrange + 1; inrandom = (coordinate >= fo.coor.fromrandom && coordinate <= fo.coor.torandom); for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) { if (basetochar(b) == std->seq.letters[l-1]) { if (inrandom) fi->data[(long)b - (long)a][l-1] = fo.coor.fracan; else fi->data[(long)b - (long)a][l-1] = 1.0; } else if (inrandom) fi->data[(long)b - (long)a][l-1] = fo.coor.notfracan; else fi->data[(long)b - (long)a][l-1] = 0.0; } } } /* end module sites.detfi */ /* begin module sites.detrho */ Static Void detrho(fi, fo, rho) rblarray fi, fo, *rho; { /* determine rho(b,l) */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ long l; /* index to array */ clearrbl(rho); copycoords(fi.coor, &rho->coor); for (coordinate = fi.coor.fromrandom + 1; coordinate <= fi.coor.torandom + 1; coordinate++) { l = coordinate - fi.coor.fromrange; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) { if (fi.data[(long)b - (long)a][l-1] == 0.0) printf("detrho: 7 zowie! PROGRAM ERROR!\n"); if (fi.data[(long)b - (long)a][l-1] > 0.0) rho->data[(long)b - (long)a] [l-1] = fo.data[(long)b - (long)a][l-1] / fi.data[(long)b - (long)a] [l-1]; else if (fo.data[(long)b - (long)a][l-1] == fi.data[(long)b - (long)a] [l-1]) rho->data[(long)b - (long)a][l-1] = 1.0; else rho->data[(long)b - (long)a][l-1] = 0.0; } } } /* end module sites.detrho */ /* begin module sites.detfpi */ Static Void detfpi(fi, fpi) rblarray fi, *fpi; { /* determine fpi(b,l), as all 1/4 values. This represents the prime (normalized) input frequency values */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ long l; /* index to array */ clearrbl(fpi); copycoords(fi.coor, &fpi->coor); for (coordinate = fi.coor.fromrandom + 1; coordinate <= fi.coor.torandom + 1; coordinate++) { l = coordinate - fi.coor.fromrange; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) fpi->data[(long)b - (long)a][l-1] = 0.25; } } /* end module sites.detfpi */ /* begin module sites.detfpo */ Static Void detfpo(fpi, rho, fpo) rblarray fpi, rho, *fpo; { /* determine fpo(b,l), experimental frequencies, normalized */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ long l; /* index to array */ double sum; /* normalizing sum */ clearrbl(fpo); copycoords(fpi.coor, &fpo->coor); for (coordinate = fpi.coor.fromrandom + 1; coordinate <= fpi.coor.torandom + 1; coordinate++) { l = coordinate - fpi.coor.fromrange; sum = 0.0; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) sum += fpi.data[(long)b - (long)a][l-1] * rho.data[(long)b - (long)a] [l-1]; if (sum == 0.0) printf("detfpo: 6 zowie! PROGRAM ERROR!\n"); /* make sure it sums to 1 by dividing by sum */ for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) fpo->data[(long)b - (long)a] [l-1] = fpi.data[(long)b - (long)a][l-1] * rho.data[(long)b - (long)a] [l-1] / sum; } } /* end module sites.detfpo */ /* begin module sites.detfpon */ Static Void detfpon(wildtype, fpi, fpon) iblarray wildtype; rblarray fpi, *fpon; { /* determine fpon(b,l), natural information frequencies output "normalized". "Normalized" because it simply assumes equiprobable before state. See molecular machines paper I. */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ boolean inrandom; /* true if working within the randomized region */ long l; /* index to array */ double sum; /* normalizing sum */ clearrbl(fpon); copycoords(fpi.coor, &fpon->coor); for (coordinate = fpi.coor.fromrange; coordinate <= fpi.coor.torange; coordinate++) { l = coordinate - fpi.coor.fromrange + 1; inrandom = (coordinate >= fpi.coor.fromrandom && coordinate <= fpi.coor.torandom); if (inrandom) { sum = 0.0; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) sum += wildtype.data[(long)b - (long)a][l-1]; if (sum > 0.0) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) fpon->data[(long)b - (long)a] [l-1] = wildtype.data[(long)b - (long)a][l-1] / sum; } else { printf("\nWARNING: wild type frequency table is odd\n"); printf("at coordinate %ld it does not have any data.\n\n", coordinate); fpon->data[(long)b - (long)a][l-1] = 1.0; } } else { /* ;if sum=0.0 then writeln(output,'5 zowie! PROGRAM ERROR!'); */ for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) fpon->data[(long)b - (long)a][l-1] = fpi.data[(long)b - (long)a][l-1]; } } } /* end module sites.detfpon */ /* begin module sites.calcinfo */ Static Void calcinfo(fi, fo, i) rblarray fi, fo; rlarray *i; { /*sample: samplestat; not useful, since method not known*/ /* calculate information table from the input and output frequencies. correct the curve using the sample statistics */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ long l; /* index to array */ double ln2 = log(2.0); /* natural log of 2 */ for (coordinate = fi.coor.fromrandom + 1; coordinate <= fi.coor.torandom + 1; coordinate++) { l = coordinate - fi.coor.fromrange; i->data[l-1] = 0.0; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) { if (fo.data[(long)b - (long)a][l-1] > 0) i->data[l-1] += fo.data[(long)b - (long)a] [l-1] * log(fo.data[(long)b - (long)a] [l-1] / fi.data[(long)b - (long)a][l-1]); } /* else no information is added */ i->data[l-1] /= ln2; /* convert to bits */ /* since we don't know how to calculate sample error, this correction is deleted for now: i.data[l] := i.data[l] - sample.en */ } } /* end module sites.calcinfo */ /* begin module sites.detfproportion */ Static Void detfproportion(nbl, std, fproportion) iblarray nbl; entry_ *std; rblarray *fproportion; { /* determine fi(b,l) by using the base proportions of output fo array, for statistical calculations */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ double frac[(long)t - (long)a + 1]; /* proportions of bases as fractions */ boolean inrandom; /* true if working within the randomized region */ long l; /* index to array */ long prop[(long)t - (long)a + 1]; /* proportions of bases */ long total = 0; /* total bases in output */ clearrbl(fproportion); copycoords(nbl.coor, &fproportion->coor); for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) /* clear array */ prop[(long)b - (long)a] = 0; for (coordinate = nbl.coor.fromrandom + 1; coordinate <= nbl.coor.torandom + 1; coordinate++) { l = coordinate - nbl.coor.fromrange; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) prop[(long)b - (long)a] += nbl.data[(long)b - (long)a][l-1]; } for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) total += prop[(long)b - (long)a]; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) frac[(long)b - (long)a] = (double)prop[(long)b - (long)a] / total; if (total == 0.0) printf("9 zowie! PROGRAM ERROR!\n"); fprintf(stats.f, " base proportions in random region: "); fprintf(stats.f, " a = %ld", prop[0]); fprintf(stats.f, " c = %ld", prop[(long)c - (long)a]); fprintf(stats.f, " g = %ld", prop[(long)g - (long)a]); fprintf(stats.f, " t = %ld\n", prop[(long)t - (long)a]); /* write(output,' base frac in random region: '); write(output,' a = ',frac[a]:10:5); write(output,' c = ',frac[c]:10:5); write(output,' g = ',frac[g]:10:5); write(output,' t = ',frac[t]:10:5); writeln(output); */ for (coordinate = nbl.coor.fromrange; coordinate <= nbl.coor.torange; coordinate++) { l = coordinate - nbl.coor.fromrange + 1; inrandom = (coordinate >= nbl.coor.fromrandom && coordinate <= nbl.coor.torandom); for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) { if (basetochar(b) == std->seq.letters[l-1]) { if (inrandom) fproportion->data[(long)b - (long)a][l-1] = frac[(long)b - (long)a]; else fproportion->data[(long)b - (long)a][l-1] = 1.0; } else if (inrandom) fproportion->data[(long)b - (long)a][l-1] = frac[(long)b - (long)a]; else fproportion->data[(long)b - (long)a][l-1] = 0.0; } } } /* end module sites.detfproportion */ /* begin module sites.detfbottle */ Static Void detfbottle(nbl, missing, std, fbotcon, fbottle) iblarray nbl; boolean *missing; entry_ *std; rblarray *fbotcon, *fbottle; { /* determine fi(b,l) using the base proportions nbl. segregated by the bottles, for statistical calculations. fbottle: input frequencies using fo array proportions separated by base bottles fbotcon: input frequencies using fo array proportions separated by base bottles, assuming that the canonical is all that matters If a base is missing from a bottle, then missing is true and statistical calculations are impossible! */ base b; /* index to array */ boolean badbottle; /* true when one bottle is bad because it has a missing base but should be a dirty bottle */ base bottle; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ double frac[(long)t - (long)a + 1]; /* proportions of bases as fractions */ boolean inrandom; /* true if working within the randomized region */ long l; /* index to array */ long prop[(long)t - (long)a + 1]; /* proportions of bases */ long total; /* total bases in output */ clearrbl(fbotcon); copycoords(nbl.coor, &fbotcon->coor); clearrbl(fbottle); copycoords(nbl.coor, &fbottle->coor); *missing = false; fprintf(stats.f, " base proportions in random region\n"); for (bottle = a; (long)bottle <= (long)t; bottle = (base)((long)bottle + 1)) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) /* clear array */ prop[(long)b - (long)a] = 0; for (coordinate = nbl.coor.fromrandom + 1; coordinate <= nbl.coor.torandom + 1; coordinate++) { l = coordinate - nbl.coor.fromrange; if (basetochar(bottle) == std->seq.letters[l-1]) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) prop[(long)b - (long)a] += nbl.data[(long)b - (long)a][l-1]; } } total = 0; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) total += prop[(long)b - (long)a]; if (total == 0.0) printf("8 zowie! PROGRAM ERROR!\n"); for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) frac[(long)b - (long)a] = (double)prop[(long)b - (long)a] / total; badbottle = false; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) badbottle = (badbottle || prop[(long)b - (long)a] == 0); *missing = (*missing || badbottle); fprintf(stats.f, " for bottle %c: ", basetochar(bottle)); fprintf(stats.f, " a = %5ld", prop[0]); fprintf(stats.f, " c = %5ld", prop[(long)c - (long)a]); fprintf(stats.f, " g = %5ld", prop[(long)g - (long)a]); fprintf(stats.f, " t = %5ld", prop[(long)t - (long)a]); if (badbottle) fprintf(stats.f, " <= MISSING BASE(s)"); fprintf(stats.f, "\n for bottle %c: ", basetochar(bottle)); fprintf(stats.f, " a = %5.2f", frac[0]); fprintf(stats.f, " c = %5.2f", frac[(long)c - (long)a]); fprintf(stats.f, " g = %5.2f", frac[(long)g - (long)a]); fprintf(stats.f, " t = %5.2f\n", frac[(long)t - (long)a]); for (coordinate = nbl.coor.fromrange; coordinate <= nbl.coor.torange; coordinate++) { l = coordinate - nbl.coor.fromrange + 1; inrandom = (coordinate >= nbl.coor.fromrandom && coordinate <= nbl.coor.torandom); if (basetochar(bottle) == std->seq.letters[l-1]) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) { if (inrandom) { fbottle->data[(long)b - (long)a][l-1] = frac[(long)b - (long)a]; /* fbotcon is defined as the fraction of the canonical base, with equially likely bases for the other bases. I don't think this is a reasonable model for most nucleic acid recognizers, but Charles Lawrence suggested testing for it, so here it is. */ if (b == bottle) fbotcon->data[(long)b - (long)a][l-1] = frac[(long)b - (long)a]; else fbotcon->data[(long)b - (long)a] [l-1] = (1.0 - frac[(long)bottle - (long)a]) / 3.0; } /* outside the random region, just make the bottles be the fixed bases of the standard */ else if (b == bottle) { fbottle->data[(long)b - (long)a][l-1] = 1.0; fbotcon->data[(long)b - (long)a][l-1] = 1.0; } else { fbottle->data[(long)b - (long)a][l-1] = 0.0; fbotcon->data[(long)b - (long)a][l-1] = 0.0; } } } } } if (!*missing) return; printf("***********************************************\n"); printf("* WARNING WARNING WARNING WARNING WARNING *\n"); printf("* One of the dirty bottles is missing a base! *\n"); printf("* See the stats file for details. *\n"); printf("***********************************************\n"); } /* end module sites.detfbottle */ /* begin module sites.signif */ Static Void signif(stats, nl, fi, fo, paramod) _TEXT *stats; ilarray nl; rblarray fi, fo; long paramod; { /* figure out the significance of the difference between the two frequency arrays fi and fo. put results in stats. paramod is the number of parameters modeled. signif assumes identical sizes for the three arrays. */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ long degfree; /* number of degrees of freedom */ double exp; /* expected number of counts */ double gsq = 0.0; /* G-squared */ boolean infinite = false; /* true for infinite significance */ long l; /* index to array */ double obs; /* observed number of counts */ double xsq = 0.0; /* chi squared */ double TEMP; for (coordinate = nl.coor.fromrandom; coordinate <= nl.coor.torandom; coordinate++) { l = coordinate - nl.coor.fromrange + 1; if (!infinite) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) { if (fi.data[(long)b - (long)a][l-1] == 0.0) { fprintf(stats->f, "fi(%c,%ld) = 0 so the significance is infinite\n", basetochar(b), coordinate); infinite = true; } if (!infinite) { if (fo.data[(long)b - (long)a][l-1] > 0) gsq += nl.data[l-1] * fo.data[(long)b - (long)a] [l-1] * log(fo.data[(long)b - (long)a] [l-1] / fi.data[(long)b - (long)a][l-1]); /* else don't add */ obs = nl.data[l-1] * fo.data[(long)b - (long)a][l-1]; exp = nl.data[l-1] * fi.data[(long)b - (long)a][l-1]; if (exp > 0) { TEMP = obs - exp; xsq += TEMP * TEMP / exp; } } } } /* writeln(output,'n[',l:2,']=',nl.data[l]:3);*/ } if (infinite) return; gsq = 2 * gsq; degfree = (nl.coor.torandom - nl.coor.fromrandom + 1) * 3 - paramod; fprintf(stats->f, " %ld degrees of freedom\n", degfree); fprintf(stats->f, " G-squared = %6.2f\n", gsq); fprintf(stats->f, " X-squared = %6.2f\n\n", xsq); } /* end module sites.signif */ /* begin module sites.mle */ Static Void mle(list, std, nbl) _TEXT *list; entry_ *std; iblarray nbl; { /* calculate the maximum-likelihood value for the canonical base */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ long l; /* index to array */ long sum = 0; /* sum of the canonical bases */ long total = 0; /* total number of sequenced bases */ for (coordinate = nbl.coor.fromrandom + 1; coordinate <= nbl.coor.torandom + 1; coordinate++) { l = coordinate - nbl.coor.fromrange; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) { if (basetochar(b) == std->seq.letters[l-1]) sum += nbl.data[(long)b - (long)a][l-1]; total += nbl.data[(long)b - (long)a][l-1]; } } fprintf(list->f, "number of canonical bases: %5ld\n", sum); fprintf(list->f, "number of bases sequenced: %5ld\n", total); if (total == 0.0) printf("2 zowie! PROGRAM ERROR!\n"); fprintf(list->f, "maximum likelihood estimate of canonical: (%ld/%ld)=%5.3f\n", sum, total, (double)sum / total); } /* end module sites.mle */ /*this is probably never going to be used: procedure showit(var list: text; std: entryptr; header: string; e,n: rlarray); (* show the information curves, experimental (e) and natural (n) into the list file *) var coordinate: integer; (* the coordinate relative to zero in the standard *) l: integer; (* index to array *) begin dataheader(list,header,std); writeln(list,'* information curves, ready for graphics'); writeln(list,'* n: natural, e: experimental (normalized) curve'); writeln(list,'* type, coordinate, info'); with e.coor do begin for coordinate := fromrange to torange do begin l := coordinate - fromrange + 1; writeln(list,'n ', coordinate:3,' ',n.data[l]:10:8); end; for coordinate := fromrange to torange do begin l := coordinate - fromrange + 1; writeln(list,'e ', coordinate:3,' ',e.data[l]:10:8); end end end; */ /* begin module sites.tabulate */ Static Void tabulate(data, numentries, co, nl, nbl, fbl) entry_ **data; long numentries; coordinates co; ilarray *nl; iblarray *nbl; rblarray *fbl; { /* tabulate the three arrays from the data, numentries is the number to tabulate */ base b; /* a base in the d sequence */ long l; /* position in the d sequence */ long d; /* data on a sequence */ long FORLIM1; clearrbl(fbl); copycoords(co, &nl->coor); copycoords(co, &nbl->coor); copycoords(co, &fbl->coor); /* clear the nl and nbl arrays */ for (l = 0; l < maxseq; l++) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) nbl->data[(long)b - (long)a][l] = 0; nl->data[l] = 0; } /* tabulate the nbl table */ for (d = 0; d < numentries; d++) { FORLIM1 = data[d]->seq.length; for (l = 0; l < FORLIM1; l++) { b = chartorange(data[d]->seq.letters[l]); if (((1L << ((long)b)) & ((1L << ((long)a)) | (1L << ((long)c)) | (1L << ((long)g)) | (1L << ((long)t)))) != 0) nbl->data[(long)b - (long)a][l]++; } } FORLIM1 = data[0]->seq.length; /* create nl from nbl. note: all have the same length */ for (l = 0; l < FORLIM1; l++) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) nl->data[l] += nbl->data[(long)b - (long)a][l]; } FORLIM1 = data[0]->seq.length; for (l = 0; l < FORLIM1; l++) { if (nl->data[l] == 0) printf("13 zowie! PROGRAM ERROR!\n"); } if (data[0]->seq.length != nl->coor.lengthrange) printf("13.5 zowie!\n"); FORLIM1 = data[0]->seq.length; /* create fbl from nbl and nl */ for (l = 0; l < FORLIM1; l++) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) fbl->data[(long)b - (long)a] [l] = (double)nbl->data[(long)b - (long)a][l] / nl->data[l]; } } /* end module sites.tabulate */ /* begin module sites.makefreq */ Static Void makefreq(rbl) rblarray *rbl; { /* make the rbl array a frequency table by normalizing */ base b; /* a base */ long l; /* a position */ double sum; /* sum of rbl at l */ long FORLIM; FORLIM = rbl->coor.lengthrange; for (l = 0; l < FORLIM; l++) { sum = 0.0; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) sum += rbl->data[(long)b - (long)a][l]; if (sum != 0.0) { for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) rbl->data[(long)b - (long)a][l] /= sum; } } } /* end module sites.makefreq */ /* begin module sites.normalize */ Static Void normalize(std, fi, fo, rho, fpi, fpo) entry_ *std; rblarray *fi, fo, *rho, *fpi, *fpo; { /* experimental input frequencies (0,fracan, notfracan) */ /* experimental output frequencies (raw sequence data) */ /* the ratio of fo / fi */ /* the normalized input table (0.25's everywhere) */ /* the fi table normalized. */ /* normalize the fo array (the fbl array) and put the result in fpo (the fblnormalized array). Intermediates that are generated: rho: the ratio of fo / fi fpi: the normalized input table (0.25's everywhere) fpo: the normalized output table. */ detfi(fo, std, fi); /* determine fi(b,l), using fo as an example */ detrho(*fi, fo, rho); /* determine rho(b,l), as fo/fi */ detfpi(*fi, fpi); /* determine f'i(b,l), as all 0.25. */ detfpo(*fpi, *rho, fpo); /* determine f'o(b,l), see the paper for method */ } /* end module sites.normalize */ /* begin module sites.addrbl */ Static Void addrbl(ar, br) rblarray ar, *br; { /* Add the frequency data of array ar into array br, over the randomized region only. The reason for limiting it to this region is that otherwise one would add fixed base data into random base data, which would be a misrepresentation. Make sure that the randomized range of ar fits correctly into that of br. If it does not fit, then the size of br was defined incorrectly. That is, br must be larger than ar. */ base b; /* a base */ long la; /* coordinate in ar array */ long lb; /* coordinate in b array */ long la1toN; /* conversion of la to 1 to N coordinates */ long lb1toN; /* conversion of lb to 1 to N coordinates */ if (ar.coor.fromrandom < br->coor.fromrandom) { printf("addrbl: attempt to add array ar out of bounds of br\n"); printf("ar.coor.fromrandom = %ld br.coor.fromrandom = %ld\n", ar.coor.fromrandom, br->coor.fromrandom); halt(); } if (ar.coor.torandom > br->coor.torandom) { printf("addrbl: attempt to add array ar out of bounds of br\n"); printf("ar.coor.torandom = %ld br.coor.torandom = %ld\n", ar.coor.torandom, br->coor.torandom); halt(); } for (la = ar.coor.fromrandom; la <= ar.coor.torandom; la++) { la1toN = la - ar.coor.fromrange + 1; lb = la; lb1toN = lb - br->coor.fromrange + 1; /* program check */ if (lb < br->coor.fromrandom || lb > br->coor.torandom || lb < br->coor.fromrange || lb > br->coor.torange) { printf("addrbl: program error in array b\n"); halt(); } for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) br->data[(long)b - (long)a][lb1toN-1] += ar.data[(long)b - (long)a] [la1toN-1]; } } /* end module sites.addrbl */ /* begin module sites.showstatistics */ Static Void showstatistics(stats, coords, experiment, header, std, wildtype, nbl, duplicates) _TEXT *stats; coordinates coords; long experiment; string header; entry_ *std; iblarray wildtype, nbl; long duplicates; { /* database identifier */ /* the canonical standard */ /* wild type frequencies */ /* show the table of results */ base b; /* one of the bases at l */ long coordinate; /* the position relative to the zero base */ long l; /* position in the site */ mkheader(stats, header, experiment, coords, std); fprintf(stats->f, "\n | | experimental | wildtype |\n"); /* writeln(stats,' l | b | experimental | wildtype |'); uuu */ fprintf(stats->f, " l | b | A C G T | A C G T |\n"); fprintf(stats->f, "------------------------------------------------------\n"); for (l = 0; l < coords.lengthrange; l++) { coordinate = l + coords.fromrange; fprintf(stats->f, "%4ld", coordinate); fprintf(stats->f, " |"); /* canonical base */ if (coordinate < coords.fromrandom || coordinate > coords.torandom) fprintf(stats->f, " %c", capitalize(std->seq.letters[l])); else fprintf(stats->f, " %c", std->seq.letters[l]); fprintf(stats->f, " |"); for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) /* experimental frequencies */ fprintf(stats->f, " %4ld", nbl.data[(long)b - (long)a][l]); fprintf(stats->f, " |"); for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) /* wild type frequencies */ fprintf(stats->f, " %4ld", wildtype.data[(long)b - (long)a][l]); fprintf(stats->f, " |\n"); } fprintf(stats->f, "\nnumber of duplicate sequences: %ld sequences\n\n", duplicates); } #define nfactor scaleup /* end module sites.showstatistics */ /* begin module sites.makersdata */ Static Void makersdata(rsdata, database, standard, fpo, ipe, nlexpt) _TEXT *rsdata, *database, *standard; rblarray fpo; rlarray ipe; ilarray nlexpt; { /* make the fbl(b,l) table for use by the MakeLogo program for producing a logo. The result is an rsdata file. Assume that there are nfactor sequences. Use database and standard for identification. A bug: nlexpt is supposed to be the number of bases at position l in the experiment. However, if there are several experiments, they should not be weighted equally. Also, it is not correct to calculate the sampling error from these numbers since partial (fracan > 0.25) randomization implies that nl is effectively smaller than the number of sequences. This portion of the code is kept, but awaits better understanding of how to do the calculation. */ /* number of sequences assumed for the table. This simply preserves the resolution to several decimal places */ double avarhnb; /* variance of the information r */ base b; /* index to array */ long coordinate; /* the coordinate relative to zero in the standard */ double e; /* for calculating small sample bias and variance */ long l; /* index to array */ double ln2 = log(2.0); /* natural log of 2 */ long nbl; /* number of bases b at position l */ long nl; /* number of bases at position l */ double rsl; /* information at position l */ double rs = 0.0; /* total information */ long s = 4; /* nubmer of symbols */ double sumavarhnb = 0.0; /* sum of variance of the information r */ /* simulate the header of a data file output from rseq */ fprintf(rsdata->f, "* sites %4.2f\n", version); if (*database->name != '\0') { if (database->f != NULL) database->f = freopen(database->name, "r", database->f); else database->f = fopen(database->name, "r"); } else rewind(database->f); if (database->f == NULL) _EscIO2(FileNotFound, database->name); RESETBUF(database->f, Char); fprintf(rsdata->f, "* "); copyaline(database, rsdata); if (*standard->name != '\0') { if (standard->f != NULL) standard->f = freopen(standard->name, "r", standard->f); else standard->f = fopen(standard->name, "r"); } else rewind(standard->f); if (standard->f == NULL) _EscIO2(FileNotFound, standard->name); RESETBUF(standard->f, Char); fprintf(rsdata->f, "* "); copyaline(standard, rsdata); for (l = 1; l <= 8; l++) fprintf(rsdata->f, "*\n"); fprintf(rsdata->f, "* %*ld %*ld\n", nfield, fpo.coor.fromrandom, nfield, fpo.coor.torandom); for (l = 1; l <= 12; l++) fprintf(rsdata->f, "*\n"); fprintf(rsdata->f, "* l a c g t rs(l) rs var(hnb) sum-var n(l) e(hnb) f\n"); /* number of symbols */ for (coordinate = fpo.coor.fromrandom; coordinate <= fpo.coor.torandom; coordinate++) { l = coordinate - fpo.coor.fromrange + 1; /* l */ fprintf(rsdata->f, " %*ld", nfield, coordinate); /* a c g t */ nl = 0; for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) { nbl = (long)floor(nfactor * fpo.data[(long)b - (long)a][l-1] + 0.5); nl += nbl; fprintf(rsdata->f, " %*ld", (int)(nfield + 1), nbl); } /* If nlexpt does not represent the global numbers of bases at each position in the experiment, (ie, only represents the numbers of bases of one experiment), then the data outside the nlexpt will be missing and the program will detect the error: if nlexpt.data[l]=0 then begin writeln(output,'10 zowie! PROGRAM ERROR!'); writeln(output,'l = ',l:1); writeln(output,'coordinate = ',coordinate:1); for s := 1 to nlexpt.coor.lengthrange do writeln(output,'nlexpt.data[',s:1,']=',nlexpt.data[s]:1); end; (* a correction for small sample sizes *) */ /* fool the compiler so that it thinks we are using nlexpt: */ e = (s - 1) / (2 * ln2 * nlexpt.data[l-1]); /* override: we really don't know a method!! */ e = 0.0; /* rs(l) */ rsl = ipe.data[l-1] - e; fprintf(rsdata->f, " %*.*f", infofield, infodecim, rsl); /* rs */ rs += rsl; fprintf(rsdata->f, " %*.*f", infofield, infodecim, rs); /* var(hnb) sum-var n(l) e(hnb) f */ /* calculate small sample variance. The calculation is based on the procedure calaehnb in the rseq.p program. */ avarhnb = e * e; avarhnb = -1.0; /* This is the mark in the rsdata file that indicates that the sample variation should not be displayed in the sequence logo (since we don't know a method). */ sumavarhnb = avarhnb + sumavarhnb; /* var(hnb) */ /* sum-var */ /* n(l) */ /* e(hnb) */ fprintf(rsdata->f, " %*.*f %*.*f %*ld %*.*f a\n", infofield, infodecim, avarhnb, infofield, infodecim, sumavarhnb, nfield, nl, infofield, infodecim, 2.0 - e); /* f */ } /* ,' +/- ', sqrt(avarhnb):infofield:infodecim,*/ fprintf(rsdata->f, "* Rsequence = %*.*f for range from %ld to %ld\n", infofield, infodecim, rs, fpo.coor.fromrandom, fpo.coor.torandom); printf("\nRsequence = %*.*f for range from %ld to %ld\n", infofield, infodecim, rs, fpo.coor.fromrandom, fpo.coor.torandom); /* ' +/- ',sqrt(avarhnb):infofield:infodecim,*/ } #undef nfactor /* true if one of the dirty bottles had one of the bases missing! */ Local Void bar(stats, experiment) _TEXT *stats; long experiment; { fprintf(stats->f, "\nExperiment %ld", experiment); fprintf(stats->f, "---------------------------------\n"); } /* end module sites.makersdata */ /* begin module sites.dostats */ Static Void dostats(stats, experiment, std, wildtype, nl, nbl, fi, fo, fpi) _TEXT *stats; long experiment; entry_ *std; iblarray wildtype; ilarray nl; iblarray nbl; rblarray fi, fo, fpi; { /* which experiment */ /* the canonical standard */ /* wild type frequencies */ /* Calculate statistics of the base frequencies observed in the experiment, based on various kinds of models. */ rblarray fbottle; /* input frequencies using fo array proportions separated by base bottles */ rblarray fbotcon; /* input frequencies using fo array proportions separated by base bottles, assuming that the canonical is all that matters */ rblarray fproportion; /* input frequencies using fo array proportions */ rblarray fpon; /* natural frequencies */ /* probably never to be used: ie: rlarray; (* information of experimental data *) ipe: rlarray; (* information of experimental data normalized *) ipn: rlarray; (* information of natural data *) */ boolean missing; bar(stats, experiment); fprintf(stats->f, "significance for completely saturated model\n"); signif(stats, nl, fi, fo, 0L); showrblarray(stats, fi); bar(stats, experiment); fprintf(stats->f, "significance for completely proportional model\n"); detfproportion(nbl, std, &fproportion); /* determine proportional info */ signif(stats, nl, fproportion, fo, 3L); showrblarray(stats, fproportion); bar(stats, experiment); fprintf(stats->f, "significance for completely proportional model, bottles\n"); /* determine proportional info */ detfbottle(nbl, &missing, std, &fbotcon, &fbottle); if (missing) fprintf(stats->f, "Cannot calculate significance because of missing bases in dirty bottle\n"); else signif(stats, nl, fbottle, fo, 12L); showrblarray(stats, fbottle); bar(stats, experiment); fprintf(stats->f, "significance for completely proportional model, bottles\n"); fprintf(stats->f, "canonical base, equally likely for other bases\n"); if (missing) fprintf(stats->f, "Cannot calculate significance because of missing bases in dirty bottle\n"); else signif(stats, nl, fbotcon, fo, 4L); showrblarray(stats, fbotcon); bar(stats, experiment); fprintf(stats->f, "calculate the maximum-likelihood value for the canonical base\n"); mle(stats, std, nbl); bar(stats, experiment); detfpon(wildtype, fpi, &fpon); /* determine f'on(b,l) */ fprintf(stats->f, "significance for wild-type sequences model\n"); signif(stats, nl, fpon, fo, nl.coor.lengthrandom * 4); showrblarray(stats, fpon); fprintf(stats->f, "\n---------------------------------------------\n"); /* probably never to be used: showit(list, std, header, ipe, ipn); (* display info results *) */ } /* end module sites.dostats */ /* begin module sites.fancytable */ Static Void fancytable(list, experiment, nl, nbl, std, LaTeX) _TEXT *list; long experiment; ilarray nl; iblarray nbl; entry_ *std; boolean LaTeX; { /* experiment number */ /* number of sequences at position l */ /* number of bases at position l */ /* show the n(b,l) table with the number of sequences at each position, n(l). Display the canonical sequence from std. If LaTeX is true, put the data out in LaTeX table format */ base b; /* one of the bases at l */ long coordinate; /* the position relative to the zero base */ long l; /* position in the site */ if (LaTeX) { fprintf(list->f, "\n\\newpage\n"); fprintf(list->f, "\\begin{table}[p] %% h here; t top; b bottom; p page of floats\n"); fprintf(list->f, "\\begin{center}\n"); /*drop nl: writeln(list,'\begin{tabular}{|rcrrrr|r|}'); */ fprintf(list->f, "\\begin{tabular}{|rcrrrr|}\n"); fprintf(list->f, "\\hline\n"); fprintf(list->f, "Coor- & Canonical & \\multicolumn{4}{c|}{Numbers of} \\\\\n"); /*,*/ /* not in use ' & Number of \\');*/ fprintf(list->f, "dinate & Base & A & C & G & T \\\\\n"); /* & sequences \\');*/ fprintf(list->f, "\\hline\n"); } else { fprintf(list->f, "\n\f\n"); fprintf(list->f, " l | b | A | C | G | T | number\n"); fprintf(list->f, "------------------------------------------------\n"); /* -27 | g | 0 | 0 | 53 | 0 | 53 */ } for (coordinate = nbl.coor.fromrange; coordinate <= nbl.coor.torange; coordinate++) { l = coordinate - nbl.coor.fromrange + 1; fprintf(list->f, " %*ld", nfield, coordinate); if (LaTeX) fprintf(list->f, " &"); else fprintf(list->f, " |"); fprintf(list->f, " %c", std->seq.letters[l-1]); for (b = a; (long)b <= (long)t; b = (base)((long)b + 1)) { if (LaTeX) fprintf(list->f, " &"); else fprintf(list->f, " |"); fprintf(list->f, " %*ld", nfield, nbl.data[(long)b - (long)a][l-1]); } if (!LaTeX) fprintf(list->f, " |"); /* modified to prevent nl on latex tables: */ if (!LaTeX) fprintf(list->f, " %*ld", nfield, nl.data[l-1]); if (LaTeX) fprintf(list->f, " \\\\"); putc('\n', list->f); } if (LaTeX) { fprintf(list->f, "\\hline\n"); fprintf(list->f, "\\end{tabular}\n"); fprintf(list->f, "\\end{center}\n"); fprintf(list->f, "\\caption{Number of bases observed at each position\n"); fprintf(list->f, " in the randomized sequences of experiment %ld}\n", experiment); fprintf(list->f, " \\label{table.nbl%ld}\n", experiment); fprintf(list->f, "\\end{table}\n\n"); } putc('\n', list->f); } /* end module sites.fancytable */ /* begin module sites.pageheader */ Static Void pageheader(list, header, experiment, std) _TEXT *list; string header; long experiment; entry_ *std; { /* put a few things out */ putc('\n', list->f); announcement(list, header); putentry(list, std); fprintf(list->f, "\nExperiment %ld\n", experiment); } /* Local variables for dotables: */ struct LOC_dotables { _TEXT *list; string header; long experiment; entry_ *std; } ; Local Void newpage(LINK) struct LOC_dotables *LINK; { fprintf(LINK->list->f, "\n\f"); pageheader(LINK->list, LINK->header, LINK->experiment, LINK->std); } /* end module sites.pageheader */ /* begin module sites.dotables */ Static Void dotables(list_, header_, experiment_, std_, nbl, fi, fo, rho, fpi, fpo) _TEXT *list_; string header_; long experiment_; entry_ *std_; iblarray nbl; rblarray fi, fo, rho, fpi, fpo; { /* number of bases at position l */ /* write the arrays out to the list file */ struct LOC_dotables V; V.list = list_; V.header = header_; V.experiment = experiment_; V.std = std_; if (V.experiment > 1) newpage(&V); else pageheader(V.list, V.header, V.experiment, V.std); fprintf(V.list->f, "n(b,l): input numbers\n"); showiblarray(V.list, nbl); newpage(&V); fprintf(V.list->f, "fi(b,l), input frequency table (fractions canonical and not canonical):\n"); showrblarray(V.list, fi); newpage(&V); fprintf(V.list->f, "fo(b,l), raw experimental frequencies:\n"); showrblarray(V.list, fo); newpage(&V); fprintf(V.list->f, "rho(b,l), ratio of fo to fi tables:\n"); showrblarray(V.list, rho); newpage(&V); fprintf(V.list->f, "f'i(b,l), normalized input frequencies (0.25)\n"); showrblarray(V.list, fpi); newpage(&V); fprintf(V.list->f, "f'o(b,l), normalized output frequency table:\n"); showrblarray(V.list, fpo); } /* end module sites.dotables */ /* begin module putmaxstring */ Static Void putmaxstring(f, s, counter, maxcounter) _TEXT *f; string *s; long *counter, maxcounter; { /* write the string to f. Like procedure putstring, but the number of characters put out is counted in counter and if it exceeds maxcounter, no more characters are output. */ long l; /* index to the string */ long FORLIM; FORLIM = s->length; for (l = 0; l < FORLIM; l++) { (*counter)++; if (*counter < maxcounter) putc(s->letters[l], f->f); } } #define namemax 20 /* longest length name allowed to be output */ /* end module putmaxstring */ /* begin module sites.makesequ */ Static Void makesequ(sequ, makebkp, coords, data, numentries, std) _TEXT *sequ, *makebkp; coordinates coords; entry_ **data; long numentries; entry_ *std; { /* standard canonical sequence */ /* make the raw sequence file in sequ, consisting of just the sequence followed by a period. Make instructions to feed makebk in makebkp */ long counter = 0; /* count of characters put out */ long thenumber; /* the number of one datum */ long l; /* index to the string */ boolean x_; /* an x was found in the sequence */ entry_ *WITH; long FORLIM1; string *WITH1; if (*sequ->name != '\0') { if (sequ->f != NULL) sequ->f = freopen(sequ->name, "w", sequ->f); else sequ->f = fopen(sequ->name, "w"); } else { if (sequ->f != NULL) rewind(sequ->f); else sequ->f = tmpfile(); } if (sequ->f == NULL) _EscIO2(FileNotFound, sequ->name); SETUPBUF(sequ->f, Char); if (*makebkp->name != '\0') { if (makebkp->f != NULL) makebkp->f = freopen(makebkp->name, "w", makebkp->f); else makebkp->f = fopen(makebkp->name, "w"); } else { if (makebkp->f != NULL) rewind(makebkp->f); else makebkp->f = tmpfile(); } if (makebkp->f == NULL) _EscIO2(FileNotFound, makebkp->name); SETUPBUF(makebkp->f, Char); fprintf(makebkp->f, "sites %4.2f\n", version); fprintf(makebkp->f, "b\n"); /* book created */ putstring(makebkp, &std->name); /* title */ fprintf(makebkp->f, "\ny\n"); /* pieces numbered */ putmaxstring(makebkp, &std->name, &counter, (long)namemax); fprintf(makebkp->f, " organism key name\n"); putstring(makebkp, &std->name); fprintf(makebkp->f, " organism full name\n"); fprintf(makebkp->f, "from sites %4.2f\n\n", version); /* no notes */ fprintf(makebkp->f, "map units\n"); counter = 0; putmaxstring(makebkp, &std->name, &counter, (long)namemax); fprintf(makebkp->f, " chromosome key name\n"); putstring(makebkp, &std->name); fprintf(makebkp->f, " chromosome full name\n\n"); /* no notes */ fprintf(makebkp->f, "0\n"); /* chromosome map beginning */ fprintf(makebkp->f, "100\n"); /* chromosome map ending */ for (thenumber = 1; thenumber <= numentries; thenumber++) { /*zzz*/ WITH = data[thenumber-1]; /* putstring(sequ,seq); would just put the string out. But we want to convert x's to the standard: */ x_ = false; FORLIM1 = WITH->seq.length; for (l = 0; l < FORLIM1; l++) { WITH1 = &WITH->seq; if (WITH1->letters[l] == 'x') { putc(std->seq.letters[l], sequ->f); x_ = true; printf("\nNOTE: x bases replaced by standard in sequ.\n"); printf("See notes in book created using makebkseq.letters[l], sequ->f); } fprintf(sequ->f, ".\n"); /* key name of piece */ counter = 0; putmaxstring(makebkp, &WITH->name, &counter, (long)namemax); /* write(makebkp,'_'); counter := succ(counter); putmaxstring(makebkp,primer,counter,namemax); write(makebkp,'_'); counter := succ(counter); putmaxstring(makebkp, ex,counter,namemax); */ if (x_) { if (counter < namemax - 10) fprintf(makebkp->f, "_X_REMOVED"); else if (counter < namemax - 3) fprintf(makebkp->f, "_X!"); } putc('\n', makebkp->f); /* full name of piece */ putstring(makebkp, &WITH->name); putc('_', makebkp->f); putstring(makebkp, &WITH->primer); putc('_', makebkp->f); putstring(makebkp, &WITH->ex); putc('_', makebkp->f); putstring(makebkp, &WITH->date); putc('\n', makebkp->f); if (x_) { fprintf(makebkp->f, "The x bases were replaced by the standard sequence:\n"); putstring(makebkp, &WITH->seq); fprintf(makebkp->f, "\nThe standard sequence is:\n"); putstring(makebkp, &std->seq); putc('\n', makebkp->f); } fprintf(makebkp->f, "piece number %ld written by sites\n\n", thenumber); /* no more notes */ fprintf(makebkp->f, "0\n"); /* map beginning */ fprintf(makebkp->f, "l\n"); /* coordinate configuration */ fprintf(makebkp->f, "+\n"); /* coordinate direction */ fprintf(makebkp->f, "%ld\n", coords.fromrange); /* coordinate beginning */ fprintf(makebkp->f, "%ld\n", coords.torange); /* coordinate beginning */ fprintf(makebkp->f, "l\n"); /* piece configuration */ fprintf(makebkp->f, "+\n"); /* piece direction */ fprintf(makebkp->f, "%ld\n", coords.fromrange); /* piece beginning */ fprintf(makebkp->f, "%ld\n", coords.torange); /* piece beginning */ fprintf(makebkp->f, "n\n"); /* don't want to correct */ fprintf(makebkp->f, "0\n"); /* transcripts */ fprintf(makebkp->f, "0\n"); /* genes */ fprintf(makebkp->f, "0\n"); /* markers */ fprintf(makebkp->f, "n\n"); /* not new organism */ fprintf(makebkp->f, "n\n"); /* not new chromosome */ } /* writeln(sequ,'debug: display, thenumber=',thenumber:1); */ } #undef namemax /* end module sites.makesequ */ /* begin module sites.themain */ Static Void themain(database, standard, caps, latex, list, sorted, stats, tables, rsdata, sequ, makebkp) _TEXT *database, *standard, *caps, *latex, *list, *sorted, *stats, *tables, *rsdata, *sequ, *makebkp; { /* themain procedure of sites */ coordinates coords; /* coordinates of the current data set */ entries data; /* all of the sequences and associated data */ long duplicates; /* number of duplicate sequences in the set */ long experiment = 0; /* the number of separate experiments performed */ rblarray fi; /* experimental input frequencies (0,fracan, notfracan) */ rblarray fo; /* experimental output frequencies (raw sequence data) */ rblarray rho; /* the ratio of fo / fi */ rblarray fpi; /* the normalized input table (0.25's everywhere) */ rblarray fpo; /* the fi table normalized. */ rblarray global; /* fpo summed from all experiments, it is ONLY over the global randomized positions */ string header; /* header line */ rlarray ipe; /* information of experimental data normalized */ iblarray nbl; /* number of bases at position l */ ilarray nl; /* number of sequences at position l */ long number; /* index for clearing the data store */ long numentries; /* number of entries in the data set "data" */ entry_ *std = NULL; /* standard canonical sequence */ iblarray wildtype; /* number of bases at position l in the standard */ long totalentries = 0; _TEXT TEMP; /* the total number of sequence entries in the database */ printf("sites %4.2f\n", version); if (*caps->name != '\0') { if (caps->f != NULL) caps->f = freopen(caps->name, "w", caps->f); else caps->f = fopen(caps->name, "w"); } else { if (caps->f != NULL) rewind(caps->f); else caps->f = tmpfile(); } if (caps->f == NULL) _EscIO2(FileNotFound, caps->name); SETUPBUF(caps->f, Char); if (*latex->name != '\0') { if (latex->f != NULL) latex->f = freopen(latex->name, "w", latex->f); else latex->f = fopen(latex->name, "w"); } else { if (latex->f != NULL) rewind(latex->f); else latex->f = tmpfile(); } if (latex->f == NULL) _EscIO2(FileNotFound, latex->name); SETUPBUF(latex->f, Char); if (*list->name != '\0') { if (list->f != NULL) list->f = freopen(list->name, "w", list->f); else list->f = fopen(list->name, "w"); } else { if (list->f != NULL) rewind(list->f); else list->f = tmpfile(); } if (list->f == NULL) _EscIO2(FileNotFound, list->name); SETUPBUF(list->f, Char); if (*sorted->name != '\0') { if (sorted->f != NULL) sorted->f = freopen(sorted->name, "w", sorted->f); else sorted->f = fopen(sorted->name, "w"); } else { if (sorted->f != NULL) rewind(sorted->f); else sorted->f = tmpfile(); } if (sorted->f == NULL) _EscIO2(FileNotFound, sorted->name); SETUPBUF(sorted->f, Char); if (*stats->name != '\0') { if (stats->f != NULL) stats->f = freopen(stats->name, "w", stats->f); else stats->f = fopen(stats->name, "w"); } else { if (stats->f != NULL) rewind(stats->f); else stats->f = tmpfile(); } if (stats->f == NULL) _EscIO2(FileNotFound, stats->name); SETUPBUF(stats->f, Char); if (*tables->name != '\0') { if (tables->f != NULL) tables->f = freopen(tables->name, "w", tables->f); else tables->f = fopen(tables->name, "w"); } else { if (tables->f != NULL) rewind(tables->f); else tables->f = tmpfile(); } if (tables->f == NULL) _EscIO2(FileNotFound, tables->name); SETUPBUF(tables->f, Char); if (*rsdata->name != '\0') { if (rsdata->f != NULL) rsdata->f = freopen(rsdata->name, "w", rsdata->f); else rsdata->f = fopen(rsdata->name, "w"); } else { if (rsdata->f != NULL) rewind(rsdata->f); else rsdata->f = tmpfile(); } if (rsdata->f == NULL) _EscIO2(FileNotFound, rsdata->name); SETUPBUF(rsdata->f, Char); /* start the latex file */ fprintf(latex->f, "%% sites %4.2f\n", version); fprintf(latex->f, "%% LaTeX typesetting format: \n"); fprintf(latex->f, "\\documentstyle[12pt]{article}\n"); fprintf(latex->f, "\\textheight 9in %% this works\n"); fprintf(latex->f, "\\topmargin 0.0in %% -0.5 would shift the whole thing up\n"); fprintf(latex->f, "\\headheight 0in\n"); fprintf(latex->f, "\\headsep 0in\n"); fprintf(latex->f, "\\textwidth 6in\n"); fprintf(latex->f, "\\oddsidemargin 0.25in\n"); fprintf(latex->f, "\\begin{document}\n"); /* read the wildtype frequencies */ getrsdata(standard, &wildtype); /* get the header of the database */ readheader(database, &header); printf("Analysis of: \n"); TEMP.f = stdout; *TEMP.name = '\0'; putstring(&TEMP, &header); putchar('\n'); /* do a pass through the database to find the extent of the random regions */ findextent(database, &global); /* clear out the data array */ for (number = 0; number < maxentries; number++) data[number] = NULL; data[0] = (entry_ *)Malloc(sizeof(entry_)); /* now pass through the database analyzing the data */ readheader(database, &header); printf("\nPass through the experiments analyzing the data\n"); /* allow multiple readings of std */ while (!BUFEOF(database->f)) { if (P_peek(database->f) == '@') { experiment++; printf("Experiment %ld\n", experiment); /* As we pickup the experimental parameters, we also pick up the standard sequence separately from the rest of the data. This prevents it from being part of the analysis. */ readexpt(database, &coords, &std); continue; } /* do the work */ readdata(database, data, std, &numentries); sortdata(data, numentries, true); /* sorted on name */ display(caps, data, numentries, header, experiment, coords, std, 'c', '-', false, &duplicates); compress(data, &numentries, std, caps); totalentries += numentries; tabulate(data, numentries, coords, &nl, &nbl, &fo); normalize(std, &fi, fo, &rho, &fpi, &fpo); addrbl(fpo, &global); /* tell the world */ printf(" Number of entries in this experiment: %5ld\n", numentries); putc('\n', list->f); if (experiment > 1) { fprintf(list->f, "\f"); fprintf(latex->f, "\\newpage\n"); } display(list, data, numentries, header, experiment, coords, std, 's', ' ', false, &duplicates); fancytable(list, experiment, nl, nbl, std, false); fprintf(latex->f, "\\twocolumn\n"); fprintf(latex->f, "\\scriptsize\n"); fprintf(latex->f, "\\begin{verbatim}\n"); /*header,*/ figure(latex, data, numentries, experiment, coords, std, 's', ' ', true); fprintf(latex->f, "\\end{verbatim}\n"); fprintf(latex->f, "\\normalsize\n"); fancytable(latex, experiment, nl, nbl, std, true); dotables(tables, header, experiment, std, nbl, fi, fo, rho, fpi, fpo); /* by putting a 'z' in the standard positions, unchanged positions are considered to be further down the alphabet than changed ones. */ /*if not onname then */ crunch(data, numentries, std, 'z'); sortdata(data, numentries, false); /* sorted on sequence */ /* leave the data set as we found it, in case other operations are desired (no side effect) */ /*if not onname then */ uncrunch(data, numentries, std, 'z'); display(sorted, data, numentries, header, experiment, coords, std, 's', ' ', true, &duplicates); printf("number of duplicate sequences: %ld\n", duplicates); showstatistics(stats, coords, experiment, header, std, wildtype, nbl, duplicates); dostats(stats, experiment, std, wildtype, nl, nbl, fi, fo, fpi); } printf(" Number of entries in ALL experiments: %5ld\n", totalentries); /* convert global into a frequency table again. Note: this could have been done by dividing by 'experiment', but if the sum was not exactly 1 because of roundoff, error would propagate. By finding out what the sum is for every position, as is done in makefreq, the error should be minimized. */ makefreq(&global); fprintf(tables->f, "\n\f\n"); pageheader(tables, header, experiment, std); fprintf(tables->f, "Average of normalized frequency tables from all experiments:\n"); showrblarray(tables, global); calcinfo(fpi, global, &ipe); /* calculate normalized information table for experiment */ makersdata(rsdata, database, standard, global, ipe, nl); /* logo of experimental results */ /* note: bug: In this call, instead of nl we should use the global nl, or the appropriate (unknown) function for the effective numbers of bases at each position in the experiments. It is not known how to find that however. */ /* finish the typesetting */ fprintf(latex->f, "\\end{document}\n"); /* make raw sequences */ makesequ(sequ, makebkp, coords, data, numentries, std); } /* end module sites.themain */ main(argc, argv) int argc; Char *argv[]; { PASCAL_MAIN(argc, argv); if (setjmp(_JL1)) goto _L1; makebkp.f = NULL; strcpy(makebkp.name, "makebkp"); sequ.f = NULL; strcpy(sequ.name, "sequ"); rsdata.f = NULL; strcpy(rsdata.name, "rsdata"); stats.f = NULL; strcpy(stats.name, "stats"); standard.f = NULL; strcpy(standard.name, "standard"); sorted.f = NULL; strcpy(sorted.name, "sorted"); list.f = NULL; strcpy(list.name, "list"); latex.f = NULL; strcpy(latex.name, "latex"); caps.f = NULL; strcpy(caps.name, "caps"); tables.f = NULL; strcpy(tables.name, "tables"); database.f = NULL; strcpy(database.name, "database"); themain(&database, &standard, &caps, &latex, &list, &sorted, &stats, &tables, &rsdata, &sequ, &makebkp); _L1: if (database.f != NULL) fclose(database.f); if (tables.f != NULL) fclose(tables.f); if (caps.f != NULL) fclose(caps.f); if (latex.f != NULL) fclose(latex.f); if (list.f != NULL) fclose(list.f); if (sorted.f != NULL) fclose(sorted.f); if (standard.f != NULL) fclose(standard.f); if (stats.f != NULL) fclose(stats.f); if (rsdata.f != NULL) fclose(rsdata.f); if (sequ.f != NULL) fclose(sequ.f); if (makebkp.f != NULL) fclose(makebkp.f); exit(EXIT_SUCCESS); } /* End. */