/* Output from p2c 1.21alpha-07.Dec.93, the Pascal-to-C translator */ /* From input file "rseq.p" */ #include /* rseq: rsequence calculated from encoded sequences Dr. Thomas D. Schneider National Institutes of Health National Cancer Institute Center for Cancer Research Nanobiology Program Molecular Information Theory Group Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu (use only if first address fails) http://www.ccrnp.ncifcrf.gov/~toms/ module libraries required: delman, matmods, prgmods, auxmods */ /* end of the program */ /* begin module version */ #define version 5.41 /* of rseq.p 2005 Sep 21 2005 Sep 21: 5.41 improved emptyfile test so 'encode' is written to rsdata' 2005 Sep 19: 5.40 emptyfile test for eof; await compiler problem solution 2000 Jun 28: 5.39 upgrade documentation 1997 June 19: 5.37 point to rsgra.p dalvec.p in see also. 1997 april 21: 5.36 upgrade to compress range to actually used region of encseq. This means the user can specify a range in encodep larger than the book/inst range. thesis version 4.46 completed 1984 mar 1 */ /* end module version */ /* begin module describe.rseq */ /* name rseq: rsequence calculated from encoded sequences synopsis rseq(encseq: in, cmp: in, rsdata: out, output: out) files encseq: the output of the encode program cmp: a composition from the comp program. if cmp is empty, then equal frequencies are assumed. rsdata: a display of the information content of each position of the sequences, with the sampling error variance. This output is ready to be used as input to rsgra or as data for genhis for plotting. output: messages to the user. description Encoded sequences from encseq are converted to a table of frequencies for each base (b) at each aligned position (l). rsequence(l) and the variance var(hnb) are calculated and shown along with their running sums. rsequence and the variance due to sampling error are shown for the whole site, but the running sums let one find rsequence and the variance for any subrange desired. n, the number of example sequences may vary with position, so both n and e(hnb) are shown. documentation Schneider, T.D., G.D. Stormo, L. Gold and A. Ehrenfeucht (1986) The information content of binding sites on nucleotide sequences. J. Mol. Biol. 188: 415-431. see also {INPUT} {the program that makes the input encseq file:} encode.p {the program that makes the input cmp file:} comp.p {OUTPUT:} {the program that uses the rsdata file for making logos:} dalvec.p {GRAPHICS:} {a program to make a graph of the curve (old):} rsgra.p {another program to make a graph of the curve:} xyplo.p {RELATED:} {a program to compute the small sample error:} calhnb.p {a related program to compute frequencies:} encfrq.p author Thomas D. Schneider bugs Does not handle di-nucleotides or longer oligos technical notes Constants maxsize (procedure calehnb) and kickover (procedure makehnblist) determine the largest n for which e(hnb) is used. Above this, ae(hnb) is used. Do not set these below 50 without careful analysis. Other constants are in module rseq.const. */ /* end module describe.rseq */ /* begin module rseq.const */ #define negativeinfinity (-1000000L) /* the definition of negative infinity for the w(b,l) matrix, 100x scaleup */ #define tolerance (-20.0) /* the definition of the tolerance of the w(b,l) matrix, in bits*/ #define posfield 4 /* field size for aligned sequence positions */ #define infofield 8 /* field size for bits */ #define infodecim 5 /* decimal places for bits */ #define nfield 4 /* size of field for printing n, the number of sites */ #define wfield 10 /* size of field for wmatrix */ /* end module rseq.const */ /* begin module vector.const */ #define maxvecpart 64 /* the number of elements in a 'part' of a vector */ #define vpagewidth 64 /* the width (in characters) of the output vector file from procedure writevector */ /* end module vector.const version = 'matmod 1.95 85 apr 18 tds/gds'; */ /* begin module encode.type.param */ /* the following types allow the user to specify parameters which will be used to encode the sequences in the book. */ /* spaces are allowed between the encoded bases, and the number of bases to be skipped between each encoded pair of bases are kept in this linked list of integers */ typedef struct spacelist { long skips; /* bases skipped to next coded base */ struct spacelist *next; /* points to next spacing number */ } spacelist; /* the encoding parameters for each region are stored in these records, and the records for each region are connected into a linked list of all the encoding parameters for the entire sequences */ typedef enum { start, stop } endpoints; /* end points of a coding region */ typedef struct parameter { /* these are the instructions for doing the coding */ long range[2]; /* the bases to be coded by these instructions, relative to the alignedbase */ long window; /* the number of bases included in a coding vector */ long wshift; /* movement of the window to the next site */ long coding; /* the 'level' at which the coding is being done, i.e., 1: mononucleotides; 2: dinucleotides; ... */ spacelist *spaces; /* the spacing between the encoded bases */ long cshift; /* shift to the next coding unit in the window */ /* values calculated at read-in time */ long wvlength; /* length of a window vector. this is coding raised to the 4th power */ long pvlength; /* parameter vector length. the vector made up of a series of window vectors. */ /* note: pvlength/wvlength is the number of windows in the parameter range */ struct parameter *next; /* set of instructions for coding another region */ } parameter; /* end module encode.type.param version = 'matmod 1.95 85 apr 18 tds/gds'; */ /* begin module vector.type */ /* vectors are useful for many applications, including the encoding of sequences. this vector type is designed to be flexible enough to be used whenever one needs a vector. it is designed as a linked list so it can contain as many elements as are ever needed. the actual 'vector' is a record containing the length and a pointer to the first 'vectorpart'. that vectorpart is a record containing an array of the first 'maxvecpart' elements (maxvecpart is a constant, from module vector.const, which must be included) and a pointer to the next 'vectorpart'. the elements are of type real so that either integers or reals may be used. */ typedef struct vectorpart { double numbers[maxvecpart]; struct vectorpart *next; } vectorpart; typedef struct vector { long length; vectorpart *part; } vector; /* end module vector.type version = 'matmod 1.95 85 apr 18 tds/gds'; */ /* begin module base.type */ /* declare the type base to avoid need for modules from delmods */ typedef enum { a, c, g, t } base; /* end module base.type version = 'auxmod 1.37 85 apr 4 gds/tds'; */ /* begin module comp.type */ /* the composition is stored in a tree of these nodes */ /* points to a node of the tree */ typedef struct compnode { /* a node of the composition tree */ long count; /* the number of oligos for this node */ struct compnode *son[4]; /* the pointers to 'descendants' of this node of the tree */ } compnode; /* spiders are used to make the composition tree */ /* points to a 'spider' */ typedef struct spider { /* a spider climbs the composition tree, its path determined by the sequences, and increments 'count' at all the nodes it passes, thereby determining the composition */ long depth; /* the level of the node now at */ compnode *place; /* a pointer to the current node */ struct spider *next; /* the next spider in the collection */ } spider; /* the total number of composition entries at a given level is stored in the linked list of type comptotal */ typedef struct comptotal { long count; /* the number at a given level */ struct comptotal *next; /* pointer to the next level totals */ } comptotal; /* 'path' is used in printing the tree */ /* pointer into a path on the tree */ typedef struct path { /* the path of bases to get to a particular node */ base bas; struct path *next; } path; /* end module comp.type version = 'auxmod 1.37 85 apr 4 gds/tds'; */ /* begin module hnblist.type */ /* a list of e(hnb) values for various n. by calculating these only once and saving them in this list, time can be saved. */ typedef struct ehnblist { long n; /* number of examples */ double ehnb; /* e(hnb) for n */ double varhnb; /* variance for e(hnb) for n */ Char aflag; /* 'a' if approximations used; 'e' if not */ struct ehnblist *next; /* next item on the list */ } ehnblist; /* end module hnblist.type */ /* ************************************************************************ */ /* begin module rseq.var */ /* encoded sequences, composition and output files */ Static _TEXT encseq, cmp, rsdata; Static jmp_buf _JL1; /* end module rseq.var */ /* begin module halt */ Static Void halt() { /* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. */ printf(" program halt.\n"); longjmp(_JL1, 1); } /* end module halt version = 6.56; (@ of alist.p 2005 Sep 21 */ /* begin module copyaline */ Static Void copyaline(fin, fout) _TEXT *fin, *fout; { /* copy a line from file fin to file fout */ while (!P_eoln(fin->f)) { putc(P_peek(fin->f), fout->f); getc(fin->f); } fscanf(fin->f, "%*[^\n]"); getc(fin->f); putc('\n', fout->f); } /* copyaline */ #define debugging false #define boundary 2 /* end module copyaline version = 6.56; (@ of alist.p 2005 Sep 21 */ /* begin module emptyfile */ Static boolean emptyfile(afile) _TEXT *afile; { /* This function is a replacement for eof() to get around a bug in the GPC compiler. The GPC bug is that end-of-file, as provided by eof(), is false when a file is empty. The way to get around it is to count characters in the file to determine if it is empty. Fortunately this is fast for large files because the count stops once a single or a few characters are seen. A single character will not do unfortunately because GPC will give an '*' for an empty file. So the test is for having a certain number (boundary) or fewer characters. This is not great, but it only means that the file must be at least 'boundary' characters long. Furthermore, since the file is being READ, the emptyfile test CANNOT be used in a loop to test for eof! Use emptyfile only to check that the file is empty. Use eof inside loops IF the file is not empty. Dr. Thomas D. Schneider National Institutes of Health National Cancer Institute Center for Cancer Research Nanobiology Program Molecular Information Theory Group Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu (use only if first address fails) http://www.ccrnp.ncifcrf.gov/~toms/ */ boolean Result; /* one needs at least this many characters for a file to be considered 'non empty'. */ long lines = 0; /* count of lines in the file */ long chars = 0; /* count of characters in the file */ Char ch; /* a character in the file */ if (*afile->name != '\0') { if (afile->f != NULL) afile->f = freopen(afile->name, "r", afile->f); else afile->f = fopen(afile->name, "r"); } else rewind(afile->f); if (afile->f == NULL) _EscIO2(FileNotFound, afile->name); RESETBUF(afile->f, Char); while (!BUFEOF(afile->f) && chars < boundary) { if (P_eoln(afile->f)) { lines++; fscanf(afile->f, "%*[^\n]"); getc(afile->f); continue; } ch = getc(afile->f); if (ch == '\n') ch = ' '; if (debugging) { printf("ord(ch) = %12d\n", ch); printf(" ch = %c\n", ch); } chars++; } if (chars < boundary) Result = true; else Result = false; if (debugging) { printf("lines = %ld\n", lines); printf("chars = %ld\n", chars); if (chars < boundary) printf(" empty (file)\n"); else printf("NOT empty (file)\n"); } if (*afile->name != '\0') { if (afile->f != NULL) afile->f = freopen(afile->name, "r", afile->f); else afile->f = fopen(afile->name, "r"); } else rewind(afile->f); if (afile->f == NULL) _EscIO2(FileNotFound, afile->name); RESETBUF(afile->f, Char); /* put the file back at the start!! */ return Result; } #undef debugging #undef boundary /* end module emptyfile version = 6.56; (@ of alist.p 2005 Sep 21 */ /* ************************************************************************ */ /* begin module encode.readencpar */ /* read encode parameters */ Static long vectorsize(param) parameter *param; { /* determine the size of the vector generated by a string of parameter records, begin with 'param' */ long size = 0; /* of the vector, so far */ while (param != NULL) { size += param->pvlength; param = param->next; } return size; } Static long paramsize(param) parameter *param; { /* determine the size of the vector generated by one parameter record */ long rangesize; /* the number of bases in the range */ long numwindows; /* the number of windows which start in the range, taking into account the shifting */ rangesize = param->range[(long)stop] - param->range[(long)start] + 1; numwindows = (long)((rangesize - 1.0) / param->wshift + 1); return (numwindows * param->wvlength); } /* paramsize */ Static Void readencpar(thefile, param, regions, vectorlength) _TEXT *thefile; parameter **param; long *regions, *vectorlength; { /* read the parameter list from 'thefile' and put the information into the linked parameter list, beginning with 'param'; the number of parameter records is returned in 'regions'. the length of each vector is vectorlength */ parameter *aparam; /* from the parameter list */ parameter *newparam; /* the new parameter record from 'thefile' */ spacelist *firstspaces; /* in the spaces list */ spacelist *newspaces; /* for the spaces list */ long i, j; /* indices */ Char ch; /* to check for proper positioning */ long FORLIM, FORLIM1; if (*thefile->name != '\0') { if (thefile->f != NULL) thefile->f = freopen(thefile->name, "r", thefile->f); else thefile->f = fopen(thefile->name, "r"); } else rewind(thefile->f); if (thefile->f == NULL) _EscIO2(FileNotFound, thefile->name); RESETBUF(thefile->f, Char); if (emptyfile(thefile)) { printf(" encoded sequence file is empty\n"); halt(); } aparam = (parameter *)Malloc(sizeof(parameter)); if (*param == NULL) *param = (parameter *)Malloc(sizeof(parameter)); aparam = *param; /* skip the header */ for (i = 1; i <= 2; i++) { fscanf(thefile->f, "%*[^\n]"); getc(thefile->f); } /* skip a blank line in the header */ fscanf(thefile->f, "%*[^\n]"); getc(thefile->f); /* read the number of regions */ fscanf(thefile->f, "%ld%*[^\n]", regions); getc(thefile->f); FORLIM = *regions; for (i = 1; i <= FORLIM; i++) { /* read the range */ fscanf(thefile->f, "%ld", aparam->range); do { ch = getc(thefile->f); if (ch == '\n') ch = ' '; } while (ch != 'o'); fscanf(thefile->f, "%ld%*[^\n]", &aparam->range[(long)stop]); getc(thefile->f); /* read the window size and shift */ fscanf(thefile->f, "%ld%*[^\n]", &aparam->window); getc(thefile->f); fscanf(thefile->f, "%ld%*[^\n]", &aparam->wshift); getc(thefile->f); /* read the coding level and arrangement */ fscanf(thefile->f, "%ld", &aparam->coding); aparam->spaces = (spacelist *)Malloc(sizeof(spacelist)); if (aparam->coding > 1) { do { ch = getc(thefile->f); if (ch == '\n') ch = ' '; } while (ch != ':'); firstspaces = (spacelist *)Malloc(sizeof(spacelist)); aparam->spaces = firstspaces; fscanf(thefile->f, "%ld", &aparam->spaces->skips); FORLIM1 = aparam->coding; for (j = 2; j < FORLIM1; j++) { newspaces = (spacelist *)Malloc(sizeof(spacelist)); aparam->spaces->next = newspaces; aparam->spaces = newspaces; fscanf(thefile->f, "%ld", &aparam->spaces->skips); } aparam->spaces->next = NULL; aparam->spaces = firstspaces; } else aparam->spaces = NULL; fscanf(thefile->f, "%*[^\n]"); getc(thefile->f); fscanf(thefile->f, "%ld%*[^\n]", &aparam->cshift); getc(thefile->f); /* set up wvlength */ aparam->wvlength = (long)floor(exp(aparam->coding * log(4.0)) + 0.5); /* set up pvlength */ aparam->pvlength = paramsize(aparam); if (i < *regions) { newparam = (parameter *)Malloc(sizeof(parameter)); aparam->next = newparam; aparam = newparam; } } aparam->next = NULL; /* read the vector size */ fscanf(thefile->f, "%ld%*[^\n]", vectorlength); getc(thefile->f); if (*vectorlength == vectorsize(*param)) return; printf(" vector length in the encoded file\n"); printf(" does not correspond to the parameters\n"); halt(); } /* end module encode.readencpar version = 'matmod 1.95 85 apr 18 tds/gds'; */ /* ************************************************************************ */ /* begin module vector.primitives */ /* these functions and procedures do manipulations on vectors. */ Static double vget(v, pos) vector v; long pos; { /* this returns from vector 'v' the value of the element at position 'pos' */ long i; if (pos > v.length || pos < 1) { printf( " error in call to function vget: position %ld is beyond the end of the vector\n", pos); halt(); } /* move to the correct 'vectorpart' */ for (i = 1; i <= (pos - 1) / maxvecpart; i++) v.part = v.part->next; /* get the proper array element from this part */ return (v.part->numbers[(pos - 1) & (maxvecpart - 1)]); } Static Void vput(v, pos, number) vector *v; long pos; double number; { /* this puts into vector 'v' the value of 'number' at position 'pos' */ long i; vectorpart *firstpart; /* of the vector */ if (pos > v->length || pos < 1) { printf( " error in call to function vput: position %ld is beyond the end of the vector\n", pos); halt(); } /* move to the correct 'vectorpart' */ firstpart = v->part; for (i = 1; i <= (pos - 1) / maxvecpart; i++) v->part = v->part->next; /* put the 'number' into the proper array element for this part */ v->part->numbers[(pos - 1) & (maxvecpart - 1)] = number; v->part = firstpart; } Static Void makevector(v, l) vector *v; long l; { /* create the linked list of vector-parts which are needed for a vector of length 'l' */ long numparts; /* number of parts needed for the vector */ long i; /* index to the parts of the vector */ vectorpart *firstpart; /* of the vector */ vectorpart *newpart; /* of the vector */ if (l < 1) { printf(" makevector: positive length required\n"); halt(); } v->length = l; v->part = (vectorpart *)Malloc(sizeof(vectorpart)); firstpart = v->part; numparts = (v->length - 1) / maxvecpart + 1; for (i = 1; i < numparts; i++) { newpart = (vectorpart *)Malloc(sizeof(vectorpart)); v->part->next = newpart; v->part = newpart; } v->part->next = NULL; v->part = firstpart; } /* end module vector.primitives version = 'matmod 1.95 85 apr 18 tds/gds'; */ /* begin module vector.readvector */ Static Void readvector(thefile, v) _TEXT *thefile; vector *v; { /* read the elements of v into v from 'thefile'. v must already be set up (all the parts created and linked together) before calling. 'thefile' must contain, from the cursor point until the end of the vector, only numbers, either integers or reals; otherwise it will bomb. the vector ends with an end of line so that end of file can be tested for. */ long i, j; /* indecies */ long numparts; /* number of parts in the vector */ long lastpart; /* the number of elements in the last vector part */ vectorpart *firstpart; /* pointer to the first vector part */ firstpart = v->part; numparts = (v->length - 1) / maxvecpart + 1; lastpart = ((v->length - 1) & (maxvecpart - 1)) + 1; for (i = 1; i < numparts; i++) { for (j = 0; j < maxvecpart; j++) fscanf(thefile->f, "%lg", &v->part->numbers[j]); v->part = v->part->next; } for (j = 0; j < lastpart; j++) fscanf(thefile->f, "%lg", &v->part->numbers[j]); fscanf(thefile->f, "%*[^\n]"); getc(thefile->f); v->part = firstpart; } /* end module vector.readvector version = 'matmod 1.95 85 apr 18 tds/gds'; */ /* begin module vector.vset */ Static Void vset(thevalue, v) double thevalue; vector *v; { /* set the value thevalue into v */ long i; /* an index */ long FORLIM; FORLIM = v->length; for (i = 1; i <= FORLIM; i++) vput(v, i, thevalue); } /* end module vector.vset version = 'matmod 1.95 85 apr 18 tds/gds'; */ /* begin module vector.vadd */ Static Void vadd(a_, b) vector a_, *b; { /* add the vector a into the vector b */ long i; /* an index */ if (a_.length != b->length) { printf(" function vadd: vector lengths must be equal\n"); halt(); } for (i = 1; i <= a_.length; i++) vput(b, i, vget(*b, i) + vget(a_, i)); } /* end module vector.vadd version = 'matmod 1.95 85 apr 18 tds/gds'; */ /* ************************************************************************ */ /* begin module comp.readcomp */ /* begin module skipoligo */ Static Void skipoligo(thefile) _TEXT *thefile; { /* this procedure is used to skip over an oligonucleotide string. it reads until it comes to a base, and then continues to read until it comes to a blank. no checking is done for non-base characters in between. */ Char ch; do { ch = getc(thefile->f); if (ch == '\n') ch = ' '; } while (ch != 't' && ch != 'g' && ch != 'c' && ch != 'a'); do { ch = getc(thefile->f); if (ch == '\n') ch = ' '; } while (ch != ' '); } /* for efficient reading of the information into the tree, each node that has a successor is stored in a queue as well as put into the tree. the variables of the type list are the queue elements. */ typedef struct list { compnode *item; /* points to the composition tree node */ struct list *next; /* points to the next list item in the queue */ } list; /* Local variables for readcomp: */ struct LOC_readcomp { list *freeitem; /* the list of unused list pointers */ } ; /* getitem and clearitem provide efficient use of linked list storage by keeping a list of unused pointers that can be allocated instead of always creating 'new' ones */ Local Void getitem(l, LINK) list **l; struct LOC_readcomp *LINK; { /* obtain a listitem from the free list or by making a new one */ if (LINK->freeitem != NULL) { *l = LINK->freeitem; LINK->freeitem = LINK->freeitem->next; } else *l = (list *)Malloc(sizeof(list)); (*l)->next = NULL; } Local Void clearitem(l, LINK) list **l; struct LOC_readcomp *LINK; { /* return a listitem to the free list */ list *lptr; if (*l == NULL) return; lptr = *l; *l = (*l)->next; lptr->next = LINK->freeitem; LINK->freeitem = lptr; } /* end module skipoligo version = 'auxmod 1.37 85 apr 4 gds/tds'; */ Static Void readcomp(comp, compmax, readmax, root, monocomptotal) _TEXT *comp; long *compmax, readmax; compnode **root; comptotal **monocomptotal; { /* this procedure requires modules: comp.type, skipoligo, halt; this procedure reads from file 'comp' a composition and puts it into the tree pointed to by the 'root' pointer. 'compmax' is the depth of the composition tree which is stored. it is the minimum of 'readmax', the requested depth, and 'detcomp', the depth for which the input file composition was determined. 'monocomptotal' points to the beginning (for the monos) of a linked list which gives the totals for each level of the composition. */ struct LOC_readcomp V; list *listitem; /* an item in the queue */ list *first; /* the first item in the queue */ list *last; /* the last item in the queue */ comptotal *comptot; /* the comp total for a given level */ comptotal *newcomptot; /* for adding to the string of comptot"s */ long detcomp; /* the level to which the input composition was determined */ long level; /* of the composition being read, i.e., monos, dis, ... */ long number; /* read from the 'comp' file */ Char ch; /* for reading from 'comp' */ base ba; /* an index */ if (*comp->name != '\0') { if (comp->f != NULL) comp->f = freopen(comp->name, "r", comp->f); else comp->f = fopen(comp->name, "r"); } else rewind(comp->f); if (comp->f == NULL) _EscIO2(FileNotFound, comp->name); RESETBUF(comp->f, Char); if (emptyfile(comp)) { printf(" error: no composition file provided\n"); halt(); } fscanf(comp->f, "%*[^\n]"); getc(comp->f); /* skip the program identification */ fscanf(comp->f, "%*[^\n]"); getc(comp->f); /* skip the book identification */ fscanf(comp->f, "%ld%*[^\n]", &detcomp); getc(comp->f); /* obtain the determined composition */ fscanf(comp->f, "%*[^\n]"); getc(comp->f); /* skip the blank line */ /* determine the level of composition to be stored */ if (readmax < 1) { printf("\n warning: 0 or negative oligo length requested\n"); printf(" composition used is depth %ld\n\n", detcomp); *compmax = detcomp; } else if (readmax > detcomp) { printf("\n warning: requested composition oligo length (%ld)\n", readmax); printf(" is larger than the determined composition oligo length (%ld).\n", detcomp); printf(" composition used is to depth %ld\n\n", detcomp); *compmax = detcomp; } else *compmax = readmax; *root = (compnode *)Malloc(sizeof(compnode)); *monocomptotal = (comptotal *)Malloc(sizeof(comptotal)); comptot = (comptotal *)Malloc(sizeof(comptotal)); first = (list *)Malloc(sizeof(list)); Malloc(sizeof(list)); /* p2c: rseq.p: Note: Eliminated unused assignment statement [338] */ first->item = *root; first->next = NULL; last = first; V.freeitem = (list *)Malloc(sizeof(list)); V.freeitem->next = NULL; /* read in the total number of bases from the composition */ fscanf(comp->f, "%*[^\n]"); getc(comp->f); /* skip the * */ fscanf(comp->f, "%*[^\n]"); getc(comp->f); /* skip the information line */ fscanf(comp->f, "%ld%*[^\n]", &number); getc(comp->f); (*root)->count = number; do { ch = getc(comp->f); /* skip the space */ if (ch == '\n') ch = ' '; ch = getc(comp->f); /* the ch determines what to do next */ if (ch == '\n') ch = ' '; if (ch == '*') { /* determine the level about to be read */ fscanf(comp->f, "%*[^\n]"); getc(comp->f); fscanf(comp->f, "%ld%*[^\n]", &level); getc(comp->f); if (level == 1) *monocomptotal = comptot; else { newcomptot = (comptotal *)Malloc(sizeof(comptotal)); comptot->next = newcomptot; comptot = newcomptot; } comptot->count = 0; comptot->next = NULL; } else { for (ba = a; (long)ba <= (long)t; ba = (base)((long)ba + 1)) { skipoligo(comp); fscanf(comp->f, "%ld", &number); if (number != 0) { first->item->son[(long)ba] = (compnode *)Malloc(sizeof(compnode)); first->item->son[(long)ba]->count = number; comptot->count += number; getitem(&listitem, &V); last->next = listitem; last = listitem; last->next = NULL; last->item = first->item->son[(long)ba]; } else first->item->son[(long)ba] = NULL; } clearitem(&first, &V); fscanf(comp->f, "%*[^\n]"); getc(comp->f); } } while (level != *compmax); /* read the composition values */ /* for the last level of compositions to be read we don"t need to store the nodes in the queue because their successors will not be read in. */ do { for (ba = a; (long)ba <= (long)t; ba = (base)((long)ba + 1)) { skipoligo(comp); fscanf(comp->f, "%ld", &number); if (number != 0) { first->item->son[(long)ba] = (compnode *)Malloc(sizeof(compnode)); first->item->son[(long)ba]->count = number; comptot->count += number; } else first->item->son[(long)ba] = NULL; } clearitem(&first, &V); fscanf(comp->f, "%*[^\n]"); getc(comp->f); } while (first != NULL); } /* readcomp */ Static long getcount(root, start_) compnode *root; path *start_; { /* this function follows the tree from the node 'root' (which may be the root of a subtree) along the path initiated with 'start', and returns the count of the resulting node. if the path through the tree ever hits a null node in the process the count is returned to be zero. */ compnode *place = root; /* a place in the composition tree */ path *point = start_; /* a point in the path through the tree */ while (place != NULL && point != NULL) { place = place->son[(long)point->bas]; point = point->next; } if (place == NULL) return 0; else return (place->count); } /* end module comp.readcomp version = 'auxmod 1.37 85 apr 4 gds/tds'; */ /* begin module comp.getmonocomposition */ Static Void getmonocomposition(cmp, gna, gnc, gng, gnt, data, equalmono) _TEXT *cmp; long *gna, *gnc, *gng, *gnt; _TEXT *data; boolean *equalmono; { /* get the mononucleotide composition in gna to gnt from the file cmp, messages about what is returned go to data. if the file cmp is empty, equal mononucleotides are assumed and equalmono is true */ /* variables for composition procedures readcomp and getcount */ long compmax; compnode *root; comptotal *monocomptotal; path *start_; base b1; /* base for comp */ long count; /* one of the base counts */ if (*cmp->name != '\0') { if (cmp->f != NULL) cmp->f = freopen(cmp->name, "r", cmp->f); else cmp->f = fopen(cmp->name, "r"); } else rewind(cmp->f); if (cmp->f == NULL) _EscIO2(FileNotFound, cmp->name); RESETBUF(cmp->f, Char); if (emptyfile(cmp)) { fprintf(data->f, "* the composition file is empty.\n"); fprintf(data->f, "* (equal mononucleotides assumed)\n"); *gna = 1; *gnc = 1; *gng = 1; *gnt = 1; *equalmono = true; return; } putc('*', data->f); copyaline(cmp, data); putc('*', data->f); copyaline(cmp, data); readcomp(cmp, &compmax, 1L, &root, &monocomptotal); /* obtain the mono composition */ start_ = (path *)Malloc(sizeof(path)); start_->next = NULL; for (b1 = a; (long)b1 <= (long)t; b1 = (base)((long)b1 + 1)) { start_->bas = b1; count = getcount(root, start_); switch (b1) { case a: *gna = count; break; case c: *gnc = count; break; case g: *gng = count; break; case t: *gnt = count; break; } } Free(start_); *equalmono = false; } /* getmonocomposition */ #define maxsize 200 /* the largest n allowed */ #define accuracy 10000 /* end module comp.getmonocomposition */ /* ************************************************************************ */ /* begin module calehnb */ Static Void calehnb(n, gna, gnc, gng, gnt, hg, ehnb, varhnb) long n, gna, gnc, gng, gnt; double *hg, *ehnb, *varhnb; { /* ; debugging: boolean */ /* calculate e(hnb) in bits/bp (ehnb) for a number (n) of example sequence sites. gna to gnt are the composition to use for the genome probabilities of a to t. the genomic entropy hg and the variance var(hnb) (=varhnb) are also calculated. if the variable debugging is passed to the procedure then the individual combinations of hnb are displayed. note: this procedure should not be broken into smaller procedures so that it remains efficient. version = 3.02; of procedure calehnb 1983 nov 23 */ /* less than (1/accuracy) bits error is demanded for the sum of pnb (see variable 'total') at the end of the procedure */ double log2 = log(2.0); /* natural log of 2, used to find log base 2 */ double logn; /* log of n */ double nlog2; /* n * log2 */ long gn; /* sum of gna..gnt */ double logpa, logpc, logpg, logpt; /* logs of genome probabilities */ /* log of n factorial is the sum of i=1 to n of log(i). the array below represents these logs up to n */ double logfact[maxsize + 1]; /* precalculated values of -p*log2(p), where p=nb/n for nb = 0 .. n. m stands for minus */ double mplog2p[maxsize + 1]; long i; /* index for logfact and mplog2p */ double logi; /* natural log of i */ long na; long nc = 0, ng = 0, nt = 0; /* numbers of bases in a site */ boolean done = false; /* true when the loop is completed */ double pnb; /* multinomial probability of a combination of na, nc, ng, nt */ double hnb; /* entropy for a combination of na..nt */ double pnbhnb; /* pnb*hnb, an intermediate result */ double sshnb = 0.0; /* sum of squares of hnb */ /* variables for testing program correctness: */ double total = 0.0; /* sum of pnb over all combinations of na..nt if this is not 1.00, the program is in error */ long counter = 0; /* counts the number of times through the loop */ /* prevent access to outside the arrays: */ if (n > maxsize) { printf(" procedure calehnb: n > maxsize (%ld>%ld)\n", n, (long)maxsize); halt(); } logn = log((double)n); nlog2 = n * log2; /* get logs of genome probabilities */ gn = gna + gnc + gng + gnt; logpa = log((double)gna / gn); logpc = log((double)gnc / gn); logpg = log((double)gng / gn); logpt = log((double)gnt / gn); /* find genomic entropy */ *hg = -((gna * logpa + gnc * logpc + gng * logpg + gnt * logpt) / (gn * log2)); *ehnb = 0.0; /* start error entropy at zero */ /* make table of log of n factorial up to n and entropies for nb/n */ logfact[0] = 0.0; /* factorial(0) = 0 */ mplog2p[0] = 0.0; for (i = 1; i <= n; i++) { logi = log((double)i); logfact[i] = logfact[i-1] + logi; mplog2p[i] = i * (logn - logi) / nlog2; } /* begin by looking at the combination with all a: na = n */ na = n; /* the following loop simulates a number of nested loops of the form: for b1=a to t do for b2=b1 to t do for b3=b2 to t do ... for bn=b(n-1) to t do ... the resulting set of variables increase in alphabetic order since no inner loop variable can have a value less than any outer loop. the number of times through the inner-most loop is given by: o = (n + 1)*(n + 2)*(n + 3)/6 in the case where there are four symbols (a,c,g,t) and n is the number of nested loops. a recursive set of loops would be possible, but it would use up too much memory in practical cases (up to n=150 or higher). a second algorithm sequests the loop variables into an array and increments them there. however, the goal is to get all possible combinations for na, nc, ng, nt, where the sum of these is n. the nested loops provide all the combinations in alphabetic order, assuring that there can not be any duplicates. to find nb (one of na..nt) one would look at which of the variables b1 to bn were of value b. this is a wasteful operation. the loop below simulates the array of control variables by changing each nb directly. */ do { /* pnb is calculated by taking the log of the expression fact(n) na nc ng nt pnb = ------------------- pa * pc * pg * pt . fact(na).. fact(nt) log(pnb) generates a series of sums, allowing the calculation to proceed by addition and multiplication rather than multiplication and exponentiation. the factorials become tractable in this way */ pnb = exp(logfact[n] - logfact[na] - logfact[nc] - logfact[ng] - logfact[nt] + na * logpa + nc * logpc + ng * logpg + nt * logpt); /* n factorial */ hnb = mplog2p[na] + mplog2p[nc] + mplog2p[ng] + mplog2p[nt]; pnbhnb = pnb * hnb; *ehnb += pnbhnb; sshnb += pnbhnb * hnb; /* sum of squares of hnb */ /* the following section keeps track of the calculation and writes out the current set of nb. */ counter++; /* if debugging then begin write(output,' ',counter:2,' '); for i := 1 to na do write(output,'a'); for i := 1 to nc do write(output,'c'); for i := 1 to ng do write(output,'g'); for i := 1 to nt do write(output,'t'); write(output,' ',na:3,nc:3,ng:3,nt:3); writeln(output,' pnb = ',pnb:10:5); end; */ total += pnb; /* the remaining portion of this repeat loop generates the values of na, nc, ng and nt. notice that there are 7 possibilities at each loop increment. other than the stop, in each case the sum of na+nc+ng+nt remains constant (=n). */ if (nt > 0) { /* ending on a t - do outer loops */ if (ng > 0) { /* turn g into t */ ng--; nt++; } else if (nc > 0) { /* turn one c into g, and all t to g (note ng = 0 initially) */ nc--; ng = nt + 1; nt = 0; } else if (na > 0) { /* turn one a into c and all g and t to c. (note ng=nc=0 initially) */ na--; nc = nt + 1; nt = 0; } else done = true; /* since nt = n */ } else { if (ng > 0) { /* turn g into t */ ng--; nt++; } else if (nc > 0) { /* turn c into g */ nc--; ng++; } else { na--; nc++; /* na > 0; turn a into c */ } } } while (!done); /* no t - increment innermost loop */ /* final adjustment: we only have the sum of squares so far */ *varhnb = sshnb - *ehnb * *ehnb; /* if this message appears, there is either a bug in the code or the computer cannot be as accurate as requested */ if (accuracy != (long)floor(accuracy * total + 0.5)) { printf(" procedure calehnb: the sum of probabilities is\n"); printf(" not accurate to one part in %ld\n", (long)accuracy); printf(" the sum of the probabilities is %10.8f\n", total); } /* if this message appear, then there is an error in the repeat-until loop: it did not repeat as many times as is expected from the algorithm */ if (counter == (long)floor((n + 1.0) * (n + 2) * (n + 3) / 6 + 0.5)) return; /* writeln(output, ' total: ',total:10:5); writeln(output,' count = ',counter:1); writeln(output,' (n+1)*(n+2)*(n+3)/6 = ', round((n+1)*(n+2)*(n+3)/6):1); */ printf(" procedure calehnb: program error, the number of\n"); printf(" calculations is in error\n"); halt(); } /* calehnb */ #undef maxsize #undef accuracy /* end module calehnb version = 2.19; (@ of calhnb 1986 mar 31 */ /* begin module calaehnb */ Static Void calaehnb(n, gna, gnc, gng, gnt, hg, aehnb, avarhnb) long n, gna, gnc, gng, gnt; double *hg, *aehnb, *avarhnb; { /* calculate ae(hnb) in bits/bp (=aehnb) for a number (n) of example sequence sites. gna to gnt are the composition to use for the genome probabilities of a to t. the genomic entropy (=hg) and the variance avar(hnb) (=avarhnb) are also calculated. this procedure gives approximations for e(hnb) and var(hnb). it is based on a formula derived by jeff haemer. see also: g. p. basharin theory probability appl. 4(3): 333-336 (1959) 'on a statistical estimate for the entropy of a sequence of independent random variables' version = 3.01; of procedure calaehnb 1983 nov 23 */ double log2 = log(2.0); /* natural log of 2 */ long gn; /* sum of gna..gnt */ double pa, pc, pg, pt; /* genomic probabilities */ double e; /* the approximate sampling error */ gn = gna + gnc + gng + gnt; pa = (double)gna / gn; pc = (double)gnc / gn; pg = (double)gng / gn; pt = (double)gnt / gn; *hg = -((pa * log(pa) + pc * log(pc) + pg * log(pg) + pt * log(pt)) / log2); e = 3 / (2 * log2 * n); *aehnb = *hg - e; *avarhnb = e * e; } /* calaehnb */ #define kickover 50 /* Local variables for makehnblist: */ struct LOC_makehnblist { long gna, gnc, gng, gnt; double *hg; long num; } ; Local Void fill(l, LINK) ehnblist **l; struct LOC_makehnblist *LINK; { /* fill l with the (global) values of n from at nindex */ if (LINK->num == 0) { /*zzz*/ /* writeln(output,' do not use encodings that reach', ' beyond the sequences'); halt */ (*l)->aflag = ' '; /* no computation to be done here */ return; } (*l)->n = LINK->num; if (LINK->num <= kickover) { calehnb(LINK->num, LINK->gna, LINK->gnc, LINK->gng, LINK->gnt, LINK->hg, &(*l)->ehnb, &(*l)->varhnb); (*l)->aflag = 'e'; /* not using approximation */ } else { /* using approximation */ calaehnb(LINK->num, LINK->gna, LINK->gnc, LINK->gng, LINK->gnt, LINK->hg, &(*l)->ehnb, &(*l)->varhnb); (*l)->aflag = 'a'; } } /* fill */ /* end module calaehnb version = 2.19; (@ of calhnb 1986 mar 31 */ /* begin module hnblist.makehnblist */ Static Void makehnblist(n, gna_, gnc_, gng_, gnt_, l, hg_) vector n; long gna_, gnc_, gng_, gnt_; ehnblist **l; double *hg_; { /* make a sorted list l of e(hnb) values based on the vector of samples n and the genomic composition gna to gnt. also, return hg for the genomic information */ struct LOC_makehnblist V; /* e(hnb) for n values above this are estimated from ae(hnb) = hg - 3/(2 ln(2) n) while those below are calculated exactly */ long nindex = 1; /* index to n */ /* hold the value of n at nindex to avoid multiple calls to vget */ ehnblist *lindex; /* index to l */ ehnblist *spare; /* a spare pointer for construction */ boolean done; /* have we finished searching l? */ /* insert somewhere in l */ V.gna = gna_; V.gnc = gnc_; V.gng = gng_; V.gnt = gnt_; V.hg = hg_; *l = (ehnblist *)Malloc(sizeof(ehnblist)); /* start the list up */ /* for the first one */ V.num = (long)floor(vget(n, nindex) + 0.5); fill(l, &V); (*l)->next = NULL; for (nindex = 2; nindex <= n.length; nindex++) { V.num = (long)floor(vget(n, nindex) + 0.5); /*write(output,' nindex = ',nindex:1); (@ debug */ /*write(output,' num = ',num:1);(@debug*/ if (V.num < (*l)->n) { spare = (ehnblist *)Malloc(sizeof(ehnblist)); spare->next = *l; *l = spare; fill(l, &V); } /* insert before, to become the first on the list */ /*writeln(output,' first on list'); (@debug*/ else { lindex = *l; done = false; while (!done) { if (lindex->next == NULL) { done = true; break; } if (lindex->next->n <= V.num) lindex = lindex->next; else done = true; } if (lindex->n != V.num) { /*write(output,' accepted'); (@debug*/ /* new number to insert, otherwise ignore */ if (lindex->next == NULL) { /* add to end */ lindex->next = (ehnblist *)Malloc(sizeof(ehnblist)); lindex = lindex->next; lindex->next = NULL; } /*writeln(output,' add to end'); (@debug*/ else { spare = (ehnblist *)Malloc(sizeof(ehnblist)); spare->next = lindex->next; lindex->next = spare; lindex = spare; } /* insert to middle of list */ /*writeln(output,' insert to middle'); (@debug*/ fill(&lindex, &V); } /* else: reject this n since it is already on the list */ /*else writeln(output,' rejected') (@debug*/ } } } /* makehnblist */ #undef kickover /* end module hnblist.makehnblist */ /* begin module hnblist.gethnb */ Static Void gethnb(l, n, ehnb, varhnb, aflag) ehnblist *l; long n; double *ehnb, *varhnb; Char *aflag; { /* get the e(hnb) [ehnb] value for n example sites by looking in the precalculated list l. also return the variance of hnb, and whether these are approximations. */ ehnblist *lindex = l; /* index to l */ while (lindex->n != n && lindex->next != NULL) lindex = lindex->next; if (lindex->n != n) { printf(" gethnb: program error.\n"); printf(" unable to find n = %ld\n", n); halt(); } /* oh oh, something is wrong */ *ehnb = lindex->ehnb; *varhnb = lindex->varhnb; *aflag = lindex->aflag; } /* gethnb */ /* end module hnblist.gethnb */ /* ************************************************************************ */ /* begin module rseq.header */ Static Void header(data, encseq) _TEXT *data, *encseq; { /* display the header to data, using information in the encoded sequences, encseq */ if (*data->name != '\0') { if (data->f != NULL) data->f = freopen(data->name, "w", data->f); else data->f = fopen(data->name, "w"); } else { if (data->f != NULL) rewind(data->f); else data->f = tmpfile(); } if (data->f == NULL) _EscIO2(FileNotFound, data->name); SETUPBUF(data->f, Char); fprintf(data->f, "* rseq %4.2f rsequence calculated from encoded sequences from:\n", version); if (*encseq->name != '\0') { if (encseq->f != NULL) encseq->f = freopen(encseq->name, "r", encseq->f); else encseq->f = fopen(encseq->name, "r"); } else rewind(encseq->f); if (encseq->f == NULL) _EscIO2(FileNotFound, encseq->name); RESETBUF(encseq->f, Char); if (emptyfile(encseq)) { printf(" encseq is empty\n"); halt(); } putc('*', data->f); copyaline(encseq, data); putc('*', data->f); copyaline(encseq, data); fprintf(data->f, "*\n"); } /* header */ /* end module rseq.header */ /* begin module rseq.checkparams */ Static Void checkparams(theparameters) parameter *theparameters; { /* make sure that the parameters are fitting to this version of rseq */ if (theparameters->window != 1) { printf(" rseq requires an encoding window length of 1\n"); halt(); } if (theparameters->wshift != 1) { printf(" rseq requires encoding window shifts of 1\n"); halt(); } if (theparameters->coding != 1) { printf(" rseq can only handle mononucleotide encodings\n"); halt(); } if (theparameters->cshift != 1) { printf(" rseq can only handle coding shifts of 1\n"); halt(); } if (theparameters->next != NULL) { printf(" rseq can only handle one encoding parameter set\n"); halt(); } } /* checkparams */ /* end module rseq.checkparams */ /* begin module vector.sumvectors */ Static Void sumvectors(encseq, nbl, firstparam) _TEXT *encseq; vector *nbl; parameter **firstparam; { /* scan the encoded sequences in encseq and sum them up in nbl */ long regions; /* for readencpar */ long vsize; /* size of the vectors */ vector v; /* a vector read in */ *firstparam = NULL; readencpar(encseq, firstparam, ®ions, &vsize); makevector(nbl, vsize); makevector(&v, vsize); vset(0.0, nbl); while (!BUFEOF(encseq->f)) { readvector(encseq, &v); vadd(v, nbl); } } /* sumvectors */ /* end module vector.sumvectors */ /* begin module vector.makenl */ Static Void makenl(nbl, nl, firstparam) vector nbl, *nl; parameter **firstparam; { /* sum nbl (the number of bases (b) of each type at a position l) over the bases to create nl, the number of bases at position l */ parameter *aparam; /* one of the parameters */ long l; /* a position in the sequence */ long v = 0; /* position in the vectors */ long w; /* position in a window of the encoding */ double total; /* sum of encoding at l */ long FORLIM; makevector(nl, nbl.length); vset(0.0, nl); aparam = *firstparam; /* handle one parameter at a time: */ while (aparam != NULL) { l = aparam->range[(long)start]; do { /* add up bases at position l */ total = 0.0; FORLIM = aparam->wvlength; for (w = 1; w <= FORLIM; w++) total += vget(nbl, w + v); FORLIM = aparam->wvlength; /* put the total into the same positions of nl */ for (w = 1; w <= FORLIM; w++) vput(nl, w + v, total); /* move to the next slots in the vector */ v += aparam->wvlength; l += aparam->wshift; } while (l <= aparam->range[(long)stop]); /* move to the next parameter */ aparam = aparam->next; } } /* end module vector.makenl */ /* begin module vector.compressnl */ Static Void compressnl(nl, firstparam, thefrom, theto) vector nl; parameter *firstparam; long *thefrom, *theto; { /* Determine the true from-to bounds of the nl vector */ parameter *aparam = firstparam; /* one of the parameters */ long l; /* a position in the sequence */ double total; /* sum of encoding at l */ long v = 0; /* position in the vectors */ *thefrom = LONG_MAX; *theto = -LONG_MAX; /* handle one parameter at a time: */ while (aparam != NULL) { l = aparam->range[(long)start]; do { /* for w := 1 to aparam^.wvlength do total := total + vget(nbl,w+v); (* put the total into the same positions of nl *) for w := 1 to aparam^.wvlength do vput(nl,w+v,total); */ total = vget(nl, v + 1); if (total > 0) *theto = l; if (*thefrom >= LONG_MAX) { if (total > 0) *thefrom = l; } /* move to the next slots in the vector */ v += aparam->wvlength; l += aparam->wshift; } while (l <= aparam->range[(long)stop]); /* move to the next parameter */ aparam = aparam->next; } } #define cmpnmin 1000 /* end module vector.compressnl */ /* begin module rseq.prepareehnb */ Static Void prepareehnb(cmp, data, nl, ehnb) _TEXT *cmp, *data; vector nl; ehnblist **ehnb; { /* prepare the list of e(hnb) values keyed on the number of sequences, n(l). the procedure reads the composition from cmp, and announces what it did to data. */ /* minimum number of bases allowed for the composition */ /* mononucleotide genomic composition */ long gmono[4]; /* a to t */ long genomicn; /* the number of bases of the composition */ boolean equalmono; /* true if equal mononucleotides were assumed by getmonocomposition */ double hg; /* the genomic entropy */ fprintf(data->f, "* genomic probabilities from:\n"); getmonocomposition(cmp, gmono, &gmono[1], &gmono[2], &gmono[3], data, &equalmono); genomicn = gmono[0] + gmono[1] + gmono[2] + gmono[3]; if (!equalmono) { if (genomicn < cmpnmin) { printf(" ************ WARNING ***********\n"); printf(" there should be at least\n"); printf(" %4ld bases of composition information\n", (long)cmpnmin); printf(" to avoid needing to adjust for sampling\n"); printf(" error in the genomic hg.\n"); printf(" the error with %5ld bases is %*.*f bits\n", genomicn, infofield, infodecim, 3 / (genomicn * 2 * log(2.0))); } } fprintf(data->f, "* the genomic composition used is: a = %ld, c = %ld, g = %ld, t = %ld\n", gmono[0], gmono[1], gmono[2], gmono[3]); fprintf(data->f, "*\n"); makehnblist(nl, gmono[0], gmono[1], gmono[2], gmono[3], ehnb, &hg); fprintf(data->f, "* genomic information, hg: %*.*f bits/base\n", infofield, infodecim, hg); fprintf(data->f, "*\n"); } /* prepareehnb */ #undef cmpnmin /* end module rseq.prepareehnb */ /* begin module rseq.colinfo */ Static Void colinfo(data) _TEXT *data; { /* send the final header column information to data */ fprintf(data->f, "* l position in aligned set\n"); fprintf(data->f, "* a..t number of each base at the position l\n"); fprintf(data->f, "* rs(l) rsequence(l)\n"); fprintf(data->f, "* rs rsequence, running sum of rs(l)\n"); fprintf(data->f, "* var(hnb) variance of hnb\n"); fprintf(data->f, "* sum-var running sum of var(hnb)\n"); fprintf(data->f, "* n(l) number of sequence examples at l\n"); fprintf(data->f, "* e(hnb) expectation of hnb\n"); fprintf(data->f, "* f flag for calculation of e(hnb) and var(hnb):\n"); fprintf(data->f, "* e = exact, a = approximation\n"); } /* colinfo */ /* end module rseq.colinfo */ /* begin module rseq.colid */ Static Void colid(data) _TEXT *data; { /* no +1 since the * is there */ /* identify each column of the output */ fprintf(data->f, "*%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s f\n", posfield, " l", (int)(nfield + 1), " a", (int)(nfield + 1), " c", (int)(nfield + 1), " g", (int)(nfield + 1), " t", (int)(infofield + 1), " rs(l)", (int)(infofield + 1), " rs", (int)(infofield + 1), " var(hnb)", (int)(infofield + 1), " sum-var", (int)(posfield + 1), " n(l)", (int)(infofield + 1), " e(hnb)"); } /* colid */ /* end module rseq.colid */ /* begin module rseq.finalcomments */ Static Void finalcomments(data, rs, sumvarhnb) _TEXT *data; double rs, sumvarhnb; { /* write rsequence and its standard deviation data to data */ fprintf(data->f, "* rsequence = %*.*f +/- %*.*f bits/site\n", infofield, infodecim, rs, infofield, infodecim, sqrt(sumvarhnb)); } /* finalcomments */ /* end module rseq.finalcomments */ /* begin module rseq.makers */ Static Void makers(data, nbl, nl, ehnblist_, theparameters, thefrom, theto) _TEXT *data; vector nbl, nl; ehnblist *ehnblist_; parameter *theparameters; long thefrom, theto; { /* display rs(l) and other results */ /* number of each kind of base at each position */ /* number of sequences at each position */ /* e(hnb) for various n */ /* the parameters that describe the vectors */ /* actual range of the data */ /* make rsequence. the core of the program. the vectors n(bl) [nbl] and n(l) [nl] are scanned to generate frequencies of bases. these are used to generate the w matrix w(b,l) [wbl], the average rs(l) [rsl] and then rsequence. sampling error is also recorded. */ Char aflag; /* 'a' if the approximations were used for ehnb and varhnb, 'e' if they were calculated exactly */ long b; /* the position in either vector corresponding to l. the b variables correspond to the base index */ long bstart; /* the first position in the vector for some l, used to get the number of sites */ long bstop; /* the last position in the vector */ double ehnb; /* e(hnb) */ double freq; /* the frequency of a base at a position */ long l; /* the aligned position in the range */ double ln2 = log(2.0); /* the natural log of 2. dividing the natural log of a number by ln2 gives the log to the base 2 of the number */ long rangestart; /* the start of the range */ long rangestop; /* the stop of the range */ double rsl; /* rsequence at a position l */ double rs = 0.0; /* the running sum of rsl */ double sumvarhnb = 0.0; /* running sum of varhnb */ long symbols; /* the number of bases */ double varhnb; /* var(hnb) at a position l */ double wbl; /* the weight w(b,l) */ _TEXT TEMP; rangestart = theparameters->range[(long)start]; rangestop = theparameters->range[(long)stop]; symbols = theparameters->wvlength; /* the following information is provided to allow another program to know how many lines will follow */ /* writeln(data,'* ',rangestart:posfield,' ',rangestop:posfield, ' is the range'); zzz*/ fprintf(data->f, "* %*ld %*ld is the range\n", posfield, thefrom, posfield, theto); fprintf(data->f, "*\n"); colinfo(data); fprintf(data->f, "*\n"); colid(data); /*zzz*/ /* for l := rangestart to rangestop do if (l >= thefrom) and (l <= theto) begin */ for (l = thefrom; l <= theto; l++) { fprintf(data->f, " %*ld", posfield, l); bstart = symbols * (l - rangestart) + 1; bstop = symbols * (l - rangestart) + symbols; gethnb(ehnblist_, (long)floor(vget(nl, bstart) + 0.5), &ehnb, &varhnb, &aflag); rsl = 0.0; for (b = bstart; b <= bstop; b++) { fprintf(data->f, " %*ld", nfield, (long)floor(vget(nbl, b) + 0.5)); freq = vget(nbl, b) / vget(nl, b); if (freq > 0.0) wbl = ehnb + log(freq) / ln2; else wbl = negativeinfinity; rsl += freq * wbl; } rs += rsl; sumvarhnb += varhnb; /* scientific notation */ /* display the results */ /* scientific notation */ /* ' ',varhnb:infofield:infodecim, ' ',sumvarhnb:infofield:infodecim, */ /* the extra space looks nicer */ fprintf(data->f, " %*.*f %*.*f % .*E % .*E %*ld %*.*f %c\n", infofield, infodecim, rsl, infofield, infodecim, rs, P_max((int)(infofield + 3) - 7, 1), varhnb, P_max((int)(infofield + 3) - 7, 1), sumvarhnb, nfield, (long)floor(vget(nl, bstart) + 0.5), infofield, infodecim, ehnb, aflag); } /* begin the display */ TEMP.f = stdout; *TEMP.name = '\0'; finalcomments(&TEMP, rs, sumvarhnb); finalcomments(data, rs, sumvarhnb); } /* makers */ /* end module rseq.makers */ /* begin module rseq.themain */ Static Void themain(encseq, cmp, rsdata) _TEXT *encseq, *cmp, *rsdata; { /* the main procedure */ vector nbl; /* sum of each kind of base at each position */ vector nl; /* number of sequences at each position */ ehnblist *ehnb; /* list of e(hnb) values */ parameter *theparameters; /* how the vectors are structured */ long thefrom, theto; /* the region of the encoding that is not zero */ printf(" rseq %4.2f\n", version); header(rsdata, encseq); sumvectors(encseq, &nbl, &theparameters); checkparams(theparameters); /*zzz*/ /* writeln(output,'theparameters^.range[start] =', theparameters^.range[start]:5); writeln(output,'theparameters^.range[stop ] =', theparameters^.range[stop ]:5); */ makenl(nbl, &nl, &theparameters); compressnl(nl, theparameters, &thefrom, &theto); /* writeln(output,'thefrom = ',thefrom:1); writeln(output,'theto = ',theto:1); */ prepareehnb(cmp, rsdata, nl, &ehnb); makers(rsdata, nbl, nl, ehnb, theparameters, thefrom, theto); } /* themain */ /* end module rseq.themain */ main(argc, argv) int argc; Char *argv[]; { PASCAL_MAIN(argc, argv); if (setjmp(_JL1)) goto _L1; rsdata.f = NULL; strcpy(rsdata.name, "rsdata"); cmp.f = NULL; strcpy(cmp.name, "cmp"); encseq.f = NULL; strcpy(encseq.name, "encseq"); themain(&encseq, &cmp, &rsdata); _L1: if (encseq.f != NULL) fclose(encseq.f); if (cmp.f != NULL) fclose(cmp.f); if (rsdata.f != NULL) fclose(rsdata.f); exit(EXIT_SUCCESS); } /* rseq */ /* End. */