/* Output from p2c 1.21alpha-07.Dec.93, the Pascal-to-C translator */ /* From input file "consensus.p" */ #include /* consensus: convert symvec to consensus to point out flaws in consensus Dr. Thomas D. Schneider National Cancer Institute Laboratory of Experimental and Computational Biology Molecular Information Theory Group Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu (use only if first address fails) http://www.lecb.ncifcrf.gov/~toms/ */ /* end of program */ /* begin module version */ #define version 1.17 /* of consensus.p 2005 May 26 2005 May 26, 1.17: cite paper: Consensus Sequence Zen. 2002 Feb 6, 1.16: report consensus so it can be grabbed. 2002 Feb 6, 1.15: report information content of consensus 2002 Feb 4, 1.14: upgrade documentation 2002 Feb 4, 1.12: compressed consensus 2002 Feb 3, 1.11: lower case letters for variations > cutoff 2002 Feb 3, 1.10: functional, improve documentation 2002 Feb 3, 1.05: produces written consensus sequence based on cutoff 2002 Feb 3, 1.04: release to web 2002 Feb 3, 1.02: upgrading 2001 Aug 31, 1.01: functional 2001 Aug 31, 1.00: origin */ #define updateversion 1.05 /* defines lowest acceptable current parameter file */ /* end module version */ /* begin module describe.consensus */ /* name consensus: convert symvec to consensus to point out flaws in consensus synopsis consensus(symvec: in, consensusp: in, list: out, conmarks: out, output: out) files symvec: a file that contains base frequency information for a protein or DNA binding site. list: Processing of the symvec to show how the consensus model destroys data. conmarks: consensus sequences in marks format for putting onto a sequence logo. This requires the ShiftLettering function provided in marks.lettering. Concatenate marks.lettering to conmarks to get the marks file of makelogo. That is, in unix: cat marks.lettering conmarks > marks consensusp: parameters to control the program. The file must contain the following parameters, one per line: parameterversion: The version number of the program. This allows the user to be warned if an old parameter file is used. The second line contains two integers: fromrange: integer; the 5' or N terminus of the range to display torange: integer; the 3' or C terminus of the range to display The third line contains one real number for the cutoff below which a base is labeled 'n'. output: messages to the user description How bad is the TATAAT consensus? What are the relative fractions? The program is designed to allow one to look at a consensus sequence. Its purpose is to ATTACK THE CONCEPT OF CONSENSUS sequences. Do not use this program to publish a consensus sequence without indicating that the consensus sequence method is highly flawed! examples documentation T. D. Schneider, Consensus Sequence Zen, Applied Bioinformatics, 1 (3), 111-119, 2002. See link below. see also {example parameter file:} consensusp {Source of ShiftLettering function:} marks.lettering {Programs that make symvec files:} alpro.p dalvec.p {If you are tempted to use this program to publish something, DON'T!!! Read this:} http://www.lecb.ncifcrf.gov/~toms/glossary.html#consensus_sequence {and the Consensus Sequence Zen paper:} http://www.lecb.ncifcrf.gov/~toms/paper/zen/ {Use these programs instead:} {make a sequence logo:} makelogo.p {make sequence walkers:} makewalker.p {If you don't know what those programs do, "go look it up":} {sequence logos:} http://www.lecb.ncifcrf.gov/~toms/glossary.html#sequence_logo {sequence walkers:} http://www.lecb.ncifcrf.gov/~toms/glossary.html#sequence_walker {This is a sequence logo:} http://www.lecb.ncifcrf.gov/~toms/icons/donor.pure.gif {This is a sequence walker:} http://www.lecb.ncifcrf.gov/~toms/icons/donorwalkersmall.gif {Examples of What You Miss (!) If You Use A Consensus Sequence:} http://www.lecb.ncifcrf.gov/~toms/papers/splice/ http://www.lecb.ncifcrf.gov/~toms/papers/walker/walker.html#fig2 http://www.lecb.ncifcrf.gov/~toms/papers/baseflip/ http://www.lecb.ncifcrf.gov/~toms/papers/repan3/ author Thomas Dana Schneider bugs This program could be used by some foolish idiot to make a consensus and publish it! Well, ok, you can if your purpose is to attack the concensus sequence concept. Otherwise, please read the 'see also' section above. technical notes */ /* end module describe.consensus */ /* begin module string.const */ #define maxstring 2000 /* the maximum string */ /* end module string.const version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module string.type */ /* pointer to a string */ typedef struct string { /* a string of characters */ Char letters[maxstring]; /* the letters in the string */ long length; /* the number of characters in the string */ long current; /* the letter we are working on */ Char *next; /* the next string in a series */ } string; /* end module string.type version = 4.54; (@ of prgmod.p 2001 Aug 29 */ Static _TEXT symvec; /* file used by this program */ Static _TEXT consensusp; /* file used by this program */ Static _TEXT list; /* file used by this program */ Static _TEXT conmarks; /* file used by this program */ Static jmp_buf _JL1; /* begin module halt */ Static Void halt() { /* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. */ printf(" program halt.\n"); longjmp(_JL1, 1); } /* end module halt version = 'delmod 6.16 84 mar 12 tds/gds'; */ /* begin module copyaline */ Static Void copyaline(fin, fout) _TEXT *fin, *fout; { /* copy a line from file fin to file fout */ while (!P_eoln(fin->f)) { putc(P_peek(fin->f), fout->f); getc(fin->f); } fscanf(fin->f, "%*[^\n]"); getc(fin->f); putc('\n', fout->f); } /* copyaline */ /* end module copyaline version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module clearstring */ Static Void clearstring(ribbon) string *ribbon; { /* empty the string */ long index; /* to the ribbon */ for (index = 0; index < maxstring; index++) ribbon->letters[index] = ' '; ribbon->length = 0; ribbon->current = 0; } /* clearstring */ Static Void initializestring(ribbon) string *ribbon; { /* start the string with a nil pointer. This routine should be called before doing linked list work. This allows the standard string routines to clear the string without killing the pointer. */ clearstring(ribbon); ribbon->next = NULL; } /* initializestring */ /* end module clearstring version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module writestring */ Static Void writestring(tofile, s) _TEXT *tofile; string *s; { /* write the string s to file tofile, no writeln */ long i; /* index to s */ long FORLIM; FORLIM = s->length; for (i = 0; i < FORLIM; i++) putc(s->letters[i], tofile->f); } /* writestring */ /* end module writestring version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module capitalize */ Static Char capitalize(c) Char c; { /* convert the character c to upper case */ long n = c; /* c is the n'th letter of the alphabet */ if (n >= 'a' && n <= 'z') c = _toupper(n); return c; } /* end module capitalize */ /* begin module decapitalize */ Static Char decapitalize(c) Char c; { /* convert the character c to lower case */ long n = c; /* c is the n'th letter of the alphabet */ if (n >= 'A' && n <= 'Z') c = _tolower(n); else c = (Char)n; return c; } /* end module decapitalize */ /* begin module twocode */ Static Char twocode(a, b) Char a, b; { /* return the two letter code for bases a and b. a and b can be upper or lower case, but upper case letters are returned. Uses: module decapitalize. */ Char Result; a = decapitalize(a); b = decapitalize(b); switch (a) { case 'a': switch (b) { case 'a': Result = 'A'; break; case 'c': Result = 'M'; break; case 'g': Result = 'R'; break; case 't': Result = 'W'; break; } break; case 'c': switch (b) { case 'a': Result = 'M'; break; case 'c': Result = 'C'; break; case 'g': Result = 'S'; break; case 't': Result = 'Y'; break; } break; case 'g': switch (b) { case 'a': Result = 'R'; break; case 'c': Result = 'S'; break; case 'g': Result = 'G'; break; case 't': Result = 'K'; break; } break; case 't': switch (b) { case 'a': Result = 'W'; break; case 'c': Result = 'Y'; break; case 'g': Result = 'K'; break; case 't': Result = 'T'; break; } break; } return Result; } /* end module twocode */ /* begin module threecode */ Static Char threecode(a, c, g) Char a, c, g; { /* return the three letter code for bases a, b and c. a and b can be upper or lower case, but upper case letters are returned. Uses: module decapitalize. THE LETTERS MUST BE ALPHABETIZED. */ a = decapitalize(a); c = decapitalize(c); g = decapitalize(g); /* who is not represented? There are only four cases if we know that! */ if (a != 'a') /* it must be cgt */ return 'B'; else { if (c != 'c') /* it must be agt */ return 'D'; else { if (g != 'g') /* it must be act */ return 'H'; else { return 'V'; /* it must be acg */ /* a must be 'a' */ /* a ='a' and c='c' */ } } } } #define wid 5 /* width of numbers */ #define dec 2 /* decimal places for numbers */ #define maxs 4 /* maximum s allowed by current program */ /* end module threecode */ /* begin module consensus.themain */ Static Void themain(symvec, consensusp, list, conmarks) _TEXT *symvec, *consensusp, *list, *conmarks; { /* the main procedure of the program */ double bits = 0.0; /* the number of bits in the consensus */ Char c; /* one of the symbols */ long conposition; /* position in constring */ long concount = 0; /* count of Ns in the consensus */ /* conlevel: integer; (* number of bases that pass the cutoff at a position *) constring: string; (* resulting consensus string - ICK!!! *) */ long conlevel[maxstring]; /* number of bases that pass the cutoff at a position */ string constring[maxs]; /* multi-level consensus strings */ string compressed; /* compressed consensus string */ double cutoff; /* the lowest frequency for using a base in the consensus */ boolean first = true; /* used to obtain the number of symbols from the start of symvec */ long fromrange; /* the 5' or N terminus of the range to display */ long n; /* number of sequences at a position l */ long ncl; /* number of c position l */ long l; /* position in the alignment */ long d; /* display index */ double f; /* frequency of symbol c */ long letter; /* index count of the symbols */ Char mfsymbol; /* maximum frequency symbol */ double mffraction; /* fraction of maximum frequency symbol */ double parameterversion; /* parameter version number */ double probability = 1.0; /* probability of consensus */ double Rsequence; /* information content */ double Rvar; /* variance of Rsequence */ long torange; /* the 3' or C terminus of the range to display */ long s; /* number of symbols */ long sindex; /* index for s, the number of symbols */ printf("consensus %4.2f\n", version); if (*consensusp->name != '\0') { if (consensusp->f != NULL) consensusp->f = freopen(consensusp->name, "r", consensusp->f); else consensusp->f = fopen(consensusp->name, "r"); } else rewind(consensusp->f); if (consensusp->f == NULL) _EscIO2(FileNotFound, consensusp->name); RESETBUF(consensusp->f, Char); fscanf(consensusp->f, "%lg%*[^\n]", ¶meterversion); getc(consensusp->f); if ((long)floor(100 * parameterversion + 0.5) < (long)floor(100 * updateversion + 0.5)) { if (parameterversion <= 1.05) { printf("You have an old parameter file!\n"); printf("Upgrading ...\n"); fscanf(consensusp->f, "%ld%ld%*[^\n]", &fromrange, &torange); getc(consensusp->f); if (*consensusp->name != '\0') { if (consensusp->f != NULL) consensusp->f = freopen(consensusp->name, "w", consensusp->f); else consensusp->f = fopen(consensusp->name, "w"); } else { if (consensusp->f != NULL) rewind(consensusp->f); else consensusp->f = tmpfile(); } if (consensusp->f == NULL) _EscIO2(FileNotFound, consensusp->name); SETUPBUF(consensusp->f, Char); fprintf(consensusp->f, "%4.2f version of consensus that this parameter file is designed for.\n", version); fprintf(consensusp->f, "%3ld %3ld range to display\n", fromrange, torange); cutoff = 0.5; fprintf(consensusp->f, "%4.2f the lowest frequency for using a base in the consensus\n", cutoff); if (*consensusp->name != '\0') { if (consensusp->f != NULL) consensusp->f = freopen(consensusp->name, "r", consensusp->f); else consensusp->f = fopen(consensusp->name, "r"); } else rewind(consensusp->f); if (consensusp->f == NULL) _EscIO2(FileNotFound, consensusp->name); RESETBUF(consensusp->f, Char); fscanf(consensusp->f, "%lg%*[^\n]", ¶meterversion); getc(consensusp->f); } } fscanf(consensusp->f, "%ld%ld%*[^\n]", &fromrange, &torange); getc(consensusp->f); fscanf(consensusp->f, "%lg%*[^\n]", &cutoff); getc(consensusp->f); if (*symvec->name != '\0') { if (symvec->f != NULL) symvec->f = freopen(symvec->name, "r", symvec->f); else symvec->f = fopen(symvec->name, "r"); } else rewind(symvec->f); if (symvec->f == NULL) _EscIO2(FileNotFound, symvec->name); RESETBUF(symvec->f, Char); if (*list->name != '\0') { if (list->f != NULL) list->f = freopen(list->name, "w", list->f); else list->f = fopen(list->name, "w"); } else { if (list->f != NULL) rewind(list->f); else list->f = tmpfile(); } if (list->f == NULL) _EscIO2(FileNotFound, list->name); SETUPBUF(list->f, Char); fprintf(list->f, "* consensus %4.2f\n", version); fprintf(list->f, "* range: %ld to %ld\n", fromrange, torange); fprintf(list->f, "* cutoff: %4.2f\n", cutoff); /* header */ while (!BUFEOF(symvec->f)) { if (P_peek(symvec->f) == '*') { copyaline(symvec, list); continue; } if (first) { first = false; fscanf(symvec->f, "%ld%*[^\n]", &s); getc(symvec->f); fprintf(list->f, "%*ld number of symbols\n", wid, s); if (s > maxs) { printf(" number of symbols %ld exceeds the allowed maximum, maxs = %ld\n", s, (long)maxs); halt(); } for (sindex = 1; sindex <= s; sindex++) clearstring(&constring[s-1]); continue; } fscanf(symvec->f, "%ld%ld%lg%lg%*[^\n]", &l, &n, &Rsequence, &Rvar); getc(symvec->f); mfsymbol = ' '; mffraction = -1.0; if (l < fromrange || l > torange) { for (letter = 1; letter <= s; letter++) { fscanf(symvec->f, "%*[^\n]"); getc(symvec->f); } continue; } /* skip the entire position l quickly: */ /* start the consensus string */ conposition = l - fromrange + 1; for (sindex = 0; sindex < s; sindex++) { constring[sindex].letters[conposition-1] = 'N'; constring[sindex].length++; } conlevel[conposition-1] = 0; for (letter = 1; letter <= s; letter++) { fscanf(symvec->f, "%c%ld%*[^\n]", &c, &ncl); getc(symvec->f); if (c == '\n') c = ' '; /* process the position */ f = (double)ncl / n; if (f >= mffraction) { mffraction = f; mfsymbol = c; /* capture the letter */ if (f >= cutoff) { conlevel[conposition-1]++; constring[conlevel[conposition-1] - 1].letters[conposition-1] = capitalize(c); } } /* if l = -19 then begin writeln(output, 'l = ',l:1); writeln(output, 'ncl = ',ncl:1); writeln(output, 'c = ',c:1); writeln(output, 'conlevel[conposition] = ',conlevel[conposition]:1); end; if l = -18 then begin writeln(output, 'l = ',l:1); writeln(output, 'conlevel[conposition] = ',conlevel[conposition]:1); halt end; */ fprintf(list->f, "%*ld %*ld %*ld %*.*f", wid, l, wid, n, wid, ncl, wid, dec, f); /* display the fraction of the base in a small graph */ fprintf(list->f, " |"); for (d = 0; d <= 10; d++) { if (f < d / 10.0) putc('-', list->f); else putc(c, list->f); } fprintf(list->f, " |\n"); /* at the last letter, write results out */ if (letter == s) { fprintf(list->f, "* maximum fraction: %c at %*.*f\n", mfsymbol, wid, dec, mffraction); fprintf(list->f, "*\n"); probability *= mffraction; } } } fprintf(list->f, "* probability of consensus: %*.*f\n", wid, dec, probability); fprintf(list->f, "* Final consensus:\n"); fprintf(list->f, "* \n"); for (sindex = 1; sindex <= s; sindex++) { fprintf(list->f, "* "); for (l = fromrange + 1; l <= torange + 1; l++) { conposition = l - fromrange; if (constring[sindex-1].letters[conposition-1] != 'N' || sindex == 1) putc(constring[sindex-1].letters[conposition-1], list->f); else putc(' ', list->f); } putc('\n', list->f); } fprintf(list->f, "* \n"); /* compress the consensus to single letter code */ clearstring(&compressed); compressed.length = constring[0].length; for (l = fromrange + 1; l <= torange + 1; l++) { conposition = l - fromrange; /* writeln(output,'conposition= ',conposition:1); writeln(output,'conlevel[conposition]= ',conlevel[conposition]:1); */ switch (conlevel[conposition-1]) { case 0: /* no consensus */ compressed.letters[conposition-1] = 'N'; break; case 1: compressed.letters[conposition-1] = constring[0].letters[conposition-1]; /* single consensus base */ bits += 2.0; break; case 2: /* use two letter code */ compressed.letters[conposition-1] = twocode( constring[0].letters[conposition-1], constring[1].letters[conposition-1]); /*:= '2';*/ bits += 1.0; break; case 3: compressed.letters[conposition-1] = threecode( constring[0].letters[conposition-1], constring[1].letters[conposition-1], constring[2].letters[conposition-1]); /*:= '3';*/ bits += 2.0 - log(3.0) / log(2.0); /* log base 2 of 3 */ break; case 4: compressed.letters[conposition-1] = 'n'; break; } } fprintf(list->f, "* "); writestring(list, &compressed); fprintf(list->f, "\n* \n"); fprintf(list->f, "* DO NOT PUBLISH THIS!!!\n"); fprintf(list->f, "* Well, ok, you can if your purpose\n"); fprintf(list->f, "* is to attack the concensus sequence concept.\n"); fprintf(list->f, "* Otherwise, read this:\n"); fprintf(list->f, "* http://www.lecb.ncifcrf.gov/~toms/glossary.html#consensus_sequence\n"); if (*conmarks->name != '\0') { if (conmarks->f != NULL) conmarks->f = freopen(conmarks->name, "w", conmarks->f); else conmarks->f = fopen(conmarks->name, "w"); } else { if (conmarks->f != NULL) rewind(conmarks->f); else conmarks->f = tmpfile(); } if (conmarks->f == NULL) _EscIO2(FileNotFound, conmarks->name); SETUPBUF(conmarks->f, Char); for (l = fromrange; l <= torange; l++) { conposition = l - fromrange + 1; fprintf(list->f, "%5ld ", l); for (sindex = 0; sindex < s; sindex++) { if (true) putc(constring[sindex].letters[conposition-1], list->f); } if (constring[0].letters[conposition-1] != 'N') concount++; fprintf(list->f, " %c\n", compressed.letters[conposition-1]); /* The definition of shiftlettering and an example: * U TRIGGER IGNORED base bits sfactor (letters) R G B ShiftLettering * U 1.1 0 0 0 1 (A) 0 0 0 ShiftLettering */ /* constring[1].letters[conposition], (* first string *) */ fprintf(conmarks->f, "U 5 0 %5ld -0.5 1.3 (%c) 0 0 0 ShiftLettering\n", l, compressed.letters[conposition-1]); } fprintf(list->f, "%ld out of %ld non-N positions in the consensus\n", concount, torange - fromrange + 1); fprintf(list->f, "%*.*f ", wid, dec, bits); writestring(list, &compressed); putc('\n', list->f); } #undef wid #undef dec #undef maxs /* end module consensus.themain */ main(argc, argv) int argc; Char *argv[]; { PASCAL_MAIN(argc, argv); if (setjmp(_JL1)) goto _L1; conmarks.f = NULL; strcpy(conmarks.name, "conmarks"); list.f = NULL; strcpy(list.name, "list"); consensusp.f = NULL; strcpy(consensusp.name, "consensusp"); symvec.f = NULL; strcpy(symvec.name, "symvec"); themain(&symvec, &consensusp, &list, &conmarks); _L1: if (symvec.f != NULL) fclose(symvec.f); if (consensusp.f != NULL) fclose(consensusp.f); if (list.f != NULL) fclose(list.f); if (conmarks.f != NULL) fclose(conmarks.f); exit(EXIT_SUCCESS); } /* End. */