/* Output from p2c 1.21alpha-07.Dec.93, the Pascal-to-C translator */ /* From input file "alword.p" */ #include /* alword: frequency and information of aligned words Tom Schneider National Cancer Institute Laboratory of Mathematical Biology Frederick, Maryland 21702-1201 toms@ncifcrf.gov */ /* end of program */ /* begin module version */ #define version 2.07 /* of alword.p 1992 June 4 previous version: 1990 August 8 origin 1990 June 14 from alpro 1.22 1990 May 9 */ /* end module version */ /* begin module describe.alword */ /* name alword: frequency and information of aligned words synopsis alword(words: in, symvec: out, output: out) files words: Aligned words. Since the input is usually to be a UNIX dictionary, there need not be any header lines. However, if they exist, they must begin with an asterisk '*'. The remaining lines are used for the words. alwordp: parameters to control the program. If the file is empty defaults are used. If the first line begins with the letter `e' then the words are aligned by their last character. If there is a first line, the second line must have the maximum word length to be included in the calculation. Words longer than this will be skipped (and reported to output). If the first character of the second line is 'a' then all of the words in the file will be read. Otherwise, only the first word on each line will be read. symvec: table of frequencies and information content. The information measure is corrected for small sample size (Schneider et al, 1986). output: messages to the user description Take an aligned set of protein sequences and produce input to the consensus program for producing a logo. examples * This is an example sequence. AGGEGCTT. * This is the second example sequence. * It is the last one. YLREBS. documentation Jotun Hein, Methods of Enzymology 183 (1990) Schneider et al. JMB 188:415 (1986) see also alpro.p, makelogo.p author Thomas Dana Schneider bugs technical notes */ /* end module describe.alword */ /* begin module alword.const */ #define maxpos 1000 /* maximum number of positions the program can handle */ /* end module alword.const */ /* begin module alword.type */ typedef struct sequence { Char letters[maxpos]; /* one of the sequences */ long length; /* the length of the sequence */ } sequence; /* end module alword.type */ Static _TEXT words; /* input aligned protein sequences */ Static _TEXT alwordp; /* parameters */ Static _TEXT symvec; /* output frequency table */ Static jmp_buf _JL1; /* begin module halt */ Static Void halt() { /* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. */ printf(" program halt.\n"); longjmp(_JL1, 1); } /* end module halt version = 4.09; (@ of prgmod.p 1990 May 18 */ /* begin module copyaline */ Static Void copyaline(fin, fout) _TEXT *fin, *fout; { /* copy a line from file fin to file fout */ while (!P_eoln(fin->f)) { putc(P_peek(fin->f), fout->f); getc(fin->f); } fscanf(fin->f, "%*[^\n]"); getc(fin->f); putc('\n', fout->f); } /* copyaline */ /* end module copyaline version = 4.09; (@ of prgmod.p 1990 May 18 */ /* begin module skipblanks */ Static Void skipblanks(thefile) _TEXT *thefile; { /* skip over blanks until a non-blank, or end of line, is found */ while ((P_peek(thefile->f) == ' ') & (!P_eoln(thefile->f))) getc(thefile->f); } Static Void skipnonblanks(thefile) _TEXT *thefile; { /* skip over nonblanks until a blank, or end of line, is found */ while ((P_peek(thefile->f) != ' ') & (!P_eoln(thefile->f))) getc(thefile->f); } /* end module skipblanks version = 4.09; (@ of prgmod.p 1990 May 18 */ /* begin module capitalize */ Static Char capitalize(c) Char c; { /* convert the character c to upper case */ long n = c; /* c is the n'th letter of the alphabet */ if (n >= 'a' && n <= 'z') c = _toupper(n); return c; } /* end module capitalize version = 4.09; (@ of prgmod.p 1990 May 18 */ Static Void die() { /* destroy symvec (as a safety feature) and halt the program */ if (*symvec.name != '\0') { if (symvec.f != NULL) symvec.f = freopen(symvec.name, "w", symvec.f); else symvec.f = fopen(symvec.name, "w"); } else { if (symvec.f != NULL) rewind(symvec.f); else symvec.f = tmpfile(); } if (symvec.f == NULL) _EscIO2(FileNotFound, symvec.name); SETUPBUF(symvec.f, Char); halt(); } /* begin module alword.readsequence */ Static Void readsequence(f, oneperline, s) _TEXT *f; boolean oneperline; sequence *s; { /* read in the sequence s from file. The sequence may have a header consisting of any number of lines that begin with an asterisk '*'. The sequence itself contains any character except space and period '.'. If oneperline is true, only one word is read per line */ Char c; /* a character from words */ boolean done = false; /* done reading the sequence */ long i = 0; /* position on the sequence */ /* writeln(output) */ skipblanks(f); while (!done) { c = getc(f->f); if (c == '\n') c = ' '; /* write(output,c); */ i++; if (i > maxpos) { printf("A sequence is longer than constant maxpos (%ld).\n", (long)maxpos); die(); } else s->letters[i-1] = c; if (P_eoln(f->f)) { done = true; fscanf(f->f, "%*[^\n]"); getc(f->f); } /* else if (ord(capitalize(c)) < ord('A')) or (ord(capitalize(c)) > ord('Z')) then begin done := true; if oneperline then readln(f) end */ } /* read one word */ if (i == 0) { printf("zero length sequence found!\n"); die(); } /* sequence will be zero long if there was none read */ else s->length = i; } /* end module alword.readsequence */ /* begin module alword.themain */ Static Void themain(words, alwordp, symvec) _TEXT *words, *alwordp, *symvec; { /* the main procedure of the program */ Char alignment; /* if 'e' by the end of the word */ double avarhnb; /* variance of the information r */ long b; /* index to the letters */ Char c; /* a character from words */ double e; /* for calculating small sample bias and variance */ boolean endalignment; /* if true, align words by their end */ double f; /* frequency of a letter */ long i; /* position on the sequence */ long j; /* position on the sequence, from the back end of the word */ long imax = 0; /* maximum position on the sequence */ double ln2 = log(2.0); /* natural log of 2 for calculating bits */ double hmax; /* log2 of s */ long maxword; /* longest word that will be reversed */ long n = 0; /* counting the input sequences */ long ntrue; /* true count in a position, deleting blanks */ long nbi[27][maxpos]; /* number of each each amino acid (b) at each position (i). zero is for blanks. */ boolean oneperline; /* one word per line */ double r; /* information at a position */ long s = 26; /* number of symbols */ boolean skipped; /* if true, we have already alerted the user about this word being skipped */ sequence seq; /* one of the sequences */ printf("alword %4.2f\n", version); if (*words->name != '\0') { if (words->f != NULL) words->f = freopen(words->name, "r", words->f); else words->f = fopen(words->name, "r"); } else rewind(words->f); if (words->f == NULL) _EscIO2(FileNotFound, words->name); RESETBUF(words->f, Char); if (*symvec->name != '\0') { if (symvec->f != NULL) symvec->f = freopen(symvec->name, "w", symvec->f); else symvec->f = fopen(symvec->name, "w"); } else { if (symvec->f != NULL) rewind(symvec->f); else symvec->f = tmpfile(); } if (symvec->f == NULL) _EscIO2(FileNotFound, symvec->name); SETUPBUF(symvec->f, Char); fprintf(symvec->f, "* alword %4.2f\n", version); /* get the parameter */ if (*alwordp->name != '\0') { if (alwordp->f != NULL) alwordp->f = freopen(alwordp->name, "r", alwordp->f); else alwordp->f = fopen(alwordp->name, "r"); } else rewind(alwordp->f); if (alwordp->f == NULL) _EscIO2(FileNotFound, alwordp->name); RESETBUF(alwordp->f, Char); if (BUFEOF(alwordp->f)) { alignment = ' '; oneperline = true; } else { fscanf(alwordp->f, "%c%*[^\n]", &alignment); getc(alwordp->f); if (alignment == '\n') alignment = ' '; if (!BUFEOF(alwordp->f)) { fscanf(alwordp->f, "%ld%*[^\n]", &maxword); getc(alwordp->f); } else { maxword = 20; printf("maximum word size: %ld\n", maxword); } } endalignment = (alignment == 'e'); if (!BUFEOF(alwordp->f)) { if (P_peek(alwordp->f) == 'a') oneperline = false; else oneperline = true; fscanf(alwordp->f, "%*[^\n]"); getc(alwordp->f); } /* clear the table */ for (i = 0; i < maxpos; i++) { for (b = 0; b <= 26; b++) nbi[b][i] = 0; } while (!BUFEOF(words->f)) { /* read one sequence */ while (P_peek(words->f) == '*') { fprintf(symvec->f, "*\n"); copyaline(words, symvec); } n++; readsequence(words, oneperline, &seq); /* Don't do this except for debugging! writeln(output,'sequence ',n:3,' is ',seq.length:3,' letters long'); */ skipped = false; for (i = 0; i < seq.length; i++) { c = capitalize(seq.letters[i]); if (c < 'A' || c > 'Z') b = 0; else b = c - 'A' + 1; /* make b be 1 to 26 for A to Z */ /* write(output,c); */ if (endalignment) { if (seq.length > maxword) { if (!skipped) { printf("skipping "); for (j = 0; j < seq.length; j++) putchar(seq.letters[j]); putchar('\n'); skipped = true; } } else { j = maxword - seq.length + i + 1; if (j > 0) nbi[b][j-1]++; } } else nbi[b][i]++; } /* writeln(output,c); */ if (seq.length > imax) imax = seq.length; } /* If we are aligning by the end of the word, correct imax to be maxword: */ if (endalignment) { imax = maxword; /* number of symbols */ } hmax = log((double)s); fprintf(symvec->f, "%ld number of symbols\n", s); /* write out the array */ for (i = 1; i <= imax; i++) { /* find the true number of letters at this position */ ntrue = 0; for (b = 1; b <= 26; b++) ntrue += nbi[b][i-1]; /* ntrue := n - nbi[0,i]; (* true number of samples *) write(output,'i=',i:1,' | '); write(output,'ntrue=',ntrue:1,' | '); write(output,'n=',n:1,' | '); writeln(output,'nbi[0,',i:1,']=',nbi[0,i]:1); */ /* calculate the information */ if (ntrue <= 0) { printf(" WARNING: something is really STRANGE!"); printf(" Position %ld has %ld sequences!\n", i, ntrue); r = 0.0; } else { r = hmax; for (b = 1; b <= 26; b++) { f = (double)nbi[b][i-1] / ntrue; if (f > 0) /* ie, Hmax - (f log f) */ r += f * log(f); } r /= ln2; /* convert to bits */ /* a correction for small sample sizes */ e = (s - 1) / (2 * ln2 * ntrue); r -= e; avarhnb = e * e; if (r < 0.0) /* eliminate negative results */ r = 0.0; } /* display the data */ fprintf(symvec->f, "%5ld %5ld %8.5f %8.5f (position, samples, information, variance)\n", i, ntrue, r, avarhnb); for (b = 1; b <= 26; b++) fprintf(symvec->f, "%c %4ld\n", (Char)(b - 1 + 'A'), nbi[b][i-1]); } } /* end module alword.themain */ main(argc, argv) int argc; Char *argv[]; { PASCAL_MAIN(argc, argv); if (setjmp(_JL1)) goto _L1; symvec.f = NULL; strcpy(symvec.name, "symvec"); alwordp.f = NULL; strcpy(alwordp.name, "alwordp"); words.f = NULL; strcpy(words.name, "words"); themain(&words, &alwordp, &symvec); _L1: if (words.f != NULL) fclose(words.f); if (alwordp.f != NULL) fclose(alwordp.f); if (symvec.f != NULL) fclose(symvec.f); exit(EXIT_SUCCESS); } /* End. */