/* Output from p2c 1.21alpha-07.Dec.93, the Pascal-to-C translator */ /* From input file "dalvec.p" */ #include /* dalvec: converts Rseq rsdata file to symvec format by Thomas Schneider module libraries required: delman, prgmods */ /* end of program */ /* begin module version */ #define version 2.20 /* of dalvec.p 1995 June 24 origin 1990 May 7 from rsgra */ /* end module version */ /* begin module describe.dalvec */ /* name dalvec: converts Rseq rsdata file to symvec format synopsis dalvec(rsdata: in, dalvecp: in, symvec: out, output: out) files rsdata: data file from rseq program dalvecp: parameters to control dalvec If empty, then the normal sequence logo will be produced. If the first character of the first line is a 'c', then a chi-logo is produced. The height of this logo is the information. The heights of the individual letters are, however, not the frequencies, but rather their partial chi-square values. The expected value is 1/4 of the number of characters. This is compared to the observed value by: partial chi-square =(observed - expected)^2/expected These partial values are normalized and placed in symvec in place of the relative frequencies. Thus the significance of each letter is used. When the observed is less than expected, the reported value is made negative. Makelogo prints these characters upside down. symvec: reformating of the rsdata file for input to the makelogo program. A series of header lines begining with asterisk ("*") are produced. The next line contains one integer which is the number of symbols in the vector (4 for DNA or RNA, 20 for proteins). After this, the format of the file is a series of entries. Each entry has two parts. The first part is on one line and contains position total information position: the position number total: the sum of the values in the vector information: the information content of the vector. The remaining parameters on the line are from the rsdata file: rs: sum of rsl varhnb: variance of rsl sumvar: sum of varhnb ehnb: 2-e(n) The second part consists of a list of a series of symbol lines. The number of these matches the numer of symbols (4 in the case of DNA), representing the the numbers of bases or amino acids at the position in an aligned set of sequences. Each line begins with the character of the symbol, followed by the number of that symbols. output: messages to the user description Convert the rsdata file from rseq into a format that the makelogo program can use. The format is a 'symbol vector'. ChiLogos: If you leave the parameter file empty, then the standard sequence logo will be created. However, if the first letter of the file is a 'c', then a new kind of logo will emerge from makelogo: the chi-logo. The height is as it was before. The height of the individual letters is different, instead of being proportional to the frequency of the letter, it is proportional to the significance of the letter, based on the chi-square test. That is, the expected number of letters is the number of letters at that position, n(l) divided by 4 (for simplicity!). The observed number comes from the rsdata file. The partial-chi square is (observed-expected)^2/expected. Note that the sum of the partials is the normal chi-square. So bases that contribute strongly get big. Also, bases that are under represented are printed UPSIDE DOWN, so you can (usually) tell you have a chilogo at a glance. The chilogo allows one to see the importance of the infrequent letters. The technical mechanism for making a letter upside down is to have its number negative in the symvec file. examples see also rseq.p, makelogo.p author Thomas D. Schneider bugs The program originally only created a vector that contained the characters of the alphabet, so the output was called an 'alvec'. To reflect the use of symbols, the name of the output file was changed to symvec, but I like 'dalvec', and 'dsymvec' is so awkward that I decided to keep the name dalvec. */ /* end module describe.dalvec */ /* begin module dalvec.const */ #define infofield 8 /* size of field for printing information in bits */ #define infodecim 5 /* number of decimal places for printing information */ #define nfield 4 /* size of field for printing n, the number of sites */ #define mapmax 26 /* the number of bases or amino acids to be sorted */ #define pwid 8 /* width in character places to print PostScript numbers */ #define pdec 5 /* decimal places to print PostScript numbers */ #define pnum 100000L /* 10^pdec for testing when something will round to zero */ #define cxmove 30 /* units to move a character horizontally */ /* end module dalvec.const */ /* begin module interact.const */ #define maxstring 150 /* the maximum string */ /* end module interact.const version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module dalvec.filler.const */ #define fillermax 21 /* the size of the filler array for a string */ /* end module dalvec.filler.const */ /* begin module rs.type */ typedef struct rstype { /* types of data in the data table from rseq */ long rstart, rstop; /* range of the data */ long l; /* position */ long nal, ncl, ngl, ntl; /* numbers of each base */ double rsl; /* rsequence(l) */ double rs; /* running sum of rs */ double varhnb; /* variance estimate for small sample correction */ double sumvar; /* sum of varhnb */ long nl; /* = a+c+g+t */ double ehnb; /* hg-e(n) */ Char flag; /* e = exact, a = approximate sampling statistics */ } rstype; /* end module rs.type version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module interact.type */ typedef struct string { /* a string of characters */ Char letters[maxstring]; /* the letters in the string */ long length; /* the number of characters in the string */ long current; /* the letter we are working on */ } string; /* end module interact.type version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module trigger.type */ typedef struct trigger { /* an object to be searched for */ string seek; /* the characters looked for */ long state; /* how close to triggering we are */ boolean skip; /* trigger not found- skip the line */ /* the trigger was found */ boolean found; } trigger; /* end module trigger.type version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module filler.type */ /* the following is an array used to fill a string. it is convenient to have it much shorter than the maxstring, so that it is easy to fill the string using procedure fillstring. the user must declare the value of constant fillermax. */ typedef Char filler[fillermax]; /* end module filler.type version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module dalvec.var */ Static _TEXT rsdata; /* output of rseq program */ Static _TEXT dalvecp; /* input of this program */ Static _TEXT symvec; /* output of this program */ Static jmp_buf _JL1; /* end module dalvec.var */ /* begin module halt */ Static Void halt() { /* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. */ printf(" program halt.\n"); longjmp(_JL1, 1); } /* end module halt version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module interact.clearstring */ Static Void clearstring(ribbon) string *ribbon; { /* empty the string */ long index; /* to the ribbon */ for (index = 0; index < maxstring; index++) ribbon->letters[index] = ' '; ribbon->length = 0; ribbon->current = 0; } /* clearstring */ /* end module interact.clearstring version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module interact.writestring */ Static Void writestring(tofile, s) _TEXT *tofile; string *s; { /* write the string s to file tofile, no writeln */ long i; /* index to s */ long FORLIM; FORLIM = s->length; for (i = 0; i < FORLIM; i++) putc(s->letters[i], tofile->f); } /* writestring */ /* end module interact.writestring version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module trigger.proc */ /* this module allows one to scan a series of characters, as from an array or a file, and to "trigger" or detect a simple string in the series. the advantage of the trigger is that several triggers can "observe" a stream of characters at once, each looking for a different thing. some other modules required: interact.const, interact.type */ Static Void resettrigger(t) trigger *t; { /* reset the trigger to ground state */ t->state = 0; t->skip = false; t->found = false; } /* resettrigger */ Static Void testfortrigger(ch, t) Char ch; trigger *t; { /* look at the character ch. if it is part of the trigger (at the current trigger state), then the trigger state goes higher. if it is not part of the trigger then the trigger state is reset, skip is true and one should skip onward to find the trigger. if the trigger is found, found is true. */ t->state++; /* if debugging then begin writestring(list,seek); writeln(list,'testfortrigger seek.letters[',state:1,']:', seek.letters[state],' ch:',ch); end;*/ if (t->seek.letters[t->state - 1] == ch) { t->skip = false; if (t->state == t->seek.length) t->found = true; else t->found = false; return; } t->state = 0; t->skip = true; t->found = false; /* reset trigger */ } /* testfortrigger */ /* end module trigger.proc version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module filler.fillstring */ Static Void fillstring(s, a) string *s; Char *a; { /* this procedure makes it reasonably easy to fill the string s with characters. one calls the procedure as: */ /* 1 2 3 4 5 */ /* 12345678901234567890123456789012345678901234567890 */ /* fillstring(s, 'this-is-the-string '); the two comments make it easy to line the characters up. also, for this example, it was assumed that the length of filler as defined by the constant fillermax was 50. */ long length = fillermax; /* of the string without trailing blanks */ long index; /* of s */ clearstring(s); while (length > 1 && a[length-1] == ' ') length--; if (length == 1 && a[length-1] == ' ') { printf("fillstring: the string is empty\n"); halt(); } for (index = 0; index < length; index++) s->letters[index] = a[index]; s->length = length; s->current = 1; } /* fillstring */ /* end module filler.fillstring version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module filler.filltrigger */ Static Void filltrigger(t, a) trigger *t; Char *a; { /* fill the trigger t */ fillstring(&t->seek, a); } /* fillstring */ /* end module filler.filltrigger version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module copyaline */ Static Void copyaline(fin, fout) _TEXT *fin, *fout; { /* copy a line from file fin to file fout */ while (!P_eoln(fin->f)) { putc(P_peek(fin->f), fout->f); getc(fin->f); } fscanf(fin->f, "%*[^\n]"); getc(fin->f); putc('\n', fout->f); } /* copyaline */ /* end module copyaline version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* ********************************************************************** */ /* begin module dalvec.header */ Static Void header(infile, outfile) _TEXT *infile, *outfile; { /* do the header of the plot. Simply copy the output. Note: this depends on the fact that the input file already has the '*' marks */ long index; /* to lines in the infile */ if (*infile->name != '\0') { if (infile->f != NULL) infile->f = freopen(infile->name, "r", infile->f); else infile->f = fopen(infile->name, "r"); } else rewind(infile->f); if (infile->f == NULL) _EscIO2(FileNotFound, infile->name); RESETBUF(infile->f, Char); if (*outfile->name != '\0') { if (outfile->f != NULL) outfile->f = freopen(outfile->name, "w", outfile->f); else outfile->f = fopen(outfile->name, "w"); } else { if (outfile->f != NULL) rewind(outfile->f); else outfile->f = tmpfile(); } if (outfile->f == NULL) _EscIO2(FileNotFound, outfile->name); SETUPBUF(outfile->f, Char); fprintf(outfile->f, "* dalvec %4.2f\n", version); /* copy header lines to outfile */ for (index = 1; index <= 3; index++) copyaline(infile, outfile); fprintf(outfile->f, "*\n"); fprintf(outfile->f, "* position, number of sequences, information Rs, variance of Rs\n"); } /* end module dalvec.header */ /* ********************************************************************** */ /* ********************************************************************** */ /* begin module rs.readrsrange */ Static Void readrsrange(rsdata, r) _TEXT *rsdata; rstype *r; { /* read the range data from rsdata to r. data is assumed to be the rsdata file from program rseq. */ long index; /* for counting lines of rsdata */ Char skip; /* a character to skip the '*' on the line */ for (index = 1; index <= 11; index++) { fscanf(rsdata->f, "%*[^\n]"); getc(rsdata->f); } fscanf(rsdata->f, "%c%ld%ld%*[^\n]", &skip, &r->rstart, &r->rstop); getc(rsdata->f); /* writeln(output, 'range: ',r.rstart:1,' ',r.rstop:1);*/ if (skip == '\n') skip = ' '; } /* end module rs.readrsrange version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module rs.getrsbegin */ Static Void getrsbegin(infile) _TEXT *infile; { /* skip to the beginning of the data in a data file from the rseq program */ Char ch; /* a character read from infile */ trigger begindata; /* a trigger to locate the beginning of the data */ /* 1 2 */ /* 123456789012345678901 */ filltrigger(&begindata, "l a c g t"); resettrigger(&begindata); if (*infile->name != '\0') { if (infile->f != NULL) infile->f = freopen(infile->name, "r", infile->f); else infile->f = fopen(infile->name, "r"); } else rewind(infile->f); if (infile->f == NULL) _EscIO2(FileNotFound, infile->name); RESETBUF(infile->f, Char); while (!begindata.found) { if (P_eoln(infile->f)) { fscanf(infile->f, "%*[^\n]"); getc(infile->f); } if (BUFEOF(infile->f)) { printf("beginning of data not found\n"); halt(); } ch = getc(infile->f); if (ch == '\n') ch = ' '; testfortrigger(ch, &begindata); } fscanf(infile->f, "%*[^\n]"); getc(infile->f); } /* end module rs.getrsbegin version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* begin module rs.readrsdata */ Static Void readrsdata(rsdata, rdata) _TEXT *rsdata; rstype *rdata; { /* read data from the data file of program rseq into the datatype */ fscanf(rsdata->f, "%ld%ld%ld%ld%ld%lg%lg%lg%lg%ld%lg", &rdata->l, &rdata->nal, &rdata->ncl, &rdata->ngl, &rdata->ntl, &rdata->rsl, &rdata->rs, &rdata->varhnb, &rdata->sumvar, &rdata->nl, &rdata->ehnb); /* skip spaces to find the flag: */ while (P_peek(rsdata->f) == ' ') getc(rsdata->f); fscanf(rsdata->f, "%c%*[^\n]", &rdata->flag); getc(rsdata->f); /* writeln(output,'readrsdata: l a c g t flag = ', l:1,' ',nal:1,' ',ncl:1,' ',ngl:1,' ',ntl:1,' ',flag); */ if (rdata->flag == '\n') rdata->flag = ' '; } /* end module rs.readrsdata version = 4.75; (@ of rsgra.p 1990 Oct 2 */ /* ********************************************************************** */ /* ********************************************************************** */ /* begin module dalvec.sign */ Static double sign(a) double a; { /* return the sign of the number a */ if (a >= 0) return 1.0; else return -1.0; } #define bignumber pnum /* number of digits that the chilogo is accurate to */ /* end module dalvec.sign */ /* begin module dalvec.themain */ Static Void themain(rsdata, dalvecp, symvec) _TEXT *rsdata, *dalvecp, *symvec; { /* the main procedure of the program */ boolean chilogo = false; /* if true, do the chi logo */ double chia, chic, chig, chit; /* partial chi square values */ double chitotal; /* sum of the partial chi squares */ double nl4; /* number of bases at l (nl) divided by 4 */ rstype rdata; /* data from rseq */ Char parameter; /* parameter read from dalvecp */ long position; /* a location in the aligned sequence */ long FORLIM; double TEMP; printf("dalvec %4.2f\n", version); if (*rsdata->name != '\0') { if (rsdata->f != NULL) rsdata->f = freopen(rsdata->name, "r", rsdata->f); else rsdata->f = fopen(rsdata->name, "r"); } else rewind(rsdata->f); if (rsdata->f == NULL) _EscIO2(FileNotFound, rsdata->name); RESETBUF(rsdata->f, Char); if (*symvec->name != '\0') { if (symvec->f != NULL) symvec->f = freopen(symvec->name, "w", symvec->f); else symvec->f = fopen(symvec->name, "w"); } else { if (symvec->f != NULL) rewind(symvec->f); else symvec->f = tmpfile(); } if (symvec->f == NULL) _EscIO2(FileNotFound, symvec->name); SETUPBUF(symvec->f, Char); if (*dalvecp->name != '\0') { if (dalvecp->f != NULL) dalvecp->f = freopen(dalvecp->name, "r", dalvecp->f); else dalvecp->f = fopen(dalvecp->name, "r"); } else rewind(dalvecp->f); if (dalvecp->f == NULL) _EscIO2(FileNotFound, dalvecp->name); RESETBUF(dalvecp->f, Char); if (!BUFEOF(dalvecp->f)) { if (!P_eoln(dalvecp->f)) { fscanf(dalvecp->f, "%c%*[^\n]", ¶meter); getc(dalvecp->f); if (parameter == '\n') parameter = ' '; } if (parameter == 'c') chilogo = true; } if (chilogo) printf("Chilogo will be produced\n"); else printf("Normal sequence logo will be produced\n"); if (BUFEOF(rsdata->f)) { printf("empty rsdata file\n"); halt(); return; } /* copy header stuff */ header(rsdata, symvec); fprintf(symvec->f, "4 number of symbols in DNA or RNA\n"); /* find the range of the graph in bases */ if (*rsdata->name != '\0') { if (rsdata->f != NULL) rsdata->f = freopen(rsdata->name, "r", rsdata->f); else rsdata->f = fopen(rsdata->name, "r"); } else rewind(rsdata->f); if (rsdata->f == NULL) _EscIO2(FileNotFound, rsdata->name); RESETBUF(rsdata->f, Char); readrsrange(rsdata, &rdata); getrsbegin(rsdata); /* prepare for reading the rsdata for graphing */ getrsbegin(rsdata); FORLIM = rdata.rstop; /* create the symvec */ for (position = rdata.rstart; position <= FORLIM; position++) { /* skip lines with an '*' */ if (P_peek(rsdata->f) != '*') { readrsdata(rsdata, &rdata); /* move to the next rsdata line and give the data */ if (chilogo) { /* position */ /* partial chi is (observed - expected)^2/expected. expected = nl/4. There is no need to divide by expected since it is renormalized afterwards anyway. */ nl4 = rdata.nl / 4.0; TEMP = rdata.nal - nl4; chia = TEMP * TEMP; TEMP = rdata.ncl - nl4; chic = TEMP * TEMP; TEMP = rdata.ngl - nl4; chig = TEMP * TEMP; TEMP = rdata.ntl - nl4; chit = TEMP * TEMP; chitotal = chia + chic + chig + chit; if (chitotal > 0) { rdata.nal = (long)floor( bignumber * (chia / chitotal) * sign(rdata.nal - nl4) + 0.5); rdata.ncl = (long)floor( bignumber * (chic / chitotal) * sign(rdata.ncl - nl4) + 0.5); rdata.ngl = (long)floor( bignumber * (chig / chitotal) * sign(rdata.ngl - nl4) + 0.5); rdata.ntl = (long)floor( bignumber * (chit / chitotal) * sign(rdata.ntl - nl4) + 0.5); rdata.nl = bignumber; } else { rdata.nal = 1; rdata.ncl = 1; rdata.ngl = 1; rdata.ntl = 1; rdata.nl = 4; } } /* number sequences */ /* information */ /* ( ' ',varhnb:infofield:infodecim); (* variance of rsl *) */ /* variance of rsl in scientific notation: */ fprintf(symvec->f, "%*ld %*ld %*.*f % .*E\n", nfield, rdata.l, infofield, rdata.nl, infofield, infodecim, rdata.rsl, P_max((int)(infofield + 3) - 7, 1), rdata.varhnb); fprintf(symvec->f, "a %*ld\n", nfield, rdata.nal); fprintf(symvec->f, "c %*ld\n", nfield, rdata.ncl); fprintf(symvec->f, "g %*ld\n", nfield, rdata.ngl); fprintf(symvec->f, "t %*ld\n", nfield, rdata.ntl); } } } #undef bignumber /* end module dalvec.themain */ main(argc, argv) int argc; Char *argv[]; { PASCAL_MAIN(argc, argv); if (setjmp(_JL1)) goto _L1; symvec.f = NULL; strcpy(symvec.name, "symvec"); dalvecp.f = NULL; strcpy(dalvecp.name, "dalvecp"); rsdata.f = NULL; strcpy(rsdata.name, "rsdata"); themain(&rsdata, &dalvecp, &symvec); _L1: if (rsdata.f != NULL) fclose(rsdata.f); if (dalvecp.f != NULL) fclose(dalvecp.f); if (symvec.f != NULL) fclose(symvec.f); exit(EXIT_SUCCESS); } /* End. */