/* Output from p2c 1.21alpha-07.Dec.93, the Pascal-to-C translator */ /* From input file "clual.p" */ #include /* clual: clustal to alpro conversion Dr. Thomas D. Schneider National Cancer Institute Laboratory of Experimental and Computational Biology Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu http://www.lecb.ncifcrf.gov/~toms/ National Cancer Institute Laboratory of Experimental and Computational Biology */ /* end of program */ /* begin module version */ #define version 1.08 /* of clual.p 2002 Feb 19 2002 Feb 19, 1.00: origin */ #define updateversion 1.00 /* defines lowest acceptable current parameter file */ /* end module version */ /* begin module describe.clual */ /* name clual: clustal to alpro conversion synopsis clual(clustalout: in, clualp: in, protseq: out, output: out) files clustalout: output of the CLUSTAL program protseq: input to the alpro program clualp: parameters to control the program. The file must contain the following parameters, one per line: parameterversion: The version number of the program. This allows the user to be warned if an old parameter file is used. The second line of clualp must match the first line of the clustalout file. This is used to check that the clustalout file is correct. verbose (character): If the first character of the third line is a 'v' then the program will name the segment numbers as it reads it in, and then give the name of each sequence as it is written out. output: messages to the user description Convert from CLUSTAL format to allow one to present COG output as a sequence logo. The CLUSTAL format is broken up into segments. Alpro requires continuous sequences. This program rearranges the CLUSTAL data to the form alpro needs. examples The first line of a clustal file looks like this: CLUSTAL W (1.74) multiple sequence alignment This is used to check that the input is good. documentation see also {example parameter file:} clualp {program that uses the output of this program:} alpro.p {program that finally generates the sequence logo:} makelogo.p {COG:} http://www.ncbi.nlm.nih.gov/COG/ {COG:} ftp://ncbi.nlm.nih.gov/pub/COG/ {an example alignment:} http://www.ncbi.nlm.nih.gov/COG/aln/COG0526.aln {the entire list of alignments, ready to grab:} ftp://ncbi.nlm.nih.gov/pub/COG/aln/ {wget can be used to grab the alignments:} http://www.lecb.ncifcrf.gov/~toms/wget.html {Why one should not use consensus sequences:} http://www.lecb.ncifcrf.gov/~toms/glossary.html#consensus_sequence author Thomas Dana Schneider bugs technical notes The clustal format has waste spaces. If "_" represents a space, then we have at the boundary of two segments: YPR082c_________------------------------------------------------------------ ____________________________________________________________________________ _ BS_resA_________------------------ This program ignores the spaces, but one wonders why they are there ... AH!!! These contain STUPID consensus sequences!!! The program will ignore this idiotic data line. */ /* end module describe.clual */ /* begin module string.const */ #define maxstring 2000 /* the maximum string */ /* end module string.const version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module filler.const */ #define fillermax 50 /* the size of the filler array for a string */ /* end module filler.const version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module string.type */ /* pointer to a string */ typedef struct string { /* a string of characters */ Char letters[maxstring]; /* the letters in the string */ long length; /* the number of characters in the string */ long current; /* the letter we are working on */ Char *next; /* the next string in a series */ } string; /* end module string.type version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module filler.type */ /* the following is an array used to fill a string. it is convenient to have it much shorter than the maxstring, so that it is easy to fill the string using procedure fillstring. the user must declare the value of constant fillermax. */ typedef Char filler[fillermax]; /* end module filler.type version = 4.54; (@ of prgmod.p 2001 Aug 29 */ Static _TEXT clustalout; /* file used by this program */ Static _TEXT clualp; /* file used by this program */ Static _TEXT protseq; /* file used by this program */ Static jmp_buf _JL1; /* begin module halt */ Static Void halt() { /* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. */ printf(" program halt.\n"); longjmp(_JL1, 1); } /* end module halt version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module clearstring */ Static Void clearstring(ribbon) string *ribbon; { /* empty the string */ long index; /* to the ribbon */ for (index = 0; index < maxstring; index++) ribbon->letters[index] = ' '; ribbon->length = 0; ribbon->current = 0; } /* clearstring */ Static Void initializestring(ribbon) string *ribbon; { /* start the string with a nil pointer. This routine should be called before doing linked list work. This allows the standard string routines to clear the string without killing the pointer. */ clearstring(ribbon); ribbon->next = NULL; } /* initializestring */ /* end module clearstring version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module interact.getstring */ Static Void getstring(afile, buffer, gotten) _TEXT *afile; string *buffer; boolean *gotten; { /* get a line (as a string) from a file not using string calls. this lets one obtain lines from a file without interactive prompts */ long index = 0; /* of buffer */ clearstring(buffer); if (BUFEOF(afile->f)) { *gotten = false; return; } while (!P_eoln(afile->f) && index < maxstring) { index++; buffer->letters[index-1] = getc(afile->f); if (buffer->letters[index-1] == '\n') buffer->letters[index-1] = ' '; } if (!P_eoln(afile->f)) { printf(" getstring: a line exceeds maximum string size (%ld)\n", (long)maxstring); halt(); } buffer->length = index; buffer->current = 1; fscanf(afile->f, "%*[^\n]"); getc(afile->f); *gotten = true; } /* getstring */ /* Local variables for figurestring: */ struct LOC_figurestring { string *line; long power; /* of 10 representing a place value in the number */ } ; Local long figureinteger(first, last, LINK) long first, last; struct LOC_figurestring *LINK; { /* figure the integer in the token */ long i; /* index */ long sum = 0; long increment; LINK->power = 1; /* start at ones place */ /* start sum at zero */ for (i = last - 1; i >= first - 1; i--) { switch (LINK->line->letters[i]) { case '0': increment = 0; break; case '1': increment = 1; break; case '2': increment = 2; break; case '3': increment = 3; break; case '4': increment = 4; break; case '5': increment = 5; break; case '6': increment = 6; break; case '7': increment = 7; break; case '8': increment = 8; break; case '9': increment = 9; break; } sum += LINK->power * increment; LINK->power *= 10; } return sum; } /* figureinteger */ /* end module interact.getstring version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module interact.figurestring */ Static Void figurestring(line_, first, last, whzat, c, i, r) string *line_; long *first, *last; Char *whzat, *c; long *i; double *r; { /* a string of characters to figure out */ /* first found non-blank character in the line */ /* last character before a blank after first */ /* what the token is */ /* the first character of the token */ /* integer value of token if it is integer; or 0 */ /* the real value if it is real; or 0.0 */ /* figurestring figures out the tokens in a string. it recognizes words, integers, reals and poorly formed numbers. you can easily use it to parse lines. our goal is to figure out what thing is on a string. start looking at the current place on the line. first and last are the first 'token' in line after start. the current place is updated to the letter after last. the thing found is described by the value of whzat: 'c': character (when the token does not begin with a digit, '+', or '-') 'i': integer 'r': real ' ': blank line 'g': garbage, cannot figure it out and the value of the thing found is the appropriate variable */ struct LOC_figurestring V; long numbers[3]; long sign; /* sign of a number */ long numberstart; /* the point a number starts, beyond its sign, if any */ long point = 0; /* location of decimal point */ long l; /* an index for dissecting numbers */ string *WITH; long FORLIM; V.line = line_; P_addset(P_expset(numbers, 0L), '0'); P_addset(numbers, '1'); P_addset(numbers, '2'); P_addset(numbers, '3'); P_addset(numbers, '4'); P_addset(numbers, '5'); P_addset(numbers, '6'); P_addset(numbers, '7'); P_addset(numbers, '8'); P_addset(numbers, '9'); /* c:=' '; i:=0; r:=0.0; do not affect these variables unless necessary */ *whzat = '.'; /* assume that we have someting to work on */ /* now to see if that is true: */ WITH = V.line; if (WITH->length == 0 || WITH->current < 1 || WITH->current > WITH->length) *whzat = ' '; else { /* figure out where the first token is in the line */ *first = V.line->current; while (V.line->letters[*first - 1] == ' ' && *first < V.line->length) (*first)++; if (*first == V.line->length && V.line->letters[*first - 1] == ' ') *whzat = ' '; } if (*whzat == ' ') return; *last = *first; while (V.line->letters[*last - 1] != ' ' && *last < V.line->length) (*last)++; if (V.line->letters[*last - 1] == ' ') (*last)--; /* the token is between inclusive first and last */ *c = V.line->letters[*first - 1]; if (P_inset(*c, numbers) || *c == '-' || *c == '+') { if (*c == '-' || *c == '+') { switch (*c) { case '+': sign = 1; break; case '-': sign = -1; break; } numberstart = *first + 1; } else { sign = 1; numberstart = *first; } *whzat = 'i'; FORLIM = *last; for (l = numberstart; l <= FORLIM; l++) { if (!P_inset(V.line->letters[l-1], numbers)) { if (V.line->letters[l-1] == '.') { /* we found a period */ if (*whzat == 'i') { /* if so far it is numbers */ *whzat = 'r'; /* it is actually real */ point = l; } else *whzat = 'g'; /* it is a second '.', ie garbage */ } else *whzat = 'g'; /* it is garbage */ } } /* if it is only numbers, it is integer */ /* build number */ /* if it ends in a period, it is integer */ if (*whzat == 'r' && point == *last) *whzat = 'i'; if (*whzat == 'i') { if (point == *last) /* had an ending decimal point */ *i = sign * figureinteger(numberstart, *last - 1, &V); else *i = sign * figureinteger(numberstart, *last, &V); *r = *i; } else if (*whzat == 'r') { *i = figureinteger(numberstart, point - 1, &V); *r = sign * (*i + (double)figureinteger(point + 1, *last, &V) / V.power); *i *= sign; } } else *whzat = 'c'; /* move the start to just beyond the last character of the token */ V.line->current = *last + 1; } /* figurestring */ /* end module interact.figurestring version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module interact.token */ Static Void token(buffer, atoken, gotten) string *buffer, *atoken; boolean *gotten; { /* get a token from the buffer */ /* variables for calling figurestring: */ long first, last; Char what, cha; long int_; double rea; long index; /* to the buffer */ figurestring(buffer, &first, &last, &what, &cha, &int_, &rea); if (what == ' ') { *gotten = false; return; } clearstring(atoken); for (index = first; index <= last; index++) atoken->letters[index - first] = buffer->letters[index-1]; atoken->length = last - first + 1; atoken->current = 1; *gotten = true; } /* end module interact.token version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module writestring */ Static Void writestring(tofile, s) _TEXT *tofile; string *s; { /* write the string s to file tofile, no writeln */ long i; /* index to s */ long FORLIM; FORLIM = s->length; for (i = 0; i < FORLIM; i++) putc(s->letters[i], tofile->f); } /* writestring */ /* end module writestring version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module filler.fillstring */ Static Void fillstring(s, a) string *s; Char *a; { /* this procedure makes it reasonably easy to fill the string s with characters. one calls the procedure as: */ /* 1 2 3 4 5 */ /* 12345678901234567890123456789012345678901234567890 */ /* fillstring(s, 'this-is-the-string '); the two comments make it easy to line the characters up. also, for this example, it was assumed that the length of filler as defined by the constant fillermax was 50. */ long length = fillermax; /* of the string without trailing blanks */ long index; /* of s */ clearstring(s); while (length > 1 && a[length-1] == ' ') length--; if (length == 1 && a[length-1] == ' ') { printf("fillstring: the string is empty\n"); halt(); } for (index = 0; index < length; index++) s->letters[index] = a[index]; s->length = length; s->current = 1; } /* fillstring */ /* end module filler.fillstring version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module equalstring */ Static boolean equalstring(a, b) string a, b; { /* Test for equality between two strings at current positions. NOTE: A compiler bug results if one directly tests this way: if thedefinition^.nametag = aname The reason is completely not clear! I showed that the parts of the strings were identical, but the whole was not by this test. For this reason it is *critical to test strings with equalstring. */ long index; /* index to both strings */ boolean equal; /* are letters in a and b the same? */ if (a.length == b.length) { index = 1; do { equal = (a.letters[index-1] == b.letters[index-1]); index++; } while (equal && index <= a.length); return equal; } else return false; } /* equalstring */ /* end module equalstring version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module copystring */ Static Void copystring(a, b) string a, *b; { /* copy string a to b */ long l; /* index to the string */ b->length = a.length; for (l = 0; l < a.length; l++) b->letters[l] = a.letters[l]; } /* end module copystring version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module skipblanks */ Static Void skipblanks(thefile) _TEXT *thefile; { /* skip over blanks until a non-blank, or end of line, is found */ while ((P_peek(thefile->f) == ' ') & (!P_eoln(thefile->f))) getc(thefile->f); } Static Void skipnonblanks(thefile) _TEXT *thefile; { /* skip over nonblanks until a blank, or end of line, is found */ while ((P_peek(thefile->f) != ' ') & (!P_eoln(thefile->f))) getc(thefile->f); } Static Void skipcolumn(thefile) _TEXT *thefile; { /* skip over a data column */ skipblanks(thefile); skipnonblanks(thefile); } /* pointer to a protein string */ typedef struct protein { /* a protein string */ string name; /* the name of the protein */ Char *data; /* the sequence of the protein */ Char *lastdata; /* the end of the data string */ long *next; /* the next protein */ } protein; /* end module skipblanks version = 4.54; (@ of prgmod.p 2001 Aug 29 */ /* begin module clual.themain */ Static Void themain(clustalout, clualp, protseq) _TEXT *clustalout, *clualp, *protseq; { /* the main procedure of the program */ string aline; /* a line of data from clustalout */ string clustalid; /* the first line of a clustal file */ Char *d; /* pointer to a data string in proteins */ boolean gotten; /* a line was gotten from clustalout? */ boolean newsegment = false; /* we are now between or at the star of a segment */ double parameterversion; /* parameter version number */ protein *p = NULL; /* pointer to proteins */ protein *proteins = NULL; /* the collection of aligned protein sequences, arranged so that they can be output continuously */ long protcount = 0; /* number of proteins */ long segment = 0; /* the current segment of the alignment */ string thedata; /* the data on the data line */ string thename; /* the name of the data line */ Char verbose; /* if 'v' then report segments and names to output */ Char STR1[256], STR2[256], STR3[256]; _TEXT TEMP; long *WITH; printf("clual %4.2f\n", version); if (*clualp->name != '\0') { if (clualp->f != NULL) clualp->f = freopen(clualp->name, "r", clualp->f); else clualp->f = fopen(clualp->name, "r"); } else rewind(clualp->f); if (clualp->f == NULL) _EscIO2(FileNotFound, clualp->name); RESETBUF(clualp->f, Char); fscanf(clualp->f, "%lg%*[^\n]", ¶meterversion); getc(clualp->f); if ((long)floor(100 * parameterversion + 0.5) < (long)floor(100.0 + 0.5)) { printf("You have an old parameter file!\n"); halt(); } if (*protseq->name != '\0') { if (protseq->f != NULL) protseq->f = freopen(protseq->name, "w", protseq->f); else protseq->f = fopen(protseq->name, "w"); } else { if (protseq->f != NULL) rewind(protseq->f); else protseq->f = tmpfile(); } if (protseq->f == NULL) _EscIO2(FileNotFound, protseq->name); SETUPBUF(protseq->f, Char); fprintf(protseq->f, "> clual %4.2f\n", version); if (*clustalout->name != '\0') { if (clustalout->f != NULL) clustalout->f = freopen(clustalout->name, "r", clustalout->f); else clustalout->f = fopen(clustalout->name, "r"); } else rewind(clustalout->f); if (clustalout->f == NULL) _EscIO2(FileNotFound, clustalout->name); RESETBUF(clustalout->f, Char); /* check that the first line of the file is a clustal file */ getstring(clualp, &clustalid, &gotten); if (!gotten) { printf("The second line of clualp must match the first line\n"); printf("of the clustalout file.\n"); printf("This is used to check that the clustalout file is correct.\n"); halt(); } fscanf(clualp->f, "%c%*[^\n]", &verbose); getc(clualp->f); if (verbose == '\n') verbose = ' '; getstring(clustalout, &aline, &gotten); if (!gotten) { printf("clustalout is empty?\n"); halt(); } if (!equalstring(aline, clustalid)) { printf("clustalout is not a correct file\n"); printf("The first line MUST be:\n"); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &clustalid); printf("\nbut this was found instead:\n"); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &aline); putchar('\n'); halt(); /* we have not started yet */ /* we have not started yet */ } while (!BUFEOF(clustalout->f)) { /* skip consensus lines */ if (!P_eoln(clustalout->f)) { /* read to the end of the line */ if (P_peek(clustalout->f) == ' ') { while (!P_eoln(clustalout->f)) getc(clustalout->f); } } if (P_eoln(clustalout->f)) { fscanf(clustalout->f, "%*[^\n]"); getc(clustalout->f); newsegment = true; p = NULL; /* we are not at any protein at the moment */ continue; } /* skip blank material between segments */ /* absorb a segment */ if (newsegment) { segment++; newsegment = false; if (verbose == 'v') printf("segment %ld\n", segment); if (p != NULL) { printf("segment %ld is too short!\n", segment); halt(); } } if (proteins == NULL) { proteins = (protein *)Malloc(sizeof(protein)); /* start the list */ clearstring(&proteins->name); proteins->data = (Char *)Malloc(256); proteins->lastdata = NULL; proteins->next = NULL; p = proteins; if (segment == 1) protcount = 1; } else { if (p == NULL) { /* start working in this segment */ p = proteins; } else { if (p->next == NULL) { p->next = (long *)Malloc(sizeof(long)); WITH = p->next; /* p2c: clual.p, line 609: Warning: Expected END, found a ':=' [227] */ /* p2c: clual.p, line 605: * Warning: Argument of WITH is not a RECORD [264] */ /* p2c: clual.p, line 606: Warning: Symbol 'NAME' is not defined [221] */ /* p2c: clual.p, line 606: * Warning: Type mismatch in VAR parameter ribbon [295] */ clearstring(&name); /* p2c: clual.p, line 607: Warning: Symbol 'DATA' is not defined [221] */ data = Malloc(sizeof(long)); lastdata = NULL; /* p2c: clual.p, line 608: * Warning: Symbol 'LASTDATA' is not defined [221] */ /* p2c: clual.p, line 609: * Warning: Symbol 'NEXT' is not of the appropriate class [222] */ } /* start the next protein */ p = (protein *)p->next; if (segment == 1) protcount++; } } /* read one data line */ getstring(clustalout, &aline, &gotten); if (!gotten) { printf("clual: could not read line correctly?\n"); printf("in segment %ld the line is:\n", segment); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &aline); putchar('\n'); halt(); } clearstring(&thename); token(&aline, &thename, &gotten); if (!gotten) { printf("clual: could not read name correctly?\n"); printf("in segment %ld\n", segment); printf("the line read is:\n"); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &aline); printf("\nthe PREVIOUS name read is:\n"); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &thename); printf("\nthe PREVIOUS data read is:\n"); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &thedata); putchar('\n'); halt(); } token(&aline, &thedata, &gotten); if (!gotten) { printf("clual: could not read data correctly?\n"); printf("in segment %ld\n", segment); printf("the line read is:\n"); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &aline); printf("\nthe PREVIOUS name read is:\n"); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &thename); printf("\nthe PREVIOUS data read is:\n"); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &thedata); putchar('\n'); halt(); } if (p->name.length == 0) { /* set up the name the first time */ copystring(thename, &p->name); p->data = (Char *)Malloc(256); /* p2c: clual.p, line 668: * Warning: Type mismatch in VAR parameter ribbon [295] */ clearstring(p->data); p->lastdata = p->data; /* p2c: clual.p, line 670: * Warning: Type mismatch in VAR parameter b [295] */ copystring(thedata, p->data); *p->lastdata = '\0'; /* p2c: clual.p, line 671: Warning: Mixing non-strings with strings [170] */ continue; } /* fill in the next data segment */ if (!equalstring(p->name, thename)) { printf("name \""); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &p->name); printf("\"\n"); printf("DOES NOT MATCH PREVIOUS NAME\n"); printf("name \""); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &thename); printf("\"\n"); halt(); } /* p2c: clual.p, line 686: Warning: Mixing non-strings with strings [170] */ strcpy(p->lastdata, Malloc(256)); p->lastdata = p->lastdata; /* p2c: clual.p, line 688: * Warning: Type mismatch in VAR parameter ribbon [295] */ clearstring(p->lastdata); /* p2c: clual.p, line 689: * Warning: Type mismatch in VAR parameter b [295] */ copystring(thedata, p->lastdata); *p->lastdata = '\0'; /* p2c: clual.p, line 690: Warning: Mixing non-strings with strings [170] */ } if (verbose != 'v') { printf("%ld segment", segment); if (segment > 1) putchar('s'); printf("\n%ld protein", protcount); if (protcount > 1) putchar('s'); putchar('\n'); } /* write the results out */ p = proteins; while (p != NULL) { if (verbose == 'v') { TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &p->name); putchar('\n'); } fprintf(protseq->f, "> "); writestring(protseq, &p->name); putc('\n', protseq->f); d = p->data; while (d != NULL) { /* p2c: clual.p, line 717: * Warning: Type mismatch in VAR parameter s [295] */ writestring(protseq, d); putc('\n', protseq->f); if (d == NULL) { /* p2c: clual.p, line 720: Warning: Mixing non-strings with strings [170] */ if (d != p->lastdata) { printf("ERROR: lastdata is not end of list\n"); halt(); /* p2c: clual.p: Warning: Mixing non-strings with strings [170] */ /* p2c: clual.p: Warning: Mixing non-strings with strings [170] */ /* p2c: clual.p: Warning: Mixing non-strings with strings [170] */ } } d = d; } p = (protein *)p->next; } } /* end module clual.themain */ main(argc, argv) int argc; Char *argv[]; { PASCAL_MAIN(argc, argv); if (setjmp(_JL1)) goto _L1; protseq.f = NULL; strcpy(protseq.name, "protseq"); clualp.f = NULL; strcpy(clualp.name, "clualp"); clustalout.f = NULL; strcpy(clustalout.name, "clustalout"); themain(&clustalout, &clualp, &protseq); _L1: if (clustalout.f != NULL) fclose(clustalout.f); if (clualp.f != NULL) fclose(clualp.f); if (protseq.f != NULL) fclose(protseq.f); exit(EXIT_SUCCESS); } /* duplicate for reference: proteinptr = ^protein; (* pointer to a protein string *) protein = record (* a protein string *) name: string; (* the name of the protein *) data: stringptr; (* the sequence of the protein *) lastdata: stringptr; (* the end of the data string *) next: ^protein; (* the next protein *) end; */ /* End. */