/* Output from p2c 1.21alpha-07.Dec.93, the Pascal-to-C translator */ /* From input file "dbfilter.p" */ #include /* dbfilter: filter GenBank databases to remove unwanted entries Mike Stephens, 1989 */ /* end of program */ /* begin module version */ #define version 1.08 /* of dbfilter.p 1992 November 1 origin 1989 August 29 */ /* end module version */ /* begin module describe.dbfilter */ /* name dbfilter: filter GenBank databases to remove unwanted entries synopsis dbfilter(input: in: output: out, dbfilterp: in) files input: a database of GenBank entries output: database after the filtration. When errors occur, the program halts and produces an error message at the end of the output file. dbfilterp: parameters to control the program FIRST LINE: the name of the organism to use, consisting of two parts (eg, Homo sapiens). description GenBank entries in input that contain the requested organsim are copied to output. The GenBank ORGANISM contains the two part genus/species name, such as: ORGANISM Homo sapiens Entries of an unwanted ORGANISM type are not copied from input to output. Those of the desired type are transferred directly. examples If dbfilterp contains: Homo sapiens then only those entries with the ORGANISM type Homo sapiens will be copied into output. All others will be filtered out. documentation none see also dbinst.p dbbk.p author R. Michael Stephens bugs Error messages are buried at the bottom of the output file. technical notes Constant maxlines determines the greatest number of lines that can be handled between LOCUS and ORGANISM. */ /* end module describe.dbfilter */ /* ************************************************************************ */ /* ************************************************************************ */ /* begin module dbfilter.const */ #define maxlines 20 /* maximum number of lines that can be handled between LOCUS and ORGANISM by the buffer */ /* end module dbfilter.const */ /* begin module interact.const */ #define maxstring 150 /* the maximum string */ /* end module interact.const version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module filler.const */ #define fillermax 50 /* the size of the filler array for a string */ /* end module filler.const version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module interact.type */ typedef struct string { /* a string of characters */ Char letters[maxstring]; /* the letters in the string */ long length; /* the number of characters in the string */ long current; /* the letter we are working on */ } string; /* end module interact.type version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module filler.type */ /* the following is an array used to fill a string. it is convenient to have it much shorter than the maxstring, so that it is easy to fill the string using procedure fillstring. the user must declare the value of constant fillermax. */ typedef Char filler[fillermax]; /* end module filler.type version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module dbfilter.type */ typedef string linebuffer[maxlines]; /* to store the lines between LOCUS and ORGANISM until we decide whether to copy them or not */ /* end module dbfilter.type */ /* begin module dbfilter.var */ Static _TEXT dbfilterp; /* parameter file */ Static jmp_buf _JL1; /* end module dbfilter.var */ /* ************************************************************************ */ /* ************************************************************************ */ /* begin module halt */ Static Void halt() { /* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. */ printf(" program halt.\n"); longjmp(_JL1, 1); } /* end module halt version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* ************************************************************************ */ /* ************************************************************************ */ /* begin module interact.clearstring */ Static Void clearstring(ribbon) string *ribbon; { /* empty the string */ long index; /* to the ribbon */ for (index = 0; index < maxstring; index++) ribbon->letters[index] = ' '; ribbon->length = 0; ribbon->current = 0; } /* clearstring */ /* Local variables for figurestring: */ struct LOC_figurestring { string *line; long power; /* of 10 representing a place value in the number */ } ; Local long figureinteger(first, last, LINK) long first, last; struct LOC_figurestring *LINK; { /* figure the integer in the token */ long i; /* index */ long sum = 0; long increment; LINK->power = 1; /* start at ones place */ /* start sum at zero */ for (i = last - 1; i >= first - 1; i--) { switch (LINK->line->letters[i]) { case '0': increment = 0; break; case '1': increment = 1; break; case '2': increment = 2; break; case '3': increment = 3; break; case '4': increment = 4; break; case '5': increment = 5; break; case '6': increment = 6; break; case '7': increment = 7; break; case '8': increment = 8; break; case '9': increment = 9; break; } sum += LINK->power * increment; LINK->power *= 10; } return sum; } /* figureinteger */ /* end module interact.clearstring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module interact.figurestring */ Static Void figurestring(line_, first, last, whzat, c, i, r) string *line_; long *first, *last; Char *whzat, *c; long *i; double *r; { /* a string of characters to figure out */ /* first found non-blank character in the line */ /* last character before a blank after first */ /* what the token is */ /* the first character of the token */ /* integer value of token if it is integer; or 0 */ /* the real value if it is real; or 0.0 */ /* figurestring figures out the tokens in a string. it recognizes words, integers, reals and poorly formed numbers. you can easily use it to parse lines. our goal is to figure out what thing is on a string. start looking at the current place on the line. first and last are the first 'token' in line after start. the current place is updated to the letter after last. the thing found is described by the value of whzat: 'c': character (when the token does not begin with a digit, '+', or '-') 'i': integer 'r': real ' ': blank line 'g': garbage, cannot figure it out and the value of the thing found is the appropriate variable */ struct LOC_figurestring V; long numbers[3]; long sign; /* sign of a number */ long numberstart; /* the point a number starts, beyond its sign, if any */ long point = 0; /* location of decimal point */ long l; /* an index for dissecting numbers */ string *WITH; long FORLIM; V.line = line_; P_addset(P_expset(numbers, 0L), '0'); P_addset(numbers, '1'); P_addset(numbers, '2'); P_addset(numbers, '3'); P_addset(numbers, '4'); P_addset(numbers, '5'); P_addset(numbers, '6'); P_addset(numbers, '7'); P_addset(numbers, '8'); P_addset(numbers, '9'); /* c:=' '; i:=0; r:=0.0; do not affect these variables unless necessary */ *whzat = '.'; /* assume that we have someting to work on */ /* now to see if that is true: */ WITH = V.line; if (WITH->length == 0 || WITH->current < 1 || WITH->current > WITH->length) *whzat = ' '; else { /* figure out where the first token is in the line */ *first = V.line->current; while (V.line->letters[*first - 1] == ' ' && *first < V.line->length) (*first)++; if (*first == V.line->length && V.line->letters[*first - 1] == ' ') *whzat = ' '; } if (*whzat == ' ') return; *last = *first; while (V.line->letters[*last - 1] != ' ' && *last < V.line->length) (*last)++; if (V.line->letters[*last - 1] == ' ') (*last)--; /* the token is between inclusive first and last */ *c = V.line->letters[*first - 1]; if (P_inset(*c, numbers) || *c == '-' || *c == '+') { if (*c == '-' || *c == '+') { switch (*c) { case '+': sign = 1; break; case '-': sign = -1; break; } numberstart = *first + 1; } else { sign = 1; numberstart = *first; } *whzat = 'i'; FORLIM = *last; for (l = numberstart; l <= FORLIM; l++) { if (!P_inset(V.line->letters[l-1], numbers)) { if (V.line->letters[l-1] == '.') { /* we found a period */ if (*whzat == 'i') { /* if so far it is numbers */ *whzat = 'r'; /* it is actually real */ point = l; } else *whzat = 'g'; /* it is a second '.', ie garbage */ } else *whzat = 'g'; /* it is garbage */ } } /* if it is only numbers, it is integer */ /* build number */ /* if it ends in a period, it is integer */ if (*whzat == 'r' && point == *last) *whzat = 'i'; if (*whzat == 'i') { if (point == *last) /* had an ending decimal point */ *i = sign * figureinteger(numberstart, *last - 1, &V); else *i = sign * figureinteger(numberstart, *last, &V); *r = *i; } else if (*whzat == 'r') { *i = figureinteger(numberstart, point - 1, &V); *r = sign * (*i + (double)figureinteger(point + 1, *last, &V) / V.power); *i *= sign; } } else *whzat = 'c'; /* move the start to just beyond the last character of the token */ V.line->current = *last + 1; } /* figurestring */ /* end module interact.figurestring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module interact.writestring */ Static Void writestring(tofile, s) _TEXT *tofile; string *s; { /* write the string s to file tofile, no writeln */ long i; /* index to s */ long FORLIM; FORLIM = s->length; for (i = 0; i < FORLIM; i++) putc(s->letters[i], tofile->f); } /* writestring */ /* end module interact.writestring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module interact.token */ Static Void token(buffer, atoken, gotten) string *buffer, *atoken; boolean *gotten; { /* get a token from the buffer */ /* variables for calling figurestring: */ long first, last; Char what, cha; long int_; double rea; long index; /* to the buffer */ figurestring(buffer, &first, &last, &what, &cha, &int_, &rea); if (what == ' ') { *gotten = false; return; } clearstring(atoken); for (index = first; index <= last; index++) atoken->letters[index - first] = buffer->letters[index-1]; atoken->length = last - first + 1; atoken->current = 1; *gotten = true; } /* end module interact.token version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module interact.getstring */ Static Void getstring(afile, buffer, gotten) _TEXT *afile; string *buffer; boolean *gotten; { /* get a string from a file not using string calls. this lets one obtain lines from a file without interactive prompts */ long index = 0; /* of buffer */ clearstring(buffer); if (BUFEOF(afile->f)) { *gotten = false; return; } while (!P_eoln(afile->f) && index < maxstring) { index++; buffer->letters[index-1] = getc(afile->f); if (buffer->letters[index-1] == '\n') buffer->letters[index-1] = ' '; } if (!P_eoln(afile->f)) { printf(" getstring: a line exceeds maximum string size (%ld)\n", (long)maxstring); halt(); } buffer->length = index; buffer->current = 1; fscanf(afile->f, "%*[^\n]"); getc(afile->f); *gotten = true; } /* getstring */ /* end module interact.getstring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module equalstring */ Static boolean equalstring(a, b) string a, b; { /* test for equality between two strings at current positions */ long index; /* index to both strings */ boolean equal; /* are letters in a and b the same? */ if (a.length == b.length) { index = 1; do { equal = (a.letters[index-1] == b.letters[index-1]); index++; } while (equal && index <= a.length); return equal; } else return false; } /* equalstring */ /* end module equalstring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module copystring */ Static Void copystring(a, b) string a, *b; { /* copy string a to b */ long l; /* index to the string */ b->length = a.length; for (l = 0; l < a.length; l++) b->letters[l] = a.letters[l]; } /* end module copystring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* begin module filler.fillstring */ Static Void fillstring(s, a) string *s; Char *a; { /* this procedure makes it reasonably easy to fill the string s with characters. one calls the procedure as: */ /* 1 2 3 4 5 */ /* 12345678901234567890123456789012345678901234567890 */ /* fillstring(s, 'this-is-the-string '); the two comments make it easy to line the characters up. also, for this example, it was assumed that the length of filler as defined by the constant fillermax was 50. */ long length = fillermax; /* of the string without trailing blanks */ long index; /* of s */ clearstring(s); while (length > 1 && a[length-1] == ' ') length--; if (length == 1 && a[length-1] == ' ') { printf("fillstring: the string is empty\n"); halt(); } for (index = 0; index < length; index++) s->letters[index] = a[index]; s->length = length; s->current = 1; } /* fillstring */ /* end module filler.fillstring version = 4.11; (@ of prgmod.p 1991 Apr 22 */ /* ************************************************************************ */ /* ************************************************************************ */ /* begin module dbfilter.writeparameters */ Static Void writeparameters(tofile, genus, species) _TEXT *tofile; string genus, species; { /* write the paramters to file tofile */ fprintf(tofile->f, "* searching for organism "); writestring(tofile, &genus); putc(' ', tofile->f); writestring(tofile, &species); putc('\n', tofile->f); } /* end module dbfilter.writeparameters */ /* begin module dbfilter.readparameters */ Static Void readparameters(dbfilterp, genus, species) _TEXT *dbfilterp; string *genus, *species; { /* read the parameters from dbfilterp */ boolean gotten; /* was the string gotten from the file? */ string buffer; /* a line from dbfilterp */ /* get the feature */ if (*dbfilterp->name != '\0') { if (dbfilterp->f != NULL) dbfilterp->f = freopen(dbfilterp->name, "r", dbfilterp->f); else dbfilterp->f = fopen(dbfilterp->name, "r"); } else rewind(dbfilterp->f); if (dbfilterp->f == NULL) _EscIO2(FileNotFound, dbfilterp->name); RESETBUF(dbfilterp->f, Char); getstring(dbfilterp, &buffer, &gotten); if (!gotten) { printf("empty dbfilterp\n"); halt(); } /* extract the genus from the line */ token(&buffer, genus, &gotten); if (!gotten) { printf("missing first parameter\n"); halt(); } /* extract the species from the line */ token(&buffer, species, &gotten); if (!gotten) { printf("missing second parameter\n"); halt(); } } /* end module dbfilter.readparameters */ /* ************************************************************************ */ /* ************************************************************************ */ /* begin module dbfilter.themain */ Static Void themain(fin, fout, dbfilterp) _TEXT *fin, *fout, *dbfilterp; { /* the main procedure of the program */ string atoken; /* first token of a given string */ string buffer; /* line of the fin file */ string endentry; /* trigger string for fin entry ends */ string foundgenus; /* the string to be compared with genus */ string foundspecies; /* the string to be compared with species */ string genus; /* first token in the organism name */ boolean gotten; /* was a line really there? */ long index; /* index for storage array */ string locus; /* trigger string for LOCUS name */ string newtoken; /* the token to compare for the end of the entry */ string organism; /* trigger string for ORGANISM */ string species; /* second token in the organism name */ linebuffer storage; /* temporary buffer for text */ boolean tokenfound; /* was a token gotten? */ long x; /* loop control variable for index */ printf("dbfilter %4.2f\n", version); if (*fin->name != '\0') { if (fin->f != NULL) fin->f = freopen(fin->name, "r", fin->f); else fin->f = fopen(fin->name, "r"); } else rewind(fin->f); if (fin->f == NULL) _EscIO2(FileNotFound, fin->name); RESETBUF(fin->f, Char); if (*fout->name != '\0') { if (fout->f != NULL) fout->f = freopen(fout->name, "w", fout->f); else fout->f = fopen(fout->name, "w"); } else { if (fout->f != NULL) rewind(fout->f); else fout->f = tmpfile(); } if (fout->f == NULL) _EscIO2(FileNotFound, fout->name); SETUPBUF(fout->f, Char); /* fill the strings to be used as identifiers */ /* 1 2 3 4 5 */ /* 12345678901234567890123456789012345678901234567890 */ fillstring(&locus, "LOCUS "); fillstring(&organism, "ORGANISM "); fillstring(&endentry, "// "); /* read the parameters from the dbfilterp file */ readparameters(dbfilterp, &genus, &species); writeparameters(fout, genus, species); while (!BUFEOF(fin->f)) { getstring(fin, &buffer, &gotten); if (!gotten) continue; token(&buffer, &atoken, &tokenfound); if (!equalstring(atoken, locus)) continue; index = 1; copystring(buffer, &storage[index-1]); writestring(fout, &buffer); putc('\n', fout->f); while (!equalstring(atoken, organism)) { getstring(fin, &buffer, &gotten); index++; if (index > maxlines) { fprintf(fout->f, "buffer capacity exceeded increase constant maxlines\n"); halt(); } copystring(buffer, &storage[index-1]); token(&buffer, &atoken, &tokenfound); if (!(tokenfound & equalstring(atoken, organism))) continue; token(&buffer, &foundgenus, &tokenfound); if (!tokenfound) continue; token(&buffer, &foundspecies, &tokenfound); if (!tokenfound) continue; writestring(fout, &genus); putc(' ', fout->f); writestring(fout, &species); putc('\n', fout->f); if (!(equalstring(genus, foundgenus) & equalstring(species, foundspecies))) continue; for (x = 0; x < index; x++) { writestring(fout, &storage[x]); putc('\n', fout->f); } do { getstring(fin, &buffer, &gotten); if (!gotten) { fprintf(fout->f, "incomplete entry\n"); halt(); } writestring(fout, &buffer); putc('\n', fout->f); token(&buffer, &newtoken, &tokenfound); if (!tokenfound) clearstring(&newtoken); } while (!equalstring(newtoken, endentry)); } } } /* end module dbfilter.themain */ main(argc, argv) int argc; Char *argv[]; { _TEXT TEMP, TEMP1; PASCAL_MAIN(argc, argv); if (setjmp(_JL1)) goto _L1; dbfilterp.f = NULL; strcpy(dbfilterp.name, "dbfilterp"); TEMP.f = stdin; *TEMP.name = '\0'; TEMP1.f = stdout; *TEMP1.name = '\0'; themain(&TEMP, &TEMP1, &dbfilterp); _L1: if (dbfilterp.f != NULL) fclose(dbfilterp.f); exit(EXIT_SUCCESS); } /* End. */