/* Output from p2c 1.21alpha-07.Dec.93, the Pascal-to-C translator */ /* From input file "exon.p" */ #include /* exon: determine lengths of exons in GenBank entries Dr. Thomas D. Schneider National Institutes of Health National Cancer Institute Center for Cancer Research Nanobiology Program Molecular Information Theory Group Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu (use only if first address fails) http://www.ccrnp.ncifcrf.gov/~toms/ */ /* end of program */ /* begin module version */ #define version 2.32 /* of exon.p 2007 Dec 10 2007 Dec 10 2.32: fix @ 2007 Dec 10 2.31: stop converting last exon to gene when mRNA (at least) 2007 Dec 10 2.30: allow m, R, r for mRNA. 2007 Dec 10 2.29: tell user exonp is being upgraded 2007 Dec 10 2.28: testing 2007 Dec 10 2.27: version upgrading 2007 Dec 10 2.26: allow mRNA analysis. 2007 Dec 05 2.25: put exon features into heap. see tech. notes 2007 Dec 05 2.24: increase exonmax was too big (100000) see tech. notes 2007 Nov 24 2.23: Segmentation Fault - exonmax was too big (100000) do fast skip of sequence 2007 Nov 23 2.22: increase exonmax 2006 Oct 25 2.21: allow for strain name in organism: ORGANISM Escherichia coli K12 gives E.coli-K12 2000 Nov 2: 2.20: clean up output 2000 Nov 2: 2.19: bug for U41218 2000 May 4: 2.14: cleaned up code so that naming really works 2000 May 3: 2.09: /note is used to name exons 2000 May 3: 2.06: sort exons into ascending order. 2000 May 3: 2.05: names are now put into einst, ainst, dinst 1999 December 1: 2.04: upgrade documentation 1999 June 1: 2.02: three options for exon names 1999 May 4: 1.98: einst file added 1999 April 26: 1.96: exon complements handled 1998 June 3: /gene name is now used for CDS. major use: 1994 November 12 origin 1994 April 10 */ #define updateversion 1.26 /* defines lowest acceptable current parameter file */ /* end module version */ /* begin module describe.exon */ /* name exon: determine lengths of exons in GenBank entries synopsis exon(exonp: inout, db: in, dinst: out, ainst: out, einst: out, lengths: out, exonfeatures: out output: out) files exonp: parameters to control the program, one per line: 0: parameterversion: The version number of the program. This allows the user to be warned if an old parameter file is used. (Introduced 2007 Dec 10.) 1: If 'n' then the end exons are not included. These do not have reliable lengths. Even if end exons are included, the program will never add the Delila instructions for the very ends of the CDS, because these are not reliable. Often they are CAP or polyA sites. Specifically, the first coordinate of the CDS is likely to be a CAP and so should not be added to the acceptors in ainst, while the last coordinate of the CDS is likely to be a polyA site and so should not be added to the donors in dinst. 2: if 'd' then gobs of debugging output are printed to the output file. 3: Two constants, theDfromrange and theDtorange, that determine the from and to range to be written for Donor Delila instructions. 4: Two constants, theAfromrange and theAtorange, that determine the from and to range to be written for Acceptor Delila instructions. 5: If the first character is 'e' then exon features are also used. If the second character is 'i' then intron features are also used. 6: If the first character is 'a' (for alternative) then exon features that have one end point the same are included. If it is not 'a' then only exons that are completely different are included. 7: 4 characters that determine the harshness of which entries to keep. The categories are: single letter name string in GenBank: p putative "putative" n notexperimental "not_experimental" g geneprediction "gene prediction" u unpublished "Unpublished" s pseudo "pseudo" The letters 'pngus' are on the parameter line. If a letter is capitilzed, then any entry with that string in it ANYWHERE will be killed. This is harsh but effective at removing GenBank crap. 8: If the first character is 'n' (for "notes") then if there is no /gene or /number for a feature, the program will use the /note feature. WARNING: Despite 15 years of complaining to GenBank, names in notes are NOT PARSABLE and may cause ill health. 9: If the first character is 'r', 'R' or 'm' (for "mRNA") then the mRNA feature is used instead of CDS. Otherwise CDS is used. (Introduced 2007 Dec 10.) If exonp is old (before having a parameter version) exon will attempt to upgrade it. db: a set of GenBank entries lengths: A list of the exon lengths found in db. dinst: Delila instructions for donor sites. ainst: Delila instructions for acceptor sites. einst: Delila instructions for exons. The acceptor from (theAfromrange) and donor to (theDtorange) are used to extend beyond the exon edges. exonfeatures: Locations of exons in the format for the Lister program. output: messages to the user description The program searches for 'CDS'. If the next word is 'join' it parses out the parts of the CDS, determining the lengths of the exons. If 'complement' is found, the complementary exons are identified. To ensure a clean data set, the program eliminates: * single exons in a locus (unreliable data for lengths) * exons which have one end not defined (< or > mark) * exons at the beginning or end of the CDS (unreliable data) * exons that are references to other entries. * duplicate exons within a single locus * exons that have any coordinates the same as other exons in the same entry. This (arbitrarily) eliminates alternative splice cases. To remove further junk from the database, entries that contain any of these phrases are skipped: 'not_experimental' 'gene prediction' 'Unpublished' GenBank contains many mRNA sequences masquerading as DNA. They can be identified by zero length introns. They are ruthlessly eliminated. If a CDS has a no /gene name in the feature table, it will be named like this: U00096.CDS.190-255 no /gene, no /number If a CDS has a /gene name in the feature table, it would be nice to name it like this: U00096.thrA (this name can fail) Unfortunately that alone will fail because all exons end up being named the same! So if there is a /number the name will include it: M95740.IDUA.exon-3 /gene, /number If there is a /gene but no /number the range will be given: M95740.IDUA.427-512 /gene, no /number So there are three options for names. * The exons are placed into ascending order. * The Delila name command is used to name the pieces. examples documentation @article{Stephens.Schneider.Splice, author = "R. M. Stephens and T. D. Schneider", title = "Features of spliceosome evolution and function inferred from an analysis of the information at human splice sites", journal = "J. Mol. Biol.", volume = "228", pages = "1124-1136", year = "1992"} see also dbinst.p author Thomas Dana Schneider bugs technical notes The program deals with alternative splicing by removing any exon that has any coordinate the same as another exon. The program only can accept a single type of organism to be put into the instruction files. It's not clear that one would ever want to mix organisms for this analysis!! The zero coordinate for splice junctions follows the convention of Stephens.Schneider.Splice: it is the base on the intron side of the splice junction. 2007 Dec 05. The exon program would compile and run with exonmax set to 15000. Unfortunately this is not enough for H.sapiens chromosome 1 (NC_000001 247249719 bp). The program compiles (gpc) but gives a 'Segmentation Fault'. The reason (thanks to David Bryant) is that the stack size in Unix is restricted. The Unix command 'limit' gives 'stacksize 8192 kbytes'. There are at least 3 solutions. 1. The Unix stack size can be increased by the command: limit stacksize 65538 Doing so solved the problem. Although this works, it requires setting the operating system so it is not too portable. 2. The exonmax determines the number of exonrecords: fealist: array[1..exonmax] of exonrecord; The exonrecords use the standard 'string' for the gene namewhich has an array of characters whose size is determined by constant maxstring for which the default is 150. Setting maxstring to 20 solves the problem. Although this would work, perhaps it is best to allow long names. 2. Put the array into the program heap, which is unlimited, instead of the stack, which has the current limit. This requires a program change. I implemented it by making the fealist be a pointer to an array: 'fealist^.a' replaced 'fealist' through the code. This worked. Thanks to David Bryant for pointing out the situation and explaining the possible solution of putting the data on the heap. */ /* end module describe.exon */ /* Documentation for Feature table format definition: http://www.ncbi.nlm.nih.gov/collab/FT/index.html http://www.ncbi.nlm.nih.gov/collab/FT/index.html The DDBJ/EMBL/GenBank Feature Table Definition Version 2.0, Dec 15 1997. | 4.3 Data item positions | | The position of the data items within the feature descriptor line is as follows: | | column position data item | --------------- --------- | | 1-5 blank | 6-20 feature key | 21 blank | 22-80 location | | | Data on the qualifier and continuation lines begins in column position 22 | (the first 21 columns contain blanks). The EMBL format for all lines differs | from that described above in that it includes a line type abbreviation in | columns 1 and 2. The following situation appears in U00096: CDS complement(20233..20508) /gene="insA_1" /note="f91; 100 pct identical to ISA1_ECOLI SW: P03827" /codon_start=1 /label=b0022 /product="insertion element IS1 protein InsA" /db_xref="PID:g1786204" /transl_table=11 /translation="MASVSISCPSCSATDGVVRNGKSTAGHQRYLCSHCRKTWQLQFT YTASQPGTHQKIIDMAMNGVGCRATARIMGVGLNTILRHLKNSGRSR" gene complement(20815..21078) /gene="rpsT" CDS complement(20815..21078) /gene="rpsT" /note="f87; 100 pct identical RS20_ECOLI SW: P02378 but includes initiator met; TTG start" /codon_start=1 /label=b0023 /product="30S ribosomal protein S20" /db_xref="PID:g1786206" /transl_table=11 /translation="MANIKSAKKRAIQSEKARKHNASRRSMMRTFIKKVYAAIEAGDK AAAQKAFNEMQPIVDRQAAKGLIHKNKAARHKANLTAQINKLA" To avoid the /gene from being assigned to insA_1, only the first occurance is used. This is done with the ignoreslashgene boolean. 2000 May 4 REMOVE THIS MECHANISM: it is easier to reset the /gene finding mechanisms at each feature using the columnposition = 6 trigger. */ /* begin module interact.const */ /* begin module string.const */ #define maxstring 2000 /* the maximum string */ /* end module string.const version = 4.86; (@ of prgmod.p 2004 Sep 8 */ /* end module interact.const version = 7.72; {of delmod.p 2007 Jul 23} */ /* begin module my.filler.const */ #define fillermax 20 /* the size of the filler array for a string */ /* end module my.filler.const */ /* from filler.const version = 4.13; (@ of prgmod.p 1994 sep 5 */ /* begin module interact.type */ /* begin module string.type */ /* pointer to a string */ typedef struct string { /* a string of characters */ Char letters[maxstring]; /* the letters in the string */ long length; /* the number of characters in the string */ long current; /* the letter we are working on */ Char *next; /* the next string in a series */ } string; /* end module string.type version = 4.86; (@ of prgmod.p 2004 Sep 8 */ /* end module interact.type version = 7.72; {of delmod.p 2007 Jul 23} */ /* begin module filler.type */ /* the following is an array used to fill a string. it is convenient to have it much shorter than the maxstring, so that it is easy to fill the string using procedure fillstring. the user must declare the value of constant fillermax. */ typedef Char filler[fillermax]; /* end module filler.type version = 4.39; (@ of prgmod.p 1999 November 28 */ /* begin module trigger.type */ typedef struct trigger { /* an object to be searched for */ string seek; /* the characters looked for */ long state; /* how close to triggering we are */ boolean skip; /* trigger not found- skip the line */ /* the trigger was found */ boolean found; } trigger; /* end module trigger.type version = 4.39; (@ of prgmod.p 1999 November 28 */ /* a type for the quicksort module */ typedef short position; Static _TEXT exonp, db, lengths, dinst, ainst, einst, exonfeatures; /* files used by this program */ /* zzz global for now, maybe forever! */ Static long columnposition; /* column position in entry */ Static jmp_buf _JL1; /* begin module halt */ Static Void halt() { /* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. */ printf(" program halt.\n"); longjmp(_JL1, 1); } /* end module halt version = 7.72; {of delmod.p 2007 Jul 23} */ /* begin module interact.clearstring */ /* begin module clearstring */ Static Void clearstring(ribbon) string *ribbon; { /* empty the string */ long index; /* to the ribbon */ for (index = 0; index < maxstring; index++) ribbon->letters[index] = ' '; ribbon->length = 0; ribbon->current = 0; } /* clearstring */ Static Void initializestring(ribbon) string *ribbon; { /* start the string with a nil pointer. This routine should be called before doing linked list work. This allows the standard string routines to clear the string without killing the pointer. */ clearstring(ribbon); ribbon->next = NULL; } /* initializestring */ /* end module clearstring version = 4.86; (@ of prgmod.p 2004 Sep 8 */ /* end module interact.clearstring version = 7.72; {of delmod.p 2007 Jul 23} */ /* begin module equalstring */ Static boolean equalstring(a, b) string a, b; { /* Test for equality between two strings at current positions. NOTE: A compiler bug results if one directly tests this way: if thedefinition^.nametag = aname The reason is completely not clear! I showed that the parts of the strings were identical, but the whole was not by this test. For this reason it is *critical to test strings with equalstring. */ long index; /* index to both strings */ boolean equal; /* are letters in a and b the same? */ if (a.length == b.length) { index = 1; do { equal = (a.letters[index-1] == b.letters[index-1]); index++; } while (equal && index <= a.length); return equal; } else return false; } /* equalstring */ /* end module equalstring version = 4.39; (@ of prgmod.p 1999 November 28 */ /* begin module interact.writestring */ /* begin module writestring */ Static Void writestring(tofile, s) _TEXT *tofile; string *s; { /* write the string s to file tofile, no writeln */ long i; /* index to s */ long FORLIM; FORLIM = s->length; for (i = 0; i < FORLIM; i++) putc(s->letters[i], tofile->f); } /* writestring */ /* end module writestring version = 4.39; (@ of prgmod.p 1999 November 28 */ /* end module interact.writestring version = 4.39; (@ of prgmod.p 1999 November 28 */ /* begin module filler.fillstring */ Static Void fillstring(s, a) string *s; Char *a; { /* this procedure makes it reasonably easy to fill the string s with characters. one calls the procedure as: */ /* 1 2 3 4 5 */ /* 12345678901234567890123456789012345678901234567890 */ /* fillstring(s, 'this-is-the-string '); the two comments make it easy to line the characters up. also, for this example, it was assumed that the length of filler as defined by the constant fillermax was 50. */ long length = fillermax; /* of the string without trailing blanks */ long index; /* of s */ clearstring(s); while (length > 1 && a[length-1] == ' ') length--; if (length == 1 && a[length-1] == ' ') { printf("fillstring: the string is empty\n"); halt(); } for (index = 0; index < length; index++) s->letters[index] = a[index]; s->length = length; s->current = 1; } /* fillstring */ /* end module filler.fillstring version = 7.72; {of delmod.p 2007 Jul 23} */ /* begin module filler.filltrigger */ Static Void filltrigger(t, a) trigger *t; Char *a; { /* fill the trigger t */ fillstring(&t->seek, a); } /* fillstring */ /* end module filler.filltrigger version = 7.72; {of delmod.p 2007 Jul 23} */ /* begin module trigger.proc */ /* this module allows one to scan a series of characters, as from an array or a file, and to "trigger" or detect a simple string in the series. the advantage of the trigger is that several triggers can "observe" a stream of characters at once, each looking for a different thing. some other modules required: interact.const, interact.type */ Static Void resettrigger(t) trigger *t; { /* reset the trigger to ground state */ t->state = 0; t->skip = false; t->found = false; } /* resettrigger */ Static Void testfortrigger(ch, t) Char ch; trigger *t; { /* look at the character ch. if it is part of the trigger (at the current trigger state), then the trigger state goes higher. if it is not part of the trigger then the trigger state is reset, skip is true and one should skip onward to find the trigger. if the trigger is found, found is true. 1996 Sep 12: Bug found! In the case of a trigger "ab", the program used to miss it for situations like "aab". This was because at the first a it would step up. Then it would see the second a and recognize that was not part of ab. It would fail to realize that it could be the start of a new one. The code now accounts for that possibility. */ t->state++; /* writestring(list,seek); writeln(list,'testfortrigger seek.letters[',state:1,']:', seek.letters[state],' ch:',ch); */ if (t->seek.letters[t->state - 1] == ch) { t->skip = false; if (t->state == t->seek.length) t->found = true; else t->found = false; return; } /* it failed. But wait! It could be the beginning of a NEW trigger string! */ if (t->seek.letters[0] == ch) { t->state = 1; t->skip = false; t->found = false; return; } t->state = 0; t->skip = true; t->found = false; /* reset trigger */ } /* testfortrigger */ /* end module trigger.proc version = 7.72; {of delmod.p 2007 Jul 23} */ /* begin module copyaline */ Static Void copyaline(fin, fout) _TEXT *fin, *fout; { /* copy a line from file fin to file fout */ while (!P_eoln(fin->f)) { putc(P_peek(fin->f), fout->f); getc(fin->f); } fscanf(fin->f, "%*[^\n]"); getc(fin->f); putc('\n', fout->f); } /* copyaline */ /* end module copyaline version = 7.72; {of delmod.p 2007 Jul 23} */ /* begin module copyline */ Static Void copyline(fin, fout) _TEXT *fin, *fout; { /* copy a line from file fin to file fout but DO NOT CARRIAGE RETURN on the fout. Carriage return on the fin. */ while (!P_eoln(fin->f)) { putc(P_peek(fin->f), fout->f); getc(fin->f); } fscanf(fin->f, "%*[^\n]"); getc(fin->f); } /* copyline */ /* end module copyline version = 4.39; (@ of prgmod.p 1999 November 28 */ /* begin module copylines */ Static long copylines(fin, fout, n) _TEXT *fin, *fout; long n; { /* copy n lines of file fin to file fout. the actual number of lines copied is returned. */ long index = 0; /* the current line number */ while (!BUFEOF(fin->f) && index < n) { copyaline(fin, fout); index++; } return index; } /* copylines */ /* end module copylines version = 7.72; {of delmod.p 2007 Jul 23} */ /* ZAP zzz begin module skipblanks */ Static Void skipblanks(thefile) _TEXT *thefile; { /* skip over blanks until a non-blank, or end of line, is found */ while ((P_peek(thefile->f) == ' ') & (!P_eoln(thefile->f))) { /* modify skipblanks so that columns are tracked properly */ columnposition++; getc(thefile->f); } } Static Void skipnonblanks(thefile) _TEXT *thefile; { /* skip over nonblanks until a blank, or end of line, is found */ while ((P_peek(thefile->f) != ' ') & (!P_eoln(thefile->f))) getc(thefile->f); } /* procedure skipcolumn(var thefile: text); (* skip over a data column *) begin skipblanks(thefile); skipnonblanks(thefile) end; */ /* ZAP zzz end module skipblanks version = 4.16; (@ of prgmod.p 1996 August 12 */ /* begin module grabtoken */ Static Void grabtoken(thefile, thestring) _TEXT *thefile; string *thestring; { /* skip any blanks and then grab the next token from the file */ Char c; /* a character in thefile */ boolean done = false; /* done finding the name */ skipblanks(thefile); thestring->length = 0; while (!done) { if (P_eoln(thefile->f)) { done = true; break; } c = getc(thefile->f); if (c == '\n') c = ' '; if (c == ' ') done = true; else { thestring->length++; thestring->letters[thestring->length - 1] = c; } } } /* end module grabtoken version = 4.39; (@ of prgmod.p 1999 November 28 */ /* begin module grabquote */ Static Void grabquote(thefile, thestring) _TEXT *thefile; string *thestring; { /* skip any blanks and then grab up to the next quote from the file */ Char c; /* a character in thefile */ boolean done = false; /* done finding the name */ skipblanks(thefile); thestring->length = 0; while (!done) { if (P_eoln(thefile->f)) { done = true; break; } c = getc(thefile->f); if (c == '\n') c = ' '; if (c == '"') done = true; else { thestring->length++; thestring->letters[thestring->length - 1] = c; } } } #define exonmax 30000 /* largest number of exons the program can handle */ #define colwid 7 /* width of columns, not including separater space */ typedef struct exonrecord { /* a record of an exon */ long thestart; /* the start */ long thestop; /* the stop */ long FEA; /* which cds or mRNA of the locus it is in */ boolean complement; /* whether it is on the complementary strand */ boolean getthestart; /* whether to get the start */ boolean getthestop; /* whether to get the stop */ boolean printasexon; /* if true, the start or stop represent an exon. otherwise it must be an intron. */ boolean genenamefound; /* if there is a /gene for the CDS, record it */ string genenamestring; /* the gene name */ boolean genenumberfound; /* if there is a /number for the CDS, record it */ string genenumberstring; /* the gene number */ string notestring; /* a string containing the last note */ long notenumber; /* count of notes since last exon */ long featurenumber; /* the number of the feature in the entry where this item was found. This allows notes to be attached only to the right feature */ } exonrecord; /* exonrecordptr is a pointer to an exonset, which is a holder for an exonrecord that puts the record on the essentially infinite heap instead of the limited stack. */ typedef struct exonset { /* holder of the array of exonrecords */ exonrecord a[exonmax]; } exonset; /* pointer to the holder of exon records */ /*qqq*/ /* Local variables for themain: */ struct LOC_themain { _TEXT *exonp, *db, *lengths, *dinst, *ainst, *einst, *exonfeatures; string accname; /* the current accession name */ boolean allowalternative; /* allow alternate splicing */ boolean atcomplement; /* we are at a complement */ Char c; /* a character in db */ Char cprevious; /* the previous value of c. This is no longer used, but leave the mechanism for now. */ trigger FEA; /* trigger to find the CDS or mRNA pattern */ long FEAcount; /* count of cds or mRNA in current entry */ trigger closeparen; /* trigger to find the end of the join pattern */ trigger complement; /* trigger to find the end of the complement pattern */ boolean completeunit; /* true when a unit is complete */ boolean debug; /* true when debugging the program */ boolean getexons; /* true if we are to get exon features */ boolean getintrons; /* true if we are to get intron features */ long dotcount; /* count number of dots in a unit of a CDS */ /* (* Original definition before 2007 Dec 05: *) fealist: array[1..exonmax] of exonrecord; (* features in a locus *) */ exonset *fealist; /* features in a locus */ /*qqq*/ long feastored; /* current number of features recorded into fealist */ long featarget; /* earlier target for any gene names or numbers */ long featurecount; /* count of features in the entry so far */ boolean firstnumber; /* true when the first number is about to be read */ trigger genename; /* gene names for CDS of the form /gene */ trigger genenumber; /* gene number for CDS of the form /number */ _TEXT holdfile; /* internal file for rebuilding the exonp */ boolean infeatures; /* true if inside feature secion of an entry */ boolean good; /* the end point is good unless it is found to be outside the sequence */ long hold; /* for reversing start and stop when atcomplement is true */ trigger join; /* trigger to find the join pattern */ boolean killentry; /* kill entries that contain bad phrases */ boolean killputative; /* kill putative entries */ boolean killnotexperimental; /* kill not_experimental entries */ boolean killgeneprediction; /* kill gene prediction entries */ boolean killunpublished; /* kill Unpublished entries */ boolean killpseudo; /* kill pseudo containing entries */ long length; /* length of an exon */ long linenumber; /* count of the lines in db */ long locuscount; /* count of locus */ boolean noends; /* true when printing ends of the CDS */ long numbers[3]; trigger organism; /* trigger to find organism key word */ double parameterversion; /* version of the program */ /*vvv*/ boolean printon; /* if true, print the data passing by */ long relorient; /* relative orientation of site to piece */ long start; /* the start base of an exon */ long stop; /* the stop base of an exon */ long theAfromrange, theAtorange; /* from/to range: acceptor delila inst */ long theDfromrange, theDtorange; /* from/to range: donor delila inst */ Char thefeature; /* the kind of feature to process, CDS or mRNA */ long totalfeastored; /* count of exons in entire db */ long totalentrycount; /* count of entrys in entire db */ long unitcount; /* true when the end of a unit has been found */ boolean unitend; /* true when the end of a unit has been found */ string unknownstring; /* string for unknown genes */ boolean usenotes; /* use notes to name exons */ } ; Local Void grabspecies(db, f, LINK) _TEXT *db, *f; struct LOC_themain *LINK; { /* get the species name from db into f. */ Char c; /* a character read from db */ /* writeln(output,'grabspecies'); */ resettrigger(&LINK->organism); if (*db->name != '\0') { if (db->f != NULL) db->f = freopen(db->name, "r", db->f); else db->f = fopen(db->name, "r"); } else rewind(db->f); if (db->f == NULL) _EscIO2(FileNotFound, db->name); RESETBUF(db->f, Char); while (!BUFEOF(db->f) && !LINK->organism.found) { if (P_eoln(db->f)) { fscanf(db->f, "%*[^\n]"); getc(db->f); continue; } c = getc(db->f); if (c == '\n') c = ' '; testfortrigger(c, &LINK->organism); if (!LINK->organism.found) continue; skipblanks(db); c = getc(db->f); if (c == '\n') c = ' '; fprintf(f->f, "%c.", c); skipnonblanks(db); skipblanks(db); /* 2006 Oct 25: copyline fails for organisms with a strain name: ORGANISM Escherichia coli K12 because it retains the space and Delila can't handle it. copyline(db,f); */ /* replace spaces with dashes: */ while (!P_eoln(db->f)) { if (P_peek(db->f) != ' ') CPUTFBUF(f->f, P_peek(db->f)); else CPUTFBUF(f->f, '-'); CPUT(f->f); getc(db->f); } } if (!LINK->organism.found) { printf("ORGANISM line missing in db\n"); halt(); } } Local Void startinst(f, db, sitetype, LINK) _TEXT *f, *db; Char sitetype; struct LOC_themain *LINK; { /* Start Delila instructions in file f. If sitetype = 'a' then label it 'acceptor'. If sitetype = 'd' then label it 'donor'. Use the db to get the organism/chromosome names */ if (*f->name != '\0') { if (f->f != NULL) f->f = freopen(f->name, "w", f->f); else f->f = fopen(f->name, "w"); } else { if (f->f != NULL) rewind(f->f); else f->f = tmpfile(); } if (f->f == NULL) _EscIO2(FileNotFound, f->name); SETUPBUF(f->f, Char); fprintf(f->f, "title \""); if (sitetype == 'a') fprintf(f->f, "acceptor"); if (sitetype == 'd') fprintf(f->f, "donor"); if (sitetype == 'e') fprintf(f->f, "exon"); fprintf(f->f, "\";\n"); fprintf(f->f, "(* exon %4.2f *)\n", version); fprintf(f->f, "default out-of-range reduce-range;\n"); fprintf(f->f, "default numbering piece;\n"); fprintf(f->f, "organism "); grabspecies(db, f, LINK); fprintf(f->f, ";\n"); fprintf(f->f, "chromosome "); grabspecies(db, f, LINK); fprintf(f->f, ";\n\n"); } Local Void nextline(LINK) struct LOC_themain *LINK; { /* move to the next line */ if (!P_eoln(LINK->db->f)) return; fscanf(LINK->db->f, "%*[^\n]"); getc(LINK->db->f); /*writeln(output); */ LINK->linenumber++; if (LINK->debug) { if (LINK->printon) putchar('\n'); } columnposition = 0; } Local Void nextcharacter(f, cprevious, c, LINK) _TEXT *f; Char *cprevious, *c; struct LOC_themain *LINK; { /* read the next character c from file f, keep track of the previous character. */ *cprevious = *c; *c = getc(f->f); if (*c == '\n') *c = ' '; columnposition++; if (LINK->debug) { if (LINK->printon) putchar(*c); } /* write(output,c); if eoln(db) then begin writeln(output); write(output,'| '); end; */ } Local Void readaninteger(f, first, n, LINK) _TEXT *f; Char first; long *n; struct LOC_themain *LINK; { /* read an integer from f, protecting ourselves from stupidity. first is the first character of the number */ /*write(output,'readaninteger: ');*/ *n = first - '0'; if (!P_inset(first, LINK->numbers)) { printf("readaninteger: bad number!\n"); printf("first is: %c\n", first); halt(); } while (P_inset(P_peek(f->f), LINK->numbers)) { nextcharacter(f, &LINK->cprevious, &LINK->c, LINK); *n = *n * 10 + LINK->c - '0'; } /*writeln(output,' becomes ',n:1);*/ } Local Void sign(f, x, LINK) _TEXT *f; long x; struct LOC_themain *LINK; { /* write the sign of x */ if (x < 0) putc('-', f->f); else putc('+', f->f); } Local Void thename(f, e, LINK) _TEXT *f; long e; struct LOC_themain *LINK; { /* write the exon name e to file f. There are three formats, depending on whether the /gene and /number have been defined: U00096.CDS.11356-10643 - - M95740.IDUA.exon-3 /gene /number M95740.IDUA.427-512 /gene - The accession number or coordinate is important to keep so that features are identified properly in lister. Presumably gene names are unique in an entry, though this may not be true. It turns out that using the gene name alone will fail since exons will all have the same name! This is why the number is needed. */ long start, stop; /* the range of the feature */ /* if it is an intron, then the start and stops are recorded the other way to make donor and acceptor identification easy, but with this new naming feature we have to invert it again... */ /* (* original way of accessing the fealist before 2005 Dec 05 *) if fealist[e].printasexon then begin qqq */ if (LINK->fealist->a[e-1].printasexon) { start = LINK->fealist->a[e-1].thestart; stop = LINK->fealist->a[e-1].thestop; } else { start = LINK->fealist->a[e-1].thestop; stop = LINK->fealist->a[e-1].thestart; } writestring(f, &LINK->accname); if (LINK->fealist->a[e-1].genenamefound) { putc('.', f->f); writestring(f, &LINK->fealist->a[e-1].genenamestring); } if (LINK->fealist->a[e-1].genenumberfound) { fprintf(f->f, ".#"); writestring(f, &LINK->fealist->a[e-1].genenumberstring); } if (!LINK->fealist->a[e-1].genenamefound && !LINK->fealist->a[e-1].genenumberfound) { /* write(f, '.CDS:'); */ if (LINK->thefeature == 'r') fprintf(f->f, ".mRNA:"); else fprintf(f->f, ".CDS:"); fprintf(f->f, "%ld-%ld", start, stop); } if (equalstring(LINK->fealist->a[e-1].genenamestring, LINK->unknownstring)) fprintf(f->f, ".%ld-%ld", start, stop); /* give the note if there was one */ if (!LINK->usenotes) return; if (LINK->fealist->a[e-1].notenumber > 0) { putc('.', f->f); writestring(f, &LINK->fealist->a[e-1].notestring); /* write(f, 'e=',e:1); write(output, 'thename: e=',e:1,' '); writestring(output, fealist^.a[e].notestring); write(output,' notenumber = ',fealist^.a[e].notenumber:1); writeln(output); */ /*zzznote*/ } } /* thename */ Local Void writeDelilainstructions(inst, location, thefromrange, thetorange, e, LINK) _TEXT *inst; long location, thefromrange, thetorange, e; struct LOC_themain *LINK; { /* write the e'th exon entry at the given location to the inst file */ fprintf(inst->f, "piece "); writestring(inst, &LINK->accname); putc(';', inst->f); fprintf(inst->f, " name \""); thename(inst, e, LINK); fprintf(inst->f, "\";"); fprintf(inst->f, " get from %ld", location); putc(' ', inst->f); sign(inst, thefromrange * LINK->relorient, LINK); if (-LINK->relorient > 0) fprintf(inst->f, "%ld", labs(thefromrange)); else fprintf(inst->f, "%ld", labs(thefromrange)); fprintf(inst->f, " to same "); sign(inst, thetorange * LINK->relorient, LINK); if (LINK->relorient > 0) fprintf(inst->f, "%ld", labs(thetorange)); else fprintf(inst->f, "%ld", labs(thetorange)); fprintf(inst->f, " direction "); sign(inst, LINK->relorient, LINK); fprintf(inst->f, ";\n"); } /******************************************************************************/ /* Sorting routines */ Local boolean lessthan(a, b, LINK) position a, b; struct LOC_themain *LINK; { /* see quicksort */ /* This causes a segmentation error for unknown reasons: case fealist^.a[a].complement of true: case fealist^.a[b].complement of true: lessthan := fealist^.a[a].thestop < fealist^.a[b].thestart; false: lessthan := fealist^.a[a].thestop < fealist^.a[b].thestop; end; false: case fealist^.a[b].complement of true: lessthan := fealist^.a[a].thestart < fealist^.a[b].thestop; false: lessthan := fealist^.a[a].thestart < fealist^.a[b].thestart; end; end; */ return (LINK->fealist->a[a-1].thestart < LINK->fealist->a[b-1].thestart); } Local Void swap_(a, b, LINK) position a, b; struct LOC_themain *LINK; { /* see quicksort */ exonrecord hold; hold = LINK->fealist->a[a-1]; LINK->fealist->a[a-1] = LINK->fealist->a[b-1]; LINK->fealist->a[b-1] = hold; /*;write(output,'a=',a:1,', b=',b:1); print (@ for testing */ } /* begin module quicksort */ Local Void quicksort(left, right, LINK) position left, right; struct LOC_themain *LINK; { /* quick sort a list between positions left and right, into ascending order. a position is simply a scalar of the form 0..max. the array to be sorted is dimensioned 1..max. (the difference in the ranges is important to the correct operation of the sort...) two external routines are used: function lessthan(a, b: position): boolean is a generalized test for value-at-a < value-at-b. procedure swap(a, b: position) switches the items at positions a and b. since these routines are external, the procedure is general. this procedure taken from the book 'algorithms + data structures = programs' by niklaus wirth, prentice-hall, inc., englewood cliffs, n.j.(1976), pp. 76-82 */ position lower = left; position upper; /* the positions looked at currently */ position center; /* the rough center of the region being sorted */ center = (left + right) / 2; upper = right; do { while (lessthan(lower, center, LINK)) lower++; while (lessthan(center, upper, LINK)) upper--; if (lower <= upper) { /* keep track of the center through the map: */ if (lower == center) center = upper; else if (upper == center) center = lower; swap_(lower, upper, LINK); lower++; upper--; } } while (lower <= upper); if (left < upper) quicksort(left, upper, LINK); if (lower < right) quicksort(lower, right, LINK); } /* end module quicksort taken from progmod.p */ Local Void sortexons(LINK) struct LOC_themain *LINK; { /* sort the exons in the fealist^.a by their first coordinate */ quicksort(1, (int)LINK->feastored, LINK); } /******************************************************************************/ Local Void printexons(LINK) struct LOC_themain *LINK; { /* Print the exons listed in the fealist^.a, then clear the list. */ long currentexons; /* number of exons printed right now */ boolean duplicate; /* check for duplicate site in previous feature */ long current; /* current base location for instruction */ long previous; /* previous base location for instruction */ long e; /* index for fealist^.a */ long firstexon = 1; /* the first exon to print */ long lastexon; /* the last exon to print */ exonrecord *WITH; if (LINK->debug) { if (P_eoln(LINK->db->f)) putchar('\n'); } if (LINK->debug) putchar('\n'); if (LINK->debug) printf("****************************"); if (LINK->debug) printf("---> %4ld %4ld %4ld %s\n", LINK->feastored, LINK->start, LINK->stop, LINK->killentry ? " TRUE" : "FALSE"); lastexon = LINK->feastored; currentexons = lastexon - firstexon + 1; if (currentexons > 0 && !LINK->killentry) { sortexons(LINK); /* only sort if there are exons to sort ... */ LINK->totalfeastored += currentexons; LINK->totalentrycount++; for (e = firstexon; e <= lastexon; e++) { /* writeln(output); writeln(output,'PRINTEXONS e = ',e:1,'=========================================='); */ LINK->start = LINK->fealist->a[e-1].thestart; LINK->stop = LINK->fealist->a[e-1].thestop; if (LINK->fealist->a[e-1].printasexon) { if (LINK->fealist->a[e-1].complement) LINK->length = LINK->start - LINK->stop + 1; else LINK->length = LINK->stop - LINK->start + 1; if (LINK->length <= 0) { fprintf(LINK->lengths->f, " ERROR: length < 0 for this entry:\n"); printf(" \007ERROR: see lengths file\n"); } /* write lengths file here */ fprintf(LINK->lengths->f, " %*ld %*ld %*ld", colwid, LINK->length, colwid, LINK->start, colwid, LINK->stop); if (LINK->fealist->a[e-1].complement) fprintf(LINK->lengths->f, " %*c", colwid, '-'); else fprintf(LINK->lengths->f, " %*c", colwid, '+'); fprintf(LINK->lengths->f, " %*ld %*ld %*ld", colwid, e, colwid, LINK->fealist->a[e-1].FEA, colwid, LINK->locuscount); putc(' ', LINK->lengths->f); writestring(LINK->lengths, &LINK->accname); putc('\n', LINK->lengths->f); } if (LINK->fealist->a[e-1].complement) LINK->relorient = -1; else LINK->relorient = 1; /* write Delila instructions here */ /* For intron ends, the location of the splice junction zero coordinate is that given in the feature table. For exons it is one off, depending on the relative orientation, relorient. */ WITH = &LINK->fealist->a[e-1]; if (WITH->getthestart) { /* look for duplicates immediately preceeding */ /* writeln(output,'========================================<<<<<<<<<<<<<<<<<<<<'); */ if (e > firstexon) { previous = LINK->fealist->a[e-2].thestart; if (LINK->fealist->a[e-2].printasexon) { if (LINK->fealist->a[e-1].complement) previous--; /* relorient */ else previous++; } if (LINK->fealist->a[e-1].printasexon) current = WITH->thestart - LINK->relorient; else current = WITH->thestart; duplicate = (current == previous && LINK->fealist->a[e-2].getthestart); /* writeln(output,'previous=',previous:1); writeln(output,'current=',current:1); */ } else duplicate = false; /* writeln(output,'fealist^.a[e].thestart = ', fealist^.a[e].thestart:1); writeln(output,'fealist^.a[e-1].thestart = ', fealist^.a[e-1].thestart:1); */ /* writeln(output,'========================================<<<<<<<<<<<<<<<<<<<<'); */ if (!duplicate) { /*zzzggg*/ /* write(ainst,'(* e=',e:1,' "'); thename(ainst,e); writeln(ainst,'" *)'); */ if (LINK->fealist->a[e-1].printasexon) writeDelilainstructions(LINK->ainst, WITH->thestart - LINK->relorient, LINK->theAfromrange, LINK->theAtorange, e, LINK); else writeDelilainstructions(LINK->ainst, WITH->thestart, LINK->theAfromrange, LINK->theAtorange, e, LINK); } /*zzzname*/ /* writeln(output); writeln(output,'about to writeDelilainstructions: ainst (e=',e:1,')'); */ } else { if (LINK->fealist->a[e-1].printasexon) printf("%ld exon start was dropped from ainst: end of CDS or sequence\n", WITH->thestart); else printf("%ld intron stop was dropped from ainst: end of sequence\n", WITH->thestart); } if (WITH->getthestop) { /* look for duplicates immediately preceeding */ /* writeln(output,'========================================>>>>>>>>>>>>>>>>>>>>'); */ if (e > firstexon) { previous = LINK->fealist->a[e-2].thestop; if (LINK->fealist->a[e-2].printasexon) { if (LINK->fealist->a[e-1].complement) previous--; /* relorient */ else previous++; } if (LINK->fealist->a[e-1].printasexon) current = WITH->thestop - LINK->relorient; else current = WITH->thestop; duplicate = (current == previous && LINK->fealist->a[e-2].getthestop); /* writeln(output,'previous=',previous:1); writeln(output,'current=',current:1); */ } else duplicate = false; /* writeln(output,'fealist^.a[e].thestop = ', fealist^.a[e].thestop:1); writeln(output,'fealist^.a[e-1].thestop = ', fealist^.a[e-1].thestop:1); */ /* writeln(output,'========================================>>>>>>>>>>>>>>>>>>>>'); */ if (!duplicate) { /*zzzggg*/ /* write(dinst,'(* e=',e:1,' "'); thename(dinst,e); writeln(dinst,'" *)'); */ if (LINK->fealist->a[e-1].printasexon) writeDelilainstructions(LINK->dinst, WITH->thestop + LINK->relorient, LINK->theDfromrange, LINK->theDtorange, e, LINK); else writeDelilainstructions(LINK->dinst, WITH->thestop, LINK->theDfromrange, LINK->theDtorange, e, LINK); } /*zzzname*/ /* writeln(output); writeln(output,'about to writeDelilainstructions: dinst (e=',e:1,')'); */ } else { if (LINK->fealist->a[e-1].printasexon) printf("%ld exon stop was dropped from dinst: end of CDS or sequence\n", WITH->thestop); else printf("%ld intron start was dropped from dinst: end of sequence\n", WITH->thestop); } if (LINK->fealist->a[e-1].printasexon) { /* write einst: exon instructions */ WITH = &LINK->fealist->a[e-1]; /* write exonfeatures file here with this format: define "J02833exon467-615" "-" "<]" "[>" 0 148 */ fprintf(LINK->einst->f, "piece "); writestring(LINK->einst, &LINK->accname); fprintf(LINK->einst->f, "; "); if (LINK->relorient == 1) { fprintf(LINK->einst->f, "name \""); thename(LINK->einst, e, LINK); fprintf(LINK->einst->f, "\"; "); fprintf(LINK->einst->f, "get from "); fprintf(LINK->einst->f, "%ld", WITH->thestart); putc(' ', LINK->einst->f); sign(LINK->einst, LINK->theAfromrange, LINK); fprintf(LINK->einst->f, "%ld", labs(LINK->theAfromrange)); fprintf(LINK->einst->f, " to "); fprintf(LINK->einst->f, "%ld", WITH->thestop); putc(' ', LINK->einst->f); sign(LINK->einst, LINK->theDtorange, LINK); fprintf(LINK->einst->f, "%ld", labs(LINK->theDtorange)); fprintf(LINK->einst->f, " direction +;\n"); } /*zzzname*/ else { fprintf(LINK->einst->f, "name \""); thename(LINK->einst, e, LINK); fprintf(LINK->einst->f, "\"; "); fprintf(LINK->einst->f, "get from "); fprintf(LINK->einst->f, "%ld", WITH->thestop); putc(' ', LINK->einst->f); sign(LINK->einst, -LINK->theDtorange, LINK); fprintf(LINK->einst->f, "%ld", labs(LINK->theDtorange)); fprintf(LINK->einst->f, " to "); fprintf(LINK->einst->f, "%ld", WITH->thestart); putc(' ', LINK->einst->f); sign(LINK->einst, -LINK->theAfromrange, LINK); fprintf(LINK->einst->f, "%ld", labs(LINK->theAfromrange)); fprintf(LINK->einst->f, " direction +;\n"); } /*zzzname*/ fprintf(LINK->exonfeatures->f, "define \""); thename(LINK->exonfeatures, e, LINK); fprintf(LINK->exonfeatures->f, "\" \"-\" \"<]\" \"[>\" 0 %ld\n", LINK->length - 1); /* call it immediately; @ J02833 467 +1 "J02833exon467-615" "" */ /*vvv*/ /* write(exonfeatures, '@ BUBAvvv '); */ fprintf(LINK->exonfeatures->f, "@ "); writestring(LINK->exonfeatures, &LINK->accname); fprintf(LINK->exonfeatures->f, " %ld %ld \"", LINK->start, LINK->relorient); thename(LINK->exonfeatures, e, LINK); fprintf(LINK->exonfeatures->f, "\" \"\"\n"); } } } LINK->feastored = 0; /* zap the entries whether or not they were printed */ } /* printexons */ Local Void addfeature(featureisexon, getstart, getstop, LINK) boolean featureisexon, getstart, getstop; struct LOC_themain *LINK; { /* see if the feature already exists in the list for this locus. If not, add it to the list. Mark whether to get the start and stops. This allows ends of CDS to be removed. When featureisexon is false then the feature is not a cds or exon (ie it's an intron), and we should not mark it as an exon. Only instructions for the ends are given. */ long e = 1; /* index for fealist */ long previous; /* the original feature previously stored */ boolean good = true; /* was the feature unique? */ /*writeln(output,'addfeature');*/ if (LINK->debug) putchar('\n'); if (LINK->debug) printf("addfeature\n"); if (LINK->debug) printf(" good unit: %ld..%ld\n", LINK->start, LINK->stop); if (LINK->debug) { printf(" feastored: %ld\n", LINK->feastored); /*if debug then writeln(output, 'checking for duplicate exons **********');*/ } if (featureisexon) printf(" CDS/exon %ld-%ld", LINK->start, LINK->stop); else printf(" intron %ld-%ld", LINK->start, LINK->stop); /*zzznote*/ while (e <= LINK->feastored) { if (LINK->debug) printf(" exon %4ld %ld..%ld", e, LINK->fealist->a[e-1].thestart, LINK->fealist->a[e-1].thestop); /* if either the start OR the stop is the same as any previous one, toss it out. This not only eliminates duplicates but also drops alternative splicing. In 4167 exons only 8 were dropped for this reason. */ if ((LINK->allowalternative && LINK->fealist->a[e-1].thestart == LINK->start && LINK->fealist->a[e-1].thestop == LINK->stop) || (!LINK->allowalternative && (LINK->fealist->a[e-1].thestart == LINK->start || LINK->fealist->a[e-1].thestop == LINK->stop))) { printf(" was dropped because it is "); if (LINK->fealist->a[e-1].thestart == LINK->start && LINK->fealist->a[e-1].thestop == LINK->stop) printf("duplicate"); else printf("alternative"); /* writeln(output); writeln(output,'fealist^.a[e].thestart = ', fealist^.a[e].thestart); writeln(output,' start = ', start); writeln(output,' fealist^.a[e].thestop = ', fealist^.a[e].thestop); writeln(output,' stop = ', stop); */ good = false; previous = e; /* note where it was in the list */ e = LINK->feastored; /* kill the loop rapidly! */ } /*write(output, 'allowalternative: ', allowalternative);*/ /* write(output,'exon ',start:1,'-',stop:1, ' was dropped because it is '); */ else if (LINK->fealist->a[e-1].thestart == LINK->start || LINK->fealist->a[e-1].thestop == LINK->stop) printf(" alternative"); /* mark alternatives */ e++; /* normal loop step */ } if (good) { LINK->feastored++; if (LINK->feastored > exonmax) { printf("\nexonmax = %ld exceeded\n", (long)exonmax); halt(); } /* record the exon in the list */ LINK->fealist->a[LINK->feastored-1].complement = LINK->atcomplement; LINK->fealist->a[LINK->feastored-1].FEA = LINK->FEAcount; LINK->fealist->a[LINK->feastored-1].printasexon = featureisexon; LINK->fealist->a[LINK->feastored-1].genenamefound = false; LINK->fealist->a[LINK->feastored-1].genenumberfound = false; LINK->fealist->a[LINK->feastored-1].notenumber = 0; clearstring(&LINK->fealist->a[LINK->feastored-1].notestring); LINK->fealist->a[LINK->feastored-1].featurenumber = LINK->featurecount; /* write(output,' vvvCHECK'); */ if (featureisexon) { LINK->fealist->a[LINK->feastored-1].thestart = LINK->start; LINK->fealist->a[LINK->feastored-1].thestop = LINK->stop; LINK->fealist->a[LINK->feastored-1].getthestart = getstart; LINK->fealist->a[LINK->feastored-1].getthestop = getstop; } else { LINK->fealist->a[LINK->feastored-1].thestart = LINK->stop; LINK->fealist->a[LINK->feastored-1].thestop = LINK->start; LINK->fealist->a[LINK->feastored-1].getthestart = getstop; LINK->fealist->a[LINK->feastored-1].getthestop = getstart; } /* for an intron, reverse the order of the recording: */ /* target gene names and numbers to this recent one */ LINK->featarget = LINK->feastored; } else { /* target gene names and numbers to the previous feature */ LINK->featarget = previous; /*zzzggg*/ } /* Ok so the feature exists - see if we can put current genename and genenumber into it */ /* writeln(output); writeln(output, 'rescue mission'); (* first matters first. Do we have anything to contribute? *) writeln(output, 'feastored = ',feastored:1); writeln(output, 'previous = ',previous:1); writeln(output, 'fealist^.a[previous].genenamefound = ', fealist^.a[previous].genenamefound:1); writeln(output, 'fealist^.a[previous].genenumberfound = ', fealist^.a[previous].genenumberfound:1); */ putchar('\n'); } /* addfeature */ Local Void dojoin(LINK) struct LOC_themain *LINK; { /* do the join function to identify the parts of a CDS. */ boolean getstart, getstop; _TEXT TEMP; /* whether to get the start or stop of an exon. Don't ever get them if they are at the end of a CDS. */ if (LINK->debug) printf("[JOIN FOUND]\n"); putchar('\n'); /* break the line at the 'join(' */ LINK->printon = true; resettrigger(&LINK->join); /*printon := false;*/ /* go through the join list */ resettrigger(&LINK->closeparen); LINK->unitcount = 0; LINK->firstnumber = true; LINK->good = true; LINK->dotcount = 0; LINK->unitend = false; LINK->completeunit = false; do { nextline(LINK); if (P_peek(LINK->db->f) == ' ') /* skip all blanks */ nextcharacter(LINK->db, &LINK->cprevious, &LINK->c, LINK); else { nextline(LINK); nextcharacter(LINK->db, &LINK->cprevious, &LINK->c, LINK); if (LINK->c == ',' || LINK->c == ')') { /* end of the unit */ LINK->unitend = true; LINK->unitcount++; /*write(output,' unitend, unitcount = ',unitcount:1);*/ } else { if (P_inset(LINK->c, LINK->numbers)) { if (LINK->firstnumber) { readaninteger(LINK->db, LINK->c, &LINK->start, LINK); if (LINK->start == LINK->stop + 1) { LINK->killentry = true; TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &LINK->accname); /* report the name to output */ printf(" zero length intron containing entry! DELETED\n"); } LINK->completeunit = false; /* until stop has been read */ LINK->firstnumber = false; } else { readaninteger(LINK->db, LINK->c, &LINK->stop, LINK); LINK->completeunit = true; } } else if (LINK->c == '.') LINK->dotcount++; else { /*writeln(output,'unit died because of ',c);*/ LINK->good = false; /* any other junk: kill */ } } /* inside of a unit */ /* end of unit read */ /*writeln(output,'dotcount=', dotcount:1);*/ /*writeln(output,'HERE good=', good, ' unitend=',unitend);*/ if (LINK->unitend && LINK->completeunit && LINK->good) { if (LINK->dotcount != 2) LINK->good = false; } if (LINK->noends && LINK->unitend) { /* kill the end ones */ if (LINK->unitcount == 1) /*unitend and*/ LINK->good = false; if (LINK->c == ')') LINK->good = false; } /* write (output,' unitend=',unitend); write (output,' completeunit=',completeunit); writeln(output,' good=',good); */ if (LINK->unitend && LINK->completeunit && LINK->good) { /* record the unit */ if (LINK->atcomplement) { /* reverse start and stop */ LINK->hold = LINK->start; LINK->start = LINK->stop; LINK->stop = LINK->hold; } /* Do another test. Two cases: 1) If we are at the low end of the CDS, and it is the exon start, this is probably not an acceptor and should be removed. 2) If we are at the high end of the CDS, and it is the exon stop, this is probably not a donor and should be removed. */ getstart = true; getstop = true; /* test for low end of the CDS */ if (LINK->unitcount == 1) getstart = false; /* test for high end of the CDS */ if (LINK->c == ')') getstop = false; addfeature(true, getstart, getstop, LINK); } if (LINK->unitend) { /* reeset for next time */ /*writeln(output,'UNITEND');*/ LINK->unitend = false; LINK->good = true; LINK->dotcount = 0; LINK->firstnumber = true; LINK->completeunit = false; } } testfortrigger(LINK->c, &LINK->closeparen); } while (!LINK->closeparen.found); /* join list finishes */ printf(")\n"); /*vvv dojoin*/ resettrigger(&LINK->closeparen); LINK->atcomplement = false; } /* dojoin */ Local Void dofeature(featureisexon, LINK) boolean featureisexon; struct LOC_themain *LINK; { /* get the feature information. Cases of "<" or ">" are not included since they are off the end of the piece. exon 143..251 /gene="hMLH1" /number=14 When featureisexon is false then the feature is not a cds or exon (ie it's an intron), and we should not mark it as an exon. Only instructions for the ends are given. */ boolean done; /* done looking for the word 'complement' */ boolean getstart = false, getstop = false; /* whether to get the start or stop of an exon. */ /*writeln(output,'dofeature *************************');*/ LINK->start = 0; LINK->stop = 0; skipblanks(LINK->db); LINK->c = getc(LINK->db->f); if (LINK->c == '\n') LINK->c = ' '; LINK->atcomplement = false; if (LINK->c == 'c') { /* handle the complement! */ /*zzzccc*/ /* make sure it is the word "complement" */ resettrigger(&LINK->complement); /* first character already is past: */ LINK->complement.state++; done = false; while (!done) { if (P_eoln(LINK->db->f)) break; LINK->c = getc(LINK->db->f); if (LINK->c == '\n') { /* p2c: exon.p: Note: Eliminated unused assignment statement [338] */ LINK->c = ' '; } /*write(output,c);*/ testfortrigger(LINK->c, &LINK->complement); if (!LINK->complement.found) continue; LINK->atcomplement = true; done = true; LINK->c = getc(LINK->db->f); /* prepare for the next part */ if (LINK->c == '\n') LINK->c = ' '; } } if (P_inset(LINK->c, LINK->numbers)) { /* ie, it's not "<" */ readaninteger(LINK->db, LINK->c, &LINK->start, LINK); getstart = true; /*writeln(output,' start =',start:1);*/ } else if (LINK->c == '<') { /* get the number but keep getstart false */ LINK->c = getc(LINK->db->f); if (LINK->c == '\n') LINK->c = ' '; if (P_inset(LINK->c, LINK->numbers)) readaninteger(LINK->db, LINK->c, &LINK->start, LINK); else LINK->start = 0; } LINK->c = getc(LINK->db->f); if (LINK->c == '\n') { /* skip the ".." */ LINK->c = ' '; } if (LINK->c == '.') { LINK->c = getc(LINK->db->f); if (LINK->c == '\n') LINK->c = ' '; /*writeln(output,' c1="',c,'"');*/ if (LINK->c == '.') { LINK->c = getc(LINK->db->f); if (LINK->c == '\n') LINK->c = ' '; /*writeln(output,' c3="',c,'"');*/ if (LINK->c == '>') { LINK->c = getc(LINK->db->f); /* skip on */ if (LINK->c == '\n') LINK->c = ' '; if (P_inset(LINK->c, LINK->numbers)) readaninteger(LINK->db, LINK->c, &LINK->stop, LINK); else LINK->stop = LONG_MAX; } else if (P_inset(LINK->c, LINK->numbers)) { /* ie, it's not ">" */ readaninteger(LINK->db, LINK->c, &LINK->stop, LINK); getstop = true; /*writeln(output,' stop =',stop:1);*/ } } /*writeln(output,' c2="',c,'"');*/ } fscanf(LINK->db->f, "%*[^\n]"); getc(LINK->db->f); /*writeln(output,'start: ',getstart);*/ /*writeln(output,' stop: ',getstop);*/ /* kill weird cases. This comes up in D13897 where there is: exon 1531..(1770.1773) Such things should just be dropped. */ if (LINK->start == 0) getstart = false; if (LINK->stop == 0) getstart = false; if (LINK->atcomplement) { /*zzzccc*/ /* reverse start and stop */ LINK->hold = LINK->start; LINK->start = LINK->stop; LINK->stop = LINK->hold; } if (getstart || getstop) addfeature(featureisexon, getstart, getstop, LINK); /*writeln(output,'***********************************');*/ } Local Void readparameters(LINK) struct LOC_themain *LINK; { /* read the parameters */ if (*LINK->exonp->name != '\0') { if (LINK->exonp->f != NULL) LINK->exonp->f = freopen(LINK->exonp->name, "r", LINK->exonp->f); else LINK->exonp->f = fopen(LINK->exonp->name, "r"); } else rewind(LINK->exonp->f); if (LINK->exonp->f == NULL) _EscIO2(FileNotFound, LINK->exonp->name); RESETBUF(LINK->exonp->f, Char); /* insist on new parameter for version */ if (P_peek(LINK->exonp->f) == '0' || P_peek(LINK->exonp->f) == '9' || P_peek(LINK->exonp->f) == '8' || P_peek(LINK->exonp->f) == '7' || P_peek(LINK->exonp->f) == '6' || P_peek(LINK->exonp->f) == '5' || P_peek(LINK->exonp->f) == '4' || P_peek(LINK->exonp->f) == '3' || P_peek(LINK->exonp->f) == '2' || P_peek(LINK->exonp->f) == '1' || P_peek(LINK->exonp->f) == '0') { fscanf(LINK->exonp->f, "%lg%*[^\n]", &LINK->parameterversion); getc(LINK->exonp->f); } else { printf("No version parameter, upgrading exonp.\n"); if (*LINK->holdfile.name != '\0') { if (LINK->holdfile.f != NULL) LINK->holdfile.f = freopen(LINK->holdfile.name, "w", LINK->holdfile.f); else LINK->holdfile.f = fopen(LINK->holdfile.name, "w"); } else { if (LINK->holdfile.f != NULL) rewind(LINK->holdfile.f); else LINK->holdfile.f = tmpfile(); } if (LINK->holdfile.f == NULL) _EscIO2(FileNotFound, LINK->holdfile.name); SETUPBUF(LINK->holdfile.f, Char); fprintf(LINK->holdfile.f, "%4.2f version of exon that this parameter file is designed for.\n", version); /*vvv*/ if (copylines(LINK->exonp, &LINK->holdfile, 8L) != 8) { printf("Cannot copy 8 lines of exonp:\n"); printf("Build it yourself.\n"); halt(); } fprintf(LINK->holdfile.f, "CDNA c/C: CDNA, m/R/r: mRNA\n"); /* preserve notes: */ while (!BUFEOF(LINK->exonp->f)) copyaline(LINK->exonp, &LINK->holdfile); if (*LINK->holdfile.name != '\0') { if (LINK->holdfile.f != NULL) LINK->holdfile.f = freopen(LINK->holdfile.name, "r", LINK->holdfile.f); else LINK->holdfile.f = fopen(LINK->holdfile.name, "r"); } else rewind(LINK->holdfile.f); if (LINK->holdfile.f == NULL) _EscIO2(FileNotFound, LINK->holdfile.name); RESETBUF(LINK->holdfile.f, Char); if (*LINK->exonp->name != '\0') { if (LINK->exonp->f != NULL) LINK->exonp->f = freopen(LINK->exonp->name, "w", LINK->exonp->f); else LINK->exonp->f = fopen(LINK->exonp->name, "w"); } else { if (LINK->exonp->f != NULL) rewind(LINK->exonp->f); else LINK->exonp->f = tmpfile(); } if (LINK->exonp->f == NULL) _EscIO2(FileNotFound, LINK->exonp->name); SETUPBUF(LINK->exonp->f, Char); while (!BUFEOF(LINK->holdfile.f)) copyaline(&LINK->holdfile, LINK->exonp); if (*LINK->exonp->name != '\0') { if (LINK->exonp->f != NULL) LINK->exonp->f = freopen(LINK->exonp->name, "r", LINK->exonp->f); else LINK->exonp->f = fopen(LINK->exonp->name, "r"); } else rewind(LINK->exonp->f); if (LINK->exonp->f == NULL) _EscIO2(FileNotFound, LINK->exonp->name); RESETBUF(LINK->exonp->f, Char); fscanf(LINK->exonp->f, "%lg%*[^\n]", &LINK->parameterversion); getc(LINK->exonp->f); } fscanf(LINK->exonp->f, "%c%*[^\n]", &LINK->c); getc(LINK->exonp->f); if (LINK->c == '\n') { LINK->c = ' '; } if (LINK->c == 'n') LINK->noends = true; else LINK->noends = false; fscanf(LINK->exonp->f, "%c%*[^\n]", &LINK->c); getc(LINK->exonp->f); if (LINK->c == '\n') { LINK->c = ' '; } if (LINK->c == 'd') LINK->debug = true; else LINK->debug = false; fscanf(LINK->exonp->f, "%ld%ld%*[^\n]", &LINK->theDfromrange, &LINK->theDtorange); getc(LINK->exonp->f); fscanf(LINK->exonp->f, "%ld%ld%*[^\n]", &LINK->theAfromrange, &LINK->theAtorange); getc(LINK->exonp->f); LINK->c = getc(LINK->exonp->f); if (LINK->c == '\n') LINK->c = ' '; if (LINK->c == 'e') LINK->getexons = true; else LINK->getexons = false; LINK->c = getc(LINK->exonp->f); if (LINK->c == '\n') LINK->c = ' '; if (LINK->c == 'i') LINK->getintrons = true; else LINK->getintrons = false; fscanf(LINK->exonp->f, "%*[^\n]"); getc(LINK->exonp->f); fscanf(LINK->exonp->f, "%c%*[^\n]", &LINK->c); getc(LINK->exonp->f); if (LINK->c == '\n') LINK->c = ' '; if (LINK->c == 'a') LINK->allowalternative = true; else LINK->allowalternative = false; LINK->c = getc(LINK->exonp->f); if (LINK->c == '\n') { LINK->c = ' '; } if (LINK->c == 'P') LINK->killputative = true; else LINK->killputative = false; LINK->c = getc(LINK->exonp->f); if (LINK->c == '\n') { LINK->c = ' '; } if (LINK->c == 'N') LINK->killnotexperimental = true; else LINK->killnotexperimental = false; LINK->c = getc(LINK->exonp->f); if (LINK->c == '\n') { LINK->c = ' '; } if (LINK->c == 'G') LINK->killgeneprediction = true; else LINK->killgeneprediction = false; LINK->c = getc(LINK->exonp->f); if (LINK->c == '\n') { LINK->c = ' '; } if (LINK->c == 'U') LINK->killunpublished = true; else LINK->killunpublished = false; LINK->c = getc(LINK->exonp->f); if (LINK->c == '\n') { LINK->c = ' '; } if (LINK->c == 'S') LINK->killpseudo = true; else LINK->killpseudo = false; fscanf(LINK->exonp->f, "%*[^\n]"); getc(LINK->exonp->f); if ((!P_eoln(LINK->exonp->f)) & (!BUFEOF(LINK->exonp->f))) { LINK->c = getc(LINK->exonp->f); if (LINK->c == '\n') LINK->c = ' '; if (LINK->c == 'n') LINK->usenotes = true; else LINK->usenotes = false; } else LINK->usenotes = false; fscanf(LINK->exonp->f, "%*[^\n]"); getc(LINK->exonp->f); fscanf(LINK->exonp->f, "%c%*[^\n]", &LINK->thefeature); getc(LINK->exonp->f); if (LINK->thefeature == '\n') LINK->thefeature = ' '; /* allow the user to type mRNA, RNA or rna: */ if (LINK->thefeature == 'R' || LINK->thefeature == 'r' || LINK->thefeature == 'm') LINK->thefeature = 'r'; /*vvv*/ } /* Local variables for writeparameters: */ struct LOC_writeparameters { struct LOC_themain *LINK; _TEXT *f; } ; Local Void no(body, LINK) boolean body; struct LOC_writeparameters *LINK; { fprintf(LINK->f->f, "* "); if (body) fprintf(LINK->f->f, "no "); else fprintf(LINK->f->f, " "); } Local Void writeparameters(f_, LINK) _TEXT *f_; struct LOC_themain *LINK; { /* read the parameters to file f */ struct LOC_writeparameters V; V.LINK = LINK; V.f = f_; fprintf(V.f->f, "***********************************************\n"); fprintf(V.f->f, "**************** parameters *******************\n"); no(LINK->noends, &V); fprintf(V.f->f, "exons on the ends of CDS included.\n"); fprintf(V.f->f, "* the DONOR from range: %3ld\n", LINK->theDfromrange); fprintf(V.f->f, "* the DONOR to range: %3ld\n", LINK->theDtorange); fprintf(V.f->f, "* the ACCEPTOR from range: %3ld\n", LINK->theAfromrange); fprintf(V.f->f, "* the ACCEPTOR to range: %3ld\n", LINK->theAtorange); no(!LINK->getexons, &V); fprintf(V.f->f, "exon features included.\n"); no(!LINK->getintrons, &V); fprintf(V.f->f, "intron features included.\n"); no(!LINK->allowalternative, &V); fprintf(V.f->f, "alternative exons included.\n"); no(!LINK->killputative, &V); fprintf(V.f->f, "\"putative\" entries will be killed.\n"); no(!LINK->killnotexperimental, &V); fprintf(V.f->f, "\"not_experimental\" entries will be killed.\n"); no(!LINK->killgeneprediction, &V); fprintf(V.f->f, "\"gene prediction\" entries will be killed.\n"); no(!LINK->killunpublished, &V); fprintf(V.f->f, "\"Unpublished\" entries will be killed.\n"); no(!LINK->killpseudo, &V); fprintf(V.f->f, "\"pseudo\" entries will be killed.\n"); no(!LINK->usenotes, &V); fprintf(V.f->f, "/notes included in names.\n"); fprintf(V.f->f, "* %c feature: ", LINK->thefeature); if (LINK->thefeature == 'r') fprintf(V.f->f, "mRNA\n"); else fprintf(V.f->f, "CDNA\n"); fprintf(V.f->f, "***********************************************\n"); } /* writeparameters */ Local Void killtest(c, t, n, killt, killentry, LINK) Char c; trigger *t; string n; boolean killt, *killentry; struct LOC_themain *LINK; { /* Test whether the character c triggers trigger t. n is the name of the entry. killt is whether the entry should be killed if the trigger t was found. killentry is whether the entry has already been killed. */ _TEXT TEMP; testfortrigger(c, t); if (!t->found) return; if (n.length > 0) { /* report the name to output */ TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &n); } else printf("The next entry"); printf(" will be"); if (killt) { printf(" KILLED"); *killentry = true; } else printf(" KEPT"); printf(". It contains: \""); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &t->seek); printf("\".\n"); resettrigger(t); } Local Void doFEA(LINK) struct LOC_themain *LINK; { /* do CDS or mRNA features */ _TEXT TEMP; if (columnposition >= 6 && columnposition <= 20) testfortrigger(LINK->c, &LINK->FEA); if (LINK->FEA.found) { if (LINK->infeatures) { /*writeln(output,'CDS FOUND');*/ skipblanks(LINK->db); LINK->FEAcount++; if (LINK->debug) putchar('\n'); if (LINK->debug) printf("CDS %ld ======================\n", LINK->FEAcount); while (!P_eoln(LINK->db->f)) { nextcharacter(LINK->db, &LINK->cprevious, &LINK->c, LINK); /*write(output,c);*/ putchar(LINK->c); testfortrigger(LINK->c, &LINK->complement); if (LINK->complement.found) LINK->atcomplement = true; if (LINK->atcomplement) { /* unique case of a single exon, eg M35391 */ testfortrigger(LINK->c, &LINK->closeparen); if (LINK->closeparen.found) LINK->atcomplement = false; } testfortrigger(LINK->c, &LINK->join); if (LINK->join.found) { dojoin(LINK); continue; } if (!P_inset(LINK->c, LINK->numbers)) continue; skipblanks(LINK->db); readaninteger(LINK->db, LINK->c, &LINK->start, LINK); LINK->c = getc(LINK->db->f); if (LINK->c == '\n') LINK->c = ' '; if (LINK->c != '.') { printf("error in reading CDS or mRNA\"%c\" found instead of \".\"\n", LINK->c); printf("ord(c) is %12d\n", LINK->c); halt(); } LINK->c = getc(LINK->db->f); if (LINK->c == '\n') LINK->c = ' '; if (LINK->c != '.') { printf("error in reading CDS or mRNA\"%c\" found instead of \".\"\n", LINK->c); halt(); } LINK->c = getc(LINK->db->f); if (LINK->c == '\n') LINK->c = ' '; if (!P_inset(LINK->c, LINK->numbers)) { if (LINK->c == '>') { LINK->c = getc(LINK->db->f); /* skip on */ if (LINK->c == '\n') LINK->c = ' '; if (P_inset(LINK->c, LINK->numbers)) readaninteger(LINK->db, LINK->c, &LINK->stop, LINK); else LINK->stop = LONG_MAX; } else { printf("error in reading CDS or mRNA\"%c\" is not a number\n", LINK->c); halt(); } } else readaninteger(LINK->db, LINK->c, &LINK->stop, LINK); /* writeln(output,'start ',start:1); writeln(output,'stop ',stop:1); */ if (LINK->atcomplement) { /* reverse start and stop */ LINK->hold = LINK->start; LINK->start = LINK->stop; LINK->stop = LINK->hold; } addfeature(true, true, true, LINK); } /* If it's a number then just grab the two numbers */ resettrigger(&LINK->FEA); } } /* pick up gene name and number into previous CDS */ /* 2007 Dec 10; Isn't that a bad idea? It wrecks the last exon! */ if (!(LINK->infeatures && LINK->getexons)) return; testfortrigger(LINK->c, &LINK->genename); /* Pick up names and numbers only if they are in the same feature */ if (LINK->genename.found) { if (LINK->fealist->a[LINK->featarget-1].genenamefound != true && LINK->featurecount == LINK->fealist->a[LINK->featarget-1].featurenumber) { /* writeln(output,'featarget = ',featarget:1,' '); writeln(output,'genename.found -----------'); */ /* This mechanism was converting the last exon into the gene feature! fealist^.a[featarget].genenamefound:= true; */ /*vvv some mechanism here causes loss of the last exon!*/ if (LINK->thefeature == 'r') LINK->fealist->a[LINK->featarget-1].genenamefound = false; else { LINK->fealist->a[LINK->featarget-1].genenamefound = true; /* read the name into the feature list */ grabquote(LINK->db, &LINK->fealist->a[LINK->featarget-1].genenamestring); printf(" gene: "); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &LINK->fealist->a[LINK->featarget-1].genenamestring); putchar('\n'); } } /* writeln(output,'genename.found ----------- done'); */ resettrigger(&LINK->genename); } /*zzzggg*/ testfortrigger(LINK->c, &LINK->genenumber); if (!LINK->genenumber.found) return; if (LINK->fealist->a[LINK->featarget-1].genenumberfound != true && LINK->featurecount == LINK->fealist->a[LINK->featarget-1].featurenumber) { /* writeln(output,'featarget = ',featarget:1,' '); writeln(output,'genenumber.found -----------'); */ LINK->fealist->a[LINK->featarget-1].genenumberfound = true; /* read the number into the feature list */ grabtoken(LINK->db, &LINK->fealist->a[LINK->featarget-1].genenumberstring); printf(" "); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &LINK->fealist->a[LINK->featarget-1].genenumberstring); putchar('\n'); /* writeln(output,'genenumber.found ----------- done'); */ } resettrigger(&LINK->genenumber); } /* end module grabquote version = 4.16; (@ of prgmod.p 1996 August 12 */ /* begin module exon.themain */ Static Void themain(exonp_, db_, lengths_, dinst_, ainst_, einst_, exonfeatures_) _TEXT *exonp_, *db_, *lengths_, *dinst_, *ainst_, *einst_, *exonfeatures_; { /* the main procedure of the program */ struct LOC_themain V; trigger accession; /* trigger to find the ACCESSION pattern */ trigger comma; /* trigger to find the end of the comma pattern */ boolean done; /* done fast skipping of sequence */ trigger features; /* trigger to find the ACCESSION pattern */ trigger entryend; /* a mark for the end of an entry */ trigger exon; /* trigger to find the exon feature */ trigger intron; /* trigger to find the intron feature */ trigger geneprediction; /* a locus contains non-experimental junk */ trigger locus; /* trigger to find the LOCUS pattern */ trigger putative; /* a locus contains non-experimental junk */ trigger notebegin; /* a mark for the beginning a note */ trigger noteend; /* a mark for the end a note */ string noteholder; /* holder for the last note */ trigger notexperimental; /* a locus contains non-experimental junk */ trigger origin; /* a mark for the ORIGIN: start of sequence */ trigger pseudo; /* a locus contains pseudo genes */ trigger unpublished; /* a locus contains junk */ _TEXT TEMP; string *WITH; V.exonp = exonp_; V.db = db_; V.lengths = lengths_; V.dinst = dinst_; V.ainst = ainst_; V.einst = einst_; V.exonfeatures = exonfeatures_; V.holdfile.f = NULL; *V.holdfile.name = '\0'; printf("exon %4.2f\n", version); V.fealist = (exonset *)Malloc(sizeof(exonset)); /* allocate the feature list memory on the heap see technical notes for the reason for doing this. */ readparameters(&V); if (*V.lengths->name != '\0') { if (V.lengths->f != NULL) V.lengths->f = freopen(V.lengths->name, "w", V.lengths->f); else V.lengths->f = fopen(V.lengths->name, "w"); } else { if (V.lengths->f != NULL) rewind(V.lengths->f); else V.lengths->f = tmpfile(); } if (V.lengths->f == NULL) _EscIO2(FileNotFound, V.lengths->name); SETUPBUF(V.lengths->f, Char); fprintf(V.lengths->f, "* exon %4.2f\n", version); writeparameters(V.lengths, &V); TEMP.f = stdout; *TEMP.name = '\0'; writeparameters(&TEMP, &V); /*vvv*/ if (V.debug) printf("DEBUGGING\n"); /* 1 2 3 4 5 */ /* 12345678901234567890123456789012345678901234567890 */ filltrigger(&locus, "LOCUS "); filltrigger(&V.organism, "ORGANISM "); filltrigger(&accession, "ACCESSION "); filltrigger(&features, "FEATURES "); if (V.thefeature == 'r') filltrigger(&V.FEA, "mRNA "); else filltrigger(&V.FEA, "CDS "); filltrigger(&V.genename, "/gene=\" "); filltrigger(&V.genenumber, "/number= "); filltrigger(&exon, "exon "); filltrigger(&intron, "intron "); filltrigger(&V.join, "join( "); filltrigger(&V.closeparen, ") "); filltrigger(&comma, ", "); filltrigger(&V.complement, "complement( "); filltrigger(&putative, "putative "); filltrigger(¬experimental, "not_experimental "); filltrigger(&geneprediction, "gene prediction "); filltrigger(&unpublished, "Unpublished "); filltrigger(&pseudo, "pseudo "); filltrigger(&entryend, "// "); filltrigger(¬ebegin, "/note=\" "); filltrigger(¬eend, "\" "); filltrigger(&origin, "ORIGIN "); fillstring(&V.unknownstring, "unknown "); resettrigger(&V.FEA); resettrigger(&exon); resettrigger(&intron); resettrigger(&locus); resettrigger(&accession); resettrigger(&features); resettrigger(&V.genename); resettrigger(&V.genenumber); resettrigger(&V.join); resettrigger(&V.closeparen); resettrigger(&V.complement); resettrigger(&comma); resettrigger(&putative); resettrigger(¬experimental); resettrigger(&geneprediction); resettrigger(&unpublished); resettrigger(&pseudo); resettrigger(&entryend); resettrigger(¬ebegin); resettrigger(¬eend); resettrigger(&origin); V.locuscount = 0; V.FEAcount = 0; V.totalentrycount = 0; V.totalfeastored = 0; V.feastored = 0; V.infeatures = false; V.linenumber = 1; /* we are already at the first line */ P_addset(P_expset(V.numbers, 0L), '0'); P_addset(V.numbers, '1'); P_addset(V.numbers, '2'); P_addset(V.numbers, '3'); P_addset(V.numbers, '4'); P_addset(V.numbers, '5'); P_addset(V.numbers, '6'); P_addset(V.numbers, '7'); P_addset(V.numbers, '8'); P_addset(V.numbers, '9'); V.killentry = false; fprintf(V.lengths->f, "* columns\n"); /* 12345678 */ fprintf(V.lengths->f, "* length start stop dir exon# FEA# locus# accession\n"); startinst(V.dinst, V.db, 'd', &V); startinst(V.ainst, V.db, 'a', &V); startinst(V.einst, V.db, 'e', &V); if (*V.exonfeatures->name != '\0') { if (V.exonfeatures->f != NULL) V.exonfeatures->f = freopen(V.exonfeatures->name, "w", V.exonfeatures->f); else V.exonfeatures->f = fopen(V.exonfeatures->name, "w"); } else { if (V.exonfeatures->f != NULL) rewind(V.exonfeatures->f); else V.exonfeatures->f = tmpfile(); } if (V.exonfeatures->f == NULL) _EscIO2(FileNotFound, V.exonfeatures->name); SETUPBUF(V.exonfeatures->f, Char); fprintf(V.exonfeatures->f, "* exon: exonfeatures %4.2f\n", version); if (*V.db->name != '\0') { if (V.db->f != NULL) V.db->f = freopen(V.db->name, "r", V.db->f); else V.db->f = fopen(V.db->name, "r"); } else rewind(V.db->f); if (V.db->f == NULL) _EscIO2(FileNotFound, V.db->name); RESETBUF(V.db->f, Char); V.c = ' '; /* forces initial cprevious to be blank */ while (!BUFEOF(V.db->f)) { columnposition = 0; while (!P_eoln(V.db->f)) { V.cprevious = V.c; nextcharacter(V.db, &V.cprevious, &V.c, &V); /* Count Features */ if (V.infeatures && columnposition == 6 && V.c != ' ') { /* We are (almost certainly, damn GenBank for not defining things precisely!) at the start of a feature */ V.featurecount++; resettrigger(&V.genename); resettrigger(&V.genenumber); /* writeln(output,'featurecount = ',featurecount:1); */ } /* LOCUS */ testfortrigger(V.c, &locus); if (locus.found) { V.killentry = false; V.locuscount++; V.FEAcount = 0; clearstring(&V.accname); V.infeatures = false; V.atcomplement = false; V.stop = -LONG_MAX; /* be sure initial intron length is large */ V.start = -LONG_MAX; /* just to be clean */ V.featurecount = 0; if (V.debug) putchar('\n'); if (V.debug) printf("LOCUS %ld ======================\n", V.locuscount); } /* /note */ testfortrigger(V.c, ¬ebegin); if (notebegin.found) { /* skip all notes. They can contain deadly triggers that mess up the program. */ if (V.debug) printf("skipping note:%c", V.c); clearstring(¬eholder); if (V.feastored > 0) { /* if there is at least one exon so far, record notes */ if (V.featurecount == V.fealist->a[V.feastored-1].featurenumber) V.fealist->a[V.feastored-1].notenumber++; } while (!noteend.found) { nextcharacter(V.db, &V.cprevious, &V.c, &V); /* ONLY record into previous fealist^.a item IF we are on the same object. Otherwise record into holding string */ if (V.feastored > 0) { /* if there is an exon, record notes */ /* only record the first note */ /* only record if it is from the same feature */ if (V.fealist->a[V.feastored-1].notestring.length < maxstring && V.c != '"' && V.featurecount == V.fealist->a[V.feastored-1].featurenumber) { WITH = &V.fealist->a[V.feastored-1].notestring; WITH->length++; WITH->letters[WITH->length - 1] = V.c; /* write(output,'+',e:1,'@'); write(output,c); */ } else { /*zzzuuu*/ /* currently this is not used, but the code for now */ /* check for length solved the U41218 bug */ if (noteholder.length < maxstring) { noteholder.length++; noteholder.letters[noteholder.length - 1] = V.c; /* writeln(output,'noteholder.length = ',noteholder.length:1); write(output,'noteholder = '); writestring(output,noteholder); writeln(output,'"'); */ } } } if (V.debug) putchar(V.c); if (V.debug) { if (P_eoln(V.db->f)) putchar('\n'); } testfortrigger(V.c, ¬eend); } if (V.debug) putchar('\n'); resettrigger(¬ebegin); resettrigger(¬eend); } /* END OF ENTRY */ testfortrigger(V.c, &entryend); if (entryend.found) printexons(&V); /* print previously found exons */ /*write(output,c);*/ /* START OF ENTRY */ testfortrigger(V.c, &accession); if (accession.found) { /* capture the accession string */ grabtoken(V.db, &V.accname); TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &V.accname); /* report the name to output */ putchar('\n'); if (V.accname.length <= 0) printf("MISSING ACCESSION STRING\n"); resettrigger(&accession); if (V.debug) printf("ACCESSION "); if (V.debug) { TEMP.f = stdout; *TEMP.name = '\0'; writestring(&TEMP, &V.accname); } if (V.debug) printf(" ======================\n"); } /* test for bad entry markers and kill them: */ if (!V.killentry) { killtest(V.c, &putative, V.accname, V.killputative, &V.killentry, &V); killtest(V.c, ¬experimental, V.accname, V.killnotexperimental, &V.killentry, &V); killtest(V.c, &geneprediction, V.accname, V.killgeneprediction, &V.killentry, &V); killtest(V.c, &unpublished, V.accname, V.killunpublished, &V.killentry, &V); killtest(V.c, &pseudo, V.accname, V.killpseudo, &V.killentry, &V); } testfortrigger(V.c, &features); if (features.found) { /*writeln(output,'FEATURE found'); */ V.infeatures = true; } /*zzzname*/ /* This is a possible solution to the CDS regions suppressing the note information. It is not very good because CDS is more "reliable" according to David Lipman. ALSO this routine looks for the /gene and /note! if not usenotes then */ doFEA(&V); if (V.infeatures) { if (V.getexons) { testfortrigger(V.c, &exon); if (exon.found) { dofeature(true, &V); resettrigger(&exon); } } } if (V.infeatures) { if (V.getintrons) { testfortrigger(V.c, &intron); if (intron.found) { dofeature(false, &V); resettrigger(&intron); } } } /* Sequence: skip it extremely quickly!! */ testfortrigger(V.c, &origin); if (!origin.found) continue; printf("Skipping sequence ...\n"); fscanf(V.db->f, "%*[^\n]"); getc(V.db->f); done = false; while (!done) { V.c = getc(V.db->f); if (V.c == '\n') V.c = ' '; if (V.c != '/') continue; /* next line handles the readln(db); */ V.c = getc(V.db->f); if (V.c == '\n') V.c = ' '; if (V.c == '/') done = true; } printf("... done skipping sequence\n"); } nextline(&V); } printexons(&V); /* finish off previous exon list */ printf("%ld entry(s) have features\n", V.totalentrycount); printf("%ld features processed\n", V.totalfeastored); printf("%ld features\n", V.featurecount); fprintf(V.lengths->f, "* %ld entry(s) have features\n", V.totalentrycount); fprintf(V.lengths->f, "* %ld features processed\n", V.totalfeastored); fprintf(V.lengths->f, "* %ld features\n", V.featurecount); if (V.holdfile.f != NULL) fclose(V.holdfile.f); } #undef exonmax #undef colwid /* end module exon.themain */ main(argc, argv) int argc; Char *argv[]; { PASCAL_MAIN(argc, argv); if (setjmp(_JL1)) goto _L1; exonfeatures.f = NULL; strcpy(exonfeatures.name, "exonfeatures"); einst.f = NULL; strcpy(einst.name, "einst"); ainst.f = NULL; strcpy(ainst.name, "ainst"); dinst.f = NULL; strcpy(dinst.name, "dinst"); lengths.f = NULL; strcpy(lengths.name, "lengths"); db.f = NULL; strcpy(db.name, "db"); exonp.f = NULL; strcpy(exonp.name, "exonp"); themain(&exonp, &db, &lengths, &dinst, &ainst, &einst, &exonfeatures); _L1: if (exonp.f != NULL) fclose(exonp.f); if (db.f != NULL) fclose(db.f); if (lengths.f != NULL) fclose(lengths.f); if (dinst.f != NULL) fclose(dinst.f); if (ainst.f != NULL) fclose(ainst.f); if (einst.f != NULL) fclose(einst.f); if (exonfeatures.f != NULL) fclose(exonfeatures.f); exit(EXIT_SUCCESS); } /* End. */