/* Output from p2c 1.21alpha-07.Dec.93, the Pascal-to-C translator */ /* From input file "mkdb.p" */ #include /* mkdb: read sequence; make GenBank entry with features for capitalized regions Dr. Thomas D. Schneider National Institutes of Health National Cancer Institute Center for Cancer Research Nanobiology Program Molecular Information Theory Group Frederick, Maryland 21702-1201 schneidt@mail.nih.gov permanent email: toms@alum.mit.edu http://www.ccrnp.ncifcrf.gov/~toms/ */ /* end of program */ /* begin module version */ #define version 1.26 /* of mkdb.p 2009 Apr 08 2009 Apr 08, 1.26: supply version 2008 Jun 25, 1.25: put topology into the entries 2007 Apr 02, 1.24: increase maximum sequence length 2007 Mar 01, 1.23: options for multipart names 2007 Mar 01, 1.22: make locus name not be multi part 2005 Sep 16, 1.21: upgrade documentation and brush up code 2002 Apr 9, 1.20: sequence length incremented, brush up documentation 2000 Nov 7, 1.19: primer bound corrected 2000 Jun 15, 1.18: upgrade to take completely raw sequence 1999 Sep 23, 1.17: Format spacing corrected: more space after id words 1999 Jun 20, 1.16: allow reading fasta format 1997 Nov 13: : allow spaces, blank lines comments and ignore numbers 1997 Oct 21: : origin */ #define updateversion 1.09 /* defines lowest acceptable current parameter file */ /* end module version */ /* begin module describe.mkdb */ /* name mkdb: read sequence; make GenBank entry with features for capitalized regions synopsis mkdb(sequ: in, mkdbp: in, entries: out, output: out) files sequ: raw DNA sequences in lower case except for objects of interest marked in upper case. The program also accepts 'n'. Sequences are separated by periods. Each sequence may be preceeded by a name line and a species line. These lines can begin with '>' or '*', but this is not necessary. Spaces, blank lines and numbers are ignored. Other lines that begin with '>' or '*' are comments If there is no species name, the single name will be used for the species also. This kludge allows the program to read the fasta format. If there is no name at all (just sequence) then a name will be assigned: nameste. name: a string of characters to name the sequence. organism: the species this sequence represents entries: GenBank entries for the sequences, with features for the capitalized regions marked as exons and features for the lower case regions (not including primers) as introns. mkdbp: parameters to control the program. The file must contain the following parameters, one per line: parameterversion: The version number of the program. This allows the user to be warned if an old parameter file is used. exoncutoff (integer): Capitalized regions longer than this number of bases will be called exons, the others will be called primers. multipart (character): What to do if a name has spaces in it. 'i' ignore the rest of the name 'u' replace spaces with underscores output: messages to the user description Sequences are often marked by people with capital letters to indicate interesting regions (exons, primers, mutations, etc). This program uses raw sequences to create simple flat-file GenBank style entries with features marked by capital letters. Long features are called 'exons' while short ones are called 'primers'. The division between these two is given by the exoncutoff parameter. Example If the sequ contains: * T7stuff * Bacteriophage T7 aacataaaggacacaatgcaatgaacattaccgacatcatgaacgctatc gacgcaatcaaagcactgccaatctgtgaacttgacaagcgtcaaggtat gcttatcgacttactggtcgagatggtcaacagcgagacgtgtgatggcg agctaacCGAACTAAATCAGGCACttgagcatcaagattggtggactacc ttgaagtgtctcacggctgacgcagggttcaagATGCTCGGTAATGGTCA CTTCTCGGCTGCTTATAGTCACCCGCTGCTACCTAACAGAGTGATTAAGG TGGGCTTTAAGAAAGAGGATTCAGGCGCAGCCTATACCGCATTCTgccgc atgtatcagggtcgtcctggtatccctaacgtctacgatgtacagcgcca cgctggatgctatacggtggtacttgacgcacttaaggattgcgagcgtt tcaacaatgatgccCATTATAAATACGCTGAgattgcaagcgacatcatt gattgcaattcggatgagcatgatgagttaactggatgggatggtgagtt tgttgaaacttgtaaactaatccgcaagttctttgagggcatcgcctcat . The entries file will contain: LOCUS T7stuff 600 bp DNA * mkdb 1.21 DEFINITION Bacteriophage T7 ACCESSION T7stuff VERSION T7stuff.1 SOURCE Bacteriophage T7 ORGANISM Bacteriophage T7 FEATURES primer 158..174 exon 234..345 primer 465..481 BASE COUNT 166 a 133 c 151 g 150 t 0 n ORIGIN 1 aacataaagg acacaatgca atgaacatta ccgacatcat gaacgctatc gacgcaatca 61 aagcactgcc aatctgtgaa cttgacaagc gtcaaggtat gcttatcgac ttactggtcg 121 agatggtcaa cagcgagacg tgtgatggcg agctaaccga actaaatcag gcacttgagc 181 atcaagattg gtggactacc ttgaagtgtc tcacggctga cgcagggttc aagatgctcg 241 gtaatggtca cttctcggct gcttatagtc acccgctgct acctaacaga gtgattaagg 301 tgggctttaa gaaagaggat tcaggcgcag cctataccgc attctgccgc atgtatcagg 361 gtcgtcctgg tatccctaac gtctacgatg tacagcgcca cgctggatgc tatacggtgg 421 tacttgacgc acttaaggat tgcgagcgtt tcaacaatga tgcccattat aaatacgctg 481 agattgcaag cgacatcatt gattgcaatt cggatgagca tgatgagtta actggatggg 541 atggtgagtt tgttgaaact tgtaaactaa tccgcaagtt ctttgagggc atcgcctcat // documentation see also {example parameter file:} mkdbp {example sequ file:} mkdb.sequ {move to the name 'sequ' to use it} {Program for listing the sequences:} lister.p {Program for generating search for capitalized sequence:} capsmark.p author Thomas Dana Schneider bugs technical notes Capitalization that abuts either end of the sequence will be indicated in the entry as beyond the end. This way the ends of the sequence will not be marked as donors or acceptors. The maximum name and sequence lengths are constants maxobjectlength and maxsequencelength respectively. */ /* end module describe.mkdb */ /* begin module describe.const */ #define maxnamelength 100 /* maximum length name */ #define maxsequencelength 6000000L /* maximum sequence length */ /* 253256583 human chromosome 2 length */ /* Set the length to the maximum your computer can handle */ #define debugging false /* set true to get debugging output */ /* end module describe.const */ Static _TEXT sequ; /* file used by this program */ Static _TEXT mkdbp; /* file used by this program */ Static _TEXT entries; /* file used by this program */ Static jmp_buf _JL1; /* begin module halt */ Static Void halt() { /* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. */ printf(" program halt.\n"); longjmp(_JL1, 1); } /* end module halt version = 'delmod 6.16 84 mar 12 tds/gds'; */ /* begin module capitalize */ Static Char capitalize(c) Char c; { /* convert the character c to upper case */ long n = c; /* c is the n'th letter of the alphabet */ if (n >= 'a' && n <= 'z') c = _toupper(n); return c; } /* end module capitalize prgmod */ /* begin module decapitalize */ Static Char decapitalize(c) Char c; { /* convert the character c to lower case */ long n = c; /* c is the n'th letter of the alphabet */ if (n >= 'A' && n <= 'Z') c = _tolower(n); else c = (Char)n; return c; } typedef struct object { Char id[maxnamelength]; /* a string object */ long length; /* length of the object */ } object; typedef struct sequence { Char id[maxsequencelength]; /* a sequence */ long length; /* length of the sequence */ } sequence; /* Local variables for themain: */ struct LOC_themain { _TEXT *sequ, *entries; Char c; /* a character in the sequence */ boolean die; /* stop the program if bad sequence characters found */ long entrylength; /* length of the sequence */ long exoncutoff; /* shortest feature to report as an exon */ Char multipart; /* what to do for multipart names */ object name; /* name of the sequence */ long na, nc, ng, nt, nn; /* numbers of bases and unknown base */ object organism; /* organism for the sequence */ sequence seq; /* the sequence */ long start; /* position in the sequence, start of feature */ long stop; /* position in the sequence, end of feature */ Char topology; /* topology of the sequence */ boolean waslower; /* true if previous character was lower case */ } ; Local Void writename(afile, name, LINK) _TEXT *afile; object name; struct LOC_themain *LINK; { /* write the name to the file. 2007 Mar 01: Force any spaces to be underscores. */ long n; /* index to the name */ boolean skip = false; /* skip the rest of the line */ /* for n := 1 to name.length do write(afile, name.id[n]); */ /*writeln(output,'multipart = ',multipart);*/ for (n = 0; n < name.length; n++) { if (!skip) { if (name.id[n] == ' ') { if (LINK->multipart == 'i') skip = true; else putc('_', afile->f); } else putc(name.id[n], afile->f); } } /* display to output for debugging: for n := 1 to name.length do begin if name.id[n] = ' ' then write(output, '_') else write(output, name.id[n]); end; writeln(output); */ } Local Void readname(afile, name, LINK) _TEXT *afile; object *name; struct LOC_themain *LINK; { /* write the name to the file */ _TEXT TEMP; /* read the name in */ if ((P_peek(afile->f) == '>') | (P_peek(afile->f) == '*')) { getc(afile->f); while ((P_peek(afile->f) == ' ') & (!P_eoln(afile->f))) getc(afile->f); if (P_eoln(afile->f)) { printf("missing name\n"); halt(); } name->length = 0; while (!P_eoln(afile->f)) { name->length++; if (name->length > maxnamelength) { printf("name too long:\n"); putchar('"'); TEMP.f = stdout; *TEMP.name = '\0'; writename(&TEMP, *name, LINK); printf("\"\n"); halt(); } name->id[name->length - 1] = getc(afile->f); if (name->id[name->length - 1] == '\n') name->id[name->length - 1] = ' '; } fscanf(afile->f, "%*[^\n]"); getc(afile->f); printf("READING: "); TEMP.f = stdout; *TEMP.name = '\0'; writename(&TEMP, *name, LINK); putchar('\n'); return; } printf("Assigned name: "); name->id[0] = 'n'; name->id[1] = 'a'; name->id[2] = 'm'; name->id[3] = 'e'; name->id[4] = 's'; name->id[5] = 't'; name->id[6] = 'e'; name->length = 7; putchar('"'); TEMP.f = stdout; *TEMP.name = '\0'; writename(&TEMP, *name, LINK); printf("\"\n"); /* writeln(output,'names must begin with > or *'); writeln(output,'character seen is: "',afile^,'"'); halt */ } Local Void indent(afile, amount, LINK) _TEXT *afile; long amount; struct LOC_themain *LINK; { /* put some indentation to the file */ long n; /* index to the name */ for (n = 1; n <= amount; n++) putc(' ', afile->f); } Local Void featurecoordinate(afile, start, stop, entrylength, LINK) _TEXT *afile; long start, stop, entrylength; struct LOC_themain *LINK; { /* write the feature coordinates start and stop to a file. If they are at or beyond the entry size, put < or > */ if (start < 1) putc('<', afile->f); fprintf(afile->f, "%ld", start); fprintf(afile->f, ".."); if (stop > entrylength) putc('>', afile->f); fprintf(afile->f, "%ld", stop); } Local Void countem(b, LINK) Char b; struct LOC_themain *LINK; { /* count the base b */ switch (decapitalize(b)) { case 'a': LINK->na++; break; case 'c': LINK->nc++; break; case 'g': LINK->ng++; break; case 't': LINK->nt++; break; case 'n': LINK->nn++; break; } } /* procedure skipcomments; (* skip comment lines *) begin if not eof(sequ) then if (sequ^ = '>') or (sequ^ = '*') then readln(sequ); end; */ Local Void chooseandwrite(LINK) struct LOC_themain *LINK; { /* choose what kind of feature to write out */ if (LINK->stop - LINK->start + 1 > LINK->exoncutoff) { indent(LINK->entries, 5L, LINK); fprintf(LINK->entries->f, "exon "); featurecoordinate(LINK->entries, LINK->start, LINK->stop, LINK->entrylength, LINK); putc('\n', LINK->entries->f); return; } indent(LINK->entries, 5L, LINK); fprintf(LINK->entries->f, "primer "); featurecoordinate(LINK->entries, LINK->start, LINK->stop, LINK->entrylength, LINK); putc('\n', LINK->entries->f); /* rejected features */ } Local Void processsequence(LINK) struct LOC_themain *LINK; { /* process one sequence ending in a period or end of file */ long position = 0; /* position in the sequence, starting with 1 */ Char TEMP; long FORLIM; /* loop variable - must be local */ /* read the next sequence */ LINK->die = false; LINK->c = ' '; LINK->na = 0; LINK->nc = 0; LINK->ng = 0; LINK->nt = 0; LINK->nn = 0; /* skipcomments; (* skip comments just after header *) */ while (!BUFEOF(LINK->sequ->f) && LINK->c != '.') { if (P_eoln(LINK->sequ->f)) { /* end of line */ fscanf(LINK->sequ->f, "%*[^\n]"); getc(LINK->sequ->f); /* allow sequence not to end with a period */ if (!BUFEOF(LINK->sequ->f)) { if ((P_peek(LINK->sequ->f) == '*') | (P_peek(LINK->sequ->f) == '>')) LINK->c = '.'; } if (debugging) putchar('\n'); /* skipcomments; */ continue; } LINK->c = getc(LINK->sequ->f); if (LINK->c == '\n') LINK->c = ' '; if (debugging) putchar(LINK->c); TEMP = decapitalize(LINK->c); if (TEMP == 'n' || TEMP == 't' || TEMP == 'g' || TEMP == 'c' || TEMP == 'a') { position++; if (position > maxsequencelength) { printf("Sequence exceeds the maximum sequence length: %ld\n", maxsequencelength); printf("Increase constant maxsequencelength\n"); halt(); } LINK->seq.id[position-1] = LINK->c; countem(LINK->c, LINK); continue; } if (LINK->c == '.') { fscanf(LINK->sequ->f, "%*[^\n]"); getc(LINK->sequ->f); } else if (LINK->c != ' ' && LINK->c != '9' && LINK->c != '8' && LINK->c != '7' && LINK->c != '6' && LINK->c != '5' && LINK->c != '4' && LINK->c != '3' && LINK->c != '2' && LINK->c != '1' && LINK->c != '0') { printf("unidentified character: %c\n", LINK->c); LINK->die = true; } } /* not end of line */ LINK->entrylength = position; if (LINK->die) halt(); /* start header of GenBank entry */ fprintf(LINK->entries->f, "\nLOCUS "); writename(LINK->entries, LINK->name, LINK); fprintf(LINK->entries->f, " %ld bp DNA", LINK->entrylength); /* 2008 Jun 25 */ if (LINK->topology == 'l') fprintf(LINK->entries->f, " linear"); else fprintf(LINK->entries->f, " circular"); fprintf(LINK->entries->f, " * mkdb %4.2f\n", version); fprintf(LINK->entries->f, "DEFINITION "); writename(LINK->entries, LINK->organism, LINK); fprintf(LINK->entries->f, "\nACCESSION "); writename(LINK->entries, LINK->name, LINK); fprintf(LINK->entries->f, "\nVERSION "); writename(LINK->entries, LINK->name, LINK); fprintf(LINK->entries->f, ".1\n"); fprintf(LINK->entries->f, "SOURCE "); writename(LINK->entries, LINK->organism, LINK); fprintf(LINK->entries->f, "\n ORGANISM "); writename(LINK->entries, LINK->organism, LINK); fprintf(LINK->entries->f, "\nFEATURES\n"); LINK->waslower = true; LINK->start = 1; LINK->stop = 1; FORLIM = LINK->entrylength; for (position = 1; position <= FORLIM; position++) { LINK->c = LINK->seq.id[position-1]; if (LINK->c == 'n' || LINK->c == 't' || LINK->c == 'g' || LINK->c == 'c' || LINK->c == 'a') { if (position != 1 && !LINK->waslower) { /* upper case ending */ LINK->stop = position - 1; chooseandwrite(LINK); } LINK->waslower = true; } else if (LINK->c == 'N' || LINK->c == 'T' || LINK->c == 'G' || LINK->c == 'C' || LINK->c == 'A') { if (LINK->waslower) /* begin upper case */ LINK->start = position; LINK->waslower = false; } else { printf("program error:\n"); printf("illegal character \"%c\" found in sequence array\n", LINK->c); halt(); } } if (!LINK->waslower) { /* ie was upper */ /* After the end of sequence is found the sequence can end on capital letters, catch that case. */ LINK->stop = LINK->entrylength; chooseandwrite(LINK); } fprintf(LINK->entries->f, "BASE COUNT %7ld a %7ld c %7ld g %7ld t %7ld n\n", LINK->na, LINK->nc, LINK->ng, LINK->nt, LINK->nn); if (LINK->na + LINK->nc + LINK->ng + LINK->nt + LINK->nn != LINK->entrylength) { printf("program error: BASE COUNT <> sum of bases\n"); halt(); } fprintf(LINK->entries->f, "ORIGIN"); /* note: ln is done as first step below */ FORLIM = LINK->entrylength; /* write out sequence */ for (position = 1; position <= FORLIM; position++) { if (position % 10 == 1) putc(' ', LINK->entries->f); /* p2c: mkdb.p, line 526: * Note: Using % for possibly-negative arguments [317] */ if ((position - 1) % 60 == 0) fprintf(LINK->entries->f, "\n%10ld ", position); /* p2c: mkdb.p, line 529: * Note: Using % for possibly-negative arguments [317] */ fputc(decapitalize(LINK->seq.id[position-1]), LINK->entries->f); } fprintf(LINK->entries->f, "\n//\n"); } /* end module decapitalize prgmod */ /* begin module mkdb.themain */ Static Void themain(sequ_, mkdbp, entries_) _TEXT *sequ_, *mkdbp, *entries_; { /* the main procedure of the program */ struct LOC_themain V; double parameterversion; /* parameter version number */ V.sequ = sequ_; V.entries = entries_; printf("mkdb %4.2f\n", version); if (*mkdbp->name != '\0') { if (mkdbp->f != NULL) mkdbp->f = freopen(mkdbp->name, "r", mkdbp->f); else mkdbp->f = fopen(mkdbp->name, "r"); } else rewind(mkdbp->f); if (mkdbp->f == NULL) _EscIO2(FileNotFound, mkdbp->name); RESETBUF(mkdbp->f, Char); fscanf(mkdbp->f, "%lg%*[^\n]", ¶meterversion); getc(mkdbp->f); if ((long)floor(100 * parameterversion + 0.5) < (long)floor(100 * updateversion + 0.5)) { printf("You have an old parameter file!\n"); halt(); } /* read other parameters */ fscanf(mkdbp->f, "%ld%*[^\n]", &V.exoncutoff); getc(mkdbp->f); if (!BUFEOF(mkdbp->f)) { fscanf(mkdbp->f, "%c%*[^\n]", &V.multipart); getc(mkdbp->f); if (V.multipart == '\n') V.multipart = ' '; } else V.multipart = 'i'; V.topology = 'l'; /* assume linear topology for now */ if (*V.entries->name != '\0') { if (V.entries->f != NULL) V.entries->f = freopen(V.entries->name, "w", V.entries->f); else V.entries->f = fopen(V.entries->name, "w"); } else { if (V.entries->f != NULL) rewind(V.entries->f); else V.entries->f = tmpfile(); } if (V.entries->f == NULL) _EscIO2(FileNotFound, V.entries->name); SETUPBUF(V.entries->f, Char); if (*V.sequ->name != '\0') { if (V.sequ->f != NULL) V.sequ->f = freopen(V.sequ->name, "r", V.sequ->f); else V.sequ->f = fopen(V.sequ->name, "r"); } else rewind(V.sequ->f); if (V.sequ->f == NULL) _EscIO2(FileNotFound, V.sequ->name); RESETBUF(V.sequ->f, Char); while (!BUFEOF(V.sequ->f)) { /* if (sequ^ = '>') or (sequ^='*') then readname(sequ, name); */ readname(V.sequ, &V.name, &V); if ((P_peek(V.sequ->f) == '>') | (P_peek(V.sequ->f) == '*')) readname(V.sequ, &V.organism, &V); else V.organism = V.name; processsequence(&V); } } /* end module mkdb.themain */ main(argc, argv) int argc; Char *argv[]; { PASCAL_MAIN(argc, argv); if (setjmp(_JL1)) goto _L1; entries.f = NULL; strcpy(entries.name, "entries"); mkdbp.f = NULL; strcpy(mkdbp.name, "mkdbp"); sequ.f = NULL; strcpy(sequ.name, "sequ"); themain(&sequ, &mkdbp, &entries); _L1: if (sequ.f != NULL) fclose(sequ.f); if (mkdbp.f != NULL) fclose(mkdbp.f); if (entries.f != NULL) fclose(entries.f); exit(EXIT_SUCCESS); } /* End. */