program indana(ind, ana, subind, indanap, output); (* indana: analysis of an index Dr. Thomas D. Schneider National Cancer Institute Laboratory of Experimental and Computational Biology Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu (use only if first address fails) http://www.lecb.ncifcrf.gov/~toms/ module libraries required: delman, delmods, prgmods *) label 1; (* the end of the program *) const (* begin module version *) version = 5.30; (* of indana.p 2003 Aug 11 2003 Aug 11, 5.28: "ind : Bad data found on integer read" 1994 Nov 13, 5.27: previous version 1981 Feb 15, 1.00: origin this version handles index version 8 or higher *) (* end module version *) (* begin module describe.indana *) (* name indana: analysis of an index synopsis indana(ind: in, ana: out, subind: out, indanap: in, output: out) files ind: an index produced by the index program. it must not be a teaching index. ana: a histogram of the similarities of the index along with the mean, standard deviation and frequency distribution of the the similarities. subind: portions of the index selected by the parameters in indanap. pairs (or adjacent sets) of lines of the index are printed. the similarities of the original index are maintained. this means that the first similarity of a pair (or set) is not a reflection of the similarity to the line above it. the ones that are 'true' are marked with an asterisk [*]. indanap: parameters to control indana, containing 3 lines: 1. the lowest similarity to put into subind 2. the highest similarity to put into subind 3. if the first character is 'b' then the groups of similar sequences will be broken apart by blank lines. description An index is usually quite large, so it is difficult to look at by hand. Indana allows one to select a portion of the index by various criteria. The portion is called a "sub-index". If the original book contained a number of highly similar oligo- nucleotides, then the histogram of similarities will show a spike of high similarities. see also index.p author Thomas Schneider bugs none known *) (* end module describe.indana *) histmax= 60000; (* the maximum window that indana can handle *) graphmax = 100; (* the widest graph allowed *) percentmax = 50; (* the maximum page size in percents of the graph *) var ind, (* the input index *) ana, (* the output analysis *) subind, (* the output subindex *) indanap: (* the parameter file *) text; (* values read from the index (see rhindex for definitions) *) window, pbefore, pwindow, pafter, number, sequences, linears, skip: integer; (* histogram for results *) hist: array[0..histmax] of integer; similarity: integer; (* a similarity *) warning: boolean; (* one or more similarity was greater than histmax and is recorded at histmax on the graph *) highest: integer; (* the highest similarity in hist *) (* statistics *) mean, sd: real; (* mean and standard deviation for the distribution of similarity values in the index *) sublower, subhigher: integer; (* the range of similarity to put into subindex *) sublist: file of integer; (* list of index lines where similarity is in the range of sublower to subhigher *) breaklines: boolean; (* if true, break between similarity groups *) (* begin module package.primitive *) (* ************************************************************************ *) (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 'delmod 6.51 85 apr 17 tds/gds' *) (* begin module copyaline *) procedure copyaline(var fin, fout: text); (* copy a line from file fin to file fout *) begin (* copyaline *) while not eoln(fin) do begin fout^ := fin^; put(fout); get(fin) end; readln(fin); writeln(fout); end; (* copyaline *) (* end module copyaline version = 'delmod 6.51 85 apr 17 tds/gds' *) (* begin module copylines *) function copylines(var fin, fout: text; n: integer): integer; (* copy n lines of file fin to file fout. the actual number of lines copied is returned. *) var index: integer; (* the current line number *) begin (* copylines *) index := 0; while (not eof(fin)) and (index < n) do begin copyaline(fin, fout); index := succ(index) end; copylines := index end; (* copylines *) (* end module copylines version = 'delmod 6.51 85 apr 17 tds/gds' *) (* ************************************************************************ *) (* end module package.primitive version = 'delmod 6.51 85 apr 17 tds/gds' *) (* begin module package.numbar *) (* ************************************************************************ *) (* begin module numberdigit *) function numberdigit(number, logplace:integer): char; (* return the digit at the place value ('logplace') position of number. example: numberdigit(13625, 3) = 3 numberdigit(13625, 4) = 1 *) var place: integer; (* the exponent of logplace *) count: integer; (* used to make place *) absolute: integer; (* the absolute value of number *) acharacter: char; (* the character to be returned *) procedure digit; (* extract a digit at the place position *) var tenplace: integer; (* ten times place *) z: integer; (* an intermediate value *) d: integer; (* the digit extracted *) begin (* digit *) tenplace:=10*place; z:=absolute-((absolute div tenplace)*tenplace); if place = 1 then d:=z else d:= z div place; case d of 0: acharacter:='0'; 1: acharacter:='1'; 2: acharacter:='2'; 3: acharacter:='3'; 4: acharacter:='4'; 5: acharacter:='5'; 6: acharacter:='6'; 7: acharacter:='7'; 8: acharacter:='8'; 9: acharacter:='9'; end end; (* digit *) procedure sign; (* put a negative sign out or a positive sign *) begin (* sign *) if number <0 then acharacter:='-' else acharacter:='+' end; (* sign *) begin (* numberdigit *) place:=1; for count:=1 to logplace do place:=10*place; if number=0 then begin if place=1 then acharacter:='0' else acharacter:=' ' end else begin absolute:=abs(number); if absolute < (place div 10) then acharacter:=' ' else if absolute >= place then digit else sign end; numberdigit:=acharacter end; (* numberdigit *) (* end module numberdigit version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module numbersize *) function numbersize(n: integer):integer; (* calculate amount of space to be reserved for the integer n *) const ln10 = 2.30259; (* natural log of 10 - for conversion to log base 10 *) epsilon = 0.00001; (* a small number to correct log base 10 errors *) begin (* numbersize *) if n = 0 then numbersize:=1 else numbersize:=trunc(ln(abs(n))/ln10 + epsilon) + 2; (* the epsilon assures that we do not lose a place due to roundoff. eg, sometimes log base 10 of 10 would be 0.9999 instead of 1, and we would not do it right... note: this will fail for very large numbers on the order of 1/epsilon. *) (* the 2 is for the sign and last digit *) end; (* numbersize *) (* end module numbersize version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module numberbar *) procedure numberbar(var afile: text; spaces, firstnumber, lastnumber: integer; var linesused: integer); (* write a bar of numbers to a file, with several spaces before. the number of lines used is returned *) var logplace: integer; (* the log of the digit being looked at *) spacecount: integer; (* count of spaces *) number: integer; (* the current number being written *) begin if abs(firstnumber) > abs(lastnumber) then linesused:= numbersize(firstnumber) else linesused:= numbersize(lastnumber); for logplace:=linesused-1 downto 0 do begin for spacecount:=1 to spaces do write(afile,' '); for number:=firstnumber to lastnumber do write(afile,numberdigit(number,logplace)); writeln(afile) end end; (* end module numberbar version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* ************************************************************************ *) (* end module package.numbar version = 4.11; (@ of prgmod.p 1991 Apr 22 *) procedure rhindex(var ind, (* the index being read in *) print: text; (* where to print the header *) var window, (* the length of the alphabetized window *) pbefore, (* number of bases before the printed window *) pwindow, (* the printed window size *) pafter, (* number of bases after the printed window *) number, (* number of bases in the book indexed *) sequences, (* the number of sequences in the book *) linears, (* the number of linear sequences *) skip: (* the number of characters to skip to the numbers of a line in index *) integer); (* read the header of the index and copy it to the file print *) (* note that the number of sequences is not returned. *) const headerlength = 11; (* lines in the header, including the blank one after the constants *) constart = 3; (* number of lines to skip to get to the constants of the index *) var c: char; (* a character for skipping in ind *) i: integer; (* index to header *) begin (* copy the header to print *) reset(ind); if eof(ind) then begin writeln(output,' index file is empty'); halt end; for i:=1 to headerlength do begin if not eoln(ind) then write(print,ind^); while not eoln(ind) do begin get(ind); write(print,ind^) end; readln(ind); writeln(print) end; (* grab the constants *) reset(ind); for i:=1 to constart do readln(ind); readln(ind,c,window); readln(ind,c,pbefore); readln(ind,c,pwindow); readln(ind,c,pafter); readln(ind,c,number); readln(ind,c,sequences); readln(ind,c,linears); readln(ind); (* skip the last parameter *) (* die if the index was produced in teaching mode *) if not eof(ind) then begin get(ind); (* skip the space *) get(ind); (* skip the space *) if not eof(ind) then if ind^='t' then begin writeln(output,' indana can not be used on a teaching index.'); halt end end; (* skip index to the first data *) readln(ind); (* define the skip value: *) skip := pbefore + pwindow + pafter + 4 (* blanks *); end; (* begin module skipblanks *) procedure skipblanks(var thefile: text); (* skip over blanks until a non-blank, or end of line, is found *) begin while (thefile^ = ' ') and not eoln(thefile) do get(thefile); end; procedure skipnonblanks(var thefile: text); (* skip over nonblanks until a blank, or end of line, is found *) begin while (thefile^ <> ' ') and not eoln(thefile) do get(thefile); end; (* end module skipblanks version = 4.11; (@ of prgmod.p 1991 Apr 22 *) procedure readsim(var ind: text; (* the index to read *) { skip: integer; (* how far to skip the bases *) } var similarity: integer); (* read the similarity from the index line. skip is precalculated in rhindex. *) begin skipblanks(ind); skipnonblanks(ind); (* skip the sequence before stuff *) skipblanks(ind); skipnonblanks(ind); (* skip the sequence window stuff *) skipblanks(ind); skipnonblanks(ind); (* skip the sequence after stuff *) skipblanks(ind); skipnonblanks(ind); (* skip the seq # *) skipblanks(ind); skipnonblanks(ind); (* skip the pos *) skipblanks(ind); skipnonblanks(ind); (* skip the dir *) readln(ind,similarity); end; procedure initialize; (* set up the variables *) var clearmax: integer; (* how much of hist to clear *) similarity: integer; (* a similarity *) procedure nosubindex; (* set variables in the case of no subindexing to be done *) begin subhigher:=-1; (* never write subindex *) sublower:=window+1; breaklines := true; writeln(ana,' no subindex made.') end; begin writeln(output,' indana ',version:4:2); rewrite(ana); writeln(ana,' indana: similarity analysis ',version:4:2,' using:'); rhindex(ind, ana, window, pbefore, pwindow, pafter, number, sequences, linears ,skip); if window > histmax then clearmax:=histmax else clearmax:=window; (* clear hist *) for similarity:=0 to clearmax do hist[similarity]:=0; (* read parameters *) reset(indanap); if eof(indanap) then nosubindex else begin readln(indanap,sublower); readln(indanap,subhigher); if (0 <= sublower) and (sublower <= subhigher) then begin write(ana,' subindex made from similarities in the range '); writeln(ana,'[',sublower:1,' to ',subhigher:1,']') end else nosubindex; if eof(indanap) then breaklines := true else if indanap^='b' then breaklines := true else breaklines := false end; rewrite(sublist); rewrite(subind); end; procedure makehist; (* make a histogram of the index similarities *) var lines: integer; (* keeps track of the lines in index *) (* for calculation of stastics of the distribution *) sumx,sumxsqr: real; begin warning:=false; (* set up for calculations *) sumx:=0; sumxsqr:=0; for lines:=1 to number do begin (* read and histogram a similarity *) readsim(ind,{skip,}similarity); if (sublower <= similarity) and (similarity <= subhigher) then begin sublist^:=lines; put(sublist) end; if similarity <= histmax then hist[similarity]:=succ(hist[similarity]) else begin warning:=true; hist[histmax]:=succ(hist[histmax]) end; (* calculate stastics, note that they are independent of histmax *) sumx:=sumx+similarity; sumxsqr:=sumxsqr + sqr(similarity) end; (* finish the statistics *) mean:=sumx/number; sd:=sqrt((sumxsqr/number) - sqr(mean)); end; procedure printhist; (* print the histogram to list *) var similarity: integer; (* a similarity *) begin writeln(ana); writeln(ana,' mean similarity: ',mean:4:2); writeln(ana,' standard deviation of similarity: ',sd:4:2); writeln(ana); writeln(ana,' similarity number frequency'); if warning then highest := histmax else highest := window; while (hist[highest] = 0) and (highest <> 0) do highest:=pred(highest); for similarity := 0 to highest do writeln(ana,' ',similarity:6, ' ',hist[similarity]:8, ' ',(hist[similarity]/number):10:2) end; procedure printgraph; (* print a graph of the similarity frequencies *) var hifreq: integer; (* the highest frequency to plot *) s,f: integer; (* indicies for highest and hifreq *) percent: array[0..histmax] of integer; (* hist converted to percents *) p: integer; (* index to percent *) begin (* calculate the minimum of graphmax and highest *) if graphmax < highest then highest:=graphmax; (* calculate the minimum of percentmax and hifreq *) hifreq:=0; for s:=0 to highest do begin p:=round(100*hist[s]/number); if p > hifreq then hifreq := p; percent[s]:=p end; if percentmax < hifreq then hifreq:=percentmax; (* begin the graph *) page(ana); writeln(ana,' frequency distribution of similarities'); writeln(ana); writeln(ana,' frequency'); (* write out the graph *) for f:=hifreq downto 1 do begin write(ana,' ',(f/100):4:2,' i'); for s:=0 to highest do if percent[s] >= f then write(ana,'*') else write(ana,' '); writeln(ana) end; (* put a bar under all that *) write(ana,' ',0.0:4:2,' +'); for s:=0 to highest do write(ana,'-'); writeln(ana); (* end the graph *) numberbar(ana,7,0,highest,s); (* s is a dummy *) writeln(ana,' ':(highest+10),'similarity'); if warning then begin writeln(output,'one or more similarity was greater than ',histmax:1); writeln(ana,' one or more similarity was greater than ',histmax:1, ' and is recorded at ',histmax:1,'.'); writeln(ana,' the statistical calculations are', ' on the original data.') end end; procedure printsubindex; (* print a subindex containing the subset of index whose similarities are in the range of sublower to subhigher *) var line: integer; (* of entries in index *) procedure copyline; (* copy to the end of the line, index to subindex, no writeln *) begin repeat subind^:=ind^; put(subind); get(ind) until eoln(ind); readln(ind); line:=succ(line) end; begin if subhigher >= 0 then begin reset(sublist); write(subind,'* subindex of indana ',version:4:2, ' (similarity range = [', sublower:1,' to ',subhigher:1,']) from '); (* copy index header to subindex *) reset(ind); while ind^='*' do begin if not eoln(ind) then write(subind,ind^); while not eoln(ind) do begin get(ind); write(subind,ind^) end; readln(ind); writeln(subind) end; line:=1; (* at first line *) while not eof(sublist) do begin while line < (sublist^-1) do begin readln(ind); line:=succ(line) end; (* now line is just before or on the high similarity line *) if line = (sublist^ - 1) then begin (* handle previous line *) if breaklines then writeln(subind); copyline; writeln(subind) end; (* now handle the line *) copyline; writeln(subind,' *'); get(sublist) end end end; begin (* indana *) initialize; makehist; printhist; printgraph; printsubindex; 1: end. (* indana *)