program siva(sorted, sivap, incu, curves, list, output); (* siva: site information variance Tom Schneider National Cancer Institute Laboratory of Mathematical Biology Frederick, Maryland 21702-1201 toms@ncifcrf.gov *) label 1; (* end of program *) const (* begin module version *) version = 1.96; (* of siva.p 1999 Dec 13 1999 Dec 13, 1.96: y2k upgrade 1993 January 26, 1.95: previous changes origin 1987 mar 5 *) (* end module version *) (* begin module describe.siva *) (* name siva: site information variance synopsis siva(sorted: in, sivap: in, incu: out, curves: out, list: out, output: out) files sorted: the output of the sites program that contains a sorted list of sites for each experiment performed. sivap: parameters to control the program. first line: two integers, from and to coordinates over which to do the calculations. second line: repeats, the number of times to take passes through the data removing subsets. This improves the statistics. incu: the xyin input to xyplo, output of this program. Two columns: first column is the number of sites used to find the information second column is the amount of information in bits The curves loop around along the axis, so they remain connected. curves: another xyin file, for graphing the wiggling info curves first column is the position across the site second column is the information The curves loop around along the axis, so they remain connected. list: statistical picture of the result. Two columns: first column is the number of sites used to find the information second column is the average amount of information (corresponds to the second column of incu, but is the average) third column is the variance of the information (corresponds to what your eye picks out as the thickness of the incu curves) output: messages to the user description Siva calculates the variance of the information in a set of randomized sites by eliminating each site in turn and keeping track of the increase in the information content. The information content must increase, since with fewer samples there must be less variation (this is the small sample bias effect). The program allows one to graph the information content versus the number of sites removed (incu). When this is done repeatedly, with different orders of removing the sites, a thick band of curves is created. The thickest part of this band shows the greatest possible amount of variation that could be in the total set of sequences. To be even-handed, the program removes the first sequence, then randomly removes the others. This creates the first curve. Then the program removes the second sequence and randomly removes the others for the second curve. If there are n sequences, then n removal curves will be generated. This is one complete repeat of the process. If you want, you can do this a number of times to get better statistics, using the repeat parameter in sivap. The largest variation in the information content is surely greater than the variation of the information content in all the sets of removals of sites. For several experiments, the statistics are joined into one set. With several experiments, surely the variation of the combined experiments would be less than the variations found for the individuals. So if one experiment gives a greater variation, that will increase the variation siva reports in list, so the highest value in list is an upper limit on the variation. documentation @article{Schneider1989, author = "T. D. Schneider and G. D. Stormo", title = "Excess Information at Bacteriophage {T7} Genomic Promoters Detected by a Random Cloning Technique", year = "1989", journal = "Nucl. Acids Res.", volume = "17", pages = "659-674"} see also sites.p author Thomas Dana Schneider bugs none known *) (* end module describe.siva *) (* begin module siva.const *) maxseq = 100; (* largest sequence allowed *) maxentries = 100; (* largest number of data entries *) (* constant for datetime work *) namelength = 20; (* maximum key name length *) (* end module siva.const *) (* begin module datetime.const *) datetimearraylength = 19; (* length of dataarray for dates, It is just long enough to include the 4 digit year - solving the year 2000 problem: 1980/06/09 18:49:11 123456789 123456789 1 2 *) (* end module datetime.const version = 7.40; {of delmod.p 2000 Feb 18} *) type (* begin module datetime.type *) (* array for dates *) datetimearray = packed array[1..datetimearraylength] of char; (* end module datetime.type version = 7.40; {of delmod.p 2000 Feb 18} *) (* begin module siva.type *) base = (a,c,g,t); (* bases *) iblarray = array[a..t, 1..maxseq] of integer; (* integer(B,L) *) rblarray = array[a..t, 1..maxseq] of real; (* real(B,L) *) rlarray = array[1..maxseq] of real; (* real(L) *) sarray = array[0..maxentries,1..maxseq] of base; statarray = array[1..maxentries] of real; (* type for datetime work *) alpha = packed array[1..namelength] of char; (* this is not alfa *) (* end module siva.type *) var sorted, sivap, incu, curves, list: text; (* files used by this program *) (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 7.40; {of delmod.p 2000 Feb 18} *) (* begin module skipblanks *) procedure skipblanks(var thefile: text); (* skip over blanks until a non-blank, or end of line, is found *) begin while (thefile^ = ' ') and not eoln(thefile) do get(thefile); end; procedure skipnonblanks(var thefile: text); (* skip over nonblanks until a blank, or end of line, is found *) begin while (thefile^ <> ' ') and not eoln(thefile) do get(thefile); end; procedure skipcolumn(var thefile: text); (* skip over a data column *) begin skipblanks(thefile); skipnonblanks(thefile) end; (* end module skipblanks version = 7.40; {of delmod.p 2000 Feb 18} *) (* begin module getdatetime *) procedure getdatetime(var adatetime: datetimearray); (* get the date and time into a single array from the system clock. adatetime contains the date: 1980/06/09 18:49:11 ye mo da ho mi se (year, month, day, hour, minute, second). As of 2000 February 18, the Sun Pascal compiler requires a formatting statement. This statement allows the date to be generated in this standard Delila format in a single call. Information about the formatting statement is available on the manual page for date in Unix. If a computer does not have this method, see the 'oldgetdatetime' routine in delmod.p (http://www.lecb.ncifcrf.gov/~toms/delila/delmod.html) for some conversion code. *) begin date(adatetime,'%Y/%m/%d %H:%M:%S'); end; (* end module getdatetime version = 7.40; {of delmod.p 2000 Feb 18} *) (* begin module timeseed *) procedure addtoseed(var seed, power: real; c: char); (* add the digit represented by c to the seed at the power position *) var n: integer; (* the character represented by c *) begin (* addtoseed *) power := power/10; { writeln(output,'addtoseed, c = ',c); writeln(output,'addtoseed, ord(c) = ',ord(c)); } case c of ' ': begin writeln(output,'timeseed: error in datetime'); halt; end; '0': n := 0; '1': n := 1; '2': n := 2; '3': n := 3; '4': n := 4; '5': n := 5; '6': n := 6; '7': n := 7; '8': n := 8; '9': n := 9 end; (*writeln(output,'timeseed number is [',n:1,']'); (@ debug *) seed := seed + power*n end; (* addtoseed *) procedure makeseed(adatetime: datetimearray; var seed: real); (* convert adatetime to a real number in seed, reversed order *) var power: real; (* a digit of the seed such as 0.01 *) begin seed := 0.0; power := 1.0; addtoseed(seed, power, adatetime[19]); addtoseed(seed, power, adatetime[18]); (* : *) addtoseed(seed, power, adatetime[16]); addtoseed(seed, power, adatetime[15]); (* : *) addtoseed(seed, power, adatetime[13]); addtoseed(seed, power, adatetime[12]); (* *) addtoseed(seed, power, adatetime[10]); addtoseed(seed, power, adatetime[9]); (* / *) addtoseed(seed, power, adatetime[7]); addtoseed(seed, power, adatetime[6]); (* / *) addtoseed(seed, power, adatetime[4]); addtoseed(seed, power, adatetime[3]); addtoseed(seed, power, adatetime[2]); addtoseed(seed, power, adatetime[1]); end; procedure orderseed(adatetime: datetimearray; var seed: real); (* convert adatetime to a real number in seed, normal order *) var power: real; (* a digit of the seed such as 0.01 *) begin seed := 0.0; power := 1.0; addtoseed(seed, power, adatetime[1]); addtoseed(seed, power, adatetime[2]); addtoseed(seed, power, adatetime[3]); addtoseed(seed, power, adatetime[4]); (* / *) addtoseed(seed, power, adatetime[6]); addtoseed(seed, power, adatetime[7]); (* / *) addtoseed(seed, power, adatetime[9]); addtoseed(seed, power, adatetime[10]); (* *) addtoseed(seed, power, adatetime[12]); addtoseed(seed, power, adatetime[13]); (* : *) addtoseed(seed, power, adatetime[15]); addtoseed(seed, power, adatetime[16]); (* : *) addtoseed(seed, power, adatetime[18]); addtoseed(seed, power, adatetime[19]); end; procedure timeseed(var seed: real); (* read the computer date and time. reverse the order of the digits and put a decimal point in front. this gives a fraction between zero and one that varies quite quickly, and is always unique (if the computer has sufficient accuracy). it is to be used as a seed to a random number generator. *) var adatetime: datetimearray; (* a date and time *) begin (* timeseed *) getdatetime(adatetime); { writeln(output,'timeseed: adatetime: ',adatetime); } makeseed(adatetime, seed); end; (* timeseed *) (* end module timeseed version = 7.40; {of delmod.p 2000 Feb 18} *) (* begin module random *) procedure random(var seed: real); (* random generator 2. version = 1.01 of random.2 1990 Oct 2 origin 1986 December 31 Test this routine with the program tstrnd. written by David Masternarde *) (* This random number generator is based on a shift register with a single bit of feedback, as described in Electronics for Neurobiologists, by Brown, Maxfield and Moraff, MIT press 1973, referencing Random Process Simulation and Measurement by Korn, McGraw-Hill 1966. The random seed rand, a number between 0 and 1 exclusive, is converted to an integer between 1 and 2**23-1, inclusive. This 23-bit number is shifted right one bit and the output of the last (23rd) bit and the 9th bit are added modulo 2 (exclusive orred) and fed back into the new first bit. This is done between 4 and 11 times, depending on the last 3 bits of the original number. The result is converted back to a real number between 0 and 1 from which the 23 bit integer can be recovered on the next call. The 23-bit shift register goes through all 2**23-1 values before repeating; the repetition frequency of this algorithm could be less or greater depending on the seed, because of the random number of multiple shifts per call. *) (* powers of 2 *) const pow14=16384; pow15=32768; pow22= 4194304; pow23=8388608; var iseed, (* integer shift register *) i, nrep: integer; (* index, number of times to do shift *) begin (* random *) iseed := round(seed*pow23); (* convert to 23 bit number *) if (iseed<1) or (iseed>=pow23) then iseed := 1; nrep := 4 + iseed mod 8; (* do it 4 to 11 times based on last 3 bits *) for i:= 1 to nrep do (* if last bit and 9th bit are equal, feed back a 0, otherwise a 1 *) if( odd(iseed) = ((iseed mod pow15) >= pow14)) then iseed := iseed div 2 else iseed := (iseed div 2) + pow22; seed := iseed/pow23; end; (* random *) (* end module random *) (* begin module copyaline *) procedure copyaline(var fin, fout: text); (* copy a line from file fin to file fout *) begin (* copyaline *) while not eoln(fin) do begin fout^ := fin^; put(fout); get(fin) end; readln(fin); writeln(fout); end; (* copyaline *) (* end module copyaline version = 7.40; {of delmod.p 2000 Feb 18} *) (* begin module package.numbar *) (* ************************************************************************ *) (* begin module numberdigit *) function numberdigit(number, logplace:integer): char; (* return the digit at the place value ('logplace') position of number. example: numberdigit(13625, 3) = 3 numberdigit(13625, 4) = 1 *) var place: integer; (* the exponent of logplace *) count: integer; (* used to make place *) absolute: integer; (* the absolute value of number *) acharacter: char; (* the character to be returned *) procedure digit; (* extract a digit at the place position *) var tenplace: integer; (* ten times place *) z: integer; (* an intermediate value *) d: integer; (* the digit extracted *) begin (* digit *) tenplace:=10*place; z:=absolute-((absolute div tenplace)*tenplace); if place = 1 then d:=z else d:= z div place; case d of 0: acharacter:='0'; 1: acharacter:='1'; 2: acharacter:='2'; 3: acharacter:='3'; 4: acharacter:='4'; 5: acharacter:='5'; 6: acharacter:='6'; 7: acharacter:='7'; 8: acharacter:='8'; 9: acharacter:='9'; end end; (* digit *) procedure sign; (* put a negative sign out or a positive sign *) begin (* sign *) if number <0 then acharacter:='-' else acharacter:='+' end; (* sign *) begin (* numberdigit *) place:=1; for count:=1 to logplace do place:=10*place; if number=0 then begin if place=1 then acharacter:='0' else acharacter:=' ' end else begin absolute:=abs(number); if absolute < (place div 10) then acharacter:=' ' else if absolute >= place then digit else sign end; numberdigit:=acharacter end; (* numberdigit *) (* end module numberdigit version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module numbersize *) function numbersize(n: integer):integer; (* calculate amount of space to be reserved for the integer n *) const ln10 = 2.30259; (* natural log of 10 - for conversion to log base 10 *) epsilon = 0.00001; (* a small number to correct log base 10 errors *) begin (* numbersize *) if n = 0 then numbersize:=1 else numbersize:=trunc(ln(abs(n))/ln10 + epsilon) + 2; (* the epsilon assures that we do not lose a place due to roundoff. eg, sometimes log base 10 of 10 would be 0.9999 instead of 1, and we would not do it right... note: this will fail for very large numbers on the order of 1/epsilon. *) (* the 2 is for the sign and last digit *) end; (* numbersize *) (* end module numbersize version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module numberbar *) procedure numberbar(var afile: text; spaces, firstnumber, lastnumber: integer; var linesused: integer); (* write a bar of numbers to a file, with several spaces before. the number of lines used is returned *) var logplace: integer; (* the log of the digit being looked at *) spacecount: integer; (* count of spaces *) number: integer; (* the current number being written *) begin if abs(firstnumber) > abs(lastnumber) then linesused:= numbersize(firstnumber) else linesused:= numbersize(lastnumber); for logplace:=linesused-1 downto 0 do begin for spacecount:=1 to spaces do write(afile,' '); for number:=firstnumber to lastnumber do write(afile,numberdigit(number,logplace)); writeln(afile) end end; (* end module numberbar version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* ************************************************************************ *) (* end module package.numbar version = 4.11; (@ of prgmod.p 1991 Apr 22 *) (* begin module chartorange *) function chartorange(ch: char): base; begin if ch in ['a','c','g','t'] then case ch of 'a': chartorange := a; 'c': chartorange := c; 'g': chartorange := g; 't': chartorange := t; end else begin writeln(output, 'chartorange: ',ch,' is not a base'); halt end end; (* end module chartorange *) (* begin module basetochar *) function basetochar(b: base): char; begin case b of a: basetochar := 'a'; c: basetochar := 'c'; g: basetochar := 'g'; t: basetochar := 't'; end end; (* end module basetochar *) (* begin module siva.getparams *) procedure getparams(var p: text; var fromcalc, tocalc: integer; var repeats: integer); (* get the parameters from p: from and to coordinates over which to calculate information, times to repeat this experiment *) begin reset(p); readln(p,fromcalc, tocalc); readln(p,repeats); end; (* end module siva.getparams *) (* begin module siva.header *) procedure header(var afile, sorted: text; fromcalc,tocalc: integer; repeats: integer); (* create the header of afile by copying from sorted, and give the region analyzed and the number of repeats to pass through eliminating sets *) var i: integer; (* index for lines *) begin rewrite(afile); reset(sorted); writeln(afile,'* siva ',version:4:2); for i := 1 to 3 do begin write(afile,'* '); copyaline(sorted, afile); end; writeln(afile,'* Coordinates over which the calculations', ' were done: ', fromcalc:1,' to ',tocalc:1); writeln(afile,'* repeats of elimination: ', repeats:1); end; (* end module siva.header *) (* begin module siva.getseqs *) procedure getseqs(var f: text; fromrange,torange: integer; fromcalc,tocalc: integer; var s: sarray; var number: integer); (* get the sites s from f, return number found in n skip the first few characters, the length of each sequence is an input *) const showem = 3; (* number of sites to show while the program is running *) var b: char; (* a character that may be a base *) done: boolean; (* done reading sequences *) i: integer; (* index for lines *) l: integer; (* index to position in sequence *) length: integer; (* size of the region of sequence read in *) skip: integer; (* number of bases to skip during read-in of sequences *) begin if fromcalc < fromrange then begin writeln(output,'requested region to calculate from (',fromcalc:1, ') is before the fromrange (',fromrange:1,')'); halt end; if tocalc > torange then begin writeln(output,'requested region to calculate to (',tocalc:1, ') is after the torange (',torange:1,')'); halt end; if fromcalc > tocalc then begin writeln(output,'requested region to calculate from (',fromcalc:1, ' is greater than the end of the calculation range', ' (',tocalc:1,')'); halt end; length := tocalc - fromcalc + 1; skip := fromcalc - fromrange; (* skip the blank and numbar *) for l := 1 to 4 do readln(f); (* put out the new one *) writeln(output,'Calculating region: ',fromcalc:1,' to ',tocalc:1); numberbar(output,0,fromcalc,tocalc,l); for l := 1 to skip do read(f,b); (* skip the left edge *) for l := 1 to length do begin (* read the base sequence *) read(f,b); write(output,b); s[0,l] := chartorange(b); end; readln(f); writeln(output); (* read the other sites *) number := 0; done := false; while not done do begin if eof(f) then done := true; if not done then if eoln(f) then done := true; if not done then begin number := number + 1; if number > maxentries then begin writeln(output,'getseqs: too many sequences, increase maxseqs'); halt end; for l := 1 to skip do read(f,b); (* skip the left edge *) for l := 1 to length do begin read(f,b); if b in ['a','c','g','t'] then s[number,l] := chartorange(b) else s[number,l] := s[0,l] end; readln(f); end; end; for i := 1 to showem do begin for l := 1 to length do if s[i,l]<>s[0,l] then write(output,basetochar(s[i,l])) else write(output,'.'); writeln(output); end; writeln(output,' . . .'); for i := number-showem to number do begin for l := 1 to length do if s[i,l]<>s[0,l] then write(output,basetochar(s[i,l])) else write(output,'.'); writeln(output); end; writeln(output,'number of sites read in: ',number:1); end; (* end module siva.getseqs *) (* begin module siva.mkfi *) procedure mkfi(fracan: real; seqs: sarray; length: integer; var fi: rblarray); (* make the fi table based on the 0 sequence in sarray. fracan is the frequency of the canonical base *) var b: base; (* one of the bases *) l: integer; (* index to the sequence position *) nonfracan: real; (* the frequency of other each bases *) begin nonfracan := (1-fracan)/3.0; for l := 1 to length do begin for b := a to t do begin if b = seqs[0,l] then fi[b,l] := fracan else fi[b,l] := nonfracan end end (* ; for l:=1 to 10 do begin for b:= a to t do write(output,' ',fi[b,l]:5:2); writeln(output) end *) end; (* end module siva.mkfi *) (* begin module siva.mknbl *) procedure mknbl(number: integer; seqs: sarray; length: integer; var nbl: iblarray); (* make the nbl table based on the sites in sarray. number is the number of sites *) var b: base; (* one of the bases *) l: integer; (* index to the sequence position *) n: integer; (* index to the sites *) begin for l := 1 to length do begin for b := a to t do nbl[b,l] := 0; (* clear *) for n := 1 to number do nbl[seqs[n,l],l] := nbl[seqs[n,l],l] + 1 end (* ; for l:=1 to 10 do begin for b:= a to t do write(output,' ',nbl[b,l]:5); writeln(output) end *) end; (* end module siva.mknbl *) (* begin module siva.mkfo *) procedure mkfo(nbl:iblarray; length: integer; var fo: rblarray); (* make the output frequency array from the input nbl array *) var b: base; (* one of the bases *) l: integer; (* index to the sequence position *) number: integer; (* sum of nbl at l *) (*boo: boolean;debug *) begin for l := 1 to length do begin number := 0; (* boo := false; for b := a to t do if nbl[b,l]<0 then boo := true; if boo then for b := a to t do begin writeln(output,'mkfo: nbl[',basetochar(b),',',l:2,']=',nbl[b,l]:5); end; if boo then begin writeln(output,0/(1-1)); halt end;(@ kill program *) for b := a to t do number := number + nbl[b,l]; for b := a to t do fo[b,l] := nbl[b,l]/number; end end; (* end module siva.mkfo *) (* begin module siva.mkinfo *) procedure mkinfo(fi,fo: rblarray; length: integer; var ip: rlarray; var info: real); (* make the normalized information for the transformation from the input fi to the output fo for length along the site. *) var b: base; (* one of the bases *) l: integer; (* index to the sequence position *) ln2: real; (* natural log 2 for conversion to bits *) nu: real; (* intermediate value *) sum: real; (* sum fo fi/fo *) begin ln2 := ln(2); info := 0.0; for l := 1 to length do begin ip[l] := 0.0; sum := 0.0; for b := a to t do sum := sum + fo[b,l]/fi[b,l]; for b := a to t do begin nu := (fo[b,l]/fi[b,l]) / sum; if nu > 0.0 then ip[l] := nu * ln(nu) + ip[l] end; ip[l] := 2 + ip[l]/ln2; (* convert to bits and add final touch *) info := info + ip[l]; if ip[l] > 2.0 then begin writeln(output,'l=',l:5,'****************************'); writeln(output,'ip[l]=',ip[l]:5,'>2!'); for b := a to t do writeln(output,'fo[',basetochar(b),',l]=',fo[b,l]:10:8); writeln(output,'sum = ',sum:10:8, 'info=',info:10:8); end; end; end; (* end module siva.mkinfo *) (* begin module siva.mkcurve *) procedure mkcurve(var curves: text; fromcalc, length: integer; ip: rlarray); (* make the curve in ip to the curves file. the curves loop to zero, where the return draw will be invisible *) var l: integer; (* index to the sequence position *) begin writeln(curves,(fromcalc):5,' ',0:5); (* bound the curve with zero *) for l := fromcalc to fromcalc+length-1 do writeln(curves,l:5,' ',ip[l-fromcalc+1]:10:8); writeln(curves,(fromcalc+length):5,' ',0:5); (* bound the curve *) end; (* end module siva.mkcurve *) (* begin module siva.del *) procedure del(nbl:iblarray; seqs: sarray; length: integer; deleted: integer; var nbld:iblarray); (* delete the 'deleted'th sequence in seqs from nbl to create nbld. the length of each sequence is given *) var b: base; (* one of the bases *) l: integer; (* index to the sequence position *) begin for l := 1 to length do begin for b := a to t do nbld[b,l] := nbl[b,l]; b := seqs[deleted,l]; nbld[b,l] := nbld[b,l] - 1; if nbld[b,l]<0 then begin for b :=a to t do writeln(output,'procedure del error:', ' nbl[,',basetochar(b),',',l:2,']=',nbld[b,l]:5); end; end end; (* end module siva.del *) (* begin module siva.recordstat *) procedure recordstat(var sumcount, sumx, sumxsq: statarray; info: real; d: integer); (* record the information (bits) of info in the three statistics arrays: count of the number of points, sum of info (sumx) and sum of square of info, at position d *) begin sumcount[d] := sumcount[d] + 1; sumx[d] := sumx[d] + info; sumxsq[d] := sumxsq[d] + info*info; end; (* end module siva.recordstat *) (* begin module siva.themain *) procedure themain(var sorted, sivap, incu, curves, list: text); (* the main procedure of the program *) var already: array[1..maxentries] of boolean; (* which sequences have already been removed from the set *) candidate: integer; (* a candidate for deletion from the set *) experiment: integer; (* the experiment number being worked on *) expected: integer; (* expected experiment number *) fi: rblarray; (* frequency input to the experiment *) fo: rblarray; (* frequency output of the experiment *) fracan: real; (* frequency of canonical base *) fromcalc, tocalc: integer; (* region over which to calculate information *) fromrange, torange: integer; (* region of the sites *) fromrandom, torandom: integer; (* region of the sites randomized *) info: real; (* total information in ip *) ip: rlarray; (* information curve across the site *) length: integer; (* length of sites *) mean: real; (* mean of the information for some number of sites deleted *) mostnumbers: integer; (* the most number of sites processed *) number: integer; (* current number of sites in this experiment *) nbl: iblarray; (* n(b,l) array *) nbld: iblarray; (* nbl less one or more sequences *) r: integer; (* index to the number of repeats of the experiment *) repeats: integer; (* the number of repeats of the experiment *) seed: real; (* the current random number *) seqs: sarray; (* the sites *) subdeleted: integer; (* one of the sequences to be deleted *) sumcount: statarray; (* count of info values for statistics *) sumx: statarray; (* sum of info values for statistics *) sumxsq: statarray; (* sum of info values squared for statistics *) procedure initialize; (* start up all the variables *) var d: integer; (* counts number of deleted sites *) begin (* initialize *) writeln(output,'siva ',version:4:2); expected := 0; getparams(sivap,fromcalc,tocalc,repeats); writeln(output,'Calculations over the range ',fromcalc:1, ' to ',tocalc:1); writeln(output,repeats:1, ' repeats'); length := tocalc-fromcalc+1; header(incu,sorted,fromcalc,tocalc,repeats); writeln(incu,'* number of sites, information with that number'); header(curves,sorted,fromcalc,tocalc,repeats); writeln(curves,'* position in site, information at that position'); header(list,sorted,fromcalc,tocalc,repeats); timeseed(seed); (* rev up the ol' random generator *) (* clear the statistics arrays *) mostnumbers := 0; for d := 1 to maxentries do begin sumcount[d] := 0.0; sumx[d] := 0.0; sumxsq[d] := 0.0; end; end; (* initialize *) procedure getset(var sorted: text; var experiment: integer; var fromrange,torange,fromrandom,torandom: integer; var fracan: real); (* get the set of experimental parameters *) var done: boolean; (* if finished getting we are done *) procedure skipword; (* skip a word in the sorted file *) begin skipblanks(sorted); skipnonblanks(sorted); end; begin (* first locate the start of the experiment *) done := false; while not done do begin if eof(sorted) then begin writeln(output,'getset: Could not find Experment start: sorted file', ' has a bad format'); halt end; if sorted^ = 'E' then begin get(sorted); if sorted^ <> 'x' then begin writeln(output,'getset: Could not find eXperment start:', ' sorted file has a bad format'); halt end; skipnonblanks(sorted); readln(sorted, experiment); expected := succ(expected); if experiment <> expected then begin writeln(output,'Expected experiment number ',expected:1, ' but found number ',experiment:1); halt end; skipword; skipword; read(sorted, fromrange); skipword; readln(sorted, torange); skipword; skipword; read(sorted, fromrandom); skipword; readln(sorted, torandom); skipword; skipword; readln(sorted, fracan); done := true; writeln(output); writeln(output); writeln(output, 'Experiment = ',experiment:4); writeln(output, 'fromrange = ',fromrange:4); writeln(output, 'torange = ',torange:4); writeln(output, 'fromrandom = ',fromrandom:4); writeln(output, 'torandom = ',torandom:4); writeln(output, 'fracan = ',fracan:4:2); writeln(output) end else readln(sorted); end; end; procedure setupexperiment; (* set up the variables for one experiment *) begin getset(sorted,experiment, fromrange,torange,fromrandom,torandom,fracan); getseqs(sorted,fromrange,torange,fromcalc,tocalc,seqs,number); if mostnumbers < number then mostnumbers := number; mkfi(fracan, seqs, length, fi); mknbl(number, seqs, length, nbl); mkfo(nbl,length,fo); mkinfo(fi,fo,length,ip,info); (* output the fundamental data, where there are no deletions at all *) mkcurve(curves,fromcalc,length,ip); (* writeln(output,number:5,' ',info:10:8); *) writeln(incu,number:3,' ',info:10:8); recordstat(sumcount,sumx,sumxsq,info,number); end; procedure removingloop; (* remove one sequence at a time in this loop *) const reporting = false; (* if true we report each step to output *) var deleted: integer; (* sequence number deleted *) d: integer; (* counts number of deleted sites *) begin (* removingloop *) writeln(output,'First sequence deleted is number:'); for deleted := 1 to number do begin write(output,' ',deleted:1); if (deleted mod 1) = 0 then writeln(output); writeln(incu,number+100:3,' ',0.0:10:8); (* baseline curve *) del(nbl,seqs,length,deleted,nbld); mkfo(nbld,length,fo); mkinfo(fi,fo,length,ip,info); (* output the fundamental data *) mkcurve(curves,fromcalc,length,ip); if reporting then writeln(output,(number-1):5,' ',info:10:8); writeln(incu, (number-1):3,' ',info:10:8); recordstat(sumcount,sumx,sumxsq,info,number-1); (* randomly remove all further sequences in this loop *) for d := 1 to number do already[d] := false; already[deleted] := true; (* we already did that one!!! *) for d := number - 2 downto 1 do begin (* start with two removed *) (* chose one of them to delete *) subdeleted := 0; while subdeleted = 0 do begin random(seed); candidate := round(seed*(number-1)) + 1; if not already[candidate] then begin subdeleted := candidate; already[candidate] := true end end; del(nbld,seqs,length,subdeleted,nbld); mkfo(nbld,length,fo); mkinfo(fi,fo,length,ip,info); recordstat(sumcount,sumx,sumxsq,info,d); if reporting then writeln(output,d:5,' ',info:10:8); writeln( incu,d:3,' ',info:10:8); end; writeln(incu,-1:3,' ',0.0:10:8); (* baseline curve *) end; end; (* removingloop *) procedure writelist(var list: text); (* write the statistics out *) var d: integer; (* counts number of deleted sites *) begin writeln(list,'* '); writeln(list,'* list: statistics of the data'); writeln(list,'* # sites, mean (bits), standard deviation (bits),', ' count'); for d := 1 to mostnumbers do begin mean := sumx[d]/sumcount[d]; (* writeln(list, ' d=>',d:10); writeln(list, 'sumcount=>',sumcount[d]:10:8); writeln(list, ' sumx=>',sumx[d]:10:8); writeln(list, ' sumxsq=>',sumxsq[d]:10:8); *) writeln(list,d:3, ' ',mean:10:8, ' ',sqrt((sumxsq[d]/sumcount[d])- mean*mean):10:8, (* that last item is the standard deviation *) ' ',round(sumcount[d]):10) end; end; (* writelist *) begin (* themain *) initialize; while not eof(sorted) do begin setupexperiment; for r := 1 to repeats do begin writeln(output,'============================ repeat ', r:1); removingloop; end; end; writelist(list); end; (* themain *) (* end module siva.themain *) begin themain(sorted, sivap, incu, curves, list); 1: end.