program rndseq(sequ, rndseqp, output); (* generate random dna sequences Dr. Thomas D. Schneider National Cancer Institute Laboratory of Experimental and Computational Biology Frederick, Maryland 21702-1201 toms@ncifcrf.gov permanent email: toms@alum.mit.edu http://www.lecb.ncifcrf.gov/~toms/ module libraries required: delman, matmods, delmods *) label 1; (* end of the program *) const (* begin module version *) version = 1.13; (* of rndseq.p 2001 Nov 19 2001 Nov 19, 1.13: documentation upgrade 1999 Dec 13, 1.11: y2k upgrade 1993 March 25, 1.10: last changes origin: 1983 oct 17 *) (* end module version *) (* begin module describe.rndseq *) (* name rndseq: generate random dna sequences synopsis rndseq(sequ: out, rndseqp: in, output: out) files sequ: the random sequence rndseqp: parameters to control the generation of the sequence, on 4 lines: number (integer): the number of sequences to generate; length (integer): the length of each sequence; a c g t (4 integers): the proportions of bases desired; seed (real): a number between 0 and 1 is the starting seed for the random number generator. a number outside this range indicates that the date and time should be used. the date and time 83/10/17 20:15:32 makes a seed of 0.235102710138. the date-time is used backwards to assure that 1) the seed is always unique, and 2) it varies rapidly with time. output: messages to the user. description rndseq creates randomly generated dna sequences, separated by periods. the number, length and composition of the sequences are all specified by the user. the user can also set the start point (seed) of the pseudo-random number generator. if the same seed is given at a later time, then the same series of bases will be produced. alternatively, the user can have the program use the current date and time to create a unique seed. examples 5 number of sequences 100 length of each sequence in base pairs 1 1 1 1 ratios of a, c, g, t 2 random generator seed: 0 to 1; outside this: inverse date/time see also {example parameter file:} rndseqp {Random DNA with composition control:} markov.p author Thomas Dana Schneider bugs none known technical notes the number of characters per line is set by constant linelength. *) (* end module describe.rndseq *) (* begin module rndseq.const *) linelength = 60; (* the length of a line of sequence *) endsymbol = '.'; (* the symbol to end each sequence *) namelength = 20; (* length of date-type alpha *) (* end module rndseq.const *) (* begin module datetime.const *) datetimearraylength = 19; (* length of dataarray for dates, It is just long enough to include the 4 digit year - solving the year 2000 problem: 1980/06/09 18:49:11 123456789 123456789 1 2 *) (* end module datetime.const version = 7.40; {of delmod.p 2000 Feb 18} *) type (* begin module datetime.type *) (* array for dates *) datetimearray = packed array[1..datetimearraylength] of char; (* end module datetime.type version = 7.40; {of delmod.p 2000 Feb 18} *) var (* begin module rndseq.var *) sequ, (* random dna sequences *) rndseqp: (* controlling parameters *) text; (* end module rndseq.var *) (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 7.40; {of delmod.p 2000 Feb 18} *) (* begin module random *) procedure random(var seed: real); (* the seed value is used to generate a random number in the range 0 to 1. the seed is then altered, to provide the seed of the next random number. in other words, the seed is the random number. to obtain the next random number in the pseudo-random series, simply call random again with the same seed. note that the series will not be truely random, since the previous value determines the next value. this is "pseudo-random". an initial seed of 4.0 is suggested. note that this is not the first random number. you can use different seeds to get different pseudo-random number series. the expected mean and standard deviation of a flat distribution is calculated using simple integrals: 1 mean = / x dx = 0.5 0 1 2 variance = / (x - mean) dx = 1/12; stdev = sqrt(variance) = .2886751 0 (the integral for the variance is solved easily by substituting y = x - mean.) the algorithm is derived from: pascal programs for scientists and engineers, alan r. miller sybex, berkeley ca (1981). which in turn was an adaptation from hp-35 applications programs *) const pi = 3.141592653589; (* a value used to help mix up the result *) var intermediate: real; (* an intermediate result *) begin (* random *) intermediate := seed + pi; intermediate := exp(5.0*ln(intermediate)); seed := intermediate - trunc(intermediate) end; (* random *) (* end module random version = 'matmod 1.95 85 apr 18 tds/gds'; *) (* begin module getdatetime *) procedure getdatetime(var adatetime: datetimearray); (* get the date and time into a single array from the system clock. adatetime contains the date: 1980/06/09 18:49:11 ye mo da ho mi se (year, month, day, hour, minute, second). As of 2000 February 18, the Sun Pascal compiler requires a formatting statement. This statement allows the date to be generated in this standard Delila format in a single call. Information about the formatting statement is available on the manual page for date in Unix. If a computer does not have this method, see the 'oldgetdatetime' routine in delmod.p (http://www.lecb.ncifcrf.gov/~toms/delila/delmod.html) for some conversion code. *) begin date(adatetime,'%Y/%m/%d %H:%M:%S'); end; (* end module getdatetime version = 7.40; {of delmod.p 2000 Feb 18} *) (* begin module timeseed *) procedure addtoseed(var seed, power: real; c: char); (* add the digit represented by c to the seed at the power position *) var n: integer; (* the character represented by c *) begin (* addtoseed *) power := power/10; { writeln(output,'addtoseed, c = ',c); writeln(output,'addtoseed, ord(c) = ',ord(c)); } case c of ' ': begin writeln(output,'timeseed: error in datetime'); halt; end; '0': n := 0; '1': n := 1; '2': n := 2; '3': n := 3; '4': n := 4; '5': n := 5; '6': n := 6; '7': n := 7; '8': n := 8; '9': n := 9 end; (*writeln(output,'timeseed number is [',n:1,']'); (@ debug *) seed := seed + power*n end; (* addtoseed *) procedure makeseed(adatetime: datetimearray; var seed: real); (* convert adatetime to a real number in seed, reversed order *) var power: real; (* a digit of the seed such as 0.01 *) begin seed := 0.0; power := 1.0; addtoseed(seed, power, adatetime[19]); addtoseed(seed, power, adatetime[18]); (* : *) addtoseed(seed, power, adatetime[16]); addtoseed(seed, power, adatetime[15]); (* : *) addtoseed(seed, power, adatetime[13]); addtoseed(seed, power, adatetime[12]); (* *) addtoseed(seed, power, adatetime[10]); addtoseed(seed, power, adatetime[9]); (* / *) addtoseed(seed, power, adatetime[7]); addtoseed(seed, power, adatetime[6]); (* / *) addtoseed(seed, power, adatetime[4]); addtoseed(seed, power, adatetime[3]); addtoseed(seed, power, adatetime[2]); addtoseed(seed, power, adatetime[1]); end; procedure orderseed(adatetime: datetimearray; var seed: real); (* convert adatetime to a real number in seed, normal order *) var power: real; (* a digit of the seed such as 0.01 *) begin seed := 0.0; power := 1.0; addtoseed(seed, power, adatetime[1]); addtoseed(seed, power, adatetime[2]); addtoseed(seed, power, adatetime[3]); addtoseed(seed, power, adatetime[4]); (* / *) addtoseed(seed, power, adatetime[6]); addtoseed(seed, power, adatetime[7]); (* / *) addtoseed(seed, power, adatetime[9]); addtoseed(seed, power, adatetime[10]); (* *) addtoseed(seed, power, adatetime[12]); addtoseed(seed, power, adatetime[13]); (* : *) addtoseed(seed, power, adatetime[15]); addtoseed(seed, power, adatetime[16]); (* : *) addtoseed(seed, power, adatetime[18]); addtoseed(seed, power, adatetime[19]); end; procedure timeseed(var seed: real); (* read the computer date and time. reverse the order of the digits and put a decimal point in front. this gives a fraction between zero and one that varies quite quickly, and is always unique (if the computer has sufficient accuracy). it is to be used as a seed to a random number generator. *) var adatetime: datetimearray; (* a date and time *) begin (* timeseed *) getdatetime(adatetime); { writeln(output,'timeseed: adatetime: ',adatetime); } makeseed(adatetime, seed); end; (* timeseed *) (* end module timeseed version = 7.40; {of delmod.p 2000 Feb 18} *) (* begin module rndseq.getparams *) procedure getparams(var rndseqp: text; var number, length, a, c, g, t: integer; var seed: real); (* get parameters of number, length, composition and starting seed from rndseqp *) begin (* getparams *) reset(rndseqp); if eof(rndseqp) then begin writeln(output,' rndseqp is empty'); halt end; readln(rndseqp, number); readln(rndseqp, length); readln(rndseqp, a,c,g,t); readln(rndseqp, seed); if (number<1) or (length<1) or (a<0) or (c<0) or (g<0) or (t<0) then begin writeln(output,' integer parameters must be positive'); halt end; if (seed <= 0.0) or (seed >= 1.0) then timeseed(seed); writeln(output,' the seed used is ', seed:14:12); end; (* getparams *) (* end module rndseq.getparams *) (* begin module rndseq.generate *) procedure generate(var sequ: text; number, length, a, c, g, t: integer; seed: real); (* generate to sequ a 'number' of random sequences of some 'length' and composition (a,c,g,t) starting with a random 'seed'. *) var n: integer; (* the total number of bases in the composition *) sa, sac, sacg: real; (* the sums of frequencies divide the range 0 to 1 into three partitions *) nindex, lindex: integer; (* indicies to number and length *) plength: integer; (* pred of length, calculated once for speed *) begin (* generate *) rewrite(sequ); n := a+c+g+t; sa := a/n; sac := sa + (c/n); sacg := sac + (g/n); plength := length - 1; (* one less than length *) (* begin to generate the sequences *) for nindex := 1 to number do begin for lindex := 0 to plength do begin if (lindex mod linelength) = 0 then if (lindex <> plength) and (lindex <> 0) then writeln(sequ); random(seed); if seed <= sa then write(sequ,'a') else if seed <= sac then write(sequ,'c') else if seed <= sacg then write(sequ,'g') else write(sequ,'t') end; writeln(sequ, endsymbol) end end; (* generate *) (* end module rndseq.generate *) (* begin module rndseq.themain *) procedure themain(var sequ, rndseqp: text); (* themain procedure of program rndseq. read parameters from rndseqp and generate random sequences to sequ. *) var number, (* the number of sequences to generate *) length, (* the length of each sequence *) a,c,g,t: (* the composition of the sequences *) integer; seed: real; (* a seed for the random generator *) begin (* themain *) writeln(output,' rndseq ', version:4:2); getparams(rndseqp, number, length, a, c, g, t, seed); generate(sequ, number, length, a, c, g, t, seed); writeln(output,' ',number*length:8,' bases generated') end; (* themain *) (* end module rndseq.themain *) begin (* rndseq *) themain(sequ,rndseqp); 1: end. (* rndseq *)