program malign(inst, book, malignp, uncert, newalign, optalign,
optinst, malignxyin, output);
(* optimal alignment of sequences in a book, to minimize uncertainty
by
David Mastronarde
and
Dr. Thomas D. Schneider
National Institutes of Health
National Cancer Institute
Center for Cancer Research Nanobiology Program
Molecular Information Theory Group
Frederick, Maryland 21702-1201
toms@ncifcrf.gov
permanent email: toms@alum.mit.edu (use only if first address fails)
http://www.ccrnp.ncifcrf.gov/~toms/
module libraries: delman, delmod *)
label 1; (* end of program *)
const
(* begin module version *)
version = 2.56; (* of malign.p 2005 Sep 28
2005 Sep 28: 2.56: insert malign.xyplop
2005 Feb 7: 2.55: cleanup
2005 Feb 7: 2.54: crash: "second operand of `mod' is <= 0"
2005 Jan 27: 2.53: increase maxnseq from 5000 to 10000
2001 Jul 10: 2.52: upgrade documentation
2001 Apr 18: 2.51: document file output
2000 Dec 13: 2.47: upgrade alignment for comments
2000 Jun 29: 2.47: upgrade documentation
2000 Feb 23: 2.44: add alignmenttype; upgrade delmod modules
1999 August 26: 2.44: add spaces to optinst so big numbers don't
get fused together. STUPID.
1999 July 22: 2.43: change xyin to malignxyin to avoid conflicts
1999 June 17: 2.42: optinst has Delila 'same' instruction.
1998 Oct 19: range set to 1000
origin 1986 apr 22 *)
(* end module version *)
(* begin module describe.malign *)
(*
name
malign: optimal alignment of a book, based on minimum uncertainty
synopsis
malign(inst: in, book: in, malignp: in, uncert: out, newalign: out,
optalign: out, optinst: out, malignxyin: out, output: out)
files
inst: delila instructions of the form 'get from 56 -5 to 56 +10;'
book: the book generated by delila using inst
malignp: parameter file with the following parameters:
winleft, winright: left and right ends of window for calculating
uncertainty, relative to aligned base
shiftmin, shiftmax: minimum and maximum shift of aligned base
iseed: integer random seed
nranseq: number of random sequences, or 0 to use sequences in book
nshuffle: number of times to redo alignment after random shuffle
ifpaired: 1 to treat each pair of sequences as complementary strands,
0 not to
standout: output run #, pass # and H to standard output every pass
if 1, every run if 0, or not at all if -1
npassout: output H and alignment every npassout passes to file newalign,
or only at end of runs if zero, or not at all if -1
nshiftout: output L and H(L) every nshiftout sequence shifts (to file
uncert), or only at end of passes if zero, or not at all if -1
tolerance: tolerance in change of H
ntolpass: maximum number of passes with change below tolerance
new parameter allowed but not required (default is i):
alignmenttype: char; 'f' means alignment by First internal coordinate
base, 'b' means alignment by Book, 'i' means alignment by
Instructions. See the alist program for more information.
Normally one will align by delila instructions.
If this parameter is 'f', then the first base of the book is
considered the zero base and if it is 'b' then the zero base is
given by the coordinates of each piece in the book.
uncert: uncertainty as function of position, for the last run, at the
end of each pass or after selected number of sequence shifts
Controlled by variable nshiftout.
newalign: values of H and the relative alignments; starting, final, and
intermediate if selected.
Controlled by variable npassout.
optalign: user-readable listing of unique optimal relative alignments
and number of times each was achieved
optinst: list of unique optimal alignments in absolute coordinates,
to be used to make inst file for selected alignment
This file is like optalign, but the coordinates are for the
original sequence.
malignxyin: a list of the number of occurrences of alignments and their
H values in bits. This may be plotted with xyplo, as described
in the paper. Each line contains these numbers:
rank: from 1 to the number of alignment classes
occurrences: how many times the class was found
H: the uncertainty of the alignment, in bits
R: the information content of the alignment, in bits
with small sample correction.
description
Given a book of aligned sequences, this program searches for the alignment
of the sequences that has the lowest uncertainty, i.e. the highest value of
Rsequence. The user specifies the "window" of bases within which
uncertainty is calculated, and the maximum number of bases that each
sequence is allowed to shift from the original alignment. The program
considers each sequence in turn, shifting it to an alignment with minimum
uncertainty while holding the other sequences fixed. A "pass" is complete
when all sequences have been considered. A "run" is complete when no
alignments have changed in the preceding pass, and the alignment is then
considered "optimal". The first run starts with the original alignment;
every run after that starts with a "shuffled" alignment obtained by shifting
each sequence independently by a random amount between the allowed limits.
The program maintains a list of all of the unique optimal alignments
achieved from these starting alignments, and it outputs them in order of
increasing uncertainty.
In version 2.33 and earlier, the program did not keep track of the
organism and chromosome names in bestinst. This file is now superceeded
by the malin program which copies the inst file to cinst and modifies
it according to one of the alignments in optinst.
********************************************************************************
Summary of file output:
Malign produces:
uncert, newalign, optalign, optinst, malignxyin, output
-- output --
Line 7 of "malignp" controls output
Parameter:
1 - every run, pass, and uncertainty will be outputed to the
screen
0 - only the lowest uncertainty run will be outputed
-1 - nothing will be outputed
*NOTE* no file will be produced regardless of the parameter
-- newalign --
Line 8 of "malignp" controls newalign
Parameter:
1 - produces a full newalign file
0 - produces a smaller newalign file (about half the size)
-1 - produces no newalign file
*NOTE* this is the largest file produced and is unnecessary
-- uncert --
Line 9 of "malignp" controls uncert
Parameter:
1 - produces a full uncert file
0 - produces a smaller uncert file (about half the size)
-1 - produces an empty uncert file
*NOTE* file will be produced regardless of the parameter,
however this file is large and unnecessary
-- optalign --
*NOTE* this file will always be produced and is needed to run malin
-- optinst --
*NOTE* this file will always be produced and is needed to
run malin
-- malignxyin --
*NOTE* this file will always be produced and can be used to
plot data using xyplo
If you set our parameters so newalign and uncert are not created,
this can save some space.
(Thanks to Brent M. Jewett for compiling this information on 2001 Feb 7.)
********************************************************************************
documentation
A paper describing the algorithm in detail is available from
ftp://ftp.ncifcrf.gov/pub/delila/malign.ps
@article{Schneider.Mastronarde.malign,
author = "T. D. Schneider
and D. Mastronarde",
title = "{Fast} multiple alignment of ungapped {DNA}
sequences using information theory and a relaxation method",
journal = "Discrete Applied Mathematics",
note = "ftp://ftp.ncifcrf.gov/pub/delila/malign.ps",
volume = "71",
pages = "259-268",
year = "1996"}
The use of malign to align sequences with a subtle pattern is described in:
@article{Toledano1994,
author = "M. B. Toledano
and I. Kullik
and F. Trinh
and P. T. Baird
and T. D. Schneider
and G. Storz",
title = "Redox-Dependent Shift of {OxyR-DNA} Contacts Along
An Extended {DNA} Binding Site:
A Mechanism for Differential Promoter Selection",
journal = "Cell",
volume = "78",
pages = "897-909",
year = "1994"}
For how the information content and small sample correction are computed:
@article{Schneider1986,
author = "T. D. Schneider
and G. D. Stormo
and L. Gold
and A. Ehrenfeucht",
title = "Information content of binding sites on nucleotide sequences",
journal = "J. Mol. Biol.",
volume = "188",
pages = "415-431",
year = "1986"}
see also
{Paper Schneider.Mastronarde.malign:}
http://www.lecb.ncifcrf.gov/~toms/paper/malign/
{Program to graph the malignxyin file:} xyplo.p
{You can use the} malign.xyplop { file for the xyplop
and the malignxyin for the xyin. Set the xyplom to be empty.
I ALWAYS make this graph to see what is going on.}
{Program to make delila instructions from nth alignment of malign:}
malin.p
{Example parameter file, malignp:} malignp
{A COMPLETE SET FOR DEMONSTRATING THE PROGRAM}
malign.inst {instructions for grabbing the first 6 EcoRI sites on
the E. coli genome, but messed up by setting the last
digit to zero}
malign.book {The Delila book corresponding to malign.inst}
malign.malignp {Parameter file for malign to realign the inst and book.}
{If one uses malin to pick the first alignment, one finds that they are
correctly realigned:
- +
1--------- +++++++++1
098765432101234567890
.....................
EcoRI U00096 3842 + 1 cgacctgccggaattcagcct
U00096 12889 + 2 tctggttgaagaattcaagaa
U00096 32545 + 3 tcagggtatcgaattcgacta
U00096 50237 + 4 ggtattcagcgaattccacga
U00096 56282 + 5 agaggtagcggaattcgttct
U00096 96860 + 6 gctacgtcaggaattcctgct
}
{Program for listing aligned sequences, as above:} alist.p
author
David Mastronarde and Tom Schneider
bugs
The realignment algorithm, which shifts all sequences by the same amount
to attempt to keep the window near its original position, is somewhat
ad hoc in nature and the effects of different settings for it parameters
have not been explored. If the window spans two real sites with competing
alignments, many optimal but meaningless alignments with similar
uncertainties may be obtained. The random sequences can't be examined.
For the computation of Rsequence, composition is assumed to be
equiprobable, there is no provision for reading in a cmp file yet.
technical notes
The malignxyin file Rsequence has the small sample correction. The choice
between the estimate and the more exact computation is determined
by constant "kickover".
The constant maxlen is one longer than the longest sequence.
The constant maxnseq is the maximum number of sequences.
*)
(* end module describe.malign *)
maxlen=2002; (* 1 + maximum length of sequences *)
maxnseq=10000; (* maximum number of sequences *)
(* constants for the ad hoc realignment procedure; see "realign" *)
maxchange=50; (* 2 times maximum change in alignments allowed
during realignment *)
minmaxforchange=3; (* minimum number of sequences that must have the
same shift from their original alignments before all sequences will
get realigned so as to restore that subset of sequences to original
alignment *)
realignlimit=10; (* maximum pass number on which to do realignment *)
kickover = 50; (* e(hnb) for n values above this are estimated from
ae(hnb) = hg - 3/(2 ln(2) n) while those below
are calculated exactly *)
(* begin module info.const *)
infofield = 8; (* field size for bits *)
infodecim = 5; (* decimal places for bits *)
posfield = 4; (* field size for aligned sequence positions *)
nfield = 4; (* size of field for printing n, the number of sites *)
(* end module info.const *)
(* begin module book.const *)
(* constants needed for book manipulations *)
dnamax = 1024; (* length of dna arrays *)
namelength = 100; (* maximum key name length *)
linelength = 80; (* maximum line readable in book *)
(* end module book.const version = 7.56; {of delmod.p 2000 Nov 21} *)
(* trigger definitions follow to make program compilable.
The prgmod.p program has the triggers *)
(* module filler.const *)
fillermax = 50; (* the size of the filler array for a string *)
(* module filler.const from prgmod.p 4.20 *)
(* begin module interact.const *)
(* begin module string.const *)
maxstring = 150; (* the maximum string *)
(* end module string.const version = 4.39; (@ of prgmod.p 1999 November 28 *)
(* end module interact.const version = 7.56; {of delmod.p 2000 Nov 21} *)
type
(* begin module book.type *)
(* types needed for book manipulations *)
chset = set of 'a'..'z';
(* types defined in book definition *)
alpha = packed array[1..namelength] of char; (* this is not alfa *)
(* name is a left justified string with blanks following the
characters *)
name = record
letters: alpha;
length: 0..namelength (* zero means an unspecified structure *)
end;
lineptr = ^line;
line = record (* a line of characters *)
letters: packed array [1..linelength] of char;
length: 0..linelength;
next: lineptr
end;
direction = (plus, minus, dircomplement, dirhomologous);
configuration = (linear, circular);
state = (on, off);
header = record (* header of key *)
keynam: name; (* key name of structure *)
fulnam: lineptr; (* full name of structure *)
note: lineptr (* note key *)
end;
(* begin module base.type *)
(* define the four nucleotide bases *)
base = (a,c,g,t);
(* end module base.type version = 7.56; {of delmod.p 2000 Nov 21} *)
(* sequence types *)
dnaptr = ^dnastring;
dnarange = 0..dnamax;
seq = packed array[1..dnamax] of base;
dnastring = record
part: seq;
length: dnarange;
next: dnaptr
end;
orgkey = record (* organism key *)
hea: header;
mapunit: lineptr (* genetic map units *)
end;
chrkey = record (* chromosome key *)
hea: header;
mapbeg: real; (* number of genetic map beginning *)
mapend: real (* number of genetic map ending *)
end;
pieceptr = ^piece;
piekey = record (* piece key *)
hea: header;
mapbeg: real; (* genetic map beginning *)
coocon: configuration; (* configruation (circular/linear) *)
coodir: direction; (* direction (+/-) relative to genetic map *)
coobeg: integer; (* beginning nucleotide *)
cooend: integer; (* ending nucleotide *)
piecon: configuration; (* configruation (circular/linear) *)
piedir: direction; (* direction (+/-) relative to coordinates *)
piebeg: integer; (* beginning nucleotide *)
pieend: integer; (* ending nucleotide *)
end;
piece = record
key: piekey;
dna: dnaptr
end;
reference = record
pienam : name; (* name of piece referred to *)
mapbeg : real; (* genetic map beginning *)
refdir : direction; (* direction relative to coordinates *)
refbeg : integer; (* beginning nucleotide *)
refend : integer; (* ending nucleotide *)
end;
genkey = record (* gene key *)
hea : header;
ref : reference;
end;
trakey = record (* transcript key *)
hea : header;
ref : reference;
end;
markerptr = ^marker;
markey = record (* marker key *)
hea : header;
ref : reference;
sta : state;
phenotype : lineptr;
next : markerptr;
end;
marker = record
key : markey;
dna : dnaptr;
end;
(* end module book.type version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module interact.type *)
(* begin module string.type *)
stringptr = ^string; (* pointer to a string *)
string = record (* a string of characters *)
letters: array[1..maxstring] of char; (* the letters in the string *)
length: integer; (* the number of characters in the string *)
current: integer; (* the letter we are working on *)
next: stringptr; (* the next string in a series *)
end;
(* end module string.type version = 4.39; (@ of prgmod.p 1999 November 28 *)
(* end module interact.type version = 7.56; {of delmod.p 2000 Nov 21} *)
(* module filler.type *)
(* the following is an array used to fill a string.
it is convenient to have it much shorter than the maxstring, so that
it is easy to fill the string using procedure fillstring.
the user must declare the value of constant fillermax. *)
filler = packed array[1..fillermax] of char;
(* module filler.type from prgmod.p 4.20 *)
(* module trigger.type *)
trigger = record (* an object to be searched for *)
seek: string; (* the characters looked for *)
state: integer; (* how close to triggering we are *)
skip: boolean; (* trigger not found- skip the line *)
found: boolean (* the trigger was found *)
end;
(* module trigger.type from prgmod.p 4.20 *)
seqrecptr = ^seqrec; (* pointer to a seqrec record *)
seqrec = record (* sequences and alignment info *)
alseq: array[1..maxlen] of base; (* the sequence *)
length: integer; (* number of bases *)
piecename: name; (* name of the piece *)
alignread: integer; (* alignment read in from file *)
alignstart: integer; (* alignment at start of run *)
aligncurrent: integer; (* current alignment *)
alignmin: integer; (* minimum alignment *)
alignmax: integer; (* maximum alignment *)
next: seqrecptr (* pointer to next record *)
end;
matrix = array[base,1..maxlen] of integer; (* array for n(B,L) table *)
poslist = array[1..maxlen] of real; (* array for H(L) values *)
fltable = array[0..maxnseq] of real; (* array for f*log(f) and diffs *)
optlistptr= ^optlist; (* pointer to optlist record *)
optlist= record (* storage for unique optimal alignments *)
albest: array[1..maxnseq] of integer; (* the alignments *)
hbest: real; (* the h value *)
num: integer; (* number of times achieved *)
next: optlistptr (* pointer to next record *)
end;
var
inst, (* the delila instructions required by the align procedures *)
book, (* the book to be aligned *)
malignp, (* parameter file *)
uncert, (* uncertainty as function of position *)
newalign, (* values of H and relative alignments *)
optalign, (* set of unique optimum relative alignments *)
optinst, (* set of unique optimum absolute alignments *)
malignxyin: (* ranking, occurrences and H *)
text;
winleft, winright, (* left and right ends of window for calculating
uncertainty, relative to aligned base *)
shiftmin, shiftmax, (* minimum and maximum shift of aligned base *)
iseed, (* integer random seed *)
nranseq, (* number of random sequences, or 0 to use sequences in book *)
nshuffle, (* number of times to redo alignment after random shuffle *)
ifpaired, (* flag to treat pairs of sequences as complementary strand *)
standout, (* standard output very pass, run, or not at all *)
npassout, (* period at which to output H and alignment *)
nshiftout, (* period at which to output L and H(L) *)
ntolpass: integer; (* maximum number of passes below tolerance *)
tolerance: real; (* tolerance in change of H *)
nbl: matrix; (* the n(B,L) matrix *)
hl: poslist; (* the H(L) array *)
flogf, fldiff: fltable; (* tables of f*log(f) and differences of this *)
fldoubl: fltable; (* table of flogf(i+2)-flogf(i) *)
sepfirst, sep, seplast: seqrecptr; (* pointers to sequence records *)
oapfirst, oap, oaplast: optlistptr; (* pointers to alignment records *)
rand, (* random number returned by random *)
horiginal, (* value of h at starting alignment *)
hcurrent, hpass: real; (* values of H currently and at end of pass *)
numseq, (* number of sequences *)
window, (* length of window *)
shuffle, (* index for shuffling loop *)
numopt, (* number of optimal alignments on list *)
globshift, (* global shift when realigning all sequences *)
iseedparam, (* random seed as read in *)
iseed2, (* secondary seed used to resolve ties in minimum H *)
iseedsave, (* value of seed before shuffling for current run *)
notchanged, (* number of sequences over which no change has been made *)
intolerance: integer; (* number of passes where H changed < tolerance *)
paired: boolean; (* boolean value for whether sequences paired *)
alignmenttype: char; (* 'f' means alignment by First internal coordinate
base, 'b' means alignment by Book, 'i' means alignment by Instructions *)
theline: integer; (* current line in the book *)
(* begin module book.var *)
(* ************************************************************************ *)
(* global variables needed for book manipulations *)
(* free storage: *)
freeline: lineptr; (* unused lines *)
freedna: dnaptr; (* unused dnas *)
readnumber: boolean; (* whether to read a number from the notes, or
to read in the notes *)
number: integer; (* the number of the item just read *)
numbered: boolean; (* true when the item just read is numbered *)
skipunnum: boolean; (* a control variable to allow skipping of
un-numbered items in the book *)
(* ************************************************************************ *)
(* end module book.var version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module halt *)
procedure halt;
(* stop the program. the procedure performs a goto to the end of the
program. you must have a label:
label 1;
declared, and also the end of the program must have this label:
1: end.
examples are in the module libraries.
this is the only goto in the delila system. *)
begin
writeln(output,' program halt.');
goto 1
end;
(* end module halt version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module copyaline *)
procedure copyaline(var fin, fout: text);
(* copy a line from file fin to file fout *)
begin (* copyaline *)
while not eoln(fin) do begin
fout^ := fin^;
put(fout);
get(fin)
end;
readln(fin);
writeln(fout);
end; (* copyaline *)
(* end module copyaline version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module skipblanks *)
procedure skipblanks(var thefile: text);
(* skip over blanks until a non-blank, or end of line, is found *)
begin
while (thefile^ = ' ') and not eoln(thefile) do get(thefile);
end;
procedure skipnonblanks(var thefile: text);
(* skip over nonblanks until a blank, or end of line, is found *)
begin
while (thefile^ <> ' ') and not eoln(thefile) do get(thefile);
end;
procedure skipcolumn(var thefile: text);
(* skip over a data column *)
begin
skipblanks(thefile); skipnonblanks(thefile)
end;
(* end module skipblanks version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module interact.clearstring *)
(* begin module clearstring *)
procedure clearstring(var ribbon: string);
(* empty the string *)
var index: integer; (* to the ribbon *)
begin (* clearstring *)
with ribbon do begin
for index := 1 to maxstring do letters[index] := ' ';
length := 0;
current := 0;
end
end; (* clearstring *)
procedure initializestring(var ribbon: string);
(* start the string with a nil pointer. This routine
should be called before doing linked list work. This allows
the standard string routines to clear the string without
killing the pointer. *)
begin (* initializestring *)
clearstring(ribbon);
ribbon.next := nil;
end; (* initializestring *)
(* end module clearstring version = 4.39; (@ of prgmod.p 1999 November 28 *)
(* end module interact.clearstring version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module filler.fillstring *)
procedure fillstring(var s: string; a: filler);
(* this procedure makes it reasonably easy to fill the string s with
characters. one calls the procedure as: *)
(* 1 2 3 4 5 *)
(* 12345678901234567890123456789012345678901234567890 *)
(* fillstring(s, 'this-is-the-string ');
the two comments make it easy to line the characters up. also, for this
example, it was assumed that the length of filler as defined by the
constant fillermax was 50. *)
var
length: integer; (* of the string without trailing blanks *)
index: integer; (* of s *)
begin (* fillstring *)
clearstring(s);
length := fillermax;
while (length > 1) and (a[length] = ' ') do length := pred(length);
if (length = 1) and (a[length] = ' ') then begin
writeln(output, 'fillstring: the string is empty');
halt
end;
for index := 1 to length do s.letters[index] := a[index];
s.length := length;
s.current := 1
end; (* fillstring *)
(* end module filler.fillstring version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module filler.filltrigger *)
procedure filltrigger(var t: trigger;
a: filler);
(* fill the trigger t *)
begin (* filltrigger *)
fillstring(t.seek,a)
end; (* fillstring *)
(* end module filler.filltrigger version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module trigger.proc *)
(* this module allows one to scan a series of characters, as from
an array or a file, and to "trigger" or detect a simple string
in the series. the advantage of the trigger is that several triggers
can "observe" a stream of characters at once, each looking for a
different thing.
some other modules required: interact.const, interact.type *)
procedure resettrigger(var t: trigger);
(* reset the trigger to ground state *)
begin (* resettrigger *)
with t do begin
state := 0;
skip := false;
found := false
end
end; (* resettrigger *)
procedure testfortrigger(ch: char; var t: trigger);
(* look at the character ch.
if it is part of the trigger (at the current trigger state),
then the trigger state goes higher.
if it is not part of the trigger then the trigger state is reset,
skip is true and one should skip onward to find the trigger.
if the trigger is found, found is true.
1996 Sep 12: Bug found! In the case of a trigger "ab", the program
used to miss it for situations like "aab". This was because at the
first a it would step up. Then it would see the second a and recognize
that was not part of ab. It would fail to realize that it could be
the start of a new one. The code now accounts for that possibility. *)
begin (* testfortrigger *)
with t do begin
state := succ(state);
{
writestring(list,seek);
writeln(list,'testfortrigger seek.letters[',state:1,']:',
seek.letters[state],' ch:',ch);
}
if seek.letters[state] = ch
then begin
skip := false;
if state = seek.length then found := true
else found := false
end
else begin
(* it failed. But wait! It could be the beginning
of a NEW trigger string! *)
if seek.letters[1] = ch then begin
state := 1;
skip := false;
found := false
end
else begin (* reset trigger *)
state := 0;
skip := true;
found := false
end
end
end
end; (* testfortrigger *)
(* end module trigger.proc version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module package.align *)
(* ************************************************************************ *)
(* begin module package.getpiece *)
(* ************************************************************************ *)
(* begin module package.brpiece *)
(* ************************************************************************ *)
(* begin module book.basis *)
(* procedures needed for book manipulations *)
(* get procedures should be used for all linked lists of records *)
procedure getline(var l: lineptr);
(* obtain a line from the free line list or by making a new one *)
begin
if freeline<>nil
then begin
l:=freeline;
freeline:=freeline^.next
end
else new(l);
l^.length:=0;
l^.next:=nil
end;
procedure getdna(var l: dnaptr);
begin
if freedna<>nil
then begin
l:=freedna;
freedna:=freedna^.next
end
else new(l);
l^.length:=0;
l^.next:=nil
end;
(* clear procedures should be called each time the records are no longer needed
failure to do this may result in a stack overflow. *)
procedure clearline(var l: lineptr);
(* return a line to the free line list *)
var lptr: lineptr;
begin
if l<>nil then begin
lptr:=l;
l:=l^.next;
lptr^.next:=freeline;
freeline:=lptr
end
end;
procedure writeline(var afile: text; l: lineptr;
carriagereturn: boolean);
(* write a line to a file, with carriage return if
carriagereturn is true. *)
var
index: integer; (* index to characters in l *)
begin
with l^ do begin
for index := 1 to length
do write(afile, letters[index]);
end;
if carriagereturn then writeln(afile);
end;
procedure showfreedna;
(* show the freedna list *)
var
counter: integer; (* count of freedna list *)
l: dnaptr; (* pointer into freedna list *)
begin
l := freedna;
counter := 0;
while l <> nil do begin
counter := succ(counter);
write(output,counter:1);
write(output, ', length = ',l^.length:1);
{
This is illegal according to gpc because one cannot
write a pointer to a text file. It can be unearthed
for debugging.
write(output, ', pointer id: ',l:1);
}
writeln(output);
l := l^.next
end;
end;
procedure cleardna(var l: dnaptr);
var lptr: dnaptr;
begin
if l<>nil then begin
lptr:=l;
l:=l^.next;
lptr^.next:=freedna;
freedna:=lptr
end
end;
procedure clearheader(var h: header);
(* clear the header h (remove lines to free storage) *)
begin
with h do begin
clearline(fulnam);
while note<>nil do clearline(note)
end
end;
procedure clearpiece(var p: pieceptr);
(* clear the dna of the piece *)
begin
while p^.dna<>nil do cleardna(p^.dna);
clearheader(p^.key.hea)
end;
function chartobase(ch:char):base;
(* convert a character into a base *)
begin
case ch of
'a': chartobase:=a;
'c': chartobase:=c;
'g': chartobase:=g;
't': chartobase:=t
end
end;
function basetochar(ba:base):char;
(* convert a base into a character *)
begin
case ba of
a: basetochar:='a';
c: basetochar:='c';
g: basetochar:='g';
t: basetochar:='t';
end
end;
function complement(ba:base):base;
(* take the complement of ba *)
begin
case ba of
a: complement:=t;
c: complement:=g;
g: complement:=c;
t: complement:=a;
end
end;
function chomplement(b: char): char;
(* create the character complement of base b. I must be getting hungry! *)
begin
chomplement := basetochar(complement(chartobase(b)));
end;
function pietoint(p: integer; pie: pieceptr): integer;
(* p is a coordinate on the piece.
we want to transform p into a number
from 1 to n: an internal coordinate system for
easy manipulation of piece coordinates *)
var i: integer; (* an intermediate value *)
begin
with pie^.key do begin
case piedir of
plus: if p>=piebeg
then i:=p-piebeg+1
else i:=(p-coobeg)+(cooend-piebeg)+2;
minus: if p<=piebeg
then i:=piebeg-p+1
else i:=(cooend-p)+(piebeg-coobeg)+2
end;
pietoint:=i
end
end;
function inttopie(i: integer; pie: pieceptr):integer;
(* i is in the range 1 to some maximum. it is an internal coordinate
system for the program. we want to do a
coordinate transformation to obtain
a value in the range of the piece called pie:
i=1 corresponds to piebeg and
i=its maximum corresponds to pieend *)
var p: integer; (* an intermediate value *)
begin
with pie^.key do begin
case piedir of
plus: begin
p:=piebeg+(i-1);
if p>cooend
then if coocon=circular
then p:=p-(cooend-coobeg+1)
end;
minus: begin
p:=piebeg-(i-1);
if p '*' then begin
writeln(output,' procedure skipstar: bad book');
writeln(output,' "*" expected as first character on the line, but "',
thefile^,'" was found');
halt
end;
get(thefile); (* skip the star *)
if thefile^ <> ' ' then begin
writeln(output,' procedure skipstar: bad book');
writeln(output,' "* " expected on a line but "*',
thefile^,'" was found');
halt
end;
get(thefile) (* skip the blank *)
end
end; (* skipstar *)
(* end module book.skipstar version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module book.brreanum *)
procedure brreanum(var thefile: text; var theline: integer; var reanum: real);
(* read a real number from the file *)
begin
skipstar(thefile);
readln(thefile,reanum);
theline := succ(theline)
end;
(* end module book.brreanum version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module book.brnumber *)
procedure brnumber(var thefile: text; var theline: integer; var num: integer);
(* read a number from the file *)
begin
skipstar(thefile);
readln(thefile,num);
theline := succ(theline)
end;
(* end module book.brnumber version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module book.brname *)
procedure brname(var thefile: text; var theline: integer; var nam: name);
(* read a name from the file *)
var i: integer; (* an index to the name *)
c: char; (* a character read *)
begin (* brname *)
skipstar(thefile);
with nam do begin
length:=0;
repeat
length:=succ(length);
read(thefile,c);
letters[length] := c
until (eoln(thefile)) or
(length>=namelength) or
(letters[length]=' ');
if letters[length]=' ' then length:=length-1;
if length 'n' then begin
skipstar(thefile);
if not eoln(thefile) then begin
if thefile^ = '#' then begin
numbered := true;
get(thefile); (* move past the number symbol *)
read(thefile,number);
end
end;
repeat
readln(thefile);
theline := succ(theline)
until thefile^ = 'n';
readln(thefile);
theline := succ(theline)
end
else begin
readln(thefile);
theline := succ(theline)
end
end
end; (* brnotenumber *)
(* end module book.brnotenumber version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module book.brnote *)
procedure brnote(var thefile: text; var theline: integer; var note: lineptr);
(* read note key *)
var
newnote: lineptr; (* the new note *)
previousnote: lineptr; (* the last line of the notes *)
begin (* brnote *)
note:=nil;
if thefile^ = 'n' then begin (* enter note *)
readln(thefile);
theline := succ(theline);
if thefile^ <> 'n' then begin (* abort null note (n/n) *)
getline(note);
newnote:=note;
while thefile^ <> 'n' do begin (* wait until end of note *)
brline(thefile,theline,newnote);
previousnote:=newnote;
(* get next note *)
getline(newnote^.next);
newnote:=newnote^.next;
end;
(* last note was not used, so: *)
clearline(newnote);
previousnote^.next:=nil;
readln(thefile);
theline := succ(theline);
end
else begin
readln(thefile);
theline := succ(theline);
end;
end
end; (* brnote *)
(* end module book.brnote version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module book.brheader *)
procedure brheader(var thefile: text; var theline: integer; var hea: header);
(* read the header of a key. *)
begin
with hea do begin
readln(thefile); (* move past the object name - new definition 1999 Mar 13 *)
theline := succ(theline);
{bbb}
(* read key name *)
brname(thefile,theline,keynam);
(* read full name *)
getline(fulnam);
brline(thefile,theline,fulnam);
(* read note key *)
if readnumber then brnotenumber(thefile,theline,note)
else brnote(thefile,theline,note)
end
end;
(* end module book.brheader version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module book.copyheader *)
procedure copyheader(fromhea: header; var tohea: header);
(* copy the header fromhea into tohea. Note that the
linked objects are NOT copied, but merely pointed to. *)
begin
tohea.keynam.letters := fromhea.keynam.letters;
tohea.keynam.length := fromhea.keynam.length;
tohea.note := fromhea.note;
tohea.fulnam := fromhea.fulnam;
end;
(* end module book.copyheader version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module book.brpiekey *)
procedure brpiekey(var thefile: text; var theline: integer; var pie: piekey);
(* read piece key, track the line number *)
begin
with pie do begin
brheader(thefile,theline,hea);
brreanum(thefile,theline,mapbeg);
brconfig(thefile,theline,coocon);
brdirect(thefile,theline,coodir);
brnumber(thefile,theline,coobeg);
brnumber(thefile,theline,cooend);
brconfig(thefile,theline,piecon);
brdirect(thefile,theline,piedir);
brnumber(thefile,theline,piebeg);
brnumber(thefile,theline,pieend);
end
end;
(* end module book.brpiekey version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module book.brdna *)
procedure brdna(var thefile: text; var theline: integer; var dna: dnaptr);
(* read in dna from thefile, track the line *)
(* note: if the dna were circularized, by linking the last dnastring
to the first, then the cleardna routine could not clear properly,
and would loop forever... there is no reason to do that, since a simple
mod function will allow one to access the circle. *)
var
ch: char;
workdna: dnaptr;
begin
getdna(dna);
workdna:=dna;
ch:=getto(thefile,theline,['d']);
readln(thefile);
theline := succ(theline);
read(thefile,ch); (* skipstar *)
while (ch = '*') do
begin
read(thefile,ch); (* skip blank *)
repeat
read(thefile,ch);
if ch in ['a','c','g','t'] then begin
if workdna^.length=dnamax then begin
getdna(workdna^.next);
workdna:=workdna^.next
end;
workdna^.length:=succ(workdna^.length);
workdna^.part[workdna^.length]:=chartobase(ch)
end
until eoln(thefile);
readln(thefile); (* go to next line *)
theline := succ(theline);
read(thefile,ch); (* ch is either '*' or 'd' *)
end;
readln(thefile); (* read past the d *)
theline := succ(theline);
end;
(* end module book.brdna version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module book.brpiece *)
procedure brpiece(var thefile: text; var theline: integer; var pie: pieceptr);
(* read in a piece, change theline to reflect the lines traversed *)
begin
{
readln(thefile); (* move past the word 'piece' - new definition 1999 Mar 13 *)
theline := succ(theline); (* BUG: was below! *)
bbb}
brpiekey(thefile,theline,pie^.key);
if numbered or (not skipunnum)
then brdna(thefile,theline,pie^.dna);
readln(thefile); (* move past the word 'piece' - new definition 1999 Mar 13 *)
theline := succ(theline);
end;
(* end module book.brpiece version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module book.brinit *)
procedure brinit(var book: text; var theline: integer);
(* check that the book is ok to read, and
set up the global variables for br routines *)
begin (* brinit *)
(* halt if the book is bad (first word is 'halt') or the first
character is not * *)
reset(book);
if not eof(book) then begin
(* check for the date line *)
if book^ <> '*' then begin
if book^ <> 'h'
then writeln(output, ' this is not the first line of a book:')
else writeln(output, ' bad book:');
write(output, ' ');
while not (eoln(book) or eof(book)) do begin
write(output, book^);
get(book)
end;
writeln(output);
halt
end
end
else begin
writeln(output, ' book is empty');
halt
end;
(* initialize free storage *)
freeline:=nil;
freedna:=nil;
readnumber:=true; (* usually we read in numbers for items *)
number:=0; (* arbitrary value *)
numbered:=false; (* the piece has no number (none yet read in) *)
skipunnum:=false;
theline := 1;
end; (* brinit *)
(* end module book.brinit version = 7.56; {of delmod.p 2000 Nov 21} *)
(* ************************************************************************ *)
(* end module package.brpiece version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module book.getpiece *)
procedure getpiece(var thefile: text; var theline: integer; var pie: pieceptr);
(* move to and read in the next piece in the book *)
var ch: char;
begin
ch:=getto(thefile,theline,['p']); (* get to the next p(iece) in the book *)
if ch<>' ' then begin
brpiece(thefile,theline,pie);
{
1999 june 2: removed this:
ch:=getto(thefile,theline,['p']); (* read to end of p *)
}
{
bbb - now done in brpiece
readln(thefile); (* read past piece *)
theline := succ(theline);
}
end
else clearpiece(pie);
end;
(* end module book.getpiece version = 7.56; {of delmod.p 2000 Nov 21} *)
(* ************************************************************************ *)
(* end module package.getpiece version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module findblank *)
procedure findblank(var afile: text);
(* read a file to find the next blank character *)
var ch: char;
begin
repeat read(afile,ch) until ch = ' '
end;
(* end module findblank version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module findnonblank *)
procedure findnonblank(var afile: text; var ch: char);
(* find the next non blank character in a file, return it in ch. *)
begin
ch:=' ';
while (not eof(afile)) and
(ch = ' ')
do begin
read(afile,ch);
if eoln(afile) then readln(afile)
end
end;
(* end module findnonblank version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module align.align *)
procedure align(var inst, book: text; var theline: integer;
var pie: pieceptr;
var length, alignedbase: integer);
(* documentation on align is in module info.align and
delman.use.aligned.books.
1996 Sep 12: The routine now uses the trigger functions found
in prgmod. The bug in the oldalign routine (that it misses the
end of comments that end in a series of astrisks) has been fixed.
It now checks that the piece corresponds to the book. *)
const maximumrange = 10000; (* if the alignment point is more than this
distance from the piece ends, the program halts in an attempt to catch
the alignment bug... 1991 Jan 11 It appears that the rewrite of the
code has removed the bug, but the check will be kept. *)
semicolon = ';'; (* end of delila instruction *)
var
ch: char; (* a character in inst *)
p: integer; (* index to a piece name *)
p1: integer; (* another index to a piece name *)
done: boolean; (* done finding an aligning get *)
thebase: integer; (* the base read in *)
indefault: boolean; (* true when within a default statement. These
can contain the word 'piece', which must be ignored. *)
gettrigger: trigger; (* trigger to find 'get' *)
defaulttrigger: trigger; (* trigger to find 'default' *)
nametrigger: trigger; (* trigger to find 'name' *)
piecetrigger: trigger; (* trigger to find 'piece' *)
settrigger: trigger; (* trigger to find 'set' *)
begincomment: trigger; (* trigger to find '(-*' (ignore the dash!) *)
endcomment: trigger; (* trigger to find '*-)' (ignore the dash!) *)
begincurly: trigger; (* trigger to find comments: '{' *)
endcurly: trigger; (* trigger to find comments: '}' *)
quote1trigger: trigger; (* trigger to find single quote ' *)
quote2trigger: trigger; (* trigger to find double quote " *)
{
procedure rd(var f: text; var ch: char);
(* read ch from f allowing inspection of the result *)
begin
read(f,ch);
write(output,ch);
write(list,ch);
write(output,'<',ch,'>');
end;
procedure rdln(var f: text);
(* readln f allowing inspection of the result *)
begin
readln(f);
writeln(output);
writeln(list);
end;
}
procedure skipcomment(var f: text);
(* skip an entire comment *)
var
comment: boolean; (* true means we are inside a comment *)
begin
(* skip to end of comment *)
resettrigger(endcomment);
comment := true;
while comment do begin
if eof(f) then begin
writeln(output,'A comment does not end!');
halt
end;
if eoln(f)
then readln(f)
{ rdln(f) }
else begin
{write(output,'<');
rd(f,ch);
write(output,'>');}
read(f,ch);
testfortrigger(ch, endcomment);
if endcomment.found
then comment := false;
end
end
end;
procedure skipcurly(var f: text);
(* skip an entire comment made by {}*)
var
comment: boolean; (* true means we are inside a comment *)
begin
(* skip to end of comment *)
resettrigger(endcurly);
comment := true;
while comment do begin
if eof(f) then begin
writeln(output,'A comment does not end!');
halt
end;
if eoln(f)
then readln(f)
{ rdln(f) }
else begin
{write(output,'<');
rd(f,ch);
write(output,'>');}
read(f,ch);
testfortrigger(ch, endcurly);
if endcurly.found
then comment := false;
end
end
end;
procedure skipquote(quote: trigger);
(* skip an entire quote of either the ' or " persuasion *)
var
kind: char; (* the kind of quote, ' or " *)
begin
kind := quote.seek.letters[1];
{writeln(output,'skipquote ',kind);}
repeat
findnonblank(inst,ch); (* get to the quote *)
until (ch = kind) or eof(inst);
if ch <> kind then begin
writeln(output,'end of quote starting with ',kind,' not found');
halt;
end;
end;
begin
filltrigger(defaulttrigger,'default');
filltrigger(gettrigger,'get ');
filltrigger(nametrigger,'name ');
filltrigger(piecetrigger,'piece ');
filltrigger(settrigger,'set ');
filltrigger(begincomment,'(* ');
filltrigger(endcomment,'*) ');
filltrigger(begincurly,'{ ');
filltrigger(endcurly,'} ');
filltrigger(quote1trigger,''' ');
filltrigger(quote2trigger,'" ');
resettrigger(defaulttrigger);
resettrigger(gettrigger);
resettrigger(nametrigger);
resettrigger(piecetrigger);
resettrigger(settrigger);
resettrigger(begincomment);
resettrigger(begincurly);
resettrigger(quote1trigger);
resettrigger(quote2trigger);
indefault := false;
if not eof(book) then begin (* if there is still more to the book ... *)
getpiece(book,theline,pie); (* read in the piece *)
if not eof(book) then begin (* if we found a piece ... *)
length:=pietoint(pie^.key.pieend,pie); (* calculate piece length *)
(* now find in inst the next occurance of 'get' *)
done := false;
while not done do begin
if eof(inst) then begin (* no instructions? *)
alignedbase := 1; (* simply align by the first base *)
done := true
end
else begin
if eoln(inst)
then readln(inst)
{then rdln(inst)}
else begin
{rd(inst,ch);}
read(inst,ch);
testfortrigger(ch, begincomment);
testfortrigger(ch, begincurly);
if begincomment.found or begincurly.found
then begin
if ch = '*' then begin
skipcomment(inst);
resettrigger(begincomment);
end
else begin
resettrigger(begincurly);
skipcurly(inst);
end
end
else begin (* we are not inside a comment *)
testfortrigger(ch, gettrigger);
if gettrigger.found then begin
findnonblank(inst,ch); (* get to "from" *)
findblank(inst); (* get past "from" *)
read(inst,thebase); (* read in the alignedbase *)
{writeln(output);writeln(output,'thebase = ',thebase:1);}
alignedbase:=pietoint(thebase,pie);
{writeln(output,'alignedbase=',alignedbase:1);}
done := true
end;
testfortrigger(ch, quote1trigger);
if quote1trigger.found then begin
skipquote(quote1trigger);
end;
testfortrigger(ch, quote2trigger);
if quote2trigger.found then begin
skipquote(quote2trigger);
end;
testfortrigger(ch, defaulttrigger);
if defaulttrigger.found then begin
indefault := true;
resettrigger(defaulttrigger)
end;
if ch = semicolon then indefault := false;
testfortrigger(ch, settrigger);
if settrigger.found then begin
indefault := true;
resettrigger(settrigger)
end;
if ch = semicolon then indefault := false;
(* check that piece names are correct *)
testfortrigger(ch, piecetrigger);
if not indefault then if piecetrigger.found then begin
skipblanks(inst); (* get to name *)
with pie^.key.hea.keynam do begin
for p := 1 to length do begin
read(inst,ch);
if letters[p] <> ch then begin
writeln(output,'The piece name in the book: ');
writeln(output,letters:length);
writeln(output,'does not match',
' the inst file name:');
(* write the letters that matched: *)
for p1 := 1 to p-1 do write(output,letters[p]);
(* write the offending letter: *)
write(output, ch);
(* get the rest of the name and show it: *)
done := eoln(inst);
while not done do begin
done := eoln(inst);
if not done then begin
read(inst,ch);
if (ch = ' ') or (ch = ';') then done := true;
if not done then write(output,ch);
end;
end;
writeln(output);
(* mark the first letter that does not match: *)
for p1 := 1 to p-1 do write(output,' ');
write(output,'^');
writeln(output);
halt
end;
end
end;
end;
end
end
end
end;
if (alignedbase <= -maximumrange) or
(alignedbase > length + maximumrange) then begin
writeln(output,' In procedure align:');
writeln(output,' read in base was ',thebase:1);
writeln(output,' in internal coordinates: ',alignedbase:1);
writeln(output,' maximum range was ',maximumrange:1);
writeln(output,' piece length was ',length:1);
with pie^.key.hea.keynam
do writeln(output,' piece name: ',letters:length);
writeln(output,' piece number: ',number:1);
writeln(output,' aligned base is too far away... see the code');
halt
end
end
end
end;
(* end module align.align version = 7.56; {of delmod.p 2000 Nov 21} *)
(* ************************************************************************ *)
(* end module package.align version = 7.56; {of delmod.p 2000 Nov 21} *)
(* begin module flfill *)
procedure flfill (var h, dh, ddh: fltable; n: integer);
(* fills the array h with -f*log(f) for f = i/n, i running from 0 to n;
fills the array dh with h[i+1]-h[i], ddh with h[i+2]-h[i] *)
var freq, (* frequency *)
nreal, (* real value of n *)
ln2: real; (* ln(2) *)
i: integer; (* loop counter *)
begin
h[0] := 0.0;
ln2 := ln(2);
nreal := n;
for i := 1 to n do begin
freq := i/nreal;
h[i] := -freq * ln(freq)/ln2;
dh[i-1] := h[i] - h[i-1];
if i>1 then ddh[i-2] := h[i] - h[i-2];
end;
end;
(* end module flfill *)
(* begin module shift.random.31 *)
procedure random(var rand: real; var iseed: integer);
(* This random number generator is based on a shift register with a single bit
of feedback, as described in Electronics for Neurobiologists, by Brown, Maxfield
and Moraff, MIT press 1973, referencing Random Process Simulation and
Measurement by Korn, McGraw-Hill 1966. A 31-bit shift register is maintained in
the integer iseed, which should initially be given any positive value. This
31-bit number is shifted right one bit and the output of the last (31st) bit and
the 13th bit are added modulo 2 (exclusive orred) and fed back into the new
first bit. This is done between 4 and 11 times, depending on the last 3 bits of
the original number. The result is converted to a real number between 0 and 1
and returned as the value of rand. The 31-bit shift register goes through all
2**31-1 values before repeating; the repetition frequency of this algorithm
could be less or greater depending on the seed, because of the random number of
multiple shifts per call. *)
const pow18=262144; pow19=524288; pow30=1073741824; (* powers of 2 *)
pow31m=2147483647.0; (* 2**31-1 *)
var i, nrep: integer; (* index, number of times to do shift *)
begin (* random *)
nrep := 4 + iseed mod 8; (* do it 4 to 11 times based on last 3 bits *)
for i:= 1 to nrep do
(* if last bit and 9th bit are equal, feed back a 0, otherwise a 1 *)
if( odd(iseed) = ((iseed mod pow19) >= pow18))
then iseed := iseed div 2
else iseed := (iseed div 2) + pow30;
rand := iseed/pow31m;
end; (* random *)
(* end module shift.random.31 *)
(* begin module randbase *)
procedure randbase (var nuc: base; var iseed: integer);
(* generate a random base, with p=0.25 for each base *)
begin
random (rand, iseed);
if rand<0.25 then nuc:=a
else begin
if rand<0.5 then nuc:=c
else begin
if rand<0.75 then nuc:=g
else nuc:=t;
end;
end;
end;
(* end module randbase *)
(* begin module readparam *)
procedure readparam(var pin: text);
(* read and check parameters from parameter file pin *)
begin
reset(pin);
readln(pin, winleft, winright);
readln(pin, shiftmin, shiftmax);
readln(pin, iseedparam);
readln(pin, nranseq);
readln(pin, nshuffle);
readln(pin, ifpaired);
readln(pin, standout);
readln(pin, npassout);
readln(pin, nshiftout);
readln(pin, tolerance);
readln(pin, ntolpass);
if not eof(pin) then begin
readln(pin, alignmenttype);
if not (alignmenttype in ['f','i','b']) then begin
writeln(output,'alignment type must be f, b, or i');
halt
end;
end
else alignmenttype := 'i';
window := winright+1 - winleft;
(* window must be >0 and < maxlen but can be anywhere *)
if (window<=0) or (window>=maxlen) then begin
write(output, 'parameter error: '); writeln(output,
'window is',window,', must be between 1 and',maxlen-1);
halt;
end;
if (shiftmin>0) or (shiftmax<0) then begin
write(output, 'parameter error: '); writeln(output,
'shiftmin must be < or = 0, shiftmax must be > or = 0');
halt;
end;
iseed := iseedparam;
iseed2 := iseed;
if iseed<=0 then begin
write(output, 'parameter error: '); writeln(output,
'random seed must be > 0');
halt;
end;
paired := (ifpaired>0); (* convert ifpaired to boolean *)
if (tolerance<=0.0) or (ntolpass<=0) then begin
write(output, 'parameter error: '); writeln(output,
'tolerance and # of passes below tolerance must be > 0');
halt;
end;
end;
(* end module readparam *)
(* begin module writeparam *)
procedure writeparam(var fout: text);
(* writes standard header of user parameters to an output file *)
begin
rewrite(fout); (* initialize file *)
writeln(fout, '* malign',version:5:2); (* program name and version *)
writeln(fout,'*');
reset(book); (* copy first line of book *)
copyaline(book, fout);
reset(book);
writeln(fout,'*');
writeln(fout, '* user specified parameters:');
writeln(fout,'* ',winleft:5, winright:5,
' window for calculating uncertainty, relative to aligned base');
writeln(fout,'* ',shiftmin:5, shiftmax:5,
' minimum and maximum shift of aligned base');
writeln(fout,'* ',iseedparam, ' random seed');
writeln(fout,'* ',nranseq:6,
' number of random sequences (0 for sequences in book)');
writeln(fout,'* ',nshuffle:6, ' number of runs after random shuffle');
writeln(fout,'* ',ifpaired:6,
' independent sequences (0), or pairs of complementary sequences (1)');
writeln(fout,'* ',standout:6,
' no (-1), limited (0) or full (1) standard output');
writeln(fout,'* ',npassout:6,
' period at which to output H and alignment');
writeln(fout,'* ',nshiftout:6,' period at which to output L and H(L)');
writeln(fout,'* ',tolerance:10:6,' tolerance in change of H');
writeln(fout,'* ',ntolpass:6,
' maximum number of passes below tolerance');
writeln(fout,'*');
end;
(* end module writeparam *)
(* begin module readaligned *)
procedure readaligned(var inst, book: text);
(* read aligned sequences from book and inst files, place in linked list *)
var pie: pieceptr; (* standard pointer to piece *)
commonshift, (* common limit on shifting of paired sequences *)
length, (* length of piece *)
alignedbase, (* relative number of aligned base *)
almax, (* a maximum alignment *)
i: integer; (* array index *)
begin
reset(book); (* reset files *)
reset(inst);
new(pie); (* create space for pieces *)
numseq := 0; (* zero # of sequences *)
while not eof(book) do begin
case alignmenttype of
'i': align(inst, book, theline, pie, length, alignedbase);
'b','f': begin
getpiece(book, theline, pie); (* read in the piece *)
length := piecelength(pie);
alignedbase := -inttopie(1, pie);
end;
end;
if not eof(book) then begin (* I assume there's no sequence if eof *)
numseq := numseq +1; (* increment sequence count *)
if numseq>maxnseq then begin
writeln(output,'procedure readaligned error:');
writeln(output,'number of sequences greater than limit of ',
maxnseq:1);
halt;
end;
if length>maxlen then begin
writeln(output,'procedure readaligned error:');
writeln(output,'sequence ',numseq,', length', length,
', greater than limit of',maxlen);
halt;
end;
new(sep); (* create new space *)
if numseq = 1 then sepfirst := sep (* set up first pointer *)
else seplast^.next := sep; (* or link from last record *)
for i := 1 to length do (* transfer bases *)
sep^.alseq[i] := pie^.dna^.part[i];
sep^.length := length;
(* transfer the name from pie into the data structure *)
sep^.piecename.length := pie^.key.hea.keynam.length;
for i := 1 to sep^.piecename.length
do sep^.piecename.letters[i] := pie^.key.hea.keynam.letters[i];
sep^.alignread := alignedbase;
sep^.alignstart := alignedbase;
sep^.aligncurrent := alignedbase;
sep^.alignmin := alignedbase+shiftmin;
sep^.alignmax := alignedbase+shiftmax;
if alignmenttype <> 'b' then begin
almax := length - winright;
(*
if sep^.alignmax > almax then writeln(output,'modify alignmax!')
else writeln(output,'max nope!');
if sep^.alignmin < (1-winleft) then writeln(output,'modify alignmin!')
else writeln(output,'min nope!');
*)
(* desired minimum alignment; may not be lower than 1-winleft *)
if sep^.alignmin < (1-winleft) then sep^.alignmin := 1-winleft;
(* desired maximum alignment; may not be higher than almax *)
almax := length - winright;
if sep^.alignmax > almax then sep^.alignmax := almax;
end;
if (sep^.alignmaxalignedbase)
then begin
writeln(output,'procedure readaligned error:');
writeln(output,'In sequence ',numseq:1 ,
', the aligned base is too close to one end:');
if (sep^.alignmaxalignedbase)
then writeln(output,'sep^.alignmin>alignedbase');
writeln(output,'alignedbase = ',alignedbase:1);
writeln(output,'shiftmin = ',shiftmin:3);
writeln(output,'shiftmax = ',shiftmax:3);
writeln(output,'winleft = ',winleft:3);
writeln(output,'winright = ',winright:3);
writeln(output,'sep^.alignmin = alignedbase+shiftmin = ',
sep^.alignmin:1);
writeln(output,'sep^.alignmax = alignedbase+shiftmax = ',
sep^.alignmax:1);
if sep^.alignmin <> alignedbase + shiftmin then begin
writeln(output,'EXCEPT that sep^.alignmin < (1-winleft)');
writeln(output,'SO sep^.alignmin = 1-winleft');
writeln(output,'WHERE winleft = ',winleft:1);
end;
halt;
end;
(* if reading second sequence of pair, make shifts mutually limiting *)
if paired and (not odd(numseq)) then begin
(* calculate minimum of allowed shift of last to left and
this one to right, impose that as limit on both shifts *)
commonshift := seplast^.alignread-seplast^.alignmin;
if commonshift > (sep^.alignmax-alignedbase)
then commonshift := sep^.alignmax-alignedbase;
seplast^.alignmin := seplast^.alignread - commonshift;
sep^.alignmax := alignedbase + commonshift;
(* calculate minimum of allowed shift of last to right and
this one to left, impose that as limit on both shifts *)
commonshift := seplast^.alignmax-seplast^.alignread;
if commonshift > (alignedbase- sep^.alignmin)
then commonshift := alignedbase-sep^.alignmin;
seplast^.alignmax := seplast^.alignread + commonshift;
sep^.alignmin := alignedbase - commonshift;
end;
seplast := sep; (* save pointer to put in link on next round *)
end;
end; (* end of while loop searching through book *)
sep^.next := nil;
if numseq = 0 then begin
writeln(output,'no sequences found in the book');
halt;
end;
if paired and odd(numseq) then begin
writeln(output,'since the sequences should be paired, there should');
writeln(output,'be an odd number of them. There are ',numseq:1,'.');
halt;
end;
dispose(pie);
end;
(* end module readaligned *)
(* begin module genrandseqs *)
procedure genrandseqs;
(* generates random sequences and puts them into linked list *)
var seqno, i, (* indices *)
len: integer; (* length of sequences *)
begin
numseq := nranseq; (* number of sequences *)
len := 1+(winright-winleft) + (shiftmax - shiftmin);
for seqno := 1 to numseq do begin
new(sep); (* create new space *)
if seqno = 1 then sepfirst := sep (* set up first pointer *)
else seplast^.next := sep; (* or link from last record *)
for i := 1 to len do randbase(sep^.alseq[i], iseed);
sep^.length := len;
sep^.alignmin := 1-winleft;
sep^.alignread := sep^.alignmin-shiftmin;
sep^.alignmax := sep^.alignread +shiftmax;
sep^.alignstart := sep^.alignread;
sep^.aligncurrent := sep^.alignread;
seplast := sep; (* save pointer to put in link on next round *)
end;
sep^.next := nil;
end;
(* end module genrandseqs *)
(* begin module fillnbl *)
procedure fillnbl;
(* computes the n(B,L) array from the linked list of sequences *)
var b: base; (* temporary for base *)
i, (* array index *)
seqno, (* sequence counter *)
offset: integer; (* offset into sequence *)
begin
sep := sepfirst; (* initialize pointer *)
for b := a to t do (* zero the matrix *)
for i := 1 to window do nbl[b,i] := 0;
for seqno := 1 to numseq do begin
offset := sep^.aligncurrent+winleft-1; (* offset to get this
sequence in the window *)
for i := 1 to window do begin
b := sep^.alseq[i + offset]; (* base at this position *)
nbl[b,i] := nbl[b,i] + 1; (* increment matrix entry *)
end;
sep := sep^.next; (* point to next sequence *)
end;
end;
(* end module fillnbl *)
(* begin module hcalc *)
procedure hcalc(var nbl: matrix; var flogf: fltable; var hl: poslist;
var htot: real; window: integer);
(* calculates H(L) and total H from n(B,L) matrix, using flogf table *)
var i: integer; (* index to positions *)
b: base; (* index to bases *)
temp: real; (* a temporary real (what else?) *)
begin
htot := 0; (* zero total *)
for i := 1 to window do begin
temp := 0; (* add up the flogf's in the ith column *)
for b := a to t do temp := temp + flogf[nbl[b,i]];
htot := htot + temp; (* add to grand total *)
hl[i] := temp;
end;
end;
(* end module hcalc *)
(* begin module changenbl *)
procedure changenbl(var sep: seqrecptr; winlen, increment: integer);
(* subtracts or adds sequence pointed to by sep to the nbl array, up to
the length specified by winlen *)
var i, offset: integer; (* index, offset into sequence for given alignment *)
b: base; (* temporary base *)
begin
with sep^ do begin
offset := aligncurrent+winleft-1; (* offset into sequence *)
for i := 1 to winlen do begin
b := alseq[i+offset]; (* base at this window position *)
nbl[b,i] := nbl[b,i]+increment; (* change nbl for this base *)
end;
end;
end;
(* end module changenbl *)
(* begin module shiftseq *)
procedure shiftseq;
(* shift sequence between allowed alignments and find alignment with minimum
H value *)
var alatmin: array[1..maxlen] of integer; (* alignments with minimum dh *)
dh, (* current value of dh for particular alignment *)
dhcur, (* dh for current alignment *)
dhmin: real; (* running minimum dh value *)
numbermin, (* number of minimum dh's hit *)
alcur, (* working alignment *)
i, offset: integer; (* index, offset into sequence for given alignment *)
(* b: base; (@ temporary base *)
begin
with sep^ do begin
(* subtract off the sequence from the nbl array *)
changenbl(sep, window, -1);
dhmin := 1.0e10;
(* run through each allowed alignment calculating dh *)
for alcur := alignmin to alignmax do begin
dh:= 0.0; (* accumulate dh here *)
offset := alcur +winleft -1;
(* this is the core of the entire program,
determine if the uncertainty h would be lower with
this alignment, by summing the differences from the
current h. That is, calculate dh. *)
for i := 1 to window do
dh:= dh+ fldiff[nbl[alseq[i+offset],i]];
(* save dh if we are at the current alignment
this saves us the trouble of calculating it separately *)
if alcur = aligncurrent then dhcur := dh;
if dh<=dhmin then begin (* if at or below minimum *)
if dh 1 then begin (* resolve ties for minimum if they exist *)
random(rand,iseed2); (* ie make numbermin be index to randomly
selected alignment on list *)
numbermin := trunc(0.99999+(numbermin*rand));
if numbermin = 0 then numbermin := 1;
(*
writeln(output,'See there really are ties! (shiftseq)');
*)
end;
if alatmin[numbermin] = aligncurrent (* if no change in alignment *)
then notchanged := notchanged +1 (* it's true of one more sequence *)
else begin
notchanged := 0; (* otherwise zero count of unchanged sequences *)
hcurrent := hcurrent + (dhmin-dhcur); (* adjust current h value *)
end;
aligncurrent := alatmin[numbermin];
(* add the sequence back to the nbl array at new alignment*)
changenbl(sep, window, 1);
end;
end;
(* end module shiftseq *)
(* begin module doubleshift *)
procedure doubleshift;
(* shift sequence between allowed alignments and find alignment with minimum
H value, for paired complementary sequences *)
var alatmin: array[1..maxlen] of integer; (* alignments with minimum dh *)
dh, (* current value of dh for particular alignment *)
dhcol, (* dh for one column *)
dhcur, (* dh for current alignment *)
dhmin: real; (* running minimum dh value *)
numbermin, (* number of minimum dh's hit *)
alcur, (* working alignment *)
halfwindow, (* size of window actually being calculated for *)
offsetnex, (* offset into next sequence *)
i, offset: integer; (* index, offset into sequence for given alignment *)
b, bn: base; (* base in this and next sequence *)
nxp: seqrecptr; (* pointer to paired sequence *)
begin
halfwindow := (window+1) div 2; (* length of unique part of window *)
nxp := sep^.next; (* point to next sequence *)
(* subtract off the sequences from the nbl array *)
(* NOTE WELL: the right half of nbl goes to pot during this process *)
changenbl(sep, halfwindow, -1);
changenbl(nxp, halfwindow, -1);
dhmin := 1.0e10;
(* run through each allowed alignment calculating dh *)
for alcur := sep^.alignmin to sep^.alignmax do begin
dh:= 0.0; (* accumulate dh here *)
offset := alcur +winleft -1;
offsetnex := (nxp^.aligncurrent+sep^.aligncurrent-alcur)+winleft-1;
for i := 1 to halfwindow do begin
b := sep^.alseq[i+offset];
bn := nxp^.alseq[i+offsetnex];
(* if bases are equal dh for the column is fldoubl value,
otherwise it's the sum of fldiff's for the two bases *)
if b=bn then dhcol := fldoubl[nbl[b,i]]
else dhcol := fldiff[nbl[b,i]] + fldiff[nbl[bn,i]];
dh:= dh+ dhcol;
end;
(* dh is twice this for the whole window, minus dh for the last
column for an odd window *)
if odd(window) then dh := dh+dh-dhcol else dh := dh+dh;
(* save dh if it's the current alignment *)
if alcur = sep^.aligncurrent then dhcur := dh;
if dh<=dhmin then begin (* if at or below minimum *)
if dh 1 then begin (* resolve ties for minimum if they exist *)
random(rand,iseed2); (* ie make numbermin be index to randomly
selected alignment on list *)
numbermin := trunc(0.99999+(numbermin*rand));
if numbermin = 0 then numbermin := 1;
(*
writeln(output,'See there really are ties! (doubleshift)');
*)
end;
if alatmin[numbermin] = sep^.aligncurrent (* if no change in alignment *)
then notchanged := notchanged +2 (* it's true of two more sequences *)
else begin
notchanged := 0; (* otherwise zero count of unchanged sequences *)
hcurrent := hcurrent + (dhmin-dhcur); (* adjust current h value *)
end;
sep^.aligncurrent := alatmin[numbermin];
nxp^.aligncurrent := nxp^.alignread+sep^.alignread-sep^.aligncurrent;
(* add the sequences back to the nbl array at new alignment*)
changenbl(sep, halfwindow, +1);
changenbl(nxp, halfwindow, +1);
end;
(* end module doubleshift *)
(* begin module hloutput *)
procedure hloutput (var fout:text; npass, shiftno: integer);
(* output uncertainty as a function of position, H(L), for pass npass and
shift number shiftno *)
(* modificiation for xyplo program format: all data are in columns *)
var i: integer; (* index to positions *)
begin
for i := 1 to window do begin
(* pass number, shift number *)
write (fout, ' ',npass:5, ' ',shiftno:5);
(* position L, Hs(L) *)
writeln(fout,' ',(winleft+i-1):3,' ',hl[i]:10:6);
end
end;
(* end module hloutput *)
(* begin module alignoutput *)
procedure alignoutput (var fout:text; npass: integer);
(* outputs current H, pass number npass, and alignments *)
var seqno: integer; (* index to sequences *)
begin
sep := sepfirst;
writeln(fout, 'pass', npass:6,', H =',hcurrent:12:7,
' relative alignments:');
for seqno := 1 to numseq do begin
write(fout, sep^.aligncurrent-sep^.alignread:4);
sep := sep^.next;
if (seqno mod 20) = 0 then writeln(fout);
end;
if(numseq mod 20) <> 0 then writeln(fout);
end;
(* end module alignoutput *)
(* begin module albestout *)
procedure albestout(var fout: text; oap: optlistptr;
numperline, field: integer);
(* outputs the array oap^.albest, numperline per line, field width = field *)
var seqno: integer; (* index to sequences *)
begin
for seqno := 1 to numseq do begin
write(fout, oap^.albest[seqno]:field);
if(seqno mod numperline)=0 then writeln(fout);
end;
if (numseq mod numperline) <>0 then writeln(fout);
end;
(* end module albestout *)
(* begin module instoutput *)
procedure instoutput(var fout: text; oap: optlistptr);
(* outputs a new inst file for alignment pointed to by oap*)
var seqno: integer; (* index to sequences *)
i: integer; (* index to each piece name *)
thesign: integer; (* -1 or +1, defining the orientation of
the Delila instruction *)
function sign(i: integer): char;
begin
if i >= 0
then sign := '+'
else sign := '-'
end;
begin
rewrite(fout);
writeln(fout,'title "malign ',version:4:2,'";');
writeln(fout,'(* begin module numbering *)');
writeln(fout,'(* number only the pieces, starting at 1 *)');
writeln(fout,'default numbering piece;');
writeln(fout,'default numbering 1;');
writeln(fout,'default out-of-range reduce-range;');
writeln(fout,'(* end module numbering *)');
writeln(fout,'(* data on this alignment:');
writeln(fout,'The H value: ', oap^.hbest:infofield:infodecim);
writeln(fout,'Number of times achieved: ',oap^.num:1);
writeln(fout,'*)');
writeln(fout);
writeln(fout,'organism org;');
writeln(fout,'chromosome chr;');
sep := sepfirst;
thesign := +1; (* if not paired, sign is always +1 *)
for seqno := 1 to numseq do begin
write(fout,'piece ');
for i := 1 to sep^.piecename.length
do write(fout,sep^.piecename.letters[i]);
sep := sep^.next;
write(fout,';');
writeln(fout,' (* piece ',seqno:1,' *)');
(*
writeln(fout,'get from', oap^.albest[seqno]:10,' -x',
' to ', oap^.albest[seqno]:10,' +y;');
*)
if paired and (not odd(seqno))
then thesign := -1
else thesign := +1;
writeln(fout,'get from', oap^.albest[seqno]:10,' ',
sign(thesign*winleft),abs(winleft):1,
' to ', 'same' ,' ',
sign(thesign*winright),abs(winright):1,';')
(* old method: before 'same' command:
' to ', oap^.albest[seqno]:10,' ',
sign(thesign*winright),abs(winright):1,';')
*)
end;
end;
(* end module instoutput *)
(* begin module optalignout *)
procedure optalignout(var fout: text);
(* outputs relative alignments of the set of optimal alignments in a format
usable to the user *)
var optno, (* counter of optimal alignments *)
seqno: integer; (* index to sequences *)
begin
writeparam(fout); (* initialize, write params to file *)
writeln(fout,
'* relative aligned bases for the set of optimal alignments');
writeln(fout);
writeln(fout, numseq:5,' sequences,',numopt:5,' optimal alignments in',
shuffle:6,' runs');
writeln(fout, horiginal:12:7,' = H for original alignment');
oap := oapfirst;
for optno := 1 to numopt do begin (* do each optimal alignment in turn *)
writeln(fout); (* write blank line then n and H *)
writeln(fout,optno:4,')',oap^.num:6,' occurrences, H =',
oap^.hbest:12:7, ', relative aligned bases:');
sep := sepfirst;
(* run through the sequences computing relative alignments as
optimal alignment minus original alignment *)
for seqno := 1 to numseq do begin
write(fout, oap^.albest[seqno]-sep^.alignread:4);
if(seqno mod 20)=0 then writeln(fout);
sep := sep^.next;
end;
if (numseq mod 20) <>0 then writeln(fout);
oap := oap^.next;
end;
end;
(* end module optalignout *)
(* begin module optinstout *)
procedure optinstout(var fout: text);
(* outputs optimal alignments in absolute DNA coordinates for conversion
to new inst files later *)
(* should use inttopie routines: does it? *)
var seqno: integer; (* index to sequences *)
begin
rewrite(fout); (* initialize file *)
writeln(fout,numseq,' ',numopt); (* first line: # of sequences, aligns *)
oap := oapfirst;
repeat (* for each optimal alignment *)
writeln(fout,oap^.num:5,' ',oap^.hbest:12:7); (* write num and H *)
for seqno := 1 to numseq do begin
write(fout, ' ',oap^.albest[seqno]:8); (* these are absolute now! *)
if (seqno mod 10) = 0 then writeln(fout);
end;
if(numseq mod 10) <> 0 then writeln(fout);
oap := oap^.next;
until oap = nil;
end;
(* end module optinstout *)
(* begin module optoutput *)
procedure optoutput;
(* calls procedure to output optimal alignments in readable relative form,
then converts all optimal alignments to absolute DNA coordinates and
calls procedures to output them *)
var seqno: integer; (* index to sequences *)
pie: pieceptr; (* pointer for piece *)
length, alignedbase: integer; (* for call to align only *)
{ oapprevious: optlistptr; (* the previous alignment *)}
begin
optalignout(optalign); (* output readable relative optimal alignments *)
reset(book); (* prepare to read book and inst again *)
reset(inst);
new(pie);
for seqno := 1 to numseq do begin (* read each sequence in turn *)
case alignmenttype of
'i': align(inst, book, theline, pie, length, alignedbase);
'b','f': begin
getpiece(book, theline, pie); (* read in the piece *)
length := piecelength(pie);
alignedbase := -inttopie(1, pie);
end;
end;
oap := oapfirst;
(* then for each optimal alignment, convert the internal alignment
for that sequence to absolute DNA coordinate *)
repeat
oap^.albest[seqno] := inttopie(oap^.albest[seqno],pie);
oap := oap^.next;
until oap = nil;
end;
optinstout(optinst); (* output file of absolute alignments *)
{
instoutput(bestinst, oapfirst); (* file of new inst for best alignment *)
}
end;
(* end module optoutput *)
(* begin module calehnb *)
procedure calehnb(n: integer;
gna,gnc,gng,gnt: integer;
var hg, ehnb, varhnb: real
(* ; debugging: boolean *));
(* calculate e(hnb) in bits/bp (ehnb) for a number (n) of example sequence
sites. gna to gnt are the composition to use for the genome probabilities
of a to t. the genomic entropy hg and the variance var(hnb)
(=varhnb) are also calculated.
if the variable debugging is passed to the procedure then the individual
combinations of hnb are displayed.
note: this procedure should not be broken into smaller
procedures so that it remains efficient.
version = 3.02; of procedure calehnb 1983 nov 23 *)
const
maxsize = 200; (* the largest n allowed *)
accuracy = 10000; (* less than (1/accuracy) bits error is demanded
for the sum of pnb (see variable 'total') at the end of the procedure *)
var
log2: real; (* natural log of 2, used to find log base 2 *)
logn: real; (* log of n *)
nlog2: real; (* n * log2 *)
gn: integer; (* sum of gna..gnt *)
logpa,logpc,logpg,logpt: real; (* logs of genome probabilities *)
(* log of n factorial is the sum of i=1 to n of log(i).
the array below represents these logs up to n *)
logfact: array[0..maxsize] of real;
(* precalculated values of -p*log2(p), where p=nb/n for
nb = 0 .. n. m stands for minus *)
mplog2p: array[0..maxsize] of real;
i: integer; (* index for logfact and mplog2p *)
logi: real; (* natural log of i *)
na, nc, ng, nt: integer; (* numbers of bases in a site *)
done: boolean; (* true when the loop is completed *)
pnb: real; (* multinomial probability of a combination
of na, nc, ng, nt *)
hnb: real; (* entropy for a combination of na..nt *)
pnbhnb: real; (* pnb*hnb, an intermediate result *)
sshnb: real; (* sum of squares of hnb *)
(* variables for testing program correctness: *)
total: real; (* sum of pnb over all combinations of na..nt
if this is not 1.00, the program is in error *)
counter: integer; (* counts the number of times through
the loop *)
begin (* calehnb *)
(* prevent access to outside the arrays: *)
if n > maxsize then begin
writeln(output,' procedure calehnb: n > maxsize (',
n:1,'>',maxsize:1,')');
halt
end;
counter := 0;
total := 0.0;
log2 := ln(2);
logn := ln(n);
nlog2 := n * log2;
(* get logs of genome probabilities *)
gn := gna + gnc + gng + gnt;
logpa := ln(gna/gn);
logpc := ln(gnc/gn);
logpg := ln(gng/gn);
logpt := ln(gnt/gn);
(* find genomic entropy *)
hg := -(gna*logpa + gnc*logpc + gng*logpg + gnt*logpt)/(gn*log2);
ehnb := 0.0; (* start error entropy at zero *)
sshnb := 0.0;
(* make table of log of n factorial up to n
and entropies for nb/n *)
logfact[0] := 0.0; (* factorial(0) = 0 *)
mplog2p[0] := 0.0;
for i := 1 to n do begin
logi := ln(i);
logfact[i] := logfact[i - 1] + logi;
mplog2p[i] := - i * (logi - logn) / nlog2
end;
(* begin by looking at the combination with all a: na = n *)
na := n;
nc := 0;
ng := 0;
nt := 0;
(* the following loop simulates a number of nested loops
of the form:
for b1=a to t do
for b2=b1 to t do
for b3=b2 to t do
...
for bn=b(n-1) to t do ...
the resulting set of variables increase in alphabetic order
since no inner loop variable can have a value less than any
outer loop. the number of times through the inner-most loop
is given by:
o = (n + 1)*(n + 2)*(n + 3)/6
in the case where there are four symbols (a,c,g,t) and n is
the number of nested loops.
a recursive set of loops would be possible, but it
would use up too much memory in practical cases (up to n=150
or higher). a second algorithm sequests the loop variables
into an array and increments them there. however, the goal
is to get all possible combinations for na, nc, ng, nt, where
the sum of these is n. the nested loops provide all the
combinations in alphabetic order, assuring that there can not
be any duplicates. to find nb (one of na..nt) one would look
at which of the variables b1 to bn were of value b. this is
a wasteful operation.
the loop below simulates the array of control variables
by changing each nb directly.
*)
done := false;
repeat
(* pnb is calculated by taking the log of the expression
fact(n) na nc ng nt
pnb = ------------------- pa * pc * pg * pt .
fact(na).. fact(nt)
log(pnb) generates a series of sums, allowing
the calculation to proceed by addition and
multiplication rather than multiplication and
exponentiation. the factorials become tractable
in this way *)
pnb := exp(logfact[n] (* n factorial *)
- logfact[na]
- logfact[nc]
- logfact[ng]
- logfact[nt]
+ na * logpa
+ nc * logpc
+ ng * logpg
+ nt * logpt);
hnb := mplog2p[na]
+ mplog2p[nc]
+ mplog2p[ng]
+ mplog2p[nt];
pnbhnb := pnb * hnb;
ehnb := ehnb + pnbhnb;
sshnb := sshnb + pnbhnb * hnb; (* sum of squares of hnb *)
(* the following section keeps track of the calculation
and writes out the current set of nb. *)
counter := succ(counter);
(* if debugging then begin
write(output,' ',counter:2,' ');
for i := 1 to na do write(output,'a');
for i := 1 to nc do write(output,'c');
for i := 1 to ng do write(output,'g');
for i := 1 to nt do write(output,'t');
write(output,' ',na:3,nc:3,ng:3,nt:3);
writeln(output,' pnb = ',pnb:10:5);
end; *)
total := total + pnb;
(* the remaining portion of this repeat loop generates
the values of na, nc, ng and nt. notice that
there are 7 possibilities at each loop increment.
other than the stop, in each case the sum of
na+nc+ng+nt remains constant (=n). *)
if nt > 0
then begin (* ending on a t - do outer loops *)
if ng > 0
then begin (* turn g into t *)
ng := ng - 1;
nt := nt + 1
end
else if nc > 0
then begin (* turn one c into g,
and all t to g (note ng = 0 initially) *)
nc := nc - 1;
ng := nt + 1;
nt := 0
end
else if na > 0
then begin (* turn one a into c and
all g and t to c. (note ng=nc=0 initially) *)
na := na - 1;
nc := nt + 1;
nt := 0
end
else done := true (* since nt = n *)
end
else begin (* no t - increment innermost loop *)
if ng > 0
then begin (* turn g into t *)
ng := ng - 1;
nt := nt + 1
end
else if nc > 0
then begin (* turn c into g *)
nc := nc - 1;
ng := ng + 1
end
else begin (* na > 0; turn a into c *)
na := na - 1;
nc := nc + 1
end
end
until done;
(* final adjustment: we only have the sum of squares so far *)
varhnb := sshnb - ehnb*ehnb;
(* if this message appears, there is either a bug in the code or
the computer cannot be as accurate as requested *)
if accuracy <> round(accuracy * total) then begin
writeln(output,' procedure calehnb: the sum of probabilities is');
writeln(output,' not accurate to one part in ',accuracy:1);
writeln(output,' the sum of the probabilities is ',total:10:8);
end;
(* if this message appear, then there is an error in the
repeat-until loop: it did not repeat as many times as
is expected from the algorithm *)
if counter <> round((n+1)*(n+2)*(n+3)/6) then begin
writeln(output,' procedure calehnb: program error, the number of');
writeln(output,' calculations is in error');
halt
end;
(* writeln(output, ' total: ',total:10:5);
writeln(output,' count = ',counter:1);
writeln(output,' (n+1)*(n+2)*(n+3)/6 = ',
round((n+1)*(n+2)*(n+3)/6):1); *)
end; (* calehnb *)
(* end module calehnb version = 5.35; (@ of rseq.p 1995 Feb 21 *)
(* begin module calaehnb *)
procedure calaehnb(n: integer;
gna,gnc,gng,gnt: integer;
var hg, aehnb, avarhnb: real);
(* calculate ae(hnb) in bits/bp (=aehnb) for a number (n) of example sequence
sites. gna to gnt are the composition to use for the genome probabilities
of a to t. the genomic entropy (=hg) and the variance avar(hnb) (=avarhnb) are
also calculated.
this procedure gives approximations for e(hnb) and var(hnb).
it is based on a formula derived by jeff haemer.
see also:
g. p. basharin
theory probability appl. 4(3): 333-336 (1959)
'on a statistical estimate for the entropy of a sequence
of independent random variables'
version = 3.01; of procedure calaehnb 1983 nov 23 *)
var
log2: real; (* natural log of 2 *)
gn: integer; (* sum of gna..gnt *)
pa, pc, pg, pt: real; (* genomic probabilities *)
e: real; (* the approximate sampling error *)
begin (* calaehnb *)
log2 := ln(2);
gn := gna + gnc + gng + gnt;
pa := gna/gn;
pc := gnc/gn;
pg := gng/gn;
pt := gnt/gn;
hg := -( (pa*ln(pa))
+(pc*ln(pc))
+(pg*ln(pg))
+(pt*ln(pt)))/log2;
e := 3/(2*log2*n);
aehnb := hg - e;
avarhnb := e * e
end; (* calaehnb *)
(* end module calaehnb version = 5.35; (@ of rseq.p 1995 Feb 21 *)
(* begin module xyoutput *)
procedure xyoutput(var malignxyin: text);
(* produce the malignxyin file *)
var
Hbefore: real; (* Hbefore for calculating information content *)
rank: integer; (* ranking of the alignment *)
Ehnb: real; (* small sample correction: 2 - e(n) *)
gna,gnc,gng,gnt: integer; (* genomic composition *)
hg: real; (* genomic uncertainty *)
varhnb: real; (* var(hnb) *)
begin
rewrite(malignxyin);
writeln(malignxyin,'* malign',version:5:2);
writeln(malignxyin,'* columns definitions:');
writeln(malignxyin,'* 1: rank: from 1 to the number of alignment classes');
writeln(malignxyin,'* 2: occurrences: how many times the class was found');
writeln(malignxyin,'* 3: H: the uncertainty of the alignment, in bits');
writeln(malignxyin,'* 4: R: the information content of the alignment, in bits');
oap := oapfirst;
rank := 0;
gna := 1000; gnc := 1000; gng := 1000; gnt := 1000;
(* compute sample size correction for numseq *)
if numseq <= kickover
then begin (* not using approximation *)
calehnb (numseq, gna,gnc,gng,gnt, hg, Ehnb, varhnb);
end
else begin (* using approximation *)
calaehnb(numseq, gna,gnc,gng,gnt, hg, Ehnb, varhnb);
end;
Hbefore := Ehnb * (winright - winleft + 1);
repeat
rank := succ(rank);
write(malignxyin,' ',rank:10);
write(malignxyin,' ',oap^.num:10);
write(malignxyin,' ',oap^.hbest:infofield:infodecim);
write(malignxyin,' ',(Hbefore - oap^.hbest):infofield:infodecim);
writeln(malignxyin);
oap := oap^.next;
until oap = nil;
end;
(* end module xyoutput *)
(* begin module realign *)
procedure realign;
(* shifts the window on all sequences so that as many sequences as possible
keep their original alignment *)
var alhist: array[1..maxchange] of integer; (* histogram of align changes *)
histmax, (* maximum value in histogram *)
binmax, (* bin in histogram with max value *)
bin, (* index to bins *)
alshift, (* alignment shift temporary value *)
alshiftmin, alshiftmax: integer; (* min and max allowed grand shifts *)
begin
for bin := 1 to maxchange do alhist[bin] := 0;
alshiftmin := -10000; (* get min and max possible changes *)
alshiftmax := 10000;
sep := sepfirst;
repeat (* form a histogram of changes from original alignment *)
bin := maxchange div 2 + sep^.alignread - sep^.aligncurrent;
if (bin>0) and (bin<=maxchange) then alhist[bin] := alhist[bin]+1;
alshift := sep^.alignmin-sep^.aligncurrent;
if alshift>alshiftmin then alshiftmin := alshift;
alshift := sep^.alignmax-sep^.aligncurrent;
if alshifthistmax) then begin
histmax := alhist[bin];
binmax := bin;
end;
if alhist[binmax]>minmaxforchange then begin
globshift := binmax-maxchange div 2; (* convert to alignment shift *)
if globshift>alshiftmax then globshift := alshiftmax;
if globshift0) and (sep<>nil) do begin
sep^.aligncurrent:= sep^.aligncurrent+globshift;
sep := sep^.next;
end;
end;
end;
(* end module realign *)
(* begin module findbestalignment *)
procedure findbestalignment;
(* does repeated passes through the set of sequences until "convergence" *)
var shiftno, (* shift number *)
npass: integer; (* number of passes *)
htemp: real; (* temporary h value *)
ifhlout: boolean; (* flag to do H(L) output this run *)
dohlout: boolean; (* flag to do H(L) output this shift *)
begin
ifhlout := (shuffle>nshuffle) and (nshiftout>=0);
(* do H(L) output if this is the last run and nshiftout >= 0 *)
if ifhlout then begin
(* writeparam(uncert); (@ initialize and write params to file *)
writeln(uncert,
'* Position, L, and uncertainty, H(L), as the alignment improves');
writeln(uncert, '*');
writeln(uncert, '* ',window, ' positions');
writeln(uncert,'* pass number, shift number, position L, H(L)');
hloutput(uncert, 0, 0); (* output starting H(L) *)
end;
if npassout>=0 then begin (* if outputing new alignments *)
writeln(newalign); (* blank line *)
writeln(newalign, 'run number:', shuffle:6,', original random seed:',
iseedsave:15);
alignoutput(newalign, 0); (* output starting alignments *)
end;
notchanged := 0; (* # of consecutive sequences with no change *)
intolerance := 0; (* # of passes with change in H less than tolerance *)
npass := 0;
repeat
npass := npass +1; (* increment pass number *)
hpass := hcurrent; (* save h at beginning of pass *)
sep := sepfirst;
shiftno := 0;
repeat (* shift sequences until all done or no change in one pass *)
shiftno := shiftno+1; (* increment shift number *)
if paired
then begin (* for pairs, do shift every other time *)
if odd(shiftno) then doubleshift;
end
else shiftseq;
sep := sep^.next;
(* if doing H(L) output, do it if at the end of a pass of (or?) if
nshiftout shifts have been done *)
if ifhlout then begin
dohlout := (shiftno=numseq);
if nshiftout > 0
then dohlout := dohlout or ((shiftno mod nshiftout) = 0);
if dohlout then begin
if paired then fillnbl; (* fix nbl if doing paired *)
hcalc(nbl, flogf, hl, htemp, window); (* get H and H(L) *)
hloutput(uncert, npass, shiftno);
end;
end;
until (sep=nil) or ((notchanged>numseq)and(globshift=0));
(* terminate in the middle of a pass only if the last realignment
attempt made no global shift of window on sequences *)
if (hpass-hcurrent)0 then writeln(output,shuffle:6, npass:6, hcurrent:13:7);
if npassout>0
then if (npass mod npassout) = 0
then alignoutput(newalign, npass);
until (intolerance>ntolpass) or (notchanged>=numseq);
if standout=0 then writeln(output,shuffle:6, npass:6, hcurrent:13:7);
(* 2005 Feb 7: TDS. The following code finally crashed because
mod of a negative value is undefined. Duh.
Follow the definition!!! Geepers.
if npassout=0
then alignoutput(newalign, npass)
else if (npass mod npassout) <> 0
then alignoutput(newalign, npass);
*)
if npassout=0
then alignoutput(newalign, npass)
else if npassout > 0 then begin
if (npass mod npassout) <> 0
then alignoutput(newalign, npass);
end;
end;
(* end module findbestalignment *)
(* begin module randomalign *)
procedure randomalign;
(* generates a new random starting alignment *)
var altrunc, (* randomly selected value with range of alignments *)
alrange: integer; (* range of allowed alignments *)
begin
sep := sepfirst;
repeat
random(rand,iseed);
alrange := sep^.alignmax-sep^.alignmin; (* range of alignments *)
altrunc := trunc(rand*(alrange+1));
if(altrunc>alrange) then altrunc := alrange;
sep^.alignstart := sep^.alignmin + altrunc;
sep^.aligncurrent := sep^.alignstart;
sep := sep^.next;
if paired then begin (* if paired adjust next sequence by reverse *)
sep^.alignstart := sep^.alignmax-altrunc;
sep^.aligncurrent := sep^.alignstart;
sep := sep^.next;
end;
until sep=nil;
end;
(* end module randomalign *)
(* begin module optcheckadd *)
procedure optcheckadd;
(* checks if current best alignment is on list of optimal alignments; if so,
increments count for that alignment; if not, adds it to list *)
var match, (* flag if match existing one on list *)
termloop: boolean; (* flag to terminate loop *)
seqno: integer; (* index to sequences *)
begin
match := false; (* flag for match to existing ones on list *)
termloop := false; (* flag to terminate loop *)
oap := oapfirst; (* initialize to go through list *)
oaplast := nil; (* keep this pointing to last record *)
(* go through list until there's a match or until oap points to first
alignment with h greater than hcurrent *)
if oap<>nil then if hcurrent>=oap^.hbest then repeat
if hcurrent = oap^.hbest then begin (* if h's match then *)
match := true; (* it's a provisional match *)
sep := sepfirst;
seqno := 1;
(* run through alignments until one doesn't match *)
while match and (seqno<=numseq) do begin
if oap^.albest[seqno]<>sep^.aligncurrent then match := false;
seqno := seqno+1;
sep := sep^.next;
end;
(* if still match after check, increment number *)
if match then oap^.num := oap^.num+1;
end;
oaplast := oap;
oap := oap^.next;
if oap=nil then termloop := true else
if hcurrent=0 then begin
writeparam(newalign); (* write params to alignment file *)
writeln(newalign, '* relative alignments at start, end of each run',
' and during run if npassout>0');
end;
if standout>=0 then writeln(output,' run pass uncertainty');
shuffle := 0;
oapfirst := nil; (* pointer to first entry of list of optimal aligns *)
numopt := 0; (* number of optimal alignments *)
iseedsave := iseed;
repeat
shuffle := shuffle +1;
fillnbl; (* fill the nbl array for the current alignment *)
hcalc(nbl, flogf, hl, hcurrent, window); (* calculate H and H(L) *)
if shuffle=1 then horiginal := hcurrent; (* save original h *)
findbestalignment;
optcheckadd; (* add alignment to list if it's unique new one *)
iseedsave := iseed;
randomalign;
until shuffle > nshuffle;
optoutput; (* output files associated with optimum alignments *)
xyoutput(malignxyin);
1: end.