program fitgauss(histog, fitgaussp, gfit, output); (* fitgauss: fit a gaussian distribution to a histogram Tom Schneider NCI/FCRDC Bldg 469. Room 144 P.O. Box B Frederick, MD 21702-1201 (301) 846-5581 (-5532 for messages) permanent email: toms@alum.mit.edu toms@ncifcrf.gov http://www.lecb.ncifcrf.gov/~toms/ National Cancer Institute Laboratory of Experimental and Computational Biology *) label 1; (* end of program *) const (* begin module version *) version = 1.07; (* of fitgauss.p 1999 December 4 origin 1999 Dec 3 *) updateversion = 1.00; (* defines lowest acceptable current parameter file *) (* end module version *) (* begin module describe.fitgauss *) (* name fitgauss: fit a gaussian distribution to a histogram synopsis fitgauss(histog: in, fitgaussp: in, gfit: out, output: out) files histog: the output of the genhis program, consisting of comment lines that begin with "*" and data lines that have pairs of bin position and number of items at that bin position. gfit: Same format as histog, but a gaussian distribution that is a least squares fit to the histog data. fitgaussp: parameters to control the program. The file must contain the following parameters, one per line: parameterversion: The version number of the program. This allows the user to be warned if an old parameter file is used. range (integer): The range to fit over. output: messages to the user description The fitgauss program fits a gaussian distribution to a histogram. examples documentation see also genhis.p, smoothis.p fitgaussp author Thomas Dana Schneider bugs technical notes memmax is a constant that defines the largest allowed range. *) (* end module describe.fitgauss *) (* begin module fitgauss.const *) wid = 10; (* width of numbers to use for data *) dec = 5; (* number of decimal places to use for data *) memmax = 10000; (* largest allowed gfiting range *) (* end module fitgauss.const *) var histog, (* file used by this program *) fitgaussp, (* file used by this program *) gfit: text; (* file used by this program *) (* begin module halt *) procedure halt; (* stop the program. the procedure performs a goto to the end of the program. you must have a label: label 1; declared, and also the end of the program must have this label: 1: end. examples are in the module libraries. this is the only goto in the delila system. *) begin writeln(output,' program halt.'); goto 1 end; (* end module halt version = 'delmod 6.16 84 mar 12 tds/gds'; *) (* begin module copyaline *) procedure copyaline(var fin, fout: text); (* copy a line from file fin to file fout *) begin (* copyaline *) while not eoln(fin) do begin fout^ := fin^; put(fout); get(fin) end; readln(fin); writeln(fout); end; (* copyaline *) (* end module copyaline version = 'delmod 6.51 85 apr 17 tds/gds' *) (* begin module fitgauss.themain *) procedure themain(var histog, fitgaussp, gfit: text); (* the main procedure of the program *) const debug = false; (* set to true to see debugging output *) debugwid = 5; (* width of numbers to use for debugging data *) debugdec = 2; (* number of decimal places to use for debugging data *) countmax = 10; (* how many data pieces to look at before halting while debugging *) pi = 3.14159265358974; (* used in calculating the gaussian curve *) var mean: real; (* the mean of values in the memory *) count: integer; (* counter for limiting debug output *) points: integer; (* number of points in the histogram *) memory: record (* the histogram to work with *) position: array[1..memmax] of real; (* positions of numbers *) value: array[1..memmax] of real; (* memory of numbers *) gauss: array[1..memmax] of real; (* memory of gaussian curve *) end; p: integer; (* index to the position in the histogram *) parameterversion: real; (* parameter version number *) range: integer; (* range to average over *) total: real; (* current total in the memory *) sumsquare: real; (* sum of squares *) stdev: real; (* sum of squares *) sumrange: real; (* sum range values *) sumsqrange: real; (* sum square range values *) weighted: real; (* partially computed weighting value *) s: real; (* partially computed weighting value *) (* stuff for gaussian: *) d1, (* first denominator, = 2*sd**2 *) d2, (* second denominator, = sd*sqrt(2*pi) *) x, (* the position for which the expectation is calculated *) ex: real; (* probability of getting a value in the interval *) number: real; (* predicted gaussian value *) interval: real; (* interval for computing probabilty *) entries: real; (* number of original data points *) printing: boolean; (* whether to print *) done: boolean; (* done finding best fit *) deviation: real; (* deviation from best fit *) newdeviation: real; (* newly computed deviation *) currentmean: real; (* current mean *) currentstdev: real; (* current stdev *) currententries: real; (* current entries *) fractionstep: real; (* the fractional step *) watchprogress: boolean; (* watch the gaussian curve converge? *) tolerance: real; (* fractional tolerance for convergence *) delta: real; (* current fractional tolerance for convergence *) maxsteps: integer; (* maximum number of steps to converge *) {zzz} procedure vary(var p: real; fraction: real); (* vary the variable p by some fractional amount *) var seed: real; (* a random number *) variation: real; (* the amount of variation *) begin seed := random(0); variation := 2.0 * (seed - 0.5) * fraction; { writeln(output,'seed = ',seed:1:dec, ' variation = ',variation:1:dec); } p := p + p * variation; end; procedure computegauss; (* compute gaussian curve, store in memory *) var p: integer; (* index to the position in the histogram *) begin d1 := 2 * stdev*stdev; d2 := stdev * sqrt(2 * pi); interval := memory.position[2] - memory.position[1]; for p := 1 to points do begin ex := interval * exp( -(memory.position[p] - mean) * (memory.position[p] - mean)/d1) / d2; memory.gauss[p] := ex * entries; end; end; procedure printgauss; (* print gaussian curve from memory *) var p: integer; (* index to the position in the histogram *) begin for p := 1 to points do begin writeln(gfit, memory.position[p]:wid:dec, ' ',memory.gauss[p]:wid:dec, ' ',memory.value[p]:wid:dec); end; end; procedure computedeviation(var deviation: real); (* compute the least squares deviation of the gaussian curve from the observed values *) var p: integer; (* index to the position in the histogram *) diff: real; (* a single difference *) sum: real; (* sum of differences squared *) begin deviation := 0.0; for p := 1 to points do begin diff := memory.value[p] - memory.gauss[p]; deviation := deviation + diff * diff; end; { writeln(output,'deviation is: ',deviation:wid:dec); } end; {zzz} begin writeln(output,'fitgauss ',version:4:2); reset(fitgaussp); readln(fitgaussp, parameterversion); if parameterversion < updateversion then begin writeln(output, 'You have an old parameter file!'); halt end; { readln(fitgaussp, range); if range < 1 then begin writeln(output,'range must be positive'); halt end; if range > memmax then begin writeln(output,'range must be less than memmax (',memmax:1,')'); halt end; } reset(histog); rewrite(gfit); writeln(gfit,'* fitgauss ',version:4:2); { writeln(output,'range is ',range:1); } points := 0; total := 0; sumsquare := 0; sumrange := 0; sumsqrange := 0; if debug then count := 0; (* read in the histogram *) while not eof(histog) do begin if (eoln(histog)) or (histog^ = '*') then begin copyaline(histog, gfit); end else begin (* pull in the new value *) points := succ(points); if points > memmax then begin writeln(output,'histogram must be shorter than memmax (', memmax:1,')'); halt end; readln(histog, memory.position[points], memory.value[points]); (* add in the new value to the running total: *) weighted := memory.value[points]*memory.position[points]; total := total + weighted; entries := entries + memory.value[points]; sumrange := sumrange + memory.value[points]; { sumsquare := sumsquare + (weighted * weighted); sumsqrange := sumsqrange + memory.value[points]*memory.value[points]; } if debug then writeln(output, memory.position[points]:wid:dec, ' ',memory.value[points]:wid:dec); if debug then write(output, ' nverage: ', mean:debugwid:debugdec, ' total: ',total:debugwid:debugdec); {zzz} if debug then writeln(output); {zzz} if debug then count := succ(count); if debug then if count >= countmax then halt; end end; mean := total / sumrange; { stdev := (sumsquare/sumsqrange) - (mean * mean); } writeln(output,'mean = ',mean:wid:dec); writeln(output,'entries = ',entries:wid:dec); { writeln(output,'stdev = ',stdev:wid:dec); } (* figure out an initial standard deviation. I can't seem to get it right the simpler way ... *) sumsquare := 0.0; for p := 1 to points do begin s := (memory.value[p]*memory.position[p] - mean); sumsquare := sumsquare + s; end; stdev := (sumsquare/sumrange); writeln(output,'stdev = ',stdev:wid:dec); watchprogress := false; tolerance := 0.00005; fractionstep := 0.05; maxsteps := 10000; computegauss; if watchprogress then printgauss; (* now vary the mean, the stdev and the entries to find the least squares fit to the memory curve *) currentmean := mean; currentstdev := stdev; currententries := entries; done := false; count := 0; computedeviation(deviation); while not done do begin vary(mean, fractionstep); vary(stdev, fractionstep); vary(entries, fractionstep); computegauss; computedeviation(newdeviation); if newdeviation < deviation then begin delta := (deviation - newdeviation)/deviation; if delta < tolerance then begin done := true; writeln(output,'reached tolerance: ',delta:wid:dec); end; { writeln(output,'accepted!'); } currentmean := mean; currentstdev := stdev; currententries := entries; deviation := newdeviation; if watchprogress then printgauss; end else begin { writeln(output,'failed!'); } (* reset the parameters: *) mean := currentmean; stdev := currentstdev; entries := currententries; end; count := succ(count); { writeln(output,count:1); } if count > maxsteps then done := true; end; if not watchprogress then printgauss; end; (* end module fitgauss.themain *) begin themain(histog, fitgaussp, gfit); 1: end.