# To unbundle, sh this file echo README 1>&2 sed 's/.//' >README <<'//GO.SYSIN DD README' - l2fit -- Least Squares Fit - -SYNOPSIS -l2fit [option ...] expr [startvals ...] [filename] - -DESCRIPTION -l2fit fits the function given in the expression to x, y pairs -in a data file (or on the standard input). Suppose, for instance, -that datafile contains pairs in which y is a quadratic function -of x plus (perhaps) some other ``error'' or ``noise''. This -command finds the three quadratic coefficients p(1), p(2) and p(3): - l2fit 'p(1)*x**2 + p(2)*x + p(3)' datafile -The values of the three parameters appear in order on the standard -output, and as a side effect l2fit writes the troff output file -l2fit.out in the current directory. That file is a one-page summary -of the fit that includes both numbers (such as standard errors) -and graphs of the function and the residuals. One should almost -always study the l2fit.out file before believing the data summarized -on standard output. - -The program assumes that the x, y input pairs are modeled by the -equation y = f(x, p); it attempts to find a vector p that minimizes -the sum of the squares of the residuals y[i] - f(x[i], p). The function -f is given as the Fortran expression expr; it may contain only the -variables x and p(1), p(2), .... l2fit recognizes ``^'' as an -abbreviation for ``**'', ``a'' and ``p1'' for ``p(1)'', ``b'' and -``p2'' for ``p(2)'', etc. Thus if the program gen.data created the -data in datafile, the above command could be rewritten as - gen.data | l2fit 'a*x^2 + b*x + c' - -The main work of l2fit is performed by David Gay's ``dn2f'' routine -from the port(3) library. It starts with an initial guess of the -vector p, and refines the guess by Gauss-Newton iteration until it -arrives at a local minimum (which may not be a global minimum). -The l2fit program uses the default starting vector of zero. -If the algorithm is unable to find an optimal value for p starting -from zero, the user might need to provide a starting vector -by giving its floating-point components following expr. - -By default, l2fit uses unweighted regression. If there are at least -two distinct y values for each x value, then a weighted least squares -regression invoked by -w uses the sample variance at each x value to -increase the accuracy of the regression. l2fit recognizes several -options presented as flags: - -w weighted regression - -t save all temporary files; see FILES below - -i no grap output in l2fit.out (invisible) - -lx log x scale in grap output - -ly log y scale in grap output - -lxy log both scales in grap output - -xText description of x variable - -yText description of y variable - -cText comment: text ignored - -troff l2fit.out is ready to be run through troff - -EXAMPLES -l2fit -lx -w -t 'a + b*x^c' 20 -1 -.5 searchdata.d - Fit the data in searchdata.d to the function a + b*x^c, - using the starting values a=20, b=-1, c=-.5. Because each - x value has many y values, we use a weighted regression (-w). - The graphs in the output file l2fit.out have logarithmic x - scales (-lx), and the temporary files l2temp.* are not - deleted (-t). - -FILES -l2fit uses several files in the current directory of the form l2temp.*; -the -t flag causes these temporary files not to be deleted. -The most useful of these files are: - l2temp.f This is the Fortran program that calls the optimization - routine. If your function will not fit in a single - expression, you might consider modifying this file - to solve your problem. - l2temp.g This is the Grap file (including some Troff input) - that eventually makes l2fit.out - l2temp.p This is the file produced by the Port routine dn2f. - It can be helpful for studying details of the least - squares computation (it is expecially handy when the - program fails to converge). See the Port documentation - for details. - -SEE ALSO -P. A. Fox, The PORT Mathematical Subroutine Library, AT&T Bell Laboratories, - May 8, 1984. - -BUGS -There are many contexts in which l2fit fails to find the ``right'' answer. -Sometimes the port(3) dn2f routine observes convergence problems; in that -case an error message is produced on stderr and in l2fit.out. In other -cases the routine converges to a local minimum that is a poor fit to the -function; the graphs of the function fit and residuals usually provide -evidence of this. Problems such as these are remedied by supplying either -a more appropriate function or better starting values. - -The program requires a lot of CPU cycles. On a VAX-8550, a 100-line -data file requires about 5 seconds to find optimal values and -about 3 seconds to prepare l2fit.out (the -i flag avoids the output and -the CPU time). Good starting values can decrease the optimization time, -and poor values can greatly increase it. A single run on a fast machine -is quite acceptable, but many runs on a heavily loaded slower machine -could become bothersome. Beware. - -INSTALLATION -The l2fit bundle contains this README file, the Awk l2fit program, and the -program gen.data. To install the program, move l2fit to a suitable -bin directory and make it executable. The first line in the Awk program -is an assignment to the variable ``schemafile''; change that to reflect -the appropriate directory, for instance: - schemafile = "/usr/you/bin/l2fit" -To test the program, make gen.data executable and try - gen.data | l2fit 'a*x^2 + b*x + c' -The output values should be near 2, -10, 12. The graphs in l2fit.out -should show that the function is a close fit to the data, and the residuals -are a sine times a uniform. - -The l2fit program requires up-to-date versions of the following programs; - Awk It needs the functions in the 1985 version. - f77 - port - grap It needs the ^ operator installed in Feb 1989. - pic - troff -If l2fit does not work when you try to install it, check to make sure that -your system has current versions. //GO.SYSIN DD README echo l2fit 1>&2 sed 's/.//' >l2fit <<'//GO.SYSIN DD l2fit' -# l2fit: least squares fit a data file to an expression -# The next line is a hack to include the text file with the awk program. -cat >/dev/null <<'ENDSCHEMAS' -@@@ FORTRAN -c Schema for Fortran l2fit program, with following subs made: -c N: Number of x, y observations -c NP: Number of p parameters -c EXP: Expression to be fit -c SP: Number of start p parameters at front of file -c NV: Size of v vector, from p. 2 of port n2f documentation - integer i, iv(2000), liv, lty, lv, ui(1), lp - double precision v(@NV@) - external dummy, resid - double precision ty(@N@,3), p(@NP@) - data liv /2000/, lv /@NV@/ - data lty /@N@/, lp /@NP@/ - do 10 i = 1, lp - p(i) = 0.0 - 10 continue - do 20 i = 1, @N@ - read (*, *) ty(i,1), ty(i,2), ty(i,3) - 20 continue - if (@SP@ .le. 0) goto 40 - do 30 i = 1, @SP@ - read (*, *) p(i) - 30 continue - 40 continue - ui(1) = lty - iv(1) = 0 - call dn2f(lty, lp, p, resid, iv, liv, lv, v, ui, ty, dummy) - stop - end -c - subroutine dummy - return - end -c - subroutine resid(n, lp, p, nf, r, lty, ty, uf) - integer n, lp, nf, lty - double precision p(lp), r(n), ty(lty, 3) - external uf - integer i - double precision x - do 10 i = 1, n - x = ty(i, 1) - r(i) = ty(i,3)*(ty(i, 2) - (@EXP@)) - 10 continue - return - end -@@@ GRAP -.\" Schema for grap summary of l2fit, with these parameters -.\" DFILE: data file name -.\" IFILE: input file name -.\" GEXP: grap expression -.\" IEXP: input expression -.\" CEXP: canonical expression -.\" LOGX: log x -.\" LOGY: log y -.\" OPARS: output parameters -.\" SPARS: starting parameters -.\" PARSTR: names of parameters -.\" ALGTERM: algorithm termination -.\" ERRS: estimates of standard errors -.\" WEIGHT: weighted or unweighted -.\" CLINE: command line arguments -.\" XTEXT: description of x variable -.\" YTEXT: description of y variable -.\" YBLANKS: same length as YTEXT, but blanks -.\" RESID: residual description -.\" SHRINK: shrink factor for graphs -.sp 1 -.nf -.ce 1 -L2FIT SUMMARY -.sp 1 -Command line: \f(CW@CLINE@\fP -Least squares regression type: @WEIGHT@ -Input data file: @IFILE@ -.EQ -delim %% -.EN -Input expression: @IEXP@ % @EEXP@ % - Canonical form: @CEXP@ % @ECEXP@ % -.ta 1.2i 2.0i 2.8i 3.6i 4.4i 5.2i 6.0i 6.8i 7.6i -Parameters%@PARSTR@% - Initial:@SPARS@ - Final:@OPARS@ - Standard errors:@ERRS@ -Algorithm termination: @ALGTERM@ -.sp 1 -.G1 -define glog X (log($1)/log(2.718281828459045)) X -define gexp X (2.718281828459045^($1)) X -define abs X (max(($1), 0-($1))) X -frame ht 3.0*@SHRINK@ wid 4.5*@SHRINK@ -label bot "Circles: input %(x,y)% pairs. Line: least squares fit %y ~=~ f(x,p)%. @XTEXT@" -coord @LOGX@ @LOGY@ -label right "@YTEXT@" -draw solid -lastx = 1e30 -copy "@DFILE@" thru { - tx = $1 - circle at tx, $2 - if lastx != tx then { next at tx, (@GEXP@) } - lastx = tx -} -.G2 -.sp 1 -.G1 -frame ht 2.0*@SHRINK@ wid 4.5*@SHRINK@ -label bot "@RESID@" -label right "@YBLANKS@" -coord @LOGX@ -draw solid -line1 = 1 -copy "@DFILE@" thru { - maxx = tx = $1 - if line1 == 1 then { line1 = 0; minx = tx } - circle at $1, ($2-(@GEXP@))/$3 -} -line from minx, 0 to maxx, 0 -.G2 -@@@ -ENDSCHEMAS -# -# Now comes the Awk program that does the work -# -awk ' -BEGIN { - # Initialize variables - schemafile = "/usr/jlb/l2fit/l2fit" ##### FIX THIS FILE NAME - squote = "\047" - progname = "l2fit" - grapoutfile = "l2fit.out" - tempdir = "" - # To make files unique: "echo $$" | getline pid - fitfile = tempdir "l2temp.p" - fortfile = tempdir "l2temp.f" - objfile = tempdir "l2temp.o" - exfile = tempdir "l2temp.x" - grapfile = tempdir "l2temp.g" - errfile = tempdir "l2temp.e" - initfile = tempdir "l2temp.ip" # Initialization Parameters - idatafile = tempdir "l2temp.id" # Input Data: all x, each y, 1 - fdatafile = tempdir "l2temp.fd" # Fortran Data: unique x, mean y, weight - gdatafile = tempdir "l2temp.gd" # Grap Data: all x, each y, stddev(y) - sortfile = tempdir "l2temp.sd" # Sorted data, gdatafile format - usage = "usage: l2fit <-flags>* * ?" - numre = "^[+-]?([0-9]+[.]?[0-9]*|[.][0-9]+)([dDeE][+-]?[0-9]+)?$" - savefiles = 0 - weighted = 0 - wantgrap = 1 - wanttroff = 1 - # Get arguments from command line - EOFSTR = "EOFeofEOF" - initlex() - for (i = 1; i <= ARGC; i++) { - s = ARGV[i] - if (s ~ /[ \t]/) s = squote s squote - cmdline = cmdline " " s - } - while (substr(token, 1, 1) == "-") { - s = substr(token, 2) - if (s ~ /^t$/) savefiles = 1 - else if (s ~ /^i$/) wantgrap = 0 - else if (s ~ /^w$/) weighted = 1 - else if (s ~ /^troff$/) wanttroff = 0 - else if (s ~ /^lx$/) { logx = "log x" } - else if (s ~ /^ly$/) { logy = "log y" } - else if (s ~ /^lxy$/) { logx = "log x"; logy = "log y" } - else if (s ~ /^x/) { xtext = substr(s, 2) } - else if (s ~ /^y/) { ytext = substr(s, 2) } - else if (s ~ /^s/) { shrink = 0+substr(s, 2) } - else if (s ~ /^c/) { } # Comment - else warn("unrecognized option: -" s) - nexttoken() - } - if (shrink <= 0 || shrink > 2) shrink = 1 - if (token == EOFSTR) error(usage) - inexpr = token - nexttoken() - startm = 0 - while (token ~ numre) { - startp[++startm] = float(token, "in argument list") - nexttoken() - } - if (token == EOFSTR) - inputfile = "-" - else { - inputfile = token - nexttoken() - if (token != EOFSTR) error(usage) - } - ARGV[1] = inputfile - ARGC = 2 - # Put expression in canonical form - canexpr = tofortran(inexpr) - # Compute m = p(i) elements in expr - m = parmcount(canexpr) - if (startm > m) { - warn("too many start values; extras discarded") - startm = m - } - for (i = 1; i <= m; i++) - startstr = startstr "\t" (0 + startp[i]) - # Read data from input into idatafile - while (getline > 0) { - if (NF != 2) warn("data line " NR ": does not have 2 fields") - x = float($1, "in data line " NR) - y = float($2, "in data line " NR) - print x, y, 1 >idatafile - cnt[x]++ - sigy[x] += y - sigy2[x] += y*y - ++n - } - close(idatafile) - # Write fdatafile, for input to Fortran program - if (weighted == 0) { - fdatafile = idatafile - } else { - mins2 = HUGE = 1e30 - distinctx = 0 - for (x in cnt) { - distinctx++ - tn = cnt[x] - ts2 = sigy2[x]/tn - sigy[x]*sigy[x]/(tn*tn) - s2[x] = ts2 - if (ts2 > 0 && ts2 < mins2) - mins2 = ts2 - } - if (mins2 == HUGE) mins2 = 1.0 - for (x in cnt) { - ts2 = s2[x] - if (ts2 <= 0.0) { - warn("zero variance at x=" x \ - "; min variance substituted") - ts2 = s2[x] = mins2 - } - print x, sigy[x]/cnt[x], sqrt(cnt[x]/ts2) > fdatafile - } - close(fdatafile) - } - # Write initfile - if (startm > 0) { - for (i = 1; i <= startm; i++) - print startp[i] >initfile - close(initfile) - } else { - initfile = "" - } - # Write fortfile - progn = weighted ? distinctx : n - subarr["N"] = progn - subarr["NP"] = m - subarr["EXP"] = canexpr - subarr["SP"] = startm - subarr["NV"] = 200 + m*(progn + 2*m + 20) + 2*progn - doschema(schemafile, "FORTRAN", fortfile, subarr) - for (i in subarr) delete subarr[i] - # Compile and execute program - if (system("f77 " fortfile " -lport -o " exfile " 2>" errfile)) { - system("cat " errfile"; rm " errfile) - error("cannot compile (bad expression?)") - } - if (system("cat " fdatafile " " initfile " | " exfile " >" fitfile)) - error("execution error (computed number out of range?)") - # Extract answers - state = 0 - while (getline 0) { - if (state == 0 && $2 == "FINAL") state = 1 - else if (state == 1 && NF == 0) state = 2 - else if (state == 2 && NF == 0) state = 3 - else if (state == 2 && NF == 4) p[$1] = float($2, "in port output") - if ($1 == "*****") conv = $0 - if ($1 == "ROW") estr = estr "\t" sqrt(float($NF, "in port output")) - } - s = pstr = "" - for (i = 1; i <= m; i++) { - pstr = pstr "\t" p[i] - s = s " " p[i] - } - print substr(s, 2) - gsub(/\*/, "", conv) - sub(/^[ \t]+/, "", conv) - for (i = 1; i <= 26; i++) - gsub(substr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", i, 1), - substr("abcdefghijklmnopqrstuvwxyz", i, 1), conv) - if (conv !~ /convergence/ || conv ~ /singular|false/) { - warn("convergence problem: " conv) - conv = "CONVERGENCE FAILURE: " conv - } - # Prepare grap summary, if desired - if (wantgrap) { - # Make grap data file gdatafile - if (weighted) { - while (getline 0) - print $1, $2, sqrt(s2[0+$1]) >gdatafile - close(gdatafile) - } else { - gdatafile = idatafile - } - # Sort data file for grap - system("sort -n " gdatafile " > " sortfile) - # Write grapfile from gschemafile - parstr = "" - for (i = 1; i <= m; i++) - parstr = parstr "\t" substr("abcdefghi", i, 1) "=p(" i ")" - gsub(/\t-/, "\t\\-", pstr) - gsub(/\t-/, "\t\\-", startstr) - subarr["DFILE"] = sortfile - subarr["IFILE"] = inputfile - subarr["GEXP"] = tograp(canexpr) - subarr["IEXP"] = inexpr - subarr["CEXP"] = canexpr - subarr["EEXP"] = toeqn(inexpr) - subarr["ECEXP"] = toeqn(canexpr) - subarr["LOGX"] = logx - subarr["LOGY"] = logy - subarr["OPARS"] = pstr - subarr["SPARS"] = startstr - subarr["PARSTR"] = toeqn(parstr) - subarr["ALGTERM"] = conv - subarr["ERRS"] = estr - subarr["WEIGHT"] = weighted ? "Weighted" : "Unweighted" - subarr["CLINE"] = cmdline - subarr["SHRINK"] = shrink - subarr["RESID"] = "Circles: weighted residuals %[y ~-~ f(x,p)]/stddev(x)%." - if (xtext) xtext = "x: " xtext - subarr["XTEXT"] = xtext - if (ytext) ytext = "y: " ytext - subarr["YTEXT"] = ytext - yblanks = ytext - gsub(/./, " ", yblanks) - subarr["YBLANKS"] = yblanks - if (weighted == 0) - subarr["RESID"] = "Circles: residuals %y ~-~ f(x,p)%." - doschema(schemafile, "GRAP", grapfile, subarr) - for (i in subarr) delete subarr[i] - # Compile grapfile - troffstr = "" - if (wanttroff) troffstr = " | troff" - system("grap " grapfile " | pic | eqn" troffstr " > " grapoutfile) - # Clean up files - rmstr = sortfile " " grapfile - if (weighted) rmstr = rmstr " " gdatafile - if (savefiles == 0) - system("rm " rmstr) - } - # Clean up files - rmstr = exfile " " fortfile " " initfile " " idatafile " " \ - objfile " " fitfile " " errfile - if (weighted) rmstr = rmstr " " fdatafile - if (savefiles == 0) - system("rm " rmstr) - exit -} - -# Support functions for numerical interfaces - -# FUNCTIONS TO CONVERT AMONG EQN, GRAP, FORTRAN, ETC. - -function toeqn(s) { - while (match(s, /exp\(/) > 0) { - b = RSTART+RLENGTH-1 - n = matchpar(s, b) - s = substr(s,1,n-1) "}" substr(s,n+1) - sub(/exp\(/, " e sup {", s) - } - while (match(s, /(\*\*|\^)\(/) > 0) { - b = RSTART+RLENGTH-1 - n = matchpar(s, b) - s = substr(s,1,n-1) "}" substr(s,n+1) - sub(/(\*\*|\^)\(/, " sup {", s) - } - gsub(/\([0-9]+\)/, " sub & ", s) # this still leaves () around them - for (i = 1; i <= 9; i++) - gsub("\\(" i "\\)", i, s) - gsub(/\(/, "{(", s) - gsub(/\)/, ")}", s) - gsub(/\*\*|\^/, " sup ", s) - gsub(/\*/, "", s) - gsub(/log/, " log ", s) - gsub(/\t/, "\"\\t\"", s) - return s -} - -function matchpar(s, n, i, k, c) { # find matching paren to one at s[n] - k = 0 - for (i = n; (c=substr(s,i,1)) != ""; i++) { - if (c == "(") - k++ - else if (c == ")") { - if (--k <= 0) - return i - } - } - return n -} - -function tograp(inputs, s) { - s = inputs - gsub(/[ \t]/, "", s) - for (i = 1; i <= m; i++) - gsub("p\\(" i "\\)", "(" p[i] ")", s) - gsub(/\*\*/, "^", s) - s = " " s " " - gsub(/[^a-zA-Z0-9]+/, " & ", s) - gsub(/ log /, " glog ", s) - gsub(/ exp /, " gexp ", s) - gsub(/ x /, " tx ", s) - gsub(/ /, "", s) - return s -} - -function tofortran(inputs, s) { - s = inputs - gsub(/[ \t]/, "", s) - gsub(/[^a-zA-Z0-9]+/, " & ", s) - s = " " s " " - for (i = 1; i <= 9; i++) { - gsub(" " substr("abcdefghi", i, 1) " ", " p(" i ") ", s) - gsub(" p" i " ", " p(" i ") ", s) - } - gsub(/\^/, "**", s) - gsub(/ /, "", s) - return s -} - -function toawk(inputs, s) { - s = inputs - gsub(/[ \t]/, "", s) - gsub(/\*\*/, "^", s) - return s -} - -function parmcount(expr, s, seen, i, n) { - s = expr - n = 0 - gsub(/[ \t]/, "", s) - while (match(s, /p\([0-9]+\)/)) { - i = 0 + substr(s, RSTART+2, RLENGTH-3) - seen[i] = 1 - if (i > n) n = i - s = substr(s, RSTART+RLENGTH) - } - for (i = 1; i < n; i++) - if (!(i in seen)) - warn("p(" i ") not in expression") - return n -} - -# LEXICAL ANALYSIS FUNCTIONS - -function initlex() { - if (ARGC == 2) error(usage) # ARGV[1] is prog name - else if (ARGC == 3) lexfile = ARGV[2] # ARGV[1] is prog name - else lexfile = "" - argptr = 1 - nexttoken() -} - -function nexttoken( status) { - if (lexfile != "") { - status = (getline token = ARGC) token = EOFSTR - } - sub(/#.*/, "", token) - sub(/^[ \t]+/, "", token) - if (token == "") nexttoken() -} - -# PROCESS SCHEMA FILE - -function doschema(schemafile, marker, outfile, subarr, temp, i) { - while (getline 0) - if ($1 == "@@@" && $2 == marker) break - while (getline 0) { - if ($1 == "@@@") break - temp = $0 - if (temp ~ /@.*@/) - for (i in subarr) - if (index(temp,i)) - gsub("@" i "@", subarr[i], temp) - print temp >outfile - } - close(schemafile) - close(outfile) -} - -# MISC SUPPORT FUNCTIONS - -function float(s, where) { - if (s ~ numre) { - sub(/[dD]/, "e", s) - } else { - warn("unrecognized float: " s " " where) - } - return 0+s -} - -function error(s) { - print " " progname " fatal error: " s | "cat 1>&2" - exit 1 -} - -function warn(s) { - print " " progname " warning: " s | "cat 1>&2" -} -' $0 "$@" //GO.SYSIN DD l2fit echo gen.data 1>&2 sed 's/.//' >gen.data <<'//GO.SYSIN DD gen.data' -awk ' -func f(x) { return 2*x^2 -10*x + 12 + rand()*sin(x/8) } -BEGIN { - for (i = -50; i <= 50; i++) - for (j = 1; j <= 1; j++) - print i, f(i) -} -' //GO.SYSIN DD gen.data