diff options
Diffstat (limited to 'poiss.c')
-rw-r--r-- | poiss.c | 658 |
1 files changed, 658 insertions, 0 deletions
@@ -0,0 +1,658 @@ +///////////////////////// + +#include <iostream> +using namespace std; + +void usage() { + cout << R"EoF(Program to fit to Poisson distributed data. +Uses actual Poisson probability (i.e. not Gaussians, and +therefore not least squares). + +Options include: + -h # print this message (and immediately exit) + -ifile $fn # input file, date for -fit + -gfile $fn # log information about initial guesses for -fit + -fit $fn # do the fit; send results to $fn + +# the rest are for testing and background investigations: + -dkcurve $fn # smooth curves and scattered data at fake abscissas + -entropy $fn # entropy of the Poisson distro + # as a function of the intensity (lambda) + -seed $str # seed the RNG with $str + -seed -- # seed the RNG from /dev/random + +To write to stdout, use "-" for the $fn. + +See ./doit for a working example, including the SS --meld stanzas. +)EoF"; +} + +#include <iomanip> /* for setw */ +#include <vector> +#include <nlopt.hpp> /* the minimum-finder */ +#include "arg_parser.h" +#include <string> +#include <sstream> +#include <map> +#include "parse_csv.h" +#include "poissonier.h" + +// Used for equal weighting: +vector<double> const u5(5, 1.0); + +// Class to describe a sitch, i.e. a situation, i.e. +// everything the objective function needs to know: +struct sitcher { + vector<double> norm; // keep the adjustable parameters close to unity + vector<double> t1; + vector<double> t2; + vector<int> obs; // actual observed counts in the interval [t1, t2] +}; + +// Instantaneous rate at time ttt. +// Takes parameters two at a time: (amplitude, decay rate). +// If there are an odd number of parameters, +// the last one is the baseline, i.e. independent of time, +// equivalent to a decay with the rate locked at zero. +// Decay rates are in nats per second. +double inst_multi_exp(vector<double> const& prm, + double const ttt, vector<double> const norm) { + double rslt(0); + int nn = prm.size(); + for (int ii = 0; ii < nn; ii+=2){ + double amp(prm[ii] * norm[ii]); + double rate(1+ii < nn ? prm[1+ii] * norm[1+ii] : 0.); + if (rate) { + double rate(prm[1+ii]); +// use the negative rate: + rslt += amp * exp(- rate * ttt); + } else { + rslt += amp; + } + } + return rslt; +} + +// Format all the elements of a C++ vector, +// creating a string suitable for printing. +template<class T> +string dump(vector<T> const arg) { + stringstream rslt; + string sep = ""; + for (T val : arg) { + rslt << sep << val; + sep = ", "; + } + return rslt.str(); +} + +// Format all the elements of a counted array, +// creating a string suitable for printing. +template<class T> +string dump(int const nn, T const arg[]) { + stringstream rslt; + string sep = ""; + for (int ii = 0; ii < nn; ii++) { + rslt << sep << arg[ii]; + sep = ","; + } + return rslt.str(); +} + +// numerically accurate sinh(arg)/arg +template<class T> +inline T sinhc(T arg) { + if (abs(arg) < 1e-4) return 1 + arg*arg/6; + return sinh(arg)/arg; +} + +struct evendor { + double t; // time + double n; // number of events +}; + +// Intensity, usually denoted lambda, +// i.e. integral of rate(t) dt over the given interval. +double inty_multi_exp(vector<double> const& prm, + double const t1, double const t2, vector<double> const norm) { + double rslt(0); + int nn = prm.size(); + for (int ii = 0; ii < nn; ii+=2){ + double amp(prm[ii] * norm[ii]); + double rate(1+ii < nn ? prm[1+ii] * norm[1+ii] : 0.); + double mid((t2 + t1) / 2.); + double tau(t2 - t1); + rslt += amp * exp(-rate*mid) * tau * sinhc(rate*tau/2.); + } + return rslt; +} + +// the actual objective function +// returns total log probability, summed over all data points +// +// _sitch is unchanged, +// but cannot be declared const, because bobyqa wouldn't like that. +double limp(unsigned int const nprm, double const* _prm, + double * _grad, void * _sitch) { + sitcher& sitch(*(sitcher*)(_sitch)); + if (_grad != 0) throw invalid_argument("grad"); + if (sitch.norm.size() != nprm) throw invalid_argument("norm"); + vector<double> prm(_prm, _prm+nprm); + if (prm.size() != nprm) throw logic_error("vector ctor"); + double rslt(0); + int npts(sitch.t1.size()); + for (int ii = 0; ii < npts; ii++){ + double inty = inty_multi_exp(prm, sitch.t1[ii], sitch.t2[ii], sitch.norm); + rslt -= poissonier::lpmd(inty, sitch.obs[ii]); + } + return rslt; +} + +// Scan the data array, +// looking for records that look like headers +// (as opposed to numerical data). +int count_header(vector<vector<string> > const aoa){ + int NR = aoa.size(); + int hh = 0; + for (; hh < NR; hh++) { + if (aoa[hh].size() == 0) continue; + string word = aoa[hh][0]; + size_t where; + where = word.find_first_not_of(" "); + if (where == string::npos) continue; + word = word.substr(where); + try { + stod(word, &where); // don't care about return value + } + catch (exception& ee) { + continue; + } + word = word.substr(where); + where = word.find_first_not_of(" "); + if (where == string::npos) break; + } + return hh; +} + +// Some tricky heuristics. +// Guess a starting point for the fitting process +// (i.e. minimization process). +// A good guess speeds things up and greatly increases +// the chance of finding the global minimum (as opposed +// to getting stuck in a worthless local minimum, or +// diverging to arrant nonsense). +struct guesser { + int hh; + int npts; + vector<evendor> evt; + vector<double> model; + sitcher sitch; + + guesser() : hh(0), npts(0) {} + + void setup(vector<vector<string> > const aoa) { + int NR = aoa.size(); // includes header row + if (NR == 0) throw invalid_argument("Zero-length input file"); + hh = count_header(aoa); + npts = NR - hh; + if (0) cout << "NR: " << NR + << " npts: " << npts + << endl; + evt = vector<evendor>(npts); + double minute(60); + for (int ii = 0; ii < npts; ii++) { + // file times are in minutes; convert to SI units here: + evt[ii].t = stod(aoa[hh+ii][0]) * minute; + evt[ii].n = stod(aoa[hh+ii][1]); + } + + double max_t = evt[npts-1].t; + double max_n = evt[npts-1].n; + + // starting point of the baseline estimate, + // as a fraction of the total span of the data, + // assuming data starts at zero: + double tail_frac = 0.9; + int tail_i = -1; + + // half life (in seconds) of the fast component: + double fast_h = 307.2; + double fast_a = M_LN2 / fast_h; + int fast_i = -1; + + // half life (in seconds) of the slow component: + double slow_h = 45720; + double slow_a = M_LN2 / slow_h; + + double dead(5); // 2**-5 = 3% + + for (int ii = 0; ii < npts; ii++) { + double time = evt[ii].t; + if (time < tail_frac * max_t) tail_i = ii; + if (time < fast_h * dead) fast_i = ii; + } + + if (0) cout << "npts: " << npts + << " : " << max_t + << "," << max_n + << endl; + + double tail_t = evt[tail_i].t; + double tail_n = evt[tail_i].n; + double tail_r = (max_n - tail_n) / (max_t - tail_t); + + if (0) cout << "tail_i: " << tail_i + << " : " << tail_t + << "," << tail_n + << " " << tail_r + << endl; + + double fast_t = evt[fast_i].t; + double fast_n = evt[fast_i].n; + if (0) cout << "fast_i: " << fast_i + << " : " << fast_t + << "," << fast_n + // << " " << fast_r + << endl; + + if (tail_i < 0) throw invalid_argument("wtf?"); + if (fast_i < 0) throw invalid_argument("wtf?"); + + // beginning times: + double slow_tb = evt[fast_i].t; + double slow_nb = evt[fast_i].n; + + double slow_dt = max_t - slow_tb; + double slow_dn = max_n - slow_nb; + double slow_mag = slow_dn - slow_dt * tail_r; + slow_mag /= (1 - exp(-slow_a * slow_dt) * (1 + slow_dt * slow_a)); + // valid at time slow_tb: + double slow_rx = slow_mag * slow_a; + // valid at time zero: + double slow_r = slow_rx * exp(slow_a * slow_tb); + + double slow_rem = slow_r * exp(-slow_a * max_t); + double bl_r = tail_r - slow_rem; + + if (0) cout << "slow_r: " << slow_r + << " slow_rx: " << slow_rx + << " slow_mag: " << slow_mag + << " slow_rem: " << slow_rem + << " tail_r: " << tail_r + << " bl_r: " << bl_r + << endl; + + vector<double> slow_model{slow_r, slow_a, bl_r}; + + double fast_t0 = evt[0].t; + double fast_n0 = evt[0].n; + double fast_t1 = evt[fast_i].t; + double fast_n1 = evt[fast_i].n; + double model_inty = inty_multi_exp(slow_model, fast_t0, fast_t1, u5); + double fast_tot = fast_n1 - fast_n0; + double fast_dn = fast_tot - model_inty; + double fast_r = fast_dn * fast_a; + if (0) cout << "fast_r: " << fast_r + << " fast_dn: " << fast_dn + << " fast_tot: " << fast_tot + << " model_inty: " << model_inty + << endl; + + model = vector<double>{fast_r, fast_a}; + model.insert(model.end(), slow_model.begin(), slow_model.end()); + } + +// Reformat the data, to make it more useful to +// the objective function: + void more() { + for (int ii = 1; ii < npts; ii++) { + sitch.t1.push_back(evt[ii-1].t); + sitch.t2.push_back(evt[ii ].t); + sitch.obs.push_back(evt[ii].n - evt[ii-1].n); + } + } + +// Show the guessed parameters. +// Also do a sweep of one variable, as a qualitative sanity check. + void show(string const ofile) { + if (ofile == "") return; + ofstream xxout; + if (ofile != "-") { + xxout.open(ofile); + } + ostream& xout(ofile != "-" ? xxout : cout); + + size_t ss = sitch.t1.size(); + xout << ss << endl; + xout << sitch.t1[ss-1] + << " " << sitch.t2[ss-1] + << " " << sitch.obs[ss-1] + << endl; + + vector<double> prm(u5); + + xout << dump(model) << endl; + + xout << "bl/norm,bl,limp" << endl; + double bl0 = prm[4]; + void* context = (void*)(&sitch); + int nprm(prm.size()); + + for (double bl = bl0*.9; bl <= bl0*1.1; bl += bl/100.) { + prm[4] = bl; + double limpy = limp(nprm, prm.data(), 0, context); + xout << bl + << "," << bl*sitch.norm[4] + << "," << limpy << endl; + } + } +}; + +// Decode result codes returned by nlopt functions. +// This is documented to be part of the nlopt library +// but is absent from the version I have. +const char *nlopt_result_to_string(nlopt_result result) +{ + switch(result) + { + case NLOPT_FAILURE: return "FAILURE"; + case NLOPT_INVALID_ARGS: return "INVALID_ARGS"; + case NLOPT_OUT_OF_MEMORY: return "OUT_OF_MEMORY"; + case NLOPT_ROUNDOFF_LIMITED: return "ROUNDOFF_LIMITED"; + case NLOPT_FORCED_STOP: return "FORCED_STOP"; + case NLOPT_SUCCESS: return "SUCCESS"; + case NLOPT_STOPVAL_REACHED: return "STOPVAL_REACHED"; + case NLOPT_FTOL_REACHED: return "FTOL_REACHED"; + case NLOPT_XTOL_REACHED: return "XTOL_REACHED"; + case NLOPT_MAXEVAL_REACHED: return "MAXEVAL_REACHED"; + case NLOPT_MAXTIME_REACHED: return "MAXTIME_REACHED"; +///??????? case NLOPT_NUM_RESULTS: return NULL; + } + return NULL; +} + +// Add something to a particular component of a vector. +template<class T> +vector<T> goose(vector<T> const& vvv, int const ii, T const delta) { + vector<T> rslt(vvv); + rslt[ii] += delta; + return rslt; +} + +void do_fit(poissonier& poi, string const ifile, + string const ofile, string const gfile) { + if (ifile == "") { + if (ofile != "") throw invalid_argument("do_fit needs an input file"); + return; + } + + ifstream inx; + if (ifile != "-") { + inx.open(ifile); + if (! inx.good()) { + cerr << "Cannot open input '" << ifile << "'" << endl; + exit(1); + } + } + + istream& in(ifile == "-" ? cin : inx); + vector<vector<string> > aoa; + aoa = readCSV<string>(in); + guesser ggg; + ggg.setup(aoa); + ggg.more(); + ggg.sitch.norm = ggg.model; + ggg.show(gfile); + + vector<double> prm = u5; + int nprm(prm.size()); + vector<double> const lower(nprm, 0.5); + vector<double> const upper(nprm, 1.5); + + nlopt_result rslt = NLOPT_FORCED_STOP; // avoid "unused" warning + double limp_end(-9e99); + vector<double> found(prm); // will get modified in place + int OK(0); + try { + void* context = (void*)(&ggg.sitch); + nlopt_opt oppy = nlopt_create(NLOPT_LN_BOBYQA, nprm); + rslt = nlopt_set_lower_bounds1(oppy, 0.5); + rslt = nlopt_set_upper_bounds1(oppy, 1.5); + rslt = nlopt_set_min_objective(oppy, limp, context); + rslt = nlopt_set_xtol_rel(oppy, 1e-7); + rslt = nlopt_optimize(oppy, found.data(), &limp_end); + OK = 1; + } + catch (exception& eee) { + cout << "Fitting bombed out: " << eee.what() << endl; + } + + if (OK && ofile != "") { + ofstream xxout; + if (ofile != "-") { + xxout.open(ofile); + } + ostream& xout(ofile != "-" ? xxout : cout); + + double DoF(ggg.npts - nprm); + xout << ifile << endl; + cout << "Fit returns:, " << rslt + << ", i.e., " << nlopt_result_to_string(rslt) + << ", limp:, " << limp_end + << ", perdof:, " << limp_end / (ggg.npts - nprm) + << endl; + xout << "Fit returns:, " << rslt + << ", i.e., " << nlopt_result_to_string(rslt) + << ", limp:, " << limp_end + << ", perdof:, " << limp_end / DoF + << endl; + xout << endl; + xout << ",fast.amp, fast.dk, slow.amp, slow.dk, bl" << endl; + xout << "Normed:, " << dump(found) << endl; + vector<double> combi(nprm); + for (int ii = 0; ii < nprm; ii++) { + combi[ii] = found[ii] * ggg.sitch.norm[ii]; + } + xout << "SI:, " << dump(combi) << endl; + vector<double> flip(combi); + flip[1] = M_LN2/combi[1]; + flip[3] = M_LN2/combi[3]; + xout << "½life:, " << dump(flip) << endl; + +// Calculate the uncertainties. +// In particular, calculate the Mahalanobis metric +// i.e. the second derivative (Hessian) of the log improbability. + vector<vector<double> > covar(nprm, vector<double>(nprm)); + vector<double> probe(u5); + sitcher sitch(ggg.sitch); + sitch.norm = combi; + void* context = (void*)(&sitch); + double delta(0.01); + xout << endl; + for (int ii = 0; ii < nprm; ii++) { + string sep = ""; + for (int jj = 0; jj < nprm; jj++) { + double maha; + if (ii != jj) { + maha = limp(nprm, goose(goose( + probe, ii, delta), jj, delta).data(), 0, context) + + limp(nprm, goose(goose( + probe, ii, -delta), jj, -delta).data(), 0, context) + - limp(nprm, goose(goose( + probe, ii, delta), jj, -delta).data(), 0, context) + - limp(nprm, goose(goose( + probe, ii, -delta), jj, delta).data(), 0, context); + maha = maha/4./delta/delta; + } else { + maha = limp(nprm, goose(probe, ii, delta).data(), 0, context) + + limp(nprm, goose(probe, ii, -delta).data(), 0, context) + -2.0*limp(nprm, probe.data(), 0, context); + maha = maha/delta/delta; + } + xout << sep << setprecision(3) << setw(11) << fixed << maha; + sep = ", "; + } + xout << endl; + } + } +} + +// Crude reconnaissance. +// Calculate the decay curves using made-up parameters. +// Not in the critical path. +void do_dk(poissonier& poi, string const ofile) { + if (ofile == "") return; + ofstream xxout; + if (ofile != "-") { + xxout.open(ofile); + } + ostream& xout(ofile != "-" ? xxout : cout); + sitcher sitch; // selected abscissas + sitcher smooth; // all abscissas + + double step(1); + vector<double> prm( {30, .1, 10, .01, 1} ); + + int dt(5); + for (int ii = 0; ii < 500; ii+= dt) { + double tt1(step*ii); + double tt2(tt1 + dt); + double intensity(inty_multi_exp(prm, tt1, tt2, u5)); + int obs = poi.sample(intensity); + sitch.t1.push_back(tt1); + sitch.t2.push_back(tt2); + sitch.obs.push_back(obs); + } + + for (int ii = 0; ii < 500; ii++) { + smooth.t1.push_back(step*ii); + } + +// here with both sitchers fully constructed. + + xout << "Seed:," << poi.graine << endl; +// output both the smooth curves +// and the observations (to the extent they exist) + for (int ii = 0; ii < 500; ii++) { + double tt1(smooth.t1[ii]); + double sminty(inst_multi_exp(prm, tt1, u5)); + xout << tt1 << "," << sminty; + if (ii < (int) sitch.t1.size()) { + double fake(max(0.+sitch.obs[ii], 0.2)); // to facilitate log axes + double tt1(sitch.t1[ii]); + double tt2(sitch.t2[ii]); + double minty(inty_multi_exp(prm, tt1, tt2, u5)); + xout << "," << tt1 + << "," << tt2 + << ",x" + << "," << minty + << "," << minty + << ",x" + << "," << fake + << "," << fake + << ",x"; + } + xout << endl; + } +} + +// Class used by do_entropy. +// Use a class rather than a plain function, +// because it returns multiple results. +struct entroper { + double ptot, stot, ii; + void go(double const rw_obs, double const rw_mod) { + ptot = 0; + stot = 0; + int ii; + int lim = int(ceil(rw_obs)); + lim *= 10; + for (ii = 0; ii < lim; ii++) { + if (ptot > .999999) break; + double pobs = poissonier::pmd(rw_obs, ii); + if (pobs != 0) { + double lpmod = poissonier::lpmd(rw_mod, ii); + ptot += pobs; + stot -= pobs * lpmod; + } else { + // don't do a calculation that could + // possibly multiply zero by -infinity. + } + } + } +}; + +// Out of curiosity, calculate the entropy of the Poisson distribution +// as a function of intensity (lambda). +// Not in the critical path. +void do_entropy(string const ofile) { + if (ofile == "") return; + ofstream xxout; + if (ofile != "-") { + xxout.open(ofile); + } + ostream& xout(ofile != "-" ? xxout : cout); + + double Delta(1.01); + xout << "Delta," << Delta << endl; + xout << "rw, ii, ptot, Scat, Sdog, Semu" << endl; + double ratio(pow(10., .005)); + entroper cat, dog, emu; + for (double rw = 0.01; rw < 1500; rw *= ratio) { + cat.go(rw, rw); + dog.go(rw, rw/Delta); + emu.go(rw, rw*Delta); + xout << rw + << "," << cat.ptot + << "," << cat.ii + << "," << cat.stot + << "," << dog.stot + << "," << emu.stot + << endl; + } +} + +int main(int argc, char * const * argv) { + poissonier poi; + string seed; + string ifile; + arg_parser arg(argc, argv); + arg.fold_case = 1; + string progname = arg.nextRaw(); + string dkfile, entfile, fitfile, gfile; + + for (;;){ + string raw_arg = arg.nextRaw(); // get next keyword + if (arg.fail()) break; + int where = arg.cur.length(); + if (where){ + if (arg.cur[where-1] == ':') arg.cur = arg.cur.substr(0,where-1); + } + if (0) {} + else if (arg.prefix("help") || arg.prefix("-h")) { + usage(); + exit(0); + } + else if (arg.prefix("-ifile")) arg >> ifile; + else if (arg.prefix("-seed")) arg >> seed; + else if (arg.prefix("-entropy")) arg >> entfile; + else if (arg.prefix("-dkcurve")) arg >> dkfile; + else if (arg.prefix("-gfile")) arg >> gfile; + else if (arg.prefix("-fit")) arg >> fitfile; + else { + cerr << "Unrecognized command '" << raw_arg << "'" << endl; + exit(1); + } + } + if (seed == "--") seed = poi.randomize(); + else if (seed != "") poi.seed(seed); + + if (!( fitfile.length() + || dkfile.length() + || entfile.length() + )) + dkfile = "-"; + do_dk(poi, dkfile); + do_fit(poi, ifile, fitfile, gfile); + do_entropy(entfile); +} |