///////////////////////// #include using namespace std; void usage() { cout << R"EoF(Program to fit to Poisson distributed data. Uses actual Poisson probability (i.e. not Gaussians, and therefore not least squares). Typical usage: :; ./poiss -if my_data.csv -fit cu.csv Options include: -h # print this message (and immediately exit) -ifile $fn # input file, date for -fit -gfile $fn # log information about initial guesses for -fit -fit $fn # do the fit; send results to $fn # the rest are for testing and background investigations: -dkcurve $fn # smooth curves and scattered data at fake abscissas -entropy $fn # entropy of the Poisson distro # as a function of the intensity (lambda) -seed $str # seed the RNG with $str -seed -- # seed the RNG from /dev/random To write to stdout, use "-" for the $fn. See ./doit for a working example, including the SS --meld stanzas. )EoF"; } #include /* for setw */ #include #include /* the minimum-finder */ #include "arg_parser.h" #include #include #include #include "parse_csv.h" #include "poissonier.h" // Used for equal weighting: vector uvec(int const ddd) { return vector(ddd, 1.0); } // Class to describe a sitch, i.e. a situation, i.e. // everything the objective function needs to know: struct sitcher { vector norm; // keep the adjustable parameters close to unity vector t1; vector t2; vector obs; // actual observed counts in the interval [t1, t2] }; // Instantaneous rate at time ttt. // Takes parameters two at a time: (amplitude, decay rate). // If there are an odd number of parameters, // the last one is the baseline, i.e. independent of time, // equivalent to a decay with the rate locked at zero. // Decay rates are in nats per second. double inst_multi_exp(vector const& prm, double const ttt, vector const norm) { double rslt(0); int nn = prm.size(); for (int ii = 0; ii < nn; ii+=2){ double amp(prm[ii] * norm[ii]); double rate(1+ii < nn ? prm[1+ii] * norm[1+ii] : 0.); if (rate) { double rate(prm[1+ii]); // use the negative rate: rslt += amp * exp(- rate * ttt); } else { rslt += amp; } } return rslt; } // Format all the elements of a C++ vector, // creating a string suitable for printing. template string dump(vector const arg) { stringstream rslt; string sep = ""; for (T val : arg) { rslt << sep << val; sep = ", "; } return rslt.str(); } // Format all the elements of a counted array, // creating a string suitable for printing. template string dump(int const nn, T const arg[]) { stringstream rslt; string sep = ""; for (int ii = 0; ii < nn; ii++) { rslt << sep << arg[ii]; sep = ","; } return rslt.str(); } // numerically accurate sinh(arg)/arg template inline T sinhc(T arg) { if (abs(arg) < 1e-4) return 1 + arg*arg/6; return sinh(arg)/arg; } struct evendor { double t; // time double n; // number of events }; // Intensity, usually denoted lambda, // i.e. integral of rate(t) dt over the given interval. double inty_multi_exp(vector const& prm, double const t1, double const t2, vector const norm) { double rslt(0); int nn = prm.size(); for (int ii = 0; ii < nn; ii+=2){ double amp(prm[ii] * norm[ii]); double rate(1+ii < nn ? prm[1+ii] * norm[1+ii] : 0.); double mid((t2 + t1) / 2.); double tau(t2 - t1); rslt += amp * exp(-rate*mid) * tau * sinhc(rate*tau/2.); } return rslt; } // the actual objective function // returns total log probability, summed over all data points // // _sitch is unchanged, // but cannot be declared const, because bobyqa wouldn't like that. double limp(unsigned int const nprm, double const* _prm, double * _grad, void * _sitch) { sitcher& sitch(*(sitcher*)(_sitch)); if (_grad != 0) throw invalid_argument("grad"); if (sitch.norm.size() != nprm) throw invalid_argument("norm"); vector prm(_prm, _prm+nprm); if (prm.size() != nprm) throw logic_error("vector ctor"); double rslt(0); int npts(sitch.t1.size()); for (int ii = 0; ii < npts; ii++){ double inty = inty_multi_exp(prm, sitch.t1[ii], sitch.t2[ii], sitch.norm); rslt -= poissonier::lpmd(inty, sitch.obs[ii]); } return rslt; } // Some tricky heuristics. // Guess a starting point for the fitting process // (i.e. minimization process). // A good guess speeds things up and greatly increases // the chance of finding the global minimum (as opposed // to getting stuck in a worthless local minimum, or // diverging to arrant nonsense). struct guesser { int hh; int npts; vector evt; vector model; sitcher sitch; guesser() : hh(0), npts(0) {} void setup(vector > const aoa) { int NR = aoa.size(); // includes header row if (NR == 0) throw invalid_argument("Zero-length input file"); hh = count_header(aoa); npts = NR - hh; if (0) cout << "NR: " << NR << " npts: " << npts << endl; evt = vector(npts); double minute(60); for (int ii = 0; ii < npts; ii++) { // file times are in minutes; convert to SI units here: evt[ii].t = stod(aoa[hh+ii][0]) * minute; evt[ii].n = stod(aoa[hh+ii][1]); } double max_t = evt[npts-1].t; double max_n = evt[npts-1].n; // starting point of the baseline estimate, // as a fraction of the total span of the data, // assuming data starts at zero: double tail_frac = 0.9; int tail_i = -1; // half life (in seconds) of the fast component: double fast_h = 307.2; double fast_a = M_LN2 / fast_h; int fast_i = -1; // half life (in seconds) of the slow component: double slow_h = 45720; double slow_a = M_LN2 / slow_h; double dead(5); // 2**-5 = 3% for (int ii = 0; ii < npts; ii++) { double time = evt[ii].t; if (time < tail_frac * max_t) tail_i = ii; if (time < fast_h * dead) fast_i = ii; } if (0) cout << "npts: " << npts << " : " << max_t << "," << max_n << endl; double tail_t = evt[tail_i].t; double tail_n = evt[tail_i].n; double tail_r = (max_n - tail_n) / (max_t - tail_t); if (0) cout << "tail_i: " << tail_i << " : " << tail_t << "," << tail_n << " " << tail_r << endl; double fast_t = evt[fast_i].t; double fast_n = evt[fast_i].n; if (0) cout << "fast_i: " << fast_i << " : " << fast_t << "," << fast_n // << " " << fast_r << endl; if (tail_i < 0) throw invalid_argument("wtf?"); if (fast_i < 0) throw invalid_argument("wtf?"); // beginning times: double slow_tb = evt[fast_i].t; double slow_nb = evt[fast_i].n; double slow_dt = max_t - slow_tb; double slow_dn = max_n - slow_nb; double slow_mag = slow_dn - slow_dt * tail_r; slow_mag /= (1 - exp(-slow_a * slow_dt) * (1 + slow_dt * slow_a)); // valid at time slow_tb: double slow_rx = slow_mag * slow_a; // valid at time zero: double slow_r = slow_rx * exp(slow_a * slow_tb); double slow_rem = slow_r * exp(-slow_a * max_t); double bl_r = tail_r - slow_rem; if (0) cout << "slow_r: " << slow_r << " slow_rx: " << slow_rx << " slow_mag: " << slow_mag << " slow_rem: " << slow_rem << " tail_r: " << tail_r << " bl_r: " << bl_r << endl; vector slow_model{slow_r, slow_a, bl_r}; int nprm = slow_model.size(); double fast_t0 = evt[0].t; double fast_n0 = evt[0].n; double fast_t1 = evt[fast_i].t; double fast_n1 = evt[fast_i].n; double model_inty = inty_multi_exp(slow_model, fast_t0, fast_t1, uvec(nprm)); double fast_tot = fast_n1 - fast_n0; double fast_dn = fast_tot - model_inty; double fast_r = fast_dn * fast_a; if (0) cout << "fast_r: " << fast_r << " fast_dn: " << fast_dn << " fast_tot: " << fast_tot << " model_inty: " << model_inty << endl; model = vector{fast_r, fast_a}; model.insert(model.end(), slow_model.begin(), slow_model.end()); } // Reformat the data, to make it more useful to // the objective function: void more() { for (int ii = 1; ii < npts; ii++) { sitch.t1.push_back(evt[ii-1].t); sitch.t2.push_back(evt[ii ].t); sitch.obs.push_back(evt[ii].n - evt[ii-1].n); } } // Show the guessed parameters. // Also do a sweep of one variable, as a qualitative sanity check. void show(string const ofile) { if (ofile == "") return; ofstream xxout; if (ofile != "-") { xxout.open(ofile); } ostream& xout(ofile != "-" ? xxout : cout); size_t nprm = sitch.t1.size(); xout << nprm << endl; xout << sitch.t1[nprm-1] << " " << sitch.t2[nprm-1] << " " << sitch.obs[nprm-1] << endl; vector prm(uvec(nprm)); xout << dump(model) << endl; xout << "bl/norm,bl,limp" << endl; double bl0 = prm[4]; void* context = (void*)(&sitch); for (double bl = bl0*.9; bl <= bl0*1.1; bl += bl/100.) { prm[4] = bl; double limpy = limp(nprm, prm.data(), 0, context); xout << bl << "," << bl*sitch.norm[4] << "," << limpy << endl; } } }; // Decode result codes returned by nlopt functions. // This is documented to be part of the nlopt library // but is absent from the version I have. const char *nlopt_result_to_string(nlopt_result result) { switch(result) { case NLOPT_FAILURE: return "FAILURE"; case NLOPT_INVALID_ARGS: return "INVALID_ARGS"; case NLOPT_OUT_OF_MEMORY: return "OUT_OF_MEMORY"; case NLOPT_ROUNDOFF_LIMITED: return "ROUNDOFF_LIMITED"; case NLOPT_FORCED_STOP: return "FORCED_STOP"; case NLOPT_SUCCESS: return "SUCCESS"; case NLOPT_STOPVAL_REACHED: return "STOPVAL_REACHED"; case NLOPT_FTOL_REACHED: return "FTOL_REACHED"; case NLOPT_XTOL_REACHED: return "XTOL_REACHED"; case NLOPT_MAXEVAL_REACHED: return "MAXEVAL_REACHED"; case NLOPT_MAXTIME_REACHED: return "MAXTIME_REACHED"; ///??????? case NLOPT_NUM_RESULTS: return NULL; } return NULL; } // Add something to a particular component of a vector. template vector goose(vector const& vvv, int const ii, T const delta) { vector rslt(vvv); rslt[ii] += delta; return rslt; } void do_fit(poissonier& poi, string const ifile, string const ofile, string const gfile) { if (ifile == "") { if (ofile != "") throw invalid_argument("do_fit needs an input file"); return; } ifstream inx; if (ifile != "-") { inx.open(ifile); if (! inx.good()) { cerr << "Cannot open input '" << ifile << "'" << endl; exit(1); } } istream& in(ifile == "-" ? cin : inx); vector > aoa; aoa = readCSV(in); guesser ggg; ggg.setup(aoa); ggg.more(); ggg.sitch.norm = ggg.model; ggg.show(gfile); int nprm = ggg.model.size(); vector prm = uvec(nprm); vector const lower(nprm, 0.5); vector const upper(nprm, 1.5); nlopt_result rslt = NLOPT_FORCED_STOP; // avoid "unused" warning double limp_end(-9e99); vector found(prm); // will get modified in place int OK(0); try { void* context = (void*)(&ggg.sitch); nlopt_opt oppy = nlopt_create(NLOPT_LN_BOBYQA, nprm); rslt = nlopt_set_lower_bounds1(oppy, 0.5); rslt = nlopt_set_upper_bounds1(oppy, 1.5); rslt = nlopt_set_min_objective(oppy, limp, context); rslt = nlopt_set_xtol_rel(oppy, 1e-7); rslt = nlopt_optimize(oppy, found.data(), &limp_end); OK = 1; } catch (exception& eee) { cout << "Fitting bombed out: " << eee.what() << endl; } if (OK && ofile != "") { string meta = "fast.amp, fast.dk, slow.amp, slow.dk, bl"; ofstream xxout; if (ofile != "-") { xxout.open(ofile); } ostream& xout(ofile != "-" ? xxout : cout); double DoF(ggg.npts - nprm); xout << ifile << endl; cout << "Fit returns:, " << rslt << ", i.e., " << nlopt_result_to_string(rslt) << ", limp:, " << limp_end << ", perdof:, " << limp_end / (ggg.npts - nprm) << endl; xout << "Fit returns:, " << rslt << ", i.e., " << nlopt_result_to_string(rslt) << ", limp:, " << limp_end << ", perdof:, " << limp_end / DoF << endl; xout << endl; xout << "," << meta << endl; xout << "Normed:, " << dump(found) << endl; vector combi(nprm); for (int ii = 0; ii < nprm; ii++) { combi[ii] = found[ii] * ggg.sitch.norm[ii]; } xout << "SI:, " << dump(combi) << endl; vector flip(combi); flip[1] = M_LN2/combi[1]; flip[3] = M_LN2/combi[3]; xout << "½life:, " << dump(flip) << endl; // Calculate the uncertainties. // In particular, calculate the Mahalanobis metric // i.e. the second derivative (Hessian) of the log improbability. vector > covar(nprm, vector(nprm)); vector probe(uvec(nprm)); sitcher sitch(ggg.sitch); sitch.norm = combi; void* context = (void*)(&sitch); double delta(0.01); xout << endl; xout << meta << endl; for (int ii = 0; ii < nprm; ii++) { string sep = ""; for (int jj = 0; jj < nprm; jj++) { double maha; if (ii != jj) { maha = limp(nprm, goose(goose( probe, ii, delta), jj, delta).data(), 0, context) + limp(nprm, goose(goose( probe, ii, -delta), jj, -delta).data(), 0, context) - limp(nprm, goose(goose( probe, ii, delta), jj, -delta).data(), 0, context) - limp(nprm, goose(goose( probe, ii, -delta), jj, delta).data(), 0, context); maha = maha/4./delta/delta; } else { maha = limp(nprm, goose(probe, ii, delta).data(), 0, context) + limp(nprm, goose(probe, ii, -delta).data(), 0, context) -2.0*limp(nprm, probe.data(), 0, context); maha = maha/delta/delta; } xout << sep << setprecision(3) << setw(11) << fixed << maha; sep = ", "; } xout << endl; } } } // Crude reconnaissance. // Calculate the decay curves using made-up parameters. // Not in the critical path. void do_dk(poissonier& poi, string const ofile) { if (ofile == "") return; ofstream xxout; if (ofile != "-") { xxout.open(ofile); } ostream& xout(ofile != "-" ? xxout : cout); sitcher sitch; // selected abscissas sitcher smooth; // all abscissas double step(1); vector prm( {30, .1, 10, .01, 1} ); int nprm(prm.size()); int dt(5); for (int ii = 0; ii < 500; ii+= dt) { double tt1(step*ii); double tt2(tt1 + dt); double intensity(inty_multi_exp(prm, tt1, tt2, uvec(nprm))); int obs = poi.sample(intensity); sitch.t1.push_back(tt1); sitch.t2.push_back(tt2); sitch.obs.push_back(obs); } for (int ii = 0; ii < 500; ii++) { smooth.t1.push_back(step*ii); } // here with both sitchers fully constructed. xout << "Seed:," << poi.graine << endl; // output both the smooth curves // and the observations (to the extent they exist) for (int ii = 0; ii < 500; ii++) { double tt1(smooth.t1[ii]); double sminty(inst_multi_exp(prm, tt1, uvec(nprm))); xout << tt1 << "," << sminty; if (ii < (int) sitch.t1.size()) { double fake(max(0.+sitch.obs[ii], 0.2)); // to facilitate log axes double tt1(sitch.t1[ii]); double tt2(sitch.t2[ii]); double minty(inty_multi_exp(prm, tt1, tt2, uvec(nprm))); xout << "," << tt1 << "," << tt2 << ",x" << "," << minty << "," << minty << ",x" << "," << fake << "," << fake << ",x"; } xout << endl; } } // Class used by do_entropy. // Use a class rather than a plain function, // because it returns multiple results. struct entroper { double ptot, stot, ii; void go(double const rw_obs, double const rw_mod) { ptot = 0; stot = 0; int ii; int lim = int(ceil(rw_obs)); lim *= 10; for (ii = 0; ii < lim; ii++) { if (ptot > .999999) break; double pobs = poissonier::pmd(rw_obs, ii); if (pobs != 0) { double lpmod = poissonier::lpmd(rw_mod, ii); ptot += pobs; stot -= pobs * lpmod; } else { // don't do a calculation that could // possibly multiply zero by -infinity. } } } }; // Out of curiosity, calculate the entropy of the Poisson distribution // as a function of intensity (lambda). // Not in the critical path. void do_entropy(string const ofile) { if (ofile == "") return; ofstream xxout; if (ofile != "-") { xxout.open(ofile); } ostream& xout(ofile != "-" ? xxout : cout); double Delta(1.01); xout << "Delta," << Delta << endl; xout << "rw, ii, ptot, Scat, Sdog, Semu" << endl; double ratio(pow(10., .005)); entroper cat, dog, emu; for (double rw = 0.01; rw < 1500; rw *= ratio) { cat.go(rw, rw); dog.go(rw, rw/Delta); emu.go(rw, rw*Delta); xout << rw << "," << cat.ptot << "," << cat.ii << "," << cat.stot << "," << dog.stot << "," << emu.stot << endl; } } int main(int argc, char * const * argv) { poissonier poi; string seed; string ifile; arg_parser arg(argc, argv); arg.fold_case = 1; string progname = arg.nextRaw(); string dkfile, entfile, fitfile, gfile; for (;;){ string raw_arg = arg.nextRaw(); // get next keyword if (arg.fail()) break; int where = arg.cur.length(); if (where){ if (arg.cur[where-1] == ':') arg.cur = arg.cur.substr(0,where-1); } if (0) {} else if (arg.prefix("help") || arg.prefix("-h")) { usage(); exit(0); } else if (arg.prefix("-ifile")) arg >> ifile; else if (arg.prefix("-seed")) arg >> seed; else if (arg.prefix("-entropy")) arg >> entfile; else if (arg.prefix("-dkcurve")) arg >> dkfile; else if (arg.prefix("-gfile")) arg >> gfile; else if (arg.prefix("-fit")) arg >> fitfile; else { cerr << "Unrecognized command '" << raw_arg << "'" << endl; exit(1); } } if (seed == "--") seed = poi.randomize(); else if (seed != "") poi.seed(seed); if (!( fitfile.length() || dkfile.length() || entfile.length() )) dkfile = "-"; do_dk(poi, dkfile); do_fit(poi, ifile, fitfile, gfile); do_entropy(entfile); }