aboutsummaryrefslogtreecommitdiff
path: root/poiss.c
diff options
context:
space:
mode:
authorJohn Denker <jsd@av8n.com>2021-10-17 10:10:18 -0700
committerJohn Denker <jsd@av8n.com>2021-10-17 11:09:24 -0700
commit74ddd0381aa1b1a90eb0d5300fa576cb2348eeac (patch)
tree72a9dded6f800467d52e479eb37574e6de5f2e6c /poiss.c
parent634d365a03cb0581a062cd3cf4db9ae69f1cde26 (diff)
basically functional, but still a work in progress
Diffstat (limited to 'poiss.c')
-rw-r--r--poiss.c658
1 files changed, 658 insertions, 0 deletions
diff --git a/poiss.c b/poiss.c
new file mode 100644
index 0000000..70a20b9
--- /dev/null
+++ b/poiss.c
@@ -0,0 +1,658 @@
+/////////////////////////
+
+#include <iostream>
+using namespace std;
+
+void usage() {
+ cout << R"EoF(Program to fit to Poisson distributed data.
+Uses actual Poisson probability (i.e. not Gaussians, and
+therefore not least squares).
+
+Options include:
+ -h # print this message (and immediately exit)
+ -ifile $fn # input file, date for -fit
+ -gfile $fn # log information about initial guesses for -fit
+ -fit $fn # do the fit; send results to $fn
+
+# the rest are for testing and background investigations:
+ -dkcurve $fn # smooth curves and scattered data at fake abscissas
+ -entropy $fn # entropy of the Poisson distro
+ # as a function of the intensity (lambda)
+ -seed $str # seed the RNG with $str
+ -seed -- # seed the RNG from /dev/random
+
+To write to stdout, use "-" for the $fn.
+
+See ./doit for a working example, including the SS --meld stanzas.
+)EoF";
+}
+
+#include <iomanip> /* for setw */
+#include <vector>
+#include <nlopt.hpp> /* the minimum-finder */
+#include "arg_parser.h"
+#include <string>
+#include <sstream>
+#include <map>
+#include "parse_csv.h"
+#include "poissonier.h"
+
+// Used for equal weighting:
+vector<double> const u5(5, 1.0);
+
+// Class to describe a sitch, i.e. a situation, i.e.
+// everything the objective function needs to know:
+struct sitcher {
+ vector<double> norm; // keep the adjustable parameters close to unity
+ vector<double> t1;
+ vector<double> t2;
+ vector<int> obs; // actual observed counts in the interval [t1, t2]
+};
+
+// Instantaneous rate at time ttt.
+// Takes parameters two at a time: (amplitude, decay rate).
+// If there are an odd number of parameters,
+// the last one is the baseline, i.e. independent of time,
+// equivalent to a decay with the rate locked at zero.
+// Decay rates are in nats per second.
+double inst_multi_exp(vector<double> const& prm,
+ double const ttt, vector<double> const norm) {
+ double rslt(0);
+ int nn = prm.size();
+ for (int ii = 0; ii < nn; ii+=2){
+ double amp(prm[ii] * norm[ii]);
+ double rate(1+ii < nn ? prm[1+ii] * norm[1+ii] : 0.);
+ if (rate) {
+ double rate(prm[1+ii]);
+// use the negative rate:
+ rslt += amp * exp(- rate * ttt);
+ } else {
+ rslt += amp;
+ }
+ }
+ return rslt;
+}
+
+// Format all the elements of a C++ vector,
+// creating a string suitable for printing.
+template<class T>
+string dump(vector<T> const arg) {
+ stringstream rslt;
+ string sep = "";
+ for (T val : arg) {
+ rslt << sep << val;
+ sep = ", ";
+ }
+ return rslt.str();
+}
+
+// Format all the elements of a counted array,
+// creating a string suitable for printing.
+template<class T>
+string dump(int const nn, T const arg[]) {
+ stringstream rslt;
+ string sep = "";
+ for (int ii = 0; ii < nn; ii++) {
+ rslt << sep << arg[ii];
+ sep = ",";
+ }
+ return rslt.str();
+}
+
+// numerically accurate sinh(arg)/arg
+template<class T>
+inline T sinhc(T arg) {
+ if (abs(arg) < 1e-4) return 1 + arg*arg/6;
+ return sinh(arg)/arg;
+}
+
+struct evendor {
+ double t; // time
+ double n; // number of events
+};
+
+// Intensity, usually denoted lambda,
+// i.e. integral of rate(t) dt over the given interval.
+double inty_multi_exp(vector<double> const& prm,
+ double const t1, double const t2, vector<double> const norm) {
+ double rslt(0);
+ int nn = prm.size();
+ for (int ii = 0; ii < nn; ii+=2){
+ double amp(prm[ii] * norm[ii]);
+ double rate(1+ii < nn ? prm[1+ii] * norm[1+ii] : 0.);
+ double mid((t2 + t1) / 2.);
+ double tau(t2 - t1);
+ rslt += amp * exp(-rate*mid) * tau * sinhc(rate*tau/2.);
+ }
+ return rslt;
+}
+
+// the actual objective function
+// returns total log probability, summed over all data points
+//
+// _sitch is unchanged,
+// but cannot be declared const, because bobyqa wouldn't like that.
+double limp(unsigned int const nprm, double const* _prm,
+ double * _grad, void * _sitch) {
+ sitcher& sitch(*(sitcher*)(_sitch));
+ if (_grad != 0) throw invalid_argument("grad");
+ if (sitch.norm.size() != nprm) throw invalid_argument("norm");
+ vector<double> prm(_prm, _prm+nprm);
+ if (prm.size() != nprm) throw logic_error("vector ctor");
+ double rslt(0);
+ int npts(sitch.t1.size());
+ for (int ii = 0; ii < npts; ii++){
+ double inty = inty_multi_exp(prm, sitch.t1[ii], sitch.t2[ii], sitch.norm);
+ rslt -= poissonier::lpmd(inty, sitch.obs[ii]);
+ }
+ return rslt;
+}
+
+// Scan the data array,
+// looking for records that look like headers
+// (as opposed to numerical data).
+int count_header(vector<vector<string> > const aoa){
+ int NR = aoa.size();
+ int hh = 0;
+ for (; hh < NR; hh++) {
+ if (aoa[hh].size() == 0) continue;
+ string word = aoa[hh][0];
+ size_t where;
+ where = word.find_first_not_of(" ");
+ if (where == string::npos) continue;
+ word = word.substr(where);
+ try {
+ stod(word, &where); // don't care about return value
+ }
+ catch (exception& ee) {
+ continue;
+ }
+ word = word.substr(where);
+ where = word.find_first_not_of(" ");
+ if (where == string::npos) break;
+ }
+ return hh;
+}
+
+// Some tricky heuristics.
+// Guess a starting point for the fitting process
+// (i.e. minimization process).
+// A good guess speeds things up and greatly increases
+// the chance of finding the global minimum (as opposed
+// to getting stuck in a worthless local minimum, or
+// diverging to arrant nonsense).
+struct guesser {
+ int hh;
+ int npts;
+ vector<evendor> evt;
+ vector<double> model;
+ sitcher sitch;
+
+ guesser() : hh(0), npts(0) {}
+
+ void setup(vector<vector<string> > const aoa) {
+ int NR = aoa.size(); // includes header row
+ if (NR == 0) throw invalid_argument("Zero-length input file");
+ hh = count_header(aoa);
+ npts = NR - hh;
+ if (0) cout << "NR: " << NR
+ << " npts: " << npts
+ << endl;
+ evt = vector<evendor>(npts);
+ double minute(60);
+ for (int ii = 0; ii < npts; ii++) {
+ // file times are in minutes; convert to SI units here:
+ evt[ii].t = stod(aoa[hh+ii][0]) * minute;
+ evt[ii].n = stod(aoa[hh+ii][1]);
+ }
+
+ double max_t = evt[npts-1].t;
+ double max_n = evt[npts-1].n;
+
+ // starting point of the baseline estimate,
+ // as a fraction of the total span of the data,
+ // assuming data starts at zero:
+ double tail_frac = 0.9;
+ int tail_i = -1;
+
+ // half life (in seconds) of the fast component:
+ double fast_h = 307.2;
+ double fast_a = M_LN2 / fast_h;
+ int fast_i = -1;
+
+ // half life (in seconds) of the slow component:
+ double slow_h = 45720;
+ double slow_a = M_LN2 / slow_h;
+
+ double dead(5); // 2**-5 = 3%
+
+ for (int ii = 0; ii < npts; ii++) {
+ double time = evt[ii].t;
+ if (time < tail_frac * max_t) tail_i = ii;
+ if (time < fast_h * dead) fast_i = ii;
+ }
+
+ if (0) cout << "npts: " << npts
+ << " : " << max_t
+ << "," << max_n
+ << endl;
+
+ double tail_t = evt[tail_i].t;
+ double tail_n = evt[tail_i].n;
+ double tail_r = (max_n - tail_n) / (max_t - tail_t);
+
+ if (0) cout << "tail_i: " << tail_i
+ << " : " << tail_t
+ << "," << tail_n
+ << " " << tail_r
+ << endl;
+
+ double fast_t = evt[fast_i].t;
+ double fast_n = evt[fast_i].n;
+ if (0) cout << "fast_i: " << fast_i
+ << " : " << fast_t
+ << "," << fast_n
+ // << " " << fast_r
+ << endl;
+
+ if (tail_i < 0) throw invalid_argument("wtf?");
+ if (fast_i < 0) throw invalid_argument("wtf?");
+
+ // beginning times:
+ double slow_tb = evt[fast_i].t;
+ double slow_nb = evt[fast_i].n;
+
+ double slow_dt = max_t - slow_tb;
+ double slow_dn = max_n - slow_nb;
+ double slow_mag = slow_dn - slow_dt * tail_r;
+ slow_mag /= (1 - exp(-slow_a * slow_dt) * (1 + slow_dt * slow_a));
+ // valid at time slow_tb:
+ double slow_rx = slow_mag * slow_a;
+ // valid at time zero:
+ double slow_r = slow_rx * exp(slow_a * slow_tb);
+
+ double slow_rem = slow_r * exp(-slow_a * max_t);
+ double bl_r = tail_r - slow_rem;
+
+ if (0) cout << "slow_r: " << slow_r
+ << " slow_rx: " << slow_rx
+ << " slow_mag: " << slow_mag
+ << " slow_rem: " << slow_rem
+ << " tail_r: " << tail_r
+ << " bl_r: " << bl_r
+ << endl;
+
+ vector<double> slow_model{slow_r, slow_a, bl_r};
+
+ double fast_t0 = evt[0].t;
+ double fast_n0 = evt[0].n;
+ double fast_t1 = evt[fast_i].t;
+ double fast_n1 = evt[fast_i].n;
+ double model_inty = inty_multi_exp(slow_model, fast_t0, fast_t1, u5);
+ double fast_tot = fast_n1 - fast_n0;
+ double fast_dn = fast_tot - model_inty;
+ double fast_r = fast_dn * fast_a;
+ if (0) cout << "fast_r: " << fast_r
+ << " fast_dn: " << fast_dn
+ << " fast_tot: " << fast_tot
+ << " model_inty: " << model_inty
+ << endl;
+
+ model = vector<double>{fast_r, fast_a};
+ model.insert(model.end(), slow_model.begin(), slow_model.end());
+ }
+
+// Reformat the data, to make it more useful to
+// the objective function:
+ void more() {
+ for (int ii = 1; ii < npts; ii++) {
+ sitch.t1.push_back(evt[ii-1].t);
+ sitch.t2.push_back(evt[ii ].t);
+ sitch.obs.push_back(evt[ii].n - evt[ii-1].n);
+ }
+ }
+
+// Show the guessed parameters.
+// Also do a sweep of one variable, as a qualitative sanity check.
+ void show(string const ofile) {
+ if (ofile == "") return;
+ ofstream xxout;
+ if (ofile != "-") {
+ xxout.open(ofile);
+ }
+ ostream& xout(ofile != "-" ? xxout : cout);
+
+ size_t ss = sitch.t1.size();
+ xout << ss << endl;
+ xout << sitch.t1[ss-1]
+ << " " << sitch.t2[ss-1]
+ << " " << sitch.obs[ss-1]
+ << endl;
+
+ vector<double> prm(u5);
+
+ xout << dump(model) << endl;
+
+ xout << "bl/norm,bl,limp" << endl;
+ double bl0 = prm[4];
+ void* context = (void*)(&sitch);
+ int nprm(prm.size());
+
+ for (double bl = bl0*.9; bl <= bl0*1.1; bl += bl/100.) {
+ prm[4] = bl;
+ double limpy = limp(nprm, prm.data(), 0, context);
+ xout << bl
+ << "," << bl*sitch.norm[4]
+ << "," << limpy << endl;
+ }
+ }
+};
+
+// Decode result codes returned by nlopt functions.
+// This is documented to be part of the nlopt library
+// but is absent from the version I have.
+const char *nlopt_result_to_string(nlopt_result result)
+{
+ switch(result)
+ {
+ case NLOPT_FAILURE: return "FAILURE";
+ case NLOPT_INVALID_ARGS: return "INVALID_ARGS";
+ case NLOPT_OUT_OF_MEMORY: return "OUT_OF_MEMORY";
+ case NLOPT_ROUNDOFF_LIMITED: return "ROUNDOFF_LIMITED";
+ case NLOPT_FORCED_STOP: return "FORCED_STOP";
+ case NLOPT_SUCCESS: return "SUCCESS";
+ case NLOPT_STOPVAL_REACHED: return "STOPVAL_REACHED";
+ case NLOPT_FTOL_REACHED: return "FTOL_REACHED";
+ case NLOPT_XTOL_REACHED: return "XTOL_REACHED";
+ case NLOPT_MAXEVAL_REACHED: return "MAXEVAL_REACHED";
+ case NLOPT_MAXTIME_REACHED: return "MAXTIME_REACHED";
+///??????? case NLOPT_NUM_RESULTS: return NULL;
+ }
+ return NULL;
+}
+
+// Add something to a particular component of a vector.
+template<class T>
+vector<T> goose(vector<T> const& vvv, int const ii, T const delta) {
+ vector<T> rslt(vvv);
+ rslt[ii] += delta;
+ return rslt;
+}
+
+void do_fit(poissonier& poi, string const ifile,
+ string const ofile, string const gfile) {
+ if (ifile == "") {
+ if (ofile != "") throw invalid_argument("do_fit needs an input file");
+ return;
+ }
+
+ ifstream inx;
+ if (ifile != "-") {
+ inx.open(ifile);
+ if (! inx.good()) {
+ cerr << "Cannot open input '" << ifile << "'" << endl;
+ exit(1);
+ }
+ }
+
+ istream& in(ifile == "-" ? cin : inx);
+ vector<vector<string> > aoa;
+ aoa = readCSV<string>(in);
+ guesser ggg;
+ ggg.setup(aoa);
+ ggg.more();
+ ggg.sitch.norm = ggg.model;
+ ggg.show(gfile);
+
+ vector<double> prm = u5;
+ int nprm(prm.size());
+ vector<double> const lower(nprm, 0.5);
+ vector<double> const upper(nprm, 1.5);
+
+ nlopt_result rslt = NLOPT_FORCED_STOP; // avoid "unused" warning
+ double limp_end(-9e99);
+ vector<double> found(prm); // will get modified in place
+ int OK(0);
+ try {
+ void* context = (void*)(&ggg.sitch);
+ nlopt_opt oppy = nlopt_create(NLOPT_LN_BOBYQA, nprm);
+ rslt = nlopt_set_lower_bounds1(oppy, 0.5);
+ rslt = nlopt_set_upper_bounds1(oppy, 1.5);
+ rslt = nlopt_set_min_objective(oppy, limp, context);
+ rslt = nlopt_set_xtol_rel(oppy, 1e-7);
+ rslt = nlopt_optimize(oppy, found.data(), &limp_end);
+ OK = 1;
+ }
+ catch (exception& eee) {
+ cout << "Fitting bombed out: " << eee.what() << endl;
+ }
+
+ if (OK && ofile != "") {
+ ofstream xxout;
+ if (ofile != "-") {
+ xxout.open(ofile);
+ }
+ ostream& xout(ofile != "-" ? xxout : cout);
+
+ double DoF(ggg.npts - nprm);
+ xout << ifile << endl;
+ cout << "Fit returns:, " << rslt
+ << ", i.e., " << nlopt_result_to_string(rslt)
+ << ", limp:, " << limp_end
+ << ", perdof:, " << limp_end / (ggg.npts - nprm)
+ << endl;
+ xout << "Fit returns:, " << rslt
+ << ", i.e., " << nlopt_result_to_string(rslt)
+ << ", limp:, " << limp_end
+ << ", perdof:, " << limp_end / DoF
+ << endl;
+ xout << endl;
+ xout << ",fast.amp, fast.dk, slow.amp, slow.dk, bl" << endl;
+ xout << "Normed:, " << dump(found) << endl;
+ vector<double> combi(nprm);
+ for (int ii = 0; ii < nprm; ii++) {
+ combi[ii] = found[ii] * ggg.sitch.norm[ii];
+ }
+ xout << "SI:, " << dump(combi) << endl;
+ vector<double> flip(combi);
+ flip[1] = M_LN2/combi[1];
+ flip[3] = M_LN2/combi[3];
+ xout << "½life:, " << dump(flip) << endl;
+
+// Calculate the uncertainties.
+// In particular, calculate the Mahalanobis metric
+// i.e. the second derivative (Hessian) of the log improbability.
+ vector<vector<double> > covar(nprm, vector<double>(nprm));
+ vector<double> probe(u5);
+ sitcher sitch(ggg.sitch);
+ sitch.norm = combi;
+ void* context = (void*)(&sitch);
+ double delta(0.01);
+ xout << endl;
+ for (int ii = 0; ii < nprm; ii++) {
+ string sep = "";
+ for (int jj = 0; jj < nprm; jj++) {
+ double maha;
+ if (ii != jj) {
+ maha = limp(nprm, goose(goose(
+ probe, ii, delta), jj, delta).data(), 0, context)
+ + limp(nprm, goose(goose(
+ probe, ii, -delta), jj, -delta).data(), 0, context)
+ - limp(nprm, goose(goose(
+ probe, ii, delta), jj, -delta).data(), 0, context)
+ - limp(nprm, goose(goose(
+ probe, ii, -delta), jj, delta).data(), 0, context);
+ maha = maha/4./delta/delta;
+ } else {
+ maha = limp(nprm, goose(probe, ii, delta).data(), 0, context)
+ + limp(nprm, goose(probe, ii, -delta).data(), 0, context)
+ -2.0*limp(nprm, probe.data(), 0, context);
+ maha = maha/delta/delta;
+ }
+ xout << sep << setprecision(3) << setw(11) << fixed << maha;
+ sep = ", ";
+ }
+ xout << endl;
+ }
+ }
+}
+
+// Crude reconnaissance.
+// Calculate the decay curves using made-up parameters.
+// Not in the critical path.
+void do_dk(poissonier& poi, string const ofile) {
+ if (ofile == "") return;
+ ofstream xxout;
+ if (ofile != "-") {
+ xxout.open(ofile);
+ }
+ ostream& xout(ofile != "-" ? xxout : cout);
+ sitcher sitch; // selected abscissas
+ sitcher smooth; // all abscissas
+
+ double step(1);
+ vector<double> prm( {30, .1, 10, .01, 1} );
+
+ int dt(5);
+ for (int ii = 0; ii < 500; ii+= dt) {
+ double tt1(step*ii);
+ double tt2(tt1 + dt);
+ double intensity(inty_multi_exp(prm, tt1, tt2, u5));
+ int obs = poi.sample(intensity);
+ sitch.t1.push_back(tt1);
+ sitch.t2.push_back(tt2);
+ sitch.obs.push_back(obs);
+ }
+
+ for (int ii = 0; ii < 500; ii++) {
+ smooth.t1.push_back(step*ii);
+ }
+
+// here with both sitchers fully constructed.
+
+ xout << "Seed:," << poi.graine << endl;
+// output both the smooth curves
+// and the observations (to the extent they exist)
+ for (int ii = 0; ii < 500; ii++) {
+ double tt1(smooth.t1[ii]);
+ double sminty(inst_multi_exp(prm, tt1, u5));
+ xout << tt1 << "," << sminty;
+ if (ii < (int) sitch.t1.size()) {
+ double fake(max(0.+sitch.obs[ii], 0.2)); // to facilitate log axes
+ double tt1(sitch.t1[ii]);
+ double tt2(sitch.t2[ii]);
+ double minty(inty_multi_exp(prm, tt1, tt2, u5));
+ xout << "," << tt1
+ << "," << tt2
+ << ",x"
+ << "," << minty
+ << "," << minty
+ << ",x"
+ << "," << fake
+ << "," << fake
+ << ",x";
+ }
+ xout << endl;
+ }
+}
+
+// Class used by do_entropy.
+// Use a class rather than a plain function,
+// because it returns multiple results.
+struct entroper {
+ double ptot, stot, ii;
+ void go(double const rw_obs, double const rw_mod) {
+ ptot = 0;
+ stot = 0;
+ int ii;
+ int lim = int(ceil(rw_obs));
+ lim *= 10;
+ for (ii = 0; ii < lim; ii++) {
+ if (ptot > .999999) break;
+ double pobs = poissonier::pmd(rw_obs, ii);
+ if (pobs != 0) {
+ double lpmod = poissonier::lpmd(rw_mod, ii);
+ ptot += pobs;
+ stot -= pobs * lpmod;
+ } else {
+ // don't do a calculation that could
+ // possibly multiply zero by -infinity.
+ }
+ }
+ }
+};
+
+// Out of curiosity, calculate the entropy of the Poisson distribution
+// as a function of intensity (lambda).
+// Not in the critical path.
+void do_entropy(string const ofile) {
+ if (ofile == "") return;
+ ofstream xxout;
+ if (ofile != "-") {
+ xxout.open(ofile);
+ }
+ ostream& xout(ofile != "-" ? xxout : cout);
+
+ double Delta(1.01);
+ xout << "Delta," << Delta << endl;
+ xout << "rw, ii, ptot, Scat, Sdog, Semu" << endl;
+ double ratio(pow(10., .005));
+ entroper cat, dog, emu;
+ for (double rw = 0.01; rw < 1500; rw *= ratio) {
+ cat.go(rw, rw);
+ dog.go(rw, rw/Delta);
+ emu.go(rw, rw*Delta);
+ xout << rw
+ << "," << cat.ptot
+ << "," << cat.ii
+ << "," << cat.stot
+ << "," << dog.stot
+ << "," << emu.stot
+ << endl;
+ }
+}
+
+int main(int argc, char * const * argv) {
+ poissonier poi;
+ string seed;
+ string ifile;
+ arg_parser arg(argc, argv);
+ arg.fold_case = 1;
+ string progname = arg.nextRaw();
+ string dkfile, entfile, fitfile, gfile;
+
+ for (;;){
+ string raw_arg = arg.nextRaw(); // get next keyword
+ if (arg.fail()) break;
+ int where = arg.cur.length();
+ if (where){
+ if (arg.cur[where-1] == ':') arg.cur = arg.cur.substr(0,where-1);
+ }
+ if (0) {}
+ else if (arg.prefix("help") || arg.prefix("-h")) {
+ usage();
+ exit(0);
+ }
+ else if (arg.prefix("-ifile")) arg >> ifile;
+ else if (arg.prefix("-seed")) arg >> seed;
+ else if (arg.prefix("-entropy")) arg >> entfile;
+ else if (arg.prefix("-dkcurve")) arg >> dkfile;
+ else if (arg.prefix("-gfile")) arg >> gfile;
+ else if (arg.prefix("-fit")) arg >> fitfile;
+ else {
+ cerr << "Unrecognized command '" << raw_arg << "'" << endl;
+ exit(1);
+ }
+ }
+ if (seed == "--") seed = poi.randomize();
+ else if (seed != "") poi.seed(seed);
+
+ if (!( fitfile.length()
+ || dkfile.length()
+ || entfile.length()
+ ))
+ dkfile = "-";
+ do_dk(poi, dkfile);
+ do_fit(poi, ifile, fitfile, gfile);
+ do_entropy(entfile);
+}