diff options
| author | John Denker <jsd@av8n.com> | 2021-10-17 10:10:18 -0700 | 
|---|---|---|
| committer | John Denker <jsd@av8n.com> | 2021-10-17 11:09:24 -0700 | 
| commit | 74ddd0381aa1b1a90eb0d5300fa576cb2348eeac (patch) | |
| tree | 72a9dded6f800467d52e479eb37574e6de5f2e6c /poiss.c | |
| parent | 634d365a03cb0581a062cd3cf4db9ae69f1cde26 (diff) | |
basically functional, but still a work in progress
Diffstat (limited to 'poiss.c')
| -rw-r--r-- | poiss.c | 658 | 
1 files changed, 658 insertions, 0 deletions
| @@ -0,0 +1,658 @@ +///////////////////////// + +#include <iostream> +using namespace std; + +void usage() { +  cout << R"EoF(Program to fit to Poisson distributed data. +Uses actual Poisson probability (i.e. not Gaussians, and +therefore not least squares). + +Options include: +  -h                    # print this message (and immediately exit) +  -ifile   $fn          # input file, date for -fit +  -gfile   $fn          # log information about initial guesses for -fit +  -fit     $fn          # do the fit; send results to $fn + +# the rest are for testing and background investigations: +  -dkcurve $fn          # smooth curves and scattered data at fake abscissas +  -entropy $fn          # entropy of the Poisson distro +                        #  as a function of the intensity (lambda) +  -seed    $str         # seed the RNG with $str +  -seed    --           # seed the RNG from /dev/random + +To write to stdout, use "-" for the $fn. + +See ./doit for a working example, including the SS --meld stanzas. +)EoF"; +} + +#include <iomanip>      /* for setw */ +#include <vector> +#include <nlopt.hpp>    /* the minimum-finder */ +#include "arg_parser.h" +#include <string> +#include <sstream> +#include <map> +#include "parse_csv.h" +#include "poissonier.h" + +// Used for equal weighting: +vector<double> const u5(5, 1.0); + +// Class to describe a sitch, i.e. a situation, i.e. +// everything the objective function needs to know: +struct sitcher { +  vector<double> norm;    // keep the adjustable parameters close to unity +  vector<double> t1; +  vector<double> t2; +  vector<int> obs;      // actual observed counts in the interval [t1, t2] +}; + +// Instantaneous rate at time ttt. +// Takes parameters two at a time: (amplitude, decay rate). +// If there are an odd number of parameters, +// the last one is the baseline, i.e. independent of time, +// equivalent to a decay with the rate locked at zero. +// Decay rates are in nats per second. +double inst_multi_exp(vector<double> const& prm, +         double const ttt, vector<double> const norm) { +  double rslt(0); +  int nn = prm.size(); +  for (int ii = 0; ii < nn; ii+=2){ +    double amp(prm[ii] * norm[ii]); +    double rate(1+ii < nn ? prm[1+ii] * norm[1+ii] : 0.); +    if (rate) { +      double rate(prm[1+ii]); +// use the negative rate: +      rslt += amp * exp(- rate * ttt); +    } else { +      rslt += amp; +    } +  } +  return rslt; +} + +// Format all the elements of a C++ vector, +// creating a string suitable for printing. +template<class T> +string dump(vector<T> const arg) { +  stringstream rslt; +  string sep = ""; +  for (T val : arg) { +    rslt << sep << val; +    sep = ", "; +  } +  return rslt.str(); +} + +// Format all the elements of a counted array, +// creating a string suitable for printing. +template<class T> +string dump(int const nn, T const arg[]) { +  stringstream rslt; +  string sep = ""; +  for (int ii = 0; ii < nn; ii++) { +    rslt << sep << arg[ii]; +    sep = ","; +  } +  return rslt.str(); +} + +// numerically accurate sinh(arg)/arg +template<class T> +inline T sinhc(T arg) { +  if (abs(arg) < 1e-4) return 1 + arg*arg/6; +  return sinh(arg)/arg; +} + +struct evendor { +  double t;               // time +  double n;               // number of events +}; + +// Intensity, usually denoted lambda, +// i.e. integral of rate(t) dt over the given interval. +double inty_multi_exp(vector<double> const& prm, +        double const t1, double const t2, vector<double> const norm) { +  double rslt(0); +  int nn = prm.size(); +  for (int ii = 0; ii < nn; ii+=2){ +    double amp(prm[ii] * norm[ii]); +    double rate(1+ii < nn ? prm[1+ii] * norm[1+ii] : 0.); +    double mid((t2 + t1) / 2.); +    double tau(t2 - t1); +    rslt += amp * exp(-rate*mid) * tau * sinhc(rate*tau/2.); +  } +  return rslt; +} + +// the actual objective function +// returns total log probability, summed over all data points +// +// _sitch is unchanged, +// but cannot be declared const, because bobyqa wouldn't like that. +double limp(unsigned int const nprm, double const* _prm, +        double * _grad, void * _sitch) { +  sitcher& sitch(*(sitcher*)(_sitch)); +  if (_grad != 0) throw invalid_argument("grad"); +  if (sitch.norm.size() != nprm) throw invalid_argument("norm"); +  vector<double> prm(_prm, _prm+nprm); +  if (prm.size() != nprm) throw logic_error("vector ctor"); +  double rslt(0); +  int npts(sitch.t1.size()); +  for (int ii = 0; ii < npts; ii++){ +    double inty = inty_multi_exp(prm, sitch.t1[ii], sitch.t2[ii], sitch.norm); +    rslt -= poissonier::lpmd(inty, sitch.obs[ii]); +  } +  return rslt; +} + +// Scan the data array, +// looking for records that look like headers +// (as opposed to numerical data). +int count_header(vector<vector<string> > const aoa){ +  int NR = aoa.size(); +  int hh = 0; +  for (; hh < NR; hh++) { +    if (aoa[hh].size() == 0) continue; +    string word = aoa[hh][0]; +    size_t where; +    where = word.find_first_not_of(" "); +    if (where == string::npos) continue; +    word = word.substr(where); +    try { +      stod(word, &where);       // don't care about return value +    } +    catch (exception& ee) { +      continue; +    } +    word = word.substr(where); +    where = word.find_first_not_of(" "); +    if (where == string::npos) break; +  } +  return hh; +} + +// Some tricky heuristics. +// Guess a starting point for the fitting process +// (i.e. minimization process). +// A good guess speeds things up and greatly increases +// the chance of finding the global minimum (as opposed +// to getting stuck in a worthless local minimum, or +// diverging to arrant nonsense). +struct guesser { +  int hh; +  int npts; +  vector<evendor> evt; +  vector<double> model; +  sitcher sitch; + +  guesser() : hh(0), npts(0) {} + +  void setup(vector<vector<string> > const aoa) { +    int NR = aoa.size();         // includes header row +    if (NR == 0) throw invalid_argument("Zero-length input file"); +    hh = count_header(aoa); +    npts = NR - hh; +    if (0) cout << "NR: " << NR +         << "  npts: " << npts +         << endl; +    evt = vector<evendor>(npts); +    double minute(60); +    for (int ii = 0; ii < npts; ii++) { +  // file times are in minutes; convert to SI units here: +      evt[ii].t = stod(aoa[hh+ii][0]) * minute; +      evt[ii].n = stod(aoa[hh+ii][1]); +    } + +    double max_t = evt[npts-1].t; +    double max_n = evt[npts-1].n; + +  // starting point of the baseline estimate, +  // as a fraction of the total span of the data, +  // assuming data starts at zero: +    double tail_frac = 0.9; +    int tail_i = -1; + +  // half life (in seconds) of the fast component: +    double fast_h = 307.2; +    double fast_a = M_LN2 / fast_h; +    int fast_i = -1; + +  // half life (in seconds) of the slow component: +    double slow_h = 45720; +    double slow_a = M_LN2 / slow_h; + +    double dead(5);       // 2**-5 = 3% + +    for (int ii = 0; ii < npts; ii++) { +      double time = evt[ii].t; +      if (time < tail_frac * max_t) tail_i = ii; +      if (time < fast_h * dead) fast_i = ii; +    } + +    if (0) cout << "npts: " << npts +          << " : " << max_t +          << "," << max_n +          << endl; + +    double tail_t = evt[tail_i].t; +    double tail_n = evt[tail_i].n; +    double tail_r = (max_n - tail_n) / (max_t - tail_t); + +    if (0) cout << "tail_i:   " << tail_i +          << " : " << tail_t +          << ","   << tail_n +          << "    " << tail_r +          << endl; + +    double fast_t = evt[fast_i].t; +    double fast_n = evt[fast_i].n; +    if (0) cout << "fast_i:   " << fast_i +          << " : " << fast_t +          << ","   << fast_n +  //        << "    " << fast_r +          << endl; + +    if (tail_i < 0) throw invalid_argument("wtf?"); +    if (fast_i < 0) throw invalid_argument("wtf?"); + +  // beginning times: +    double slow_tb = evt[fast_i].t; +    double slow_nb = evt[fast_i].n; + +    double slow_dt = max_t - slow_tb; +    double slow_dn = max_n - slow_nb; +    double slow_mag = slow_dn - slow_dt * tail_r; +    slow_mag /= (1 - exp(-slow_a * slow_dt) * (1 + slow_dt * slow_a)); +  // valid at time slow_tb: +    double slow_rx = slow_mag * slow_a; +  // valid at time zero: +    double slow_r = slow_rx * exp(slow_a * slow_tb); + +    double slow_rem = slow_r * exp(-slow_a * max_t); +    double bl_r = tail_r - slow_rem; + +    if (0) cout << "slow_r: " << slow_r +          << "  slow_rx: " << slow_rx +          << "  slow_mag: " << slow_mag +          << "  slow_rem: " << slow_rem +          << "  tail_r: " << tail_r +          << "  bl_r: " << bl_r +          << endl; + +    vector<double> slow_model{slow_r, slow_a, bl_r}; + +    double fast_t0 = evt[0].t; +    double fast_n0 = evt[0].n; +    double fast_t1 = evt[fast_i].t; +    double fast_n1 = evt[fast_i].n; +    double model_inty  = inty_multi_exp(slow_model, fast_t0, fast_t1, u5); +    double fast_tot = fast_n1 - fast_n0; +    double fast_dn = fast_tot - model_inty; +    double fast_r = fast_dn * fast_a; +    if (0) cout << "fast_r: " << fast_r +           << " fast_dn: " << fast_dn +          << "  fast_tot: " << fast_tot +          << "  model_inty: " << model_inty +          << endl; + +    model = vector<double>{fast_r, fast_a}; +    model.insert(model.end(), slow_model.begin(), slow_model.end()); +  } + +// Reformat the data, to make it more useful to +// the objective function: +  void more() { +    for (int ii = 1; ii < npts; ii++) { +      sitch.t1.push_back(evt[ii-1].t); +      sitch.t2.push_back(evt[ii  ].t); +      sitch.obs.push_back(evt[ii].n - evt[ii-1].n); +    } +  } + +// Show the guessed parameters. +// Also do a sweep of one variable, as a qualitative sanity check. +  void show(string const ofile) { +    if (ofile == "") return; +    ofstream xxout; +    if (ofile != "-") { +      xxout.open(ofile); +    } +    ostream& xout(ofile != "-" ? xxout : cout); + +    size_t ss = sitch.t1.size(); +    xout << ss << endl; +    xout << sitch.t1[ss-1] +          << "  " << sitch.t2[ss-1] +          << "  " << sitch.obs[ss-1] +          << endl; + +    vector<double> prm(u5); + +    xout << dump(model) << endl; + +    xout << "bl/norm,bl,limp" << endl; +    double bl0 = prm[4]; +    void* context = (void*)(&sitch); +    int nprm(prm.size()); + +    for (double bl = bl0*.9; bl <= bl0*1.1; bl += bl/100.) { +      prm[4] = bl; +      double limpy = limp(nprm, prm.data(), 0, context); +      xout << bl +          << "," << bl*sitch.norm[4] +          << "," << limpy << endl; +    } +  } +}; + +// Decode result codes returned by nlopt functions. +// This is documented to be part of the nlopt library +// but is absent from the version I have. +const char *nlopt_result_to_string(nlopt_result result) +{ +  switch(result) +  { +    case NLOPT_FAILURE: return "FAILURE"; +    case NLOPT_INVALID_ARGS: return "INVALID_ARGS"; +    case NLOPT_OUT_OF_MEMORY: return "OUT_OF_MEMORY"; +    case NLOPT_ROUNDOFF_LIMITED: return "ROUNDOFF_LIMITED"; +    case NLOPT_FORCED_STOP: return "FORCED_STOP"; +    case NLOPT_SUCCESS: return "SUCCESS"; +    case NLOPT_STOPVAL_REACHED: return "STOPVAL_REACHED"; +    case NLOPT_FTOL_REACHED: return "FTOL_REACHED"; +    case NLOPT_XTOL_REACHED: return "XTOL_REACHED"; +    case NLOPT_MAXEVAL_REACHED: return "MAXEVAL_REACHED"; +    case NLOPT_MAXTIME_REACHED: return "MAXTIME_REACHED"; +///???????    case NLOPT_NUM_RESULTS: return NULL; +  } +  return NULL; +} + +// Add something to a particular component of a vector. +template<class T> +vector<T> goose(vector<T> const& vvv, int const ii, T const delta) { +  vector<T> rslt(vvv); +  rslt[ii] += delta; +  return rslt; +} + +void do_fit(poissonier& poi, string const ifile, +        string const ofile, string const gfile) { +  if (ifile == "") { +    if (ofile != "") throw invalid_argument("do_fit needs an input file"); +    return; +  } + +  ifstream inx; +  if (ifile != "-") { +    inx.open(ifile); +    if (! inx.good()) { +      cerr << "Cannot open input '" << ifile << "'" << endl; +      exit(1); +    } +  } + +  istream& in(ifile == "-" ? cin : inx); +  vector<vector<string> > aoa; +  aoa = readCSV<string>(in); +  guesser ggg; +  ggg.setup(aoa); +  ggg.more(); +  ggg.sitch.norm = ggg.model; +  ggg.show(gfile); + +  vector<double> prm = u5; +  int nprm(prm.size()); +  vector<double> const lower(nprm, 0.5); +  vector<double> const upper(nprm, 1.5); + +  nlopt_result rslt = NLOPT_FORCED_STOP; // avoid "unused" warning +  double limp_end(-9e99); +  vector<double> found(prm);       // will get modified in place +  int OK(0); +  try { +    void* context = (void*)(&ggg.sitch); +    nlopt_opt oppy = nlopt_create(NLOPT_LN_BOBYQA, nprm); +    rslt = nlopt_set_lower_bounds1(oppy, 0.5); +    rslt = nlopt_set_upper_bounds1(oppy, 1.5); +    rslt = nlopt_set_min_objective(oppy, limp, context); +    rslt = nlopt_set_xtol_rel(oppy, 1e-7); +    rslt = nlopt_optimize(oppy, found.data(), &limp_end); +    OK = 1; +  } +  catch (exception& eee) { +    cout << "Fitting bombed out: " << eee.what() << endl; +  } + +  if (OK && ofile != "") { +    ofstream xxout; +    if (ofile != "-") { +      xxout.open(ofile); +    } +    ostream& xout(ofile != "-" ? xxout : cout); + +    double DoF(ggg.npts - nprm); +    xout << ifile << endl; +    cout << "Fit returns:, " << rslt +          << ", i.e., " << nlopt_result_to_string(rslt) +          << ",   limp:, " << limp_end +          << ",   perdof:, " << limp_end / (ggg.npts - nprm) +          << endl; +    xout << "Fit returns:, " << rslt +          << ", i.e., " << nlopt_result_to_string(rslt) +          << ",   limp:, " << limp_end +          << ",   perdof:, " << limp_end / DoF +          << endl; +    xout << endl; +    xout << ",fast.amp, fast.dk, slow.amp, slow.dk, bl" << endl; +    xout << "Normed:, " << dump(found) << endl; +    vector<double> combi(nprm); +    for (int ii = 0; ii < nprm; ii++) { +      combi[ii] = found[ii] * ggg.sitch.norm[ii]; +    } +    xout << "SI:,     " << dump(combi) << endl; +    vector<double> flip(combi); +    flip[1] = M_LN2/combi[1]; +    flip[3] = M_LN2/combi[3]; +    xout << "½life:,  " << dump(flip) << endl; + +// Calculate the uncertainties. +// In particular, calculate the Mahalanobis metric +// i.e. the second derivative (Hessian) of the log improbability. +    vector<vector<double> > covar(nprm, vector<double>(nprm)); +    vector<double> probe(u5); +    sitcher sitch(ggg.sitch); +    sitch.norm = combi; +    void* context = (void*)(&sitch); +    double delta(0.01); +    xout << endl; +    for (int ii = 0; ii < nprm; ii++) { +      string sep = ""; +      for (int jj = 0; jj < nprm; jj++) { +        double maha; +        if (ii != jj) { +          maha = limp(nprm, goose(goose( +            probe, ii,  delta), jj,  delta).data(), 0, context) +              + limp(nprm, goose(goose( +            probe, ii, -delta), jj, -delta).data(), 0, context) +              - limp(nprm, goose(goose( +            probe, ii,  delta), jj, -delta).data(), 0, context) +              - limp(nprm, goose(goose( +            probe, ii, -delta), jj,  delta).data(), 0, context); +            maha = maha/4./delta/delta; +        } else { +          maha = limp(nprm, goose(probe, ii,  delta).data(), 0, context) +              + limp(nprm, goose(probe, ii, -delta).data(), 0, context) +              -2.0*limp(nprm, probe.data(), 0, context); +          maha = maha/delta/delta; +        } +        xout << sep << setprecision(3) << setw(11) << fixed << maha; +        sep = ", "; +      } +      xout << endl; +    } +  } +} + +// Crude reconnaissance. +// Calculate the decay curves using made-up parameters. +// Not in the critical path. +void do_dk(poissonier& poi, string const ofile) { +  if (ofile == "") return; +  ofstream xxout; +  if (ofile != "-") { +    xxout.open(ofile); +  } +  ostream& xout(ofile != "-" ? xxout : cout); +  sitcher sitch;        // selected abscissas +  sitcher smooth;       // all abscissas + +  double step(1); +  vector<double> prm( {30, .1, 10, .01, 1} ); + +  int dt(5); +  for (int ii = 0; ii < 500; ii+= dt) { +    double tt1(step*ii); +    double tt2(tt1 + dt); +    double intensity(inty_multi_exp(prm, tt1, tt2, u5)); +    int obs = poi.sample(intensity); +    sitch.t1.push_back(tt1); +    sitch.t2.push_back(tt2); +    sitch.obs.push_back(obs); +  } + +  for (int ii = 0; ii < 500; ii++) { +    smooth.t1.push_back(step*ii); +  } + +// here with both sitchers fully constructed. + +  xout << "Seed:," << poi.graine << endl; +// output both the smooth curves +// and the observations (to the extent they exist) +  for (int ii = 0; ii < 500; ii++) { +    double tt1(smooth.t1[ii]); +    double sminty(inst_multi_exp(prm, tt1, u5)); +    xout << tt1 << "," << sminty; +    if (ii < (int) sitch.t1.size()) { +      double fake(max(0.+sitch.obs[ii], 0.2));   // to facilitate log axes +      double tt1(sitch.t1[ii]); +      double tt2(sitch.t2[ii]); +      double minty(inty_multi_exp(prm, tt1, tt2, u5)); +      xout << "," << tt1 +           << "," << tt2 +           << ",x" +           << "," << minty +           << "," << minty +           << ",x" +           << "," << fake +           << "," << fake +           << ",x"; +    } +    xout << endl; +  } +} + +// Class used by do_entropy. +// Use a class rather than a plain function, +// because it returns multiple results. +struct entroper { + double ptot, stot, ii; + void go(double const rw_obs, double const rw_mod) { +    ptot = 0; +    stot = 0; +    int ii; +    int lim = int(ceil(rw_obs)); +    lim *= 10; +    for (ii = 0; ii < lim; ii++) { +      if (ptot > .999999) break; +      double pobs = poissonier::pmd(rw_obs, ii); +      if (pobs != 0) { +        double lpmod = poissonier::lpmd(rw_mod, ii); +        ptot += pobs; +        stot -= pobs * lpmod; +      } else { +        // don't do a calculation that could +        // possibly multiply zero by -infinity. +      } +    } +  } +}; + +// Out of curiosity, calculate the entropy of the Poisson distribution +// as a function of intensity (lambda). +// Not in the critical path. +void do_entropy(string const ofile) { +  if (ofile == "") return; +  ofstream xxout; +  if (ofile != "-") { +    xxout.open(ofile); +  } +  ostream& xout(ofile != "-" ? xxout : cout); + +  double Delta(1.01); +  xout << "Delta," << Delta << endl; +  xout << "rw, ii, ptot, Scat, Sdog, Semu" << endl; +  double ratio(pow(10., .005)); +  entroper cat, dog, emu; +  for (double rw = 0.01; rw < 1500; rw *= ratio) { +    cat.go(rw, rw); +    dog.go(rw, rw/Delta); +    emu.go(rw, rw*Delta); +    xout << rw +        << "," << cat.ptot +        << "," << cat.ii +        << "," << cat.stot +        << "," << dog.stot +        << "," << emu.stot +        << endl; +  } +} + +int main(int argc, char * const * argv) { +  poissonier poi; +  string seed; +  string ifile; +  arg_parser arg(argc, argv); +  arg.fold_case = 1; +  string progname = arg.nextRaw(); +  string dkfile, entfile, fitfile, gfile; + +  for (;;){ +    string raw_arg = arg.nextRaw();         // get next keyword +    if (arg.fail()) break; +    int where = arg.cur.length(); +    if (where){ +      if (arg.cur[where-1] == ':') arg.cur = arg.cur.substr(0,where-1); +    } +    if (0) {} +    else if (arg.prefix("help") || arg.prefix("-h")) { +      usage(); +      exit(0); +    } +    else if (arg.prefix("-ifile")) arg >> ifile; +    else if (arg.prefix("-seed")) arg >> seed; +    else if (arg.prefix("-entropy")) arg >> entfile; +    else if (arg.prefix("-dkcurve")) arg >> dkfile; +    else if (arg.prefix("-gfile")) arg >> gfile; +    else if (arg.prefix("-fit")) arg >> fitfile; +    else { +      cerr << "Unrecognized command '" << raw_arg << "'" << endl; +      exit(1); +    } +  } +  if (seed == "--") seed = poi.randomize(); +  else if (seed != "") poi.seed(seed); + +  if (!(   fitfile.length() +        || dkfile.length() +        || entfile.length() +     )) +         dkfile = "-"; +  do_dk(poi, dkfile); +  do_fit(poi, ifile, fitfile, gfile); +  do_entropy(entfile); +} | 
