new libskrewt-based code is now stable and in use;

let it be the basis for further developments
author: John Denker <jsd@av8n.com> 2012-11-24 09:03:18 -0800
committer: John Denker <jsd@av8n.com> 2012-11-24 09:03:18 -0800
commit: 7abce5d60408c7713181249ba3f23b72a40aa326 (patch)
tree: f89f7c75bdd955015849d2ac793169b9d2cdf27c /tools/skrewt.c
parent: 3043e470e483da2452943bd810256ea0ff8807b0 (diff)
1 files changed, 21 insertions, 477 deletions
diff --git a/tools/skrewt.c b/tools/skrewt.c
index 63c6be6..abea289 100644
--- a/tools/skrewt.c
+++ b/tools/skrewt.c
@@ -8,7 +8,6 @@
 #include <stdlib.h>             /* for exit() */
 #include <string>               /* for strcmp() */
 #include <ctype.h>              /* toupper */
-#include <signal.h>
 
 #include <stdio.h>              /* perror */
 #include <sstream>
@@ -37,33 +36,8 @@ void usage(const int sts){
   exit(sts);
 }
 
-#include "qq_exit_codes.h"
+#include "libskrewt.h"
 #include "utils.h"
-#include "sepofra.h"
-
-void maybe_exeunt(const int sts, const int really){
-  if (!really) return;
-  if (sts == ex_good) exit(sts);
-
-  const char* foo = getenv("HI_Q_GROUP");
-  if (!foo) exit(sts);
-
-// No point in signalling ourself:
-  sighandler_t rslt = signal(SIGUSR1, SIG_IGN);
-  if (rslt == SIG_ERR) {
-    cerr << "error setting signal" << endl;
-  }
-  int k = kill(-atoi(foo), SIGUSR1);
-  if (k) {
-    cerr << "kill failed on group " << atoi(foo) << " ... ";
-    perror(0);
-  }
-  exit(sts);
-}
-
-void exeunt(const int sts){
-  maybe_exeunt(sts, 1);
-}
 
 string progname, progid;
 int mypid;
@@ -72,87 +46,6 @@ int mypid;
 /* Content-Type: multipart/mixed; boundary="1170861315-1262462055-1341954763=:92165"    */
 //
 
-void parse_content(const string type_spec_line,
-        string &maintype, string &boundary) {
-  //xxx cerr << "parser called with: " << type_spec_line << endl;
-  string get_type(type_spec_line);
-
-  size_t where = get_type.find_first_of(" \t;\n");
-  string rest;
-  if (where == string::npos) {
-    // keep whole string
-  }
-  else {
-    rest = get_type.substr(where+1);
-    get_type = get_type.substr(0,where);
-  }
-  where = get_type.find("/");
-  if (where == string::npos){
-    maintype = "";
-    cerr << "could not find / in " << get_type << endl;
-  } else {
-    maintype = get_type.substr(0, where);
-  }
-
-// now need to find boundary
-
-  string srch = "boundary=";
-  where = rest.find(srch);
-  if (where != string::npos) {
-    where += srch.length();
-    boundary = rest.substr(where);
-    if (boundary[0] == '"') {
-      boundary = boundary.substr(1);
-      where = boundary.find_first_of("\"");
-    } else {
-      where = boundary.find_first_of(" \t;\n");
-    }
-    if (where == string::npos) {
-      /* do nothing, boundary=boundary as a whole */
-    } else {
-      boundary = boundary.substr(0, where);
-    }
-  } else {
-    //xxxxxxx cerr << "boundary= not found in " << type_spec_line << endl;
-  }
-}
-
-class skrewt{
-public:
-  string received_from;         // envelope HELO among other things
-    string proximta_HELO;
-    string proximta_rDNS;
-    string proximta_IP;
-    string proximta_AuthUser;
-  string return_path;           // envelope MAIL FROM
-  string boundary;
-  string to;
-  string from;
-  string subject;
-  string date;
-  string message_id;
-  string content_type;
-  string delivered_to;
-  int msgsize;
-  vector<string> bigbuf;
-  int saw_blank_line;
-  int recno;
-
-  int maxsize;
-  int error_exit;
-  int mid_required;
-
-  // constructor
-  skrewt()
-  : boundary("x-xx-x"), msgsize(0), saw_blank_line(0), recno(0),
-    maxsize(1000*1000), error_exit(0), mid_required(0)
-  {}
-
-  int headers();
-  int interstage();
-  int body();
-  int krunch_rfrom();
-};
 
 #if 0   /* typical "Received: from" lines */
 Received: from lists.sourceforge.net (216.34.181.88)
@@ -178,360 +71,6 @@ Received: from ip68-231-191-153.tc.ph.cox.net (HELO asclepias.av8n.net) (smtp@68
 /home/jsd/Maildir/cur/1342363199.24320.cloud:2,
 #endif
 
-int skrewt::krunch_rfrom(){
-  stringstream parse;
-  parse.str(received_from);
-  string word;
-  parse >> word;
-  if (word != "from") {
-    cerr << progid << " bad 'Received: from' line ... '"
-        << word << "'" << endl;
-    return ex_syserr;
-  }
-  parse >> proximta_rDNS;
-  parse >> word;
-  if (word == "(HELO") {
-    parse >> proximta_HELO;
-    proximta_HELO = rtrim(proximta_HELO, "()");
-    parse >> word;
-  } else {
-    proximta_HELO = proximta_rDNS;
-  }
-  size_t len = word.length();
-  if (len<2 || word[0] != '(' || word[len-1] != ')') {
-    cerr << progid << " bad 'Received: from' line ;;; '"
-        << word << "'" << endl;
-    return ex_syserr;
-  }
-  proximta_IP = word.substr(1, len-2);
-  size_t where = proximta_IP.find("@");
-  if (where != string::npos){
-    proximta_AuthUser = proximta_IP.substr(0, where);
-    proximta_IP = proximta_IP.substr(1+where);
-  }
-
-  return 0;
-}
-
-int skrewt::headers(){
-  //xxxx cerr << progid << " begins" << endl;
-  for (;;){             // outer loop over all records in the header
-    if (cin.eof()) break;
-    if (cin.bad()) return 1;
-
-    string line;
-// on fail, go back to top of outer loop and check for eof versus bad
-    if (getline(cin, line).fail()) continue;
-    msgsize += line.length()+1;
-    if (msgsize > maxsize) {
-      cerr << progid << " rejection: bigger than " << maxsize << endl;
-      exeunt(ex_spam);
-    }
-    cout << line << endl;
-    bigbuf.push_back(line);
-    string headrec = noCR(line);       // for a folded record, this is the first line
-
-    for (;;) {        // inner loop to build a multi-line record e.g. folded record:
-      if (cin.eof()) break;
-      if (cin.bad()) return 1;
-      char ch;
-      if (cin.get(ch).fail()) continue;
-      cin.putback(ch);
-      if (ch != ' ' && ch != '\t') break;
-      string line;
-// on fail, go back to top of inner loop and check for eof versus bad
-      if (getline(cin, line).fail()) continue;
-      msgsize += line.length()+1;
-      if (msgsize > maxsize) {
-        cerr << progid << " rejection: bigger than " << maxsize << endl;
-        exeunt(ex_spam);
-      }
-      cout << line << endl;
-      bigbuf.push_back(line);
-      headrec += "\n" + noCR(line);
-    }
-// here with a fully assembled header record
-// headrec (unlike line) contains no DOS CR characters
-    int len = headrec.length();
-    if (len == 0) {
-      saw_blank_line = 1;
-      break;            // no more headers in this message
-    }
-
-// here if it's a header line
-    string headword;
-    string rest;
-    size_t where = headrec.find(":");
-    if (where != string::npos) {
-      headword = headrec.substr(0, where);
-      rest = ltrim(headrec.substr(1+where));
-    }
-    headword = toLower(headword);
-    if (0){
-    } else if (headword == "from") {
-      from = rest;
-    } else if (headword == "to") {
-      to = rest;
-    } else if (headword == "return-path") {
-      return_path = rest;
-    } else if (headword == "message-id") {
-      message_id = rest;
-    } else if (headword == "received") {
-      if (!received_from.length() && prefix("from ", rest)){
-        received_from = rest;
-      }
-    } else if (headword == "date") {
-      date = rest;
-    } else if (headword == "subject") {
-      subject = rest;
-    } else if (headword == "content-type") {
-      content_type = rest;
-    } else if (headword == "delivered-to") {
-      delivered_to = rest;
-    }
-    //xxxx  cout << headrec.length() << " ... ";
-    recno++;
-    if (0) if (recno <= 6) cerr << progid << "#" << recno
-        << " " << headrec << endl;
-  }
-  return 0;
-}
-
-int skrewt::interstage(){
-  if (saw_blank_line) {/* ignore */}
-// Note that the headers are in reverse-chronological order:
-  cerr << progid <<" Return-path: " << return_path <<endl;
-
-  { // parse the 'Received: from' line:
-    cerr << "        Received: " << received_from <<endl;
-    int rslt = krunch_rfrom();
-    if (rslt) return rslt;
-    cerr << "         rDNS:     " << proximta_rDNS << endl;
-    cerr << "         HELO:     " << proximta_HELO << endl;
-    cerr << "         IP:       "   << proximta_IP << endl;
-    cerr << "         AuthUser: "   << proximta_AuthUser << endl;
-    cerr << "         Mid       '"  << message_id << "'" << endl;
-  }
-
-  sepofra my_spf;
-  try {
-    my_spf.check(proximta_IP,
-        proximta_HELO,
-        return_path,
-        "junk", 0/* verbosity */);
-    cerr << "*** " << my_spf.explain() << endl;
-  } catch (bad_thing foo) {
-    cerr << "Caught bad thing: " << foo.what() << endl;
-    return ex_syserr;
-  }
-
-// The logic here is:  In order:
-// 1:: If whitelisted, accept.  No greylisting, no spam-checking.
-// 2:: If blacklisted, reject.  No greylisting, no spam-checking.
-// 3:: If good reputation, spam-check it and send it on its way.
-// 4:: If no reputation, greylist.
-// 5:: If bad reputation, ????
-
-// Expanding item 3 to the next level of detail:
-//  3a:: If some domain vouches for this sender-IP via SPF,
-//   then the reputation is bound to the domain.
-//  3c:: If some domain vouches for the message vie DKIM,
-//   then the reputation is bound to the domain.
-//  3d:: If no SPF or DKIM, then the reputation attaches
-//   to the sender-IP.
-
-// Expanding item 4 to the next level of detail:
-//  4a:: If the greylisting database says this message is ripe
-//   spam-check it.  If it's OK, use it to count toward reputation.
-//  4b:: If it is previously unseen or too old, start greylisting
-//   timer from scratch.  Reject with temporary error.
-//  4c:: If it is in the "green" state, let the timer
-//   continue from where it is.  Reject with temporary error.
-
-// Note:  Reputation normally attaches to a domain.
-//  With SPF, the domain vouches for the sender at a given IP address
-//   ... and then the sender implicitly vouches for the message.
-//  With DKIM, the domain vouches for an individual message.
-//  With neither SPF nor DKIM, reputation attaches to the sender's
-//    IP address.  The sender vouches for the message.
-//
-// During greylisting, delay applies to the message.  Reputation
-//  applies to the domain (via SPF or DKIM) or to the server
-//  (otherwise).
-
-
-// If you are a medium-sized operator, such that you have one
-// and only one IP address that ever sends email, and it is a
-// static IP address, then you don't have much to gain from
-// DKIM or SPF.  Attaching a reputation to your domain is not
-// much different from attaching a reputation to your IP address.
-
-// In constrast, if you are a low-budget operator with a
-// dynamic IP address, you benefit from SPF and/or DKIM.
-// Your reputation attaches to your domain, and remains
-// stable even as your IP address changes.
-
-// At the other extreme, if you are a big-time operator
-// such as googlegroups.com, you benefit from DKIM and/or
-// SPF.  Your IP addresses are not dynamic, but they are
-// numerous, so you prefer to have your reputation apply
-// to all your email-sending hosts.
-
-#if 0   /* typical Received-SPF line */
- Received-SPF: pass (google.com: domain of rpendarvis@brenau.edu designates 74.125.245.70 as permitted sender) client-ip=74.125.245.70;
-#endif
-
-#if 0   /* SPF users */
- :; mail-scan +received-spf /home/jsd/Maildir/cur[/]*  |
-    sed 's/.*domain of\(.*\).*designates.*/XXX \1 YYY/' |
-    awk '/XXX/{print "<" $2 ">"}' | sort | uniq -c | sort -nr
-     81 <gmail.com>
-     17 <mac.com>
-      8 <gmx.net>
-      8 <bbruner@gmail.com>
-      7 <jsd@av8n.com>
-      6 <kst24@cam.ac.uk>
-      5 <farooq.w@gmail.com>
-      4 <scerri@chem.ucla.edu>
-      4 <comcast.net>
-      4 <c2i.net>
-      3 <gemort2006@gmail.com>
-      2 <rrhake@earthlink.net>
-      2 <hotmail.com>
-      2 <GCC.EDU>
-      1 <us.panasonic.com>
-      1 <sss.pgh.pa.us>
-      1 <scot_wherland@wsu.edu>
-      1 <rpendarvis@brenau.edu>
-      1 <hmperks@gmail.com>
-      1 <btv1==55494f7d7e0==matt.fisher@email.stvincent.edu>
-      1 <arcor.de>
-#endif
-
-#if 0   /* DKIM users */
-     52 d=googlegroups.com;
-     27 d=barackobama.com;
-     10 d=gmail.com;
-      5 d=bronto.com;
-      5 d=bluehornet.com;
-      4 d=news.abebooks.com;
-      2 d=yahoo.co.uk;
-      2 d=sbcglobal.net;
-      2 d=embarqmail.com;
-      2 d=emailms.angieslist.com;
-      1 d=newsletters.sourceforge.net;
-      1 d=members.ebay.com;
-      1 d=info.citibank.com;
-      1 d=ebay.com;
-      1 d=commail1.co.za;
-#endif
-
-  list<string> badnews;
-  int whitelisted(0);
-
-  if (subject.find("sesame") != string::npos
-        && subject.find("swordfish") != string::npos) {
-    whitelisted++;
-  }
-
-  if (delivered_to.length()){
-    cerr << progid <<  " Delivered-to: <<<" << delivered_to << ">>>" << endl;
-  }
-  if (toLower(trim(delivered_to)) == "jean@av8n.com") {
-    badnews.push_back("Looping Delivered-to: " + delivered_to);
-  }
-
-  if (subject.find("-please-bounce-this-") != string::npos) {
-    badnews.push_back("by request");
-  }
-
-  if (!date.length()) {
-    badnews.push_back("no date");
-  }
-
-  if (mid_required && !message_id.length()) {
-    badnews.push_back("no message-id");
-  }
-
-  if (badnews.size() && !whitelisted){
-    cerr << progid << " " << join(", ", badnews) << endl;
-    if (error_exit){
-      cerr << progid << " '" << from
-             << "' to '" << to
-             << "'" << endl;
-      exeunt(ex_spam);
-    }
-  }
-  return 0;
-}
-
-int skrewt::body(){
-  string main_contype;
-  if (content_type.length())
-    parse_content(content_type, main_contype, boundary);
-// some slightly-useful booleans:
-  int currently_text = main_contype == "text";
-  int main_multipart = main_contype == "multipart";
-
-// early-stage thinking has been done.
-// Now spew the rest of the message
-  //xxxx cerr << "body begins: " << main_contype << " " << currently_text << " " << boundary << endl;
-  int in_subheads(0);
-  int textlines(0);
-
-  for (;;){             // outer loop over all lines in the body
-    if (cin.eof()) break;
-    if (cin.bad()) return 1;
-    string line;
-// on fail, go back to top of outer loop and check for eof versus bad
-    if (getline(cin, line).fail()) continue;
-    msgsize += line.length()+1;
-    if (msgsize > maxsize) {
-      cerr << progid << " rejection: bigger than " << maxsize << endl;
-      maybe_exeunt(ex_spam, error_exit);
-    }
-    bigbuf.push_back(line);
-    cout << line << endl;
-    if (in_subheads){
-      if (line == "" || line == "\r") in_subheads = 0;
-    }
-    if (in_subheads){
-        string sub_contype;
-        string junk;
-// in principle could worry about folded headers,
-// but in this application it doesn't actually matter
-        string headword;
-        string rest;
-        size_t where = line.find(":");
-        if (where != string::npos) {
-          headword = line.substr(0, where);
-          rest = ltrim(line.substr(1+where));
-        }
-        headword = toLower(headword);
-        if (headword == "content-type") {
-          parse_content(rest, sub_contype, junk);
-          currently_text = sub_contype == "text";
-          //xxxx cerr << "setting contype '" << sub_contype << "' " << currently_text << " ... " << textlines << endl;
-        }
-    } else {
-      if (main_multipart && line == "--" + boundary) {
-        //xxxx cerr << "found subhead boundary" << endl;
-        in_subheads = 1;
-        continue;
-      }
-      if (currently_text) textlines++;
-    }
-  }
-
-  if (0) cerr << "textlines: " << textlines << endl;
-  if (!textlines) {
-    cerr << progid << " rejection: no text: " << error_exit << endl;
-    maybe_exeunt(ex_spam, error_exit);
-  }
-  cerr << progid << " normal completion" << endl;
-  return(ex_good);
-}
-
 ////////////////////////////////////////////////////////////
 int main(int _argc, const char** _argv){
 
@@ -546,24 +85,22 @@ int main(int _argc, const char** _argv){
   }
 
   skrewt mysk;
+//  cerr << "maxsize: " << mysk.maxsize << endl;
 
-  while (argc) {
-    string arg(*argv); argv++; argc--;
+  argParser ARGS(argc, argv);
+  try {while (ARGS.size()) {
+    string arg = ARGS.next();
     if (arg.substr(0,2) == "--") arg = arg.substr(1);
-    if (prefix(arg, "-help")) {
+    if (ARGS.prefix("-help")) {
       usage(0);
     }
     if (0) {
-    } else if (prefix(arg, "-mid-required")) {
+    } else if (ARGS.prefix("-mid-required")) {
       mysk.mid_required++;
-    } else if (prefix(arg, "-error-exit")) {
+    } else if (ARGS.prefix("-error-exit")) {
       mysk.error_exit++;
-    } else if (prefix(arg, "-maxsize")) {
-      if (!argc) {
-        cerr << "Option -maxsize requires an argument" << endl;
-        exit(ex_usage);
-      }
-      mysk.maxsize = atoi(*argv); argv++; argc--;
+    } else if (ARGS.prefix("-maxsize", 1)) {
+      mysk.maxsize = atoi(ARGS.shift().c_str());
     } else if (arg.substr(0,1) == "-") {
       cerr << "Unrecognized option '" << arg << "'" << endl;
       cerr << "For help, try:  " << progname << " -help" << endl;
@@ -573,10 +110,16 @@ int main(int _argc, const char** _argv){
       cerr << "For help, try:  " << progname << " -help" << endl;
       exit(ex_usage);
     }
+  }}
+  catch (int) {
+    exit(ex_usage);
   }
 
-  int rslt = mysk.headers();
+  int rslt = mysk.headers(cin);
   if (rslt) return rslt;
+  mysk.dump_bigbuf(cout);
+  mysk.headerbuf = mysk.bigbuf;
+  mysk.bigbuf = vector<string>(0);
 
 // Headers are done.
 // Do some early-stage thinking.
@@ -584,7 +127,8 @@ int main(int _argc, const char** _argv){
   rslt = mysk.interstage();
   if (rslt) return rslt;
 
-  rslt = mysk.body();
-  return rslt;
-
+  rslt = mysk.body(cin, cout);
+  if (rslt) return rslt;
+  mysk.dump_bigbuf(cout);
+  return 0;
 }
author	John Denker <jsd@av8n.com>	2012-11-24 09:03:18 -0800
committer	John Denker <jsd@av8n.com>	2012-11-24 09:03:18 -0800
commit	7abce5d60408c7713181249ba3f23b72a40aa326 (patch)
tree	f89f7c75bdd955015849d2ac793169b9d2cdf27c /tools/skrewt.c
parent	3043e470e483da2452943bd810256ea0ff8807b0 (diff)