ward is like skrewt, but new and experimental

author: John Denker <jsd@av8n.com> 2012-11-22 15:56:19 -0800
committer: John Denker <jsd@av8n.com> 2012-11-22 15:56:19 -0800
commit: bf1bf1c43a82ec167ae80f185fe11255cf3a5237 (patch)
tree: d8e92ada612dac7dd032e4dacdb2c6edbce00f90 /tools
parent: c01d3b2d57c2fb2491f664aa6d67f03e514cfbb3 (diff)
1 files changed, 654 insertions, 0 deletions
diff --git a/tools/ward.c b/tools/ward.c
new file mode 100644
index 0000000..70265da
--- /dev/null
+++ b/tools/ward.c
@@ -0,0 +1,654 @@
+//////////////////
+// skrewt.c
+//
+// scrutinize email
+//
+
+#include <iostream>
+#include <stdlib.h>             /* for exit() */
+#include <string>               /* for strcmp() */
+#include <ctype.h>              /* toupper */
+#include <signal.h>
+
+#include <stdio.h>              /* perror */
+#include <sstream>
+#include <vector>
+#include <list>
+
+using namespace std;
+
+void usage(const int sts){
+  (sts ? cerr : cout) <<
+"Usage: skrewt [options]\n"
+"\n"
+"  Scrutinizes email.  Reads stdin, copies it to stdout.\n"
+"  Exit result 0 means good, 21 means rejection (spam).\n"
+"  Writes reason for rejection to stderr.\n"
+"\n"
+"  Typically used as a filter in a pipeline, along with spamc -E\n"
+"  Options\n"
+"    -help              print this msg (and exit immediately).\n"
+"    -maxsize ii        msg size in bytes; anything bigger will be rejected.\n"
+"    -error-exit        exit early if errors have been detected.\n"
+"\n"
+"  Messages containing the string '-please-bounce-this-' will be rejected.\n"
+"  Messages with no date will be rejected.\n"
+;
+  exit(sts);
+}
+
+#include "qq_exit_codes.h"
+#include "utils.h"
+#include "sepofra.h"
+
+/////////////////////////////////////////////////////////
+// Case insensitive comparison of strings
+
+class lessthan_foldcase{
+public:
+  bool operator() (const std::string& a, const std::string& b) const {
+    size_t a_len = a.length();
+    size_t b_len = b.length();
+
+    size_t lim = a_len < b_len ? a_len : b_len;
+
+    for (size_t i=0; i<lim; ++i)
+    {
+            char cha = toupper(a[i]);
+            char chb = toupper(b[i]);
+
+            if (cha < chb) return true;
+            if (cha > chb) return false;
+    }
+    // here if one is an extension of the other
+    if ( a_len < b_len ) return true;
+    return false;
+  }
+};
+
+
+// Returns negative if a is less than b in alphabetical order
+// returns 0 if they are the same, or positive if a is greater.
+// Like perl cmp operator, but ignores case.
+int cmp_casefold(const std::string& a, const std::string& b) {
+  string::const_iterator aa, bb;
+  aa = a.begin();
+  bb = b.begin();
+  while (aa != a.end() && bb != b.end()){
+    char ca = tolower(*aa++);
+    char cb = tolower(*bb++);
+    if (ca != cb) return ca < cb ? -2 : 2;
+  }
+  if (aa != a.end()) return 1;          // a is longer
+  if (bb != b.end()) return -1;         // b is longer
+  return 0;
+}
+
+string noCR(const string bar){
+  string foo(bar);
+  int len = foo.length();
+  if (len){
+    if (foo[len-1] == '\r') {
+      foo.erase(len-1);
+    }
+  }
+  return foo;
+}
+
+void maybe_exeunt(const int sts, const int really){
+  if (!really) return;
+  if (sts == ex_good) exit(sts);
+
+  const char* foo = getenv("HI_Q_GROUP");
+  if (!foo) exit(sts);
+
+// No point in signalling ourself:
+  sighandler_t rslt = signal(SIGUSR1, SIG_IGN);
+  if (rslt == SIG_ERR) {
+    cerr << "error setting signal" << endl;
+  }
+  int k = kill(-atoi(foo), SIGUSR1);
+  if (k) {
+    cerr << "kill failed on group " << atoi(foo) << " ... ";
+    perror(0);
+  }
+  exit(sts);
+}
+
+void exeunt(const int sts){
+  maybe_exeunt(sts, 1);
+}
+
+string progname, progid;
+int mypid;
+
+/* Content-Type: text/plain; charset="us-ascii"                                         */
+/* Content-Type: multipart/mixed; boundary="1170861315-1262462055-1341954763=:92165"    */
+//
+
+void parse_content(const string type_spec_line,
+        string &maintype, string &boundary) {
+  //xxx cerr << "parser called with: " << type_spec_line << endl;
+  string get_type(type_spec_line);
+
+  size_t where = get_type.find_first_of(" \t;\n");
+  string rest;
+  if (where == string::npos) {
+    // keep whole string
+  }
+  else {
+    rest = get_type.substr(where+1);
+    get_type = get_type.substr(0,where);
+  }
+  where = get_type.find("/");
+  if (where == string::npos){
+    maintype = "";
+    cerr << "could not find / in " << get_type << endl;
+  } else {
+    maintype = get_type.substr(0, where);
+  }
+
+// now need to find boundary
+
+  string srch = "boundary=";
+  where = rest.find(srch);
+  if (where != string::npos) {
+    where += srch.length();
+    boundary = rest.substr(where);
+    if (boundary[0] == '"') {
+      boundary = boundary.substr(1);
+      where = boundary.find_first_of("\"");
+    } else {
+      where = boundary.find_first_of(" \t;\n");
+    }
+    if (where == string::npos) {
+      /* do nothing, boundary=boundary as a whole */
+    } else {
+      boundary = boundary.substr(0, where);
+    }
+  } else {
+    //xxxxxxx cerr << "boundary= not found in " << type_spec_line << endl;
+  }
+}
+
+string join(const string sep, const list<string> stuff){
+  string rslt;
+  for (list<string>::const_iterator ptr = stuff.begin();
+        ptr != stuff.end(); ptr++){
+    if (rslt.length()) rslt += sep;
+    rslt += *ptr;
+  }
+  return rslt;
+}
+
+class skrewt{
+public:
+  string received_from;         // envelope HELO among other things
+    string proximta_HELO;
+    string proximta_rDNS;
+    string proximta_IP;
+    string proximta_AuthUser;
+  string return_path;           // envelope MAIL FROM
+  string boundary;
+  string to;
+  string from;
+  string subject;
+  string date;
+  string message_id;
+  string content_type;
+  string delivered_to;
+  int msgsize;
+  vector<string> bigbuf;
+  int saw_blank_line;
+  int recno;
+
+  int maxsize;
+  int error_exit;
+  int mid_required;
+
+  // constructor
+  skrewt()
+  : boundary("x-xx-x"), msgsize(0), saw_blank_line(0), recno(0),
+    maxsize(1000*1000), error_exit(0), mid_required(0)
+  {}
+
+  int headers();
+  int interstage();
+  int body();
+  int krunch_rfrom();
+};
+
+#if 0   /* typical "Received: from" lines */
+Received: from lists.sourceforge.net (216.34.181.88)
+  by cloud.av8n.com with SMTP; 31 Jul 2012 22:13:48 -0000
+
+Received: from 24-145-119-127-dhcp.gsv.md.atlanticbb.net (HELO mail.phys-l.org) (24.145.119.127)   by cloud.av8n.com with SMTP; 14 Jul 2012 23:56:54 -0000
+
+Received: from ip68-231-191-153.tc.ph.cox.net (HELO asclepias.av8n.net) (smtp@68.231.191.153)   by cloud.av8n.com with SMTP; 15 Jul 2012 14:39:58 -0000
+#endif
+
+#if 0   /* good for testing */
+// random mail from FAA
+/home/jsd/Maildir/cur/1343769926.24228.cloud\:2\,
+
+// has a good SPF result buried inside, at an earlier hop:
+/home/jsd/Maildir/cur/1342372942.24810.cloud:2,
+
+// has a good SPF as delivered to us:
+/home/jsd/Maildir/cur/1343671179.10420.cloud:2,
+
+// The following msg has no message-id, but does have an
+// authorized submitter:
+/home/jsd/Maildir/cur/1342363199.24320.cloud:2,
+#endif
+
+int skrewt::krunch_rfrom(){
+  stringstream parse;
+  parse.str(received_from);
+  string word;
+  parse >> word;
+  if (word != "from") {
+    cerr << progid << " bad 'Received: from' line ... '"
+        << word << "'" << endl;
+    return ex_syserr;
+  }
+  parse >> proximta_rDNS;
+  parse >> word;
+  if (word == "(HELO") {
+    parse >> proximta_HELO;
+    proximta_HELO = rtrim(proximta_HELO, "()");
+    parse >> word;
+  } else {
+    proximta_HELO = proximta_rDNS;
+  }
+  size_t len = word.length();
+  if (len<2 || word[0] != '(' || word[len-1] != ')') {
+    cerr << progid << " bad 'Received: from' line ;;; '"
+        << word << "'" << endl;
+    return ex_syserr;
+  }
+  proximta_IP = word.substr(1, len-2);
+  size_t where = proximta_IP.find("@");
+  if (where != string::npos){
+    proximta_AuthUser = proximta_IP.substr(0, where);
+    proximta_IP = proximta_IP.substr(1+where);
+  }
+
+  return 0;
+}
+
+int skrewt::headers(){
+  //xxxx cerr << progid << " begins" << endl;
+  for (;;){             // outer loop over all records in the header
+    if (cin.eof()) break;
+    if (cin.bad()) return 1;
+
+    string line;
+// on fail, go back to top of outer loop and check for eof versus bad
+    if (getline(cin, line).fail()) continue;
+    msgsize += line.length()+1;
+    if (msgsize > maxsize) {
+      cerr << progid << " rejection: bigger than " << maxsize << endl;
+      exeunt(ex_spam);
+    }
+    cout << line << endl;
+    bigbuf.push_back(line);
+    string headrec = noCR(line);       // for a folded record, this is the first line
+
+    for (;;) {        // inner loop to build a multi-line record e.g. folded record:
+      if (cin.eof()) break;
+      if (cin.bad()) return 1;
+      char ch;
+      if (cin.get(ch).fail()) continue;
+      cin.putback(ch);
+      if (ch != ' ' && ch != '\t') break;
+      string line;
+// on fail, go back to top of inner loop and check for eof versus bad
+      if (getline(cin, line).fail()) continue;
+      msgsize += line.length()+1;
+      if (msgsize > maxsize) {
+        cerr << progid << " rejection: bigger than " << maxsize << endl;
+        exeunt(ex_spam);
+      }
+      cout << line << endl;
+      bigbuf.push_back(line);
+      headrec += "\n" + noCR(line);
+    }
+// here with a fully assembled header record
+// headrec (unlike line) contains no DOS CR characters
+    int len = headrec.length();
+    if (len == 0) {
+      saw_blank_line = 1;
+      break;            // no more headers in this message
+    }
+
+// here if it's a header line
+    string headword;
+    string rest;
+    size_t where = headrec.find(":");
+    if (where != string::npos) {
+      headword = headrec.substr(0, where);
+      rest = ltrim(headrec.substr(1+where));
+    }
+    headword = toLower(headword);
+    if (0){
+    } else if (headword == "from") {
+      from = rest;
+    } else if (headword == "to") {
+      to = rest;
+    } else if (headword == "return-path") {
+      return_path = rest;
+    } else if (headword == "message-id") {
+      message_id = rest;
+    } else if (headword == "received") {
+      if (!received_from.length() && prefix("from ", rest)){
+        received_from = rest;
+      }
+    } else if (headword == "date") {
+      date = rest;
+    } else if (headword == "subject") {
+      subject = rest;
+    } else if (headword == "content-type") {
+      content_type = rest;
+    } else if (headword == "delivered-to") {
+      delivered_to = rest;
+    }
+    //xxxx  cout << headrec.length() << " ... ";
+    recno++;
+    if (0) if (recno <= 6) cerr << progid << "#" << recno
+        << " " << headrec << endl;
+  }
+  return 0;
+}
+
+int skrewt::interstage(){
+  if (saw_blank_line) {/* ignore */}
+// Note that the headers are in reverse-chronological order:
+  cerr << progid <<" Return-path: " << return_path <<endl;
+
+  { // parse the 'Received: from' line:
+    cerr << "        Received: " << received_from <<endl;
+    int rslt = krunch_rfrom();
+    if (rslt) return rslt;
+    cerr << "         rDNS:     " << proximta_rDNS << endl;
+    cerr << "         HELO:     " << proximta_HELO << endl;
+    cerr << "         IP:       "   << proximta_IP << endl;
+    cerr << "         AuthUser: "   << proximta_AuthUser << endl;
+    cerr << "         Mid       '"  << message_id << "'" << endl;
+  }
+
+  sepofra my_spf;
+  try {
+    my_spf.check(proximta_IP,
+        proximta_HELO,
+        return_path,
+        "junk", 0/* verbosity */);
+    cerr << "*** " << my_spf.explain() << endl;
+  } catch (bad_thing foo) {
+    cerr << "Caught bad thing: " << foo.what() << endl;
+    return ex_syserr;
+  }
+
+// The logic here is:  In order:
+// 1:: If whitelisted, accept.  No greylisting, no spam-checking.
+// 2:: If blacklisted, reject.  No greylisting, no spam-checking.
+// 3:: If good reputation, spam-check it and send it on its way.
+// 4:: If no reputation, greylist.
+// 5:: If bad reputation, ????
+
+// Expanding item 3 to the next level of detail:
+//  3a:: If some domain vouches for this sender-IP via SPF,
+//   then the reputation is bound to the domain.
+//  3c:: If some domain vouches for the message vie DKIM,
+//   then the reputation is bound to the domain.
+//  3d:: If no SPF or DKIM, then the reputation attaches
+//   to the sender-IP.
+
+// Expanding item 4 to the next level of detail:
+//  4a:: If the greylisting database says this message is ripe
+//   spam-check it.  If it's OK, use it to count toward reputation.
+//  4b:: If it is previously unseen or too old, start greylisting
+//   timer from scratch.  Reject with temporary error.
+//  4c:: If it is in the "green" state, let the timer
+//   continue from where it is.  Reject with temporary error.
+
+// Note:  Reputation normally attaches to a domain.
+//  With SPF, the domain vouches for the sender at a given IP address
+//   ... and then the sender implicitly vouches for the message.
+//  With DKIM, the domain vouches for an individual message.
+//  With neither SPF nor DKIM, reputation attaches to the sender's
+//    IP address.  The sender vouches for the message.
+//
+// During greylisting, delay applies to the message.  Reputation
+//  applies to the domain (via SPF or DKIM) or to the server
+//  (otherwise).
+
+
+// If you are a medium-sized operator, such that you have one
+// and only one IP address that ever sends email, and it is a
+// static IP address, then you don't have much to gain from
+// DKIM or SPF.  Attaching a reputation to your domain is not
+// much different from attaching a reputation to your IP address.
+
+// In constrast, if you are a low-budget operator with a
+// dynamic IP address, you benefit from SPF and/or DKIM.
+// Your reputation attaches to your domain, and remains
+// stable even as your IP address changes.
+
+// At the other extreme, if you are a big-time operator
+// such as googlegroups.com, you benefit from DKIM and/or
+// SPF.  Your IP addresses are not dynamic, but they are
+// numerous, so you prefer to have your reputation apply
+// to all your email-sending hosts.
+
+#if 0   /* typical Received-SPF line */
+ Received-SPF: pass (google.com: domain of rpendarvis@brenau.edu designates 74.125.245.70 as permitted sender) client-ip=74.125.245.70;
+#endif
+
+#if 0   /* SPF users */
+ :; mail-scan +received-spf /home/jsd/Maildir/cur[/]*  |
+    sed 's/.*domain of\(.*\).*designates.*/XXX \1 YYY/' |
+    awk '/XXX/{print "<" $2 ">"}' | sort | uniq -c | sort -nr
+     81 <gmail.com>
+     17 <mac.com>
+      8 <gmx.net>
+      8 <bbruner@gmail.com>
+      7 <jsd@av8n.com>
+      6 <kst24@cam.ac.uk>
+      5 <farooq.w@gmail.com>
+      4 <scerri@chem.ucla.edu>
+      4 <comcast.net>
+      4 <c2i.net>
+      3 <gemort2006@gmail.com>
+      2 <rrhake@earthlink.net>
+      2 <hotmail.com>
+      2 <GCC.EDU>
+      1 <us.panasonic.com>
+      1 <sss.pgh.pa.us>
+      1 <scot_wherland@wsu.edu>
+      1 <rpendarvis@brenau.edu>
+      1 <hmperks@gmail.com>
+      1 <btv1==55494f7d7e0==matt.fisher@email.stvincent.edu>
+      1 <arcor.de>
+#endif
+
+#if 0   /* DKIM users */
+     52 d=googlegroups.com;
+     27 d=barackobama.com;
+     10 d=gmail.com;
+      5 d=bronto.com;
+      5 d=bluehornet.com;
+      4 d=news.abebooks.com;
+      2 d=yahoo.co.uk;
+      2 d=sbcglobal.net;
+      2 d=embarqmail.com;
+      2 d=emailms.angieslist.com;
+      1 d=newsletters.sourceforge.net;
+      1 d=members.ebay.com;
+      1 d=info.citibank.com;
+      1 d=ebay.com;
+      1 d=commail1.co.za;
+#endif
+
+  list<string> badnews;
+  int whitelisted(0);
+
+  if (subject.find("sesame") != string::npos
+        && subject.find("swordfish") != string::npos) {
+    whitelisted++;
+  }
+
+  if (delivered_to.length()){
+    cerr << progid <<  "Delivered-to: <<<" << delivered_to << ">>>" << endl;
+  }
+  if (toLower(trim(delivered_to)) == "jean@av8n.com") {
+    badnews.push_back("Looping Delivered-to: " + delivered_to);
+  }
+
+  if (subject.find("-please-bounce-this-") != string::npos) {
+    badnews.push_back("by request");
+  }
+
+  if (!date.length()) {
+    badnews.push_back("no date");
+  }
+
+  if (mid_required && !message_id.length()) {
+    badnews.push_back("no message-id");
+  }
+
+  if (badnews.size() && !whitelisted){
+    cerr << progid << " " << join(", ", badnews) << endl;
+    if (error_exit){
+      cerr << progid << " '" << from
+             << "' to '" << to
+             << "'" << endl;
+      exeunt(ex_spam);
+    }
+  }
+  return 0;
+}
+
+int skrewt::body(){
+  string main_contype;
+  if (content_type.length())
+    parse_content(content_type, main_contype, boundary);
+// some slightly-useful booleans:
+  int currently_text = main_contype == "text";
+  int main_multipart = main_contype == "multipart";
+
+// early-stage thinking has been done.
+// Now spew the rest of the message
+  //xxxx cerr << "body begins: " << main_contype << " " << currently_text << " " << boundary << endl;
+  int in_subheads(0);
+  int textlines(0);
+
+  for (;;){             // outer loop over all lines in the body
+    if (cin.eof()) break;
+    if (cin.bad()) return 1;
+    string line;
+// on fail, go back to top of outer loop and check for eof versus bad
+    if (getline(cin, line).fail()) continue;
+    msgsize += line.length()+1;
+    if (msgsize > maxsize) {
+      cerr << progid << " rejection: bigger than " << maxsize << endl;
+      maybe_exeunt(ex_spam, error_exit);
+    }
+    bigbuf.push_back(line);
+    cout << line << endl;
+    if (in_subheads){
+      if (line == "" || line == "\r") in_subheads = 0;
+    }
+    if (in_subheads){
+        string sub_contype;
+        string junk;
+// in principle could worry about folded headers,
+// but in this application it doesn't actually matter
+        string headword;
+        string rest;
+        size_t where = line.find(":");
+        if (where != string::npos) {
+          headword = line.substr(0, where);
+          rest = ltrim(line.substr(1+where));
+        }
+        headword = toLower(headword);
+        if (headword == "content-type") {
+          parse_content(rest, sub_contype, junk);
+          currently_text = sub_contype == "text";
+          //xxxx cerr << "setting contype '" << sub_contype << "' " << currently_text << " ... " << textlines << endl;
+        }
+    } else {
+      if (main_multipart && line == "--" + boundary) {
+        //xxxx cerr << "found subhead boundary" << endl;
+        in_subheads = 1;
+        continue;
+      }
+      if (currently_text) textlines++;
+    }
+  }
+
+  if (0) cerr << "textlines: " << textlines << endl;
+  if (!textlines) {
+    cerr << progid << " rejection: no text: " << error_exit << endl;
+    maybe_exeunt(ex_spam, error_exit);
+  }
+  cerr << progid << " normal completion" << endl;
+  return(ex_good);
+}
+
+////////////////////////////////////////////////////////////
+int main(int _argc, const char** _argv){
+
+  int argc(_argc);
+  const char **argv(_argv);
+  {
+    progname = *argv++; argc--;
+    mypid = getpid();
+    stringstream binder;
+    binder << basename(progname) << "[" << mypid << "]";
+    progid = binder.str();
+  }
+
+  skrewt mysk;
+
+  while (argc) {
+    string arg(*argv); argv++; argc--;
+    if (arg.substr(0,2) == "--") arg = arg.substr(1);
+    if (prefix(arg, "-help")) {
+      usage(0);
+    }
+    if (0) {
+    } else if (prefix(arg, "-mid-required")) {
+      mysk.mid_required++;
+    } else if (prefix(arg, "-error-exit")) {
+      mysk.error_exit++;
+    } else if (prefix(arg, "-maxsize")) {
+      if (!argc) {
+        cerr << "Option -maxsize requires an argument" << endl;
+        exit(ex_usage);
+      }
+      mysk.maxsize = atoi(*argv); argv++; argc--;
+    } else if (arg.substr(0,1) == "-") {
+      cerr << "Unrecognized option '" << arg << "'" << endl;
+      cerr << "For help, try:  " << progname << " -help" << endl;
+      exit(ex_usage);
+    } else {
+      cerr << "Extraneous verbiage '" << arg << "'" << endl;
+      cerr << "For help, try:  " << progname << " -help" << endl;
+      exit(ex_usage);
+    }
+  }
+
+  int rslt = mysk.headers();
+  if (rslt) return rslt;
+
+// Headers are done.
+// Do some early-stage thinking.
+
+  rslt = mysk.interstage();
+  if (rslt) return rslt;
+
+  rslt = mysk.body();
+  return rslt;
+
+}
author	John Denker <jsd@av8n.com>	2012-11-22 15:56:19 -0800
committer	John Denker <jsd@av8n.com>	2012-11-22 15:56:19 -0800
commit	bf1bf1c43a82ec167ae80f185fe11255cf3a5237 (patch)
tree	d8e92ada612dac7dd032e4dacdb2c6edbce00f90 /tools
parent	c01d3b2d57c2fb2491f664aa6d67f03e514cfbb3 (diff)