diff options
-rw-r--r-- | tools/ward.c | 654 |
1 files changed, 654 insertions, 0 deletions
diff --git a/tools/ward.c b/tools/ward.c new file mode 100644 index 0000000..70265da --- /dev/null +++ b/tools/ward.c @@ -0,0 +1,654 @@ +////////////////// +// skrewt.c +// +// scrutinize email +// + +#include <iostream> +#include <stdlib.h> /* for exit() */ +#include <string> /* for strcmp() */ +#include <ctype.h> /* toupper */ +#include <signal.h> + +#include <stdio.h> /* perror */ +#include <sstream> +#include <vector> +#include <list> + +using namespace std; + +void usage(const int sts){ + (sts ? cerr : cout) << +"Usage: skrewt [options]\n" +"\n" +" Scrutinizes email. Reads stdin, copies it to stdout.\n" +" Exit result 0 means good, 21 means rejection (spam).\n" +" Writes reason for rejection to stderr.\n" +"\n" +" Typically used as a filter in a pipeline, along with spamc -E\n" +" Options\n" +" -help print this msg (and exit immediately).\n" +" -maxsize ii msg size in bytes; anything bigger will be rejected.\n" +" -error-exit exit early if errors have been detected.\n" +"\n" +" Messages containing the string '-please-bounce-this-' will be rejected.\n" +" Messages with no date will be rejected.\n" +; + exit(sts); +} + +#include "qq_exit_codes.h" +#include "utils.h" +#include "sepofra.h" + +///////////////////////////////////////////////////////// +// Case insensitive comparison of strings + +class lessthan_foldcase{ +public: + bool operator() (const std::string& a, const std::string& b) const { + size_t a_len = a.length(); + size_t b_len = b.length(); + + size_t lim = a_len < b_len ? a_len : b_len; + + for (size_t i=0; i<lim; ++i) + { + char cha = toupper(a[i]); + char chb = toupper(b[i]); + + if (cha < chb) return true; + if (cha > chb) return false; + } + // here if one is an extension of the other + if ( a_len < b_len ) return true; + return false; + } +}; + + +// Returns negative if a is less than b in alphabetical order +// returns 0 if they are the same, or positive if a is greater. +// Like perl cmp operator, but ignores case. +int cmp_casefold(const std::string& a, const std::string& b) { + string::const_iterator aa, bb; + aa = a.begin(); + bb = b.begin(); + while (aa != a.end() && bb != b.end()){ + char ca = tolower(*aa++); + char cb = tolower(*bb++); + if (ca != cb) return ca < cb ? -2 : 2; + } + if (aa != a.end()) return 1; // a is longer + if (bb != b.end()) return -1; // b is longer + return 0; +} + +string noCR(const string bar){ + string foo(bar); + int len = foo.length(); + if (len){ + if (foo[len-1] == '\r') { + foo.erase(len-1); + } + } + return foo; +} + +void maybe_exeunt(const int sts, const int really){ + if (!really) return; + if (sts == ex_good) exit(sts); + + const char* foo = getenv("HI_Q_GROUP"); + if (!foo) exit(sts); + +// No point in signalling ourself: + sighandler_t rslt = signal(SIGUSR1, SIG_IGN); + if (rslt == SIG_ERR) { + cerr << "error setting signal" << endl; + } + int k = kill(-atoi(foo), SIGUSR1); + if (k) { + cerr << "kill failed on group " << atoi(foo) << " ... "; + perror(0); + } + exit(sts); +} + +void exeunt(const int sts){ + maybe_exeunt(sts, 1); +} + +string progname, progid; +int mypid; + +/* Content-Type: text/plain; charset="us-ascii" */ +/* Content-Type: multipart/mixed; boundary="1170861315-1262462055-1341954763=:92165" */ +// + +void parse_content(const string type_spec_line, + string &maintype, string &boundary) { + //xxx cerr << "parser called with: " << type_spec_line << endl; + string get_type(type_spec_line); + + size_t where = get_type.find_first_of(" \t;\n"); + string rest; + if (where == string::npos) { + // keep whole string + } + else { + rest = get_type.substr(where+1); + get_type = get_type.substr(0,where); + } + where = get_type.find("/"); + if (where == string::npos){ + maintype = ""; + cerr << "could not find / in " << get_type << endl; + } else { + maintype = get_type.substr(0, where); + } + +// now need to find boundary + + string srch = "boundary="; + where = rest.find(srch); + if (where != string::npos) { + where += srch.length(); + boundary = rest.substr(where); + if (boundary[0] == '"') { + boundary = boundary.substr(1); + where = boundary.find_first_of("\""); + } else { + where = boundary.find_first_of(" \t;\n"); + } + if (where == string::npos) { + /* do nothing, boundary=boundary as a whole */ + } else { + boundary = boundary.substr(0, where); + } + } else { + //xxxxxxx cerr << "boundary= not found in " << type_spec_line << endl; + } +} + +string join(const string sep, const list<string> stuff){ + string rslt; + for (list<string>::const_iterator ptr = stuff.begin(); + ptr != stuff.end(); ptr++){ + if (rslt.length()) rslt += sep; + rslt += *ptr; + } + return rslt; +} + +class skrewt{ +public: + string received_from; // envelope HELO among other things + string proximta_HELO; + string proximta_rDNS; + string proximta_IP; + string proximta_AuthUser; + string return_path; // envelope MAIL FROM + string boundary; + string to; + string from; + string subject; + string date; + string message_id; + string content_type; + string delivered_to; + int msgsize; + vector<string> bigbuf; + int saw_blank_line; + int recno; + + int maxsize; + int error_exit; + int mid_required; + + // constructor + skrewt() + : boundary("x-xx-x"), msgsize(0), saw_blank_line(0), recno(0), + maxsize(1000*1000), error_exit(0), mid_required(0) + {} + + int headers(); + int interstage(); + int body(); + int krunch_rfrom(); +}; + +#if 0 /* typical "Received: from" lines */ +Received: from lists.sourceforge.net (216.34.181.88) + by cloud.av8n.com with SMTP; 31 Jul 2012 22:13:48 -0000 + +Received: from 24-145-119-127-dhcp.gsv.md.atlanticbb.net (HELO mail.phys-l.org) (24.145.119.127) by cloud.av8n.com with SMTP; 14 Jul 2012 23:56:54 -0000 + +Received: from ip68-231-191-153.tc.ph.cox.net (HELO asclepias.av8n.net) (smtp@68.231.191.153) by cloud.av8n.com with SMTP; 15 Jul 2012 14:39:58 -0000 +#endif + +#if 0 /* good for testing */ +// random mail from FAA +/home/jsd/Maildir/cur/1343769926.24228.cloud\:2\, + +// has a good SPF result buried inside, at an earlier hop: +/home/jsd/Maildir/cur/1342372942.24810.cloud:2, + +// has a good SPF as delivered to us: +/home/jsd/Maildir/cur/1343671179.10420.cloud:2, + +// The following msg has no message-id, but does have an +// authorized submitter: +/home/jsd/Maildir/cur/1342363199.24320.cloud:2, +#endif + +int skrewt::krunch_rfrom(){ + stringstream parse; + parse.str(received_from); + string word; + parse >> word; + if (word != "from") { + cerr << progid << " bad 'Received: from' line ... '" + << word << "'" << endl; + return ex_syserr; + } + parse >> proximta_rDNS; + parse >> word; + if (word == "(HELO") { + parse >> proximta_HELO; + proximta_HELO = rtrim(proximta_HELO, "()"); + parse >> word; + } else { + proximta_HELO = proximta_rDNS; + } + size_t len = word.length(); + if (len<2 || word[0] != '(' || word[len-1] != ')') { + cerr << progid << " bad 'Received: from' line ;;; '" + << word << "'" << endl; + return ex_syserr; + } + proximta_IP = word.substr(1, len-2); + size_t where = proximta_IP.find("@"); + if (where != string::npos){ + proximta_AuthUser = proximta_IP.substr(0, where); + proximta_IP = proximta_IP.substr(1+where); + } + + return 0; +} + +int skrewt::headers(){ + //xxxx cerr << progid << " begins" << endl; + for (;;){ // outer loop over all records in the header + if (cin.eof()) break; + if (cin.bad()) return 1; + + string line; +// on fail, go back to top of outer loop and check for eof versus bad + if (getline(cin, line).fail()) continue; + msgsize += line.length()+1; + if (msgsize > maxsize) { + cerr << progid << " rejection: bigger than " << maxsize << endl; + exeunt(ex_spam); + } + cout << line << endl; + bigbuf.push_back(line); + string headrec = noCR(line); // for a folded record, this is the first line + + for (;;) { // inner loop to build a multi-line record e.g. folded record: + if (cin.eof()) break; + if (cin.bad()) return 1; + char ch; + if (cin.get(ch).fail()) continue; + cin.putback(ch); + if (ch != ' ' && ch != '\t') break; + string line; +// on fail, go back to top of inner loop and check for eof versus bad + if (getline(cin, line).fail()) continue; + msgsize += line.length()+1; + if (msgsize > maxsize) { + cerr << progid << " rejection: bigger than " << maxsize << endl; + exeunt(ex_spam); + } + cout << line << endl; + bigbuf.push_back(line); + headrec += "\n" + noCR(line); + } +// here with a fully assembled header record +// headrec (unlike line) contains no DOS CR characters + int len = headrec.length(); + if (len == 0) { + saw_blank_line = 1; + break; // no more headers in this message + } + +// here if it's a header line + string headword; + string rest; + size_t where = headrec.find(":"); + if (where != string::npos) { + headword = headrec.substr(0, where); + rest = ltrim(headrec.substr(1+where)); + } + headword = toLower(headword); + if (0){ + } else if (headword == "from") { + from = rest; + } else if (headword == "to") { + to = rest; + } else if (headword == "return-path") { + return_path = rest; + } else if (headword == "message-id") { + message_id = rest; + } else if (headword == "received") { + if (!received_from.length() && prefix("from ", rest)){ + received_from = rest; + } + } else if (headword == "date") { + date = rest; + } else if (headword == "subject") { + subject = rest; + } else if (headword == "content-type") { + content_type = rest; + } else if (headword == "delivered-to") { + delivered_to = rest; + } + //xxxx cout << headrec.length() << " ... "; + recno++; + if (0) if (recno <= 6) cerr << progid << "#" << recno + << " " << headrec << endl; + } + return 0; +} + +int skrewt::interstage(){ + if (saw_blank_line) {/* ignore */} +// Note that the headers are in reverse-chronological order: + cerr << progid <<" Return-path: " << return_path <<endl; + + { // parse the 'Received: from' line: + cerr << " Received: " << received_from <<endl; + int rslt = krunch_rfrom(); + if (rslt) return rslt; + cerr << " rDNS: " << proximta_rDNS << endl; + cerr << " HELO: " << proximta_HELO << endl; + cerr << " IP: " << proximta_IP << endl; + cerr << " AuthUser: " << proximta_AuthUser << endl; + cerr << " Mid '" << message_id << "'" << endl; + } + + sepofra my_spf; + try { + my_spf.check(proximta_IP, + proximta_HELO, + return_path, + "junk", 0/* verbosity */); + cerr << "*** " << my_spf.explain() << endl; + } catch (bad_thing foo) { + cerr << "Caught bad thing: " << foo.what() << endl; + return ex_syserr; + } + +// The logic here is: In order: +// 1:: If whitelisted, accept. No greylisting, no spam-checking. +// 2:: If blacklisted, reject. No greylisting, no spam-checking. +// 3:: If good reputation, spam-check it and send it on its way. +// 4:: If no reputation, greylist. +// 5:: If bad reputation, ???? + +// Expanding item 3 to the next level of detail: +// 3a:: If some domain vouches for this sender-IP via SPF, +// then the reputation is bound to the domain. +// 3c:: If some domain vouches for the message vie DKIM, +// then the reputation is bound to the domain. +// 3d:: If no SPF or DKIM, then the reputation attaches +// to the sender-IP. + +// Expanding item 4 to the next level of detail: +// 4a:: If the greylisting database says this message is ripe +// spam-check it. If it's OK, use it to count toward reputation. +// 4b:: If it is previously unseen or too old, start greylisting +// timer from scratch. Reject with temporary error. +// 4c:: If it is in the "green" state, let the timer +// continue from where it is. Reject with temporary error. + +// Note: Reputation normally attaches to a domain. +// With SPF, the domain vouches for the sender at a given IP address +// ... and then the sender implicitly vouches for the message. +// With DKIM, the domain vouches for an individual message. +// With neither SPF nor DKIM, reputation attaches to the sender's +// IP address. The sender vouches for the message. +// +// During greylisting, delay applies to the message. Reputation +// applies to the domain (via SPF or DKIM) or to the server +// (otherwise). + + +// If you are a medium-sized operator, such that you have one +// and only one IP address that ever sends email, and it is a +// static IP address, then you don't have much to gain from +// DKIM or SPF. Attaching a reputation to your domain is not +// much different from attaching a reputation to your IP address. + +// In constrast, if you are a low-budget operator with a +// dynamic IP address, you benefit from SPF and/or DKIM. +// Your reputation attaches to your domain, and remains +// stable even as your IP address changes. + +// At the other extreme, if you are a big-time operator +// such as googlegroups.com, you benefit from DKIM and/or +// SPF. Your IP addresses are not dynamic, but they are +// numerous, so you prefer to have your reputation apply +// to all your email-sending hosts. + +#if 0 /* typical Received-SPF line */ + Received-SPF: pass (google.com: domain of rpendarvis@brenau.edu designates 74.125.245.70 as permitted sender) client-ip=74.125.245.70; +#endif + +#if 0 /* SPF users */ + :; mail-scan +received-spf /home/jsd/Maildir/cur[/]* | + sed 's/.*domain of\(.*\).*designates.*/XXX \1 YYY/' | + awk '/XXX/{print "<" $2 ">"}' | sort | uniq -c | sort -nr + 81 <gmail.com> + 17 <mac.com> + 8 <gmx.net> + 8 <bbruner@gmail.com> + 7 <jsd@av8n.com> + 6 <kst24@cam.ac.uk> + 5 <farooq.w@gmail.com> + 4 <scerri@chem.ucla.edu> + 4 <comcast.net> + 4 <c2i.net> + 3 <gemort2006@gmail.com> + 2 <rrhake@earthlink.net> + 2 <hotmail.com> + 2 <GCC.EDU> + 1 <us.panasonic.com> + 1 <sss.pgh.pa.us> + 1 <scot_wherland@wsu.edu> + 1 <rpendarvis@brenau.edu> + 1 <hmperks@gmail.com> + 1 <btv1==55494f7d7e0==matt.fisher@email.stvincent.edu> + 1 <arcor.de> +#endif + +#if 0 /* DKIM users */ + 52 d=googlegroups.com; + 27 d=barackobama.com; + 10 d=gmail.com; + 5 d=bronto.com; + 5 d=bluehornet.com; + 4 d=news.abebooks.com; + 2 d=yahoo.co.uk; + 2 d=sbcglobal.net; + 2 d=embarqmail.com; + 2 d=emailms.angieslist.com; + 1 d=newsletters.sourceforge.net; + 1 d=members.ebay.com; + 1 d=info.citibank.com; + 1 d=ebay.com; + 1 d=commail1.co.za; +#endif + + list<string> badnews; + int whitelisted(0); + + if (subject.find("sesame") != string::npos + && subject.find("swordfish") != string::npos) { + whitelisted++; + } + + if (delivered_to.length()){ + cerr << progid << "Delivered-to: <<<" << delivered_to << ">>>" << endl; + } + if (toLower(trim(delivered_to)) == "jean@av8n.com") { + badnews.push_back("Looping Delivered-to: " + delivered_to); + } + + if (subject.find("-please-bounce-this-") != string::npos) { + badnews.push_back("by request"); + } + + if (!date.length()) { + badnews.push_back("no date"); + } + + if (mid_required && !message_id.length()) { + badnews.push_back("no message-id"); + } + + if (badnews.size() && !whitelisted){ + cerr << progid << " " << join(", ", badnews) << endl; + if (error_exit){ + cerr << progid << " '" << from + << "' to '" << to + << "'" << endl; + exeunt(ex_spam); + } + } + return 0; +} + +int skrewt::body(){ + string main_contype; + if (content_type.length()) + parse_content(content_type, main_contype, boundary); +// some slightly-useful booleans: + int currently_text = main_contype == "text"; + int main_multipart = main_contype == "multipart"; + +// early-stage thinking has been done. +// Now spew the rest of the message + //xxxx cerr << "body begins: " << main_contype << " " << currently_text << " " << boundary << endl; + int in_subheads(0); + int textlines(0); + + for (;;){ // outer loop over all lines in the body + if (cin.eof()) break; + if (cin.bad()) return 1; + string line; +// on fail, go back to top of outer loop and check for eof versus bad + if (getline(cin, line).fail()) continue; + msgsize += line.length()+1; + if (msgsize > maxsize) { + cerr << progid << " rejection: bigger than " << maxsize << endl; + maybe_exeunt(ex_spam, error_exit); + } + bigbuf.push_back(line); + cout << line << endl; + if (in_subheads){ + if (line == "" || line == "\r") in_subheads = 0; + } + if (in_subheads){ + string sub_contype; + string junk; +// in principle could worry about folded headers, +// but in this application it doesn't actually matter + string headword; + string rest; + size_t where = line.find(":"); + if (where != string::npos) { + headword = line.substr(0, where); + rest = ltrim(line.substr(1+where)); + } + headword = toLower(headword); + if (headword == "content-type") { + parse_content(rest, sub_contype, junk); + currently_text = sub_contype == "text"; + //xxxx cerr << "setting contype '" << sub_contype << "' " << currently_text << " ... " << textlines << endl; + } + } else { + if (main_multipart && line == "--" + boundary) { + //xxxx cerr << "found subhead boundary" << endl; + in_subheads = 1; + continue; + } + if (currently_text) textlines++; + } + } + + if (0) cerr << "textlines: " << textlines << endl; + if (!textlines) { + cerr << progid << " rejection: no text: " << error_exit << endl; + maybe_exeunt(ex_spam, error_exit); + } + cerr << progid << " normal completion" << endl; + return(ex_good); +} + +//////////////////////////////////////////////////////////// +int main(int _argc, const char** _argv){ + + int argc(_argc); + const char **argv(_argv); + { + progname = *argv++; argc--; + mypid = getpid(); + stringstream binder; + binder << basename(progname) << "[" << mypid << "]"; + progid = binder.str(); + } + + skrewt mysk; + + while (argc) { + string arg(*argv); argv++; argc--; + if (arg.substr(0,2) == "--") arg = arg.substr(1); + if (prefix(arg, "-help")) { + usage(0); + } + if (0) { + } else if (prefix(arg, "-mid-required")) { + mysk.mid_required++; + } else if (prefix(arg, "-error-exit")) { + mysk.error_exit++; + } else if (prefix(arg, "-maxsize")) { + if (!argc) { + cerr << "Option -maxsize requires an argument" << endl; + exit(ex_usage); + } + mysk.maxsize = atoi(*argv); argv++; argc--; + } else if (arg.substr(0,1) == "-") { + cerr << "Unrecognized option '" << arg << "'" << endl; + cerr << "For help, try: " << progname << " -help" << endl; + exit(ex_usage); + } else { + cerr << "Extraneous verbiage '" << arg << "'" << endl; + cerr << "For help, try: " << progname << " -help" << endl; + exit(ex_usage); + } + } + + int rslt = mysk.headers(); + if (rslt) return rslt; + +// Headers are done. +// Do some early-stage thinking. + + rslt = mysk.interstage(); + if (rslt) return rslt; + + rslt = mysk.body(); + return rslt; + +} |