summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorJohn Denker <jsd@av8n.com>2012-11-22 15:56:19 -0800
committerJohn Denker <jsd@av8n.com>2012-11-22 15:56:19 -0800
commitbf1bf1c43a82ec167ae80f185fe11255cf3a5237 (patch)
treed8e92ada612dac7dd032e4dacdb2c6edbce00f90 /tools
parentc01d3b2d57c2fb2491f664aa6d67f03e514cfbb3 (diff)
ward is like skrewt, but new and experimental
Diffstat (limited to 'tools')
-rw-r--r--tools/ward.c654
1 files changed, 654 insertions, 0 deletions
diff --git a/tools/ward.c b/tools/ward.c
new file mode 100644
index 0000000..70265da
--- /dev/null
+++ b/tools/ward.c
@@ -0,0 +1,654 @@
+//////////////////
+// skrewt.c
+//
+// scrutinize email
+//
+
+#include <iostream>
+#include <stdlib.h> /* for exit() */
+#include <string> /* for strcmp() */
+#include <ctype.h> /* toupper */
+#include <signal.h>
+
+#include <stdio.h> /* perror */
+#include <sstream>
+#include <vector>
+#include <list>
+
+using namespace std;
+
+void usage(const int sts){
+ (sts ? cerr : cout) <<
+"Usage: skrewt [options]\n"
+"\n"
+" Scrutinizes email. Reads stdin, copies it to stdout.\n"
+" Exit result 0 means good, 21 means rejection (spam).\n"
+" Writes reason for rejection to stderr.\n"
+"\n"
+" Typically used as a filter in a pipeline, along with spamc -E\n"
+" Options\n"
+" -help print this msg (and exit immediately).\n"
+" -maxsize ii msg size in bytes; anything bigger will be rejected.\n"
+" -error-exit exit early if errors have been detected.\n"
+"\n"
+" Messages containing the string '-please-bounce-this-' will be rejected.\n"
+" Messages with no date will be rejected.\n"
+;
+ exit(sts);
+}
+
+#include "qq_exit_codes.h"
+#include "utils.h"
+#include "sepofra.h"
+
+/////////////////////////////////////////////////////////
+// Case insensitive comparison of strings
+
+class lessthan_foldcase{
+public:
+ bool operator() (const std::string& a, const std::string& b) const {
+ size_t a_len = a.length();
+ size_t b_len = b.length();
+
+ size_t lim = a_len < b_len ? a_len : b_len;
+
+ for (size_t i=0; i<lim; ++i)
+ {
+ char cha = toupper(a[i]);
+ char chb = toupper(b[i]);
+
+ if (cha < chb) return true;
+ if (cha > chb) return false;
+ }
+ // here if one is an extension of the other
+ if ( a_len < b_len ) return true;
+ return false;
+ }
+};
+
+
+// Returns negative if a is less than b in alphabetical order
+// returns 0 if they are the same, or positive if a is greater.
+// Like perl cmp operator, but ignores case.
+int cmp_casefold(const std::string& a, const std::string& b) {
+ string::const_iterator aa, bb;
+ aa = a.begin();
+ bb = b.begin();
+ while (aa != a.end() && bb != b.end()){
+ char ca = tolower(*aa++);
+ char cb = tolower(*bb++);
+ if (ca != cb) return ca < cb ? -2 : 2;
+ }
+ if (aa != a.end()) return 1; // a is longer
+ if (bb != b.end()) return -1; // b is longer
+ return 0;
+}
+
+string noCR(const string bar){
+ string foo(bar);
+ int len = foo.length();
+ if (len){
+ if (foo[len-1] == '\r') {
+ foo.erase(len-1);
+ }
+ }
+ return foo;
+}
+
+void maybe_exeunt(const int sts, const int really){
+ if (!really) return;
+ if (sts == ex_good) exit(sts);
+
+ const char* foo = getenv("HI_Q_GROUP");
+ if (!foo) exit(sts);
+
+// No point in signalling ourself:
+ sighandler_t rslt = signal(SIGUSR1, SIG_IGN);
+ if (rslt == SIG_ERR) {
+ cerr << "error setting signal" << endl;
+ }
+ int k = kill(-atoi(foo), SIGUSR1);
+ if (k) {
+ cerr << "kill failed on group " << atoi(foo) << " ... ";
+ perror(0);
+ }
+ exit(sts);
+}
+
+void exeunt(const int sts){
+ maybe_exeunt(sts, 1);
+}
+
+string progname, progid;
+int mypid;
+
+/* Content-Type: text/plain; charset="us-ascii" */
+/* Content-Type: multipart/mixed; boundary="1170861315-1262462055-1341954763=:92165" */
+//
+
+void parse_content(const string type_spec_line,
+ string &maintype, string &boundary) {
+ //xxx cerr << "parser called with: " << type_spec_line << endl;
+ string get_type(type_spec_line);
+
+ size_t where = get_type.find_first_of(" \t;\n");
+ string rest;
+ if (where == string::npos) {
+ // keep whole string
+ }
+ else {
+ rest = get_type.substr(where+1);
+ get_type = get_type.substr(0,where);
+ }
+ where = get_type.find("/");
+ if (where == string::npos){
+ maintype = "";
+ cerr << "could not find / in " << get_type << endl;
+ } else {
+ maintype = get_type.substr(0, where);
+ }
+
+// now need to find boundary
+
+ string srch = "boundary=";
+ where = rest.find(srch);
+ if (where != string::npos) {
+ where += srch.length();
+ boundary = rest.substr(where);
+ if (boundary[0] == '"') {
+ boundary = boundary.substr(1);
+ where = boundary.find_first_of("\"");
+ } else {
+ where = boundary.find_first_of(" \t;\n");
+ }
+ if (where == string::npos) {
+ /* do nothing, boundary=boundary as a whole */
+ } else {
+ boundary = boundary.substr(0, where);
+ }
+ } else {
+ //xxxxxxx cerr << "boundary= not found in " << type_spec_line << endl;
+ }
+}
+
+string join(const string sep, const list<string> stuff){
+ string rslt;
+ for (list<string>::const_iterator ptr = stuff.begin();
+ ptr != stuff.end(); ptr++){
+ if (rslt.length()) rslt += sep;
+ rslt += *ptr;
+ }
+ return rslt;
+}
+
+class skrewt{
+public:
+ string received_from; // envelope HELO among other things
+ string proximta_HELO;
+ string proximta_rDNS;
+ string proximta_IP;
+ string proximta_AuthUser;
+ string return_path; // envelope MAIL FROM
+ string boundary;
+ string to;
+ string from;
+ string subject;
+ string date;
+ string message_id;
+ string content_type;
+ string delivered_to;
+ int msgsize;
+ vector<string> bigbuf;
+ int saw_blank_line;
+ int recno;
+
+ int maxsize;
+ int error_exit;
+ int mid_required;
+
+ // constructor
+ skrewt()
+ : boundary("x-xx-x"), msgsize(0), saw_blank_line(0), recno(0),
+ maxsize(1000*1000), error_exit(0), mid_required(0)
+ {}
+
+ int headers();
+ int interstage();
+ int body();
+ int krunch_rfrom();
+};
+
+#if 0 /* typical "Received: from" lines */
+Received: from lists.sourceforge.net (216.34.181.88)
+ by cloud.av8n.com with SMTP; 31 Jul 2012 22:13:48 -0000
+
+Received: from 24-145-119-127-dhcp.gsv.md.atlanticbb.net (HELO mail.phys-l.org) (24.145.119.127) by cloud.av8n.com with SMTP; 14 Jul 2012 23:56:54 -0000
+
+Received: from ip68-231-191-153.tc.ph.cox.net (HELO asclepias.av8n.net) (smtp@68.231.191.153) by cloud.av8n.com with SMTP; 15 Jul 2012 14:39:58 -0000
+#endif
+
+#if 0 /* good for testing */
+// random mail from FAA
+/home/jsd/Maildir/cur/1343769926.24228.cloud\:2\,
+
+// has a good SPF result buried inside, at an earlier hop:
+/home/jsd/Maildir/cur/1342372942.24810.cloud:2,
+
+// has a good SPF as delivered to us:
+/home/jsd/Maildir/cur/1343671179.10420.cloud:2,
+
+// The following msg has no message-id, but does have an
+// authorized submitter:
+/home/jsd/Maildir/cur/1342363199.24320.cloud:2,
+#endif
+
+int skrewt::krunch_rfrom(){
+ stringstream parse;
+ parse.str(received_from);
+ string word;
+ parse >> word;
+ if (word != "from") {
+ cerr << progid << " bad 'Received: from' line ... '"
+ << word << "'" << endl;
+ return ex_syserr;
+ }
+ parse >> proximta_rDNS;
+ parse >> word;
+ if (word == "(HELO") {
+ parse >> proximta_HELO;
+ proximta_HELO = rtrim(proximta_HELO, "()");
+ parse >> word;
+ } else {
+ proximta_HELO = proximta_rDNS;
+ }
+ size_t len = word.length();
+ if (len<2 || word[0] != '(' || word[len-1] != ')') {
+ cerr << progid << " bad 'Received: from' line ;;; '"
+ << word << "'" << endl;
+ return ex_syserr;
+ }
+ proximta_IP = word.substr(1, len-2);
+ size_t where = proximta_IP.find("@");
+ if (where != string::npos){
+ proximta_AuthUser = proximta_IP.substr(0, where);
+ proximta_IP = proximta_IP.substr(1+where);
+ }
+
+ return 0;
+}
+
+int skrewt::headers(){
+ //xxxx cerr << progid << " begins" << endl;
+ for (;;){ // outer loop over all records in the header
+ if (cin.eof()) break;
+ if (cin.bad()) return 1;
+
+ string line;
+// on fail, go back to top of outer loop and check for eof versus bad
+ if (getline(cin, line).fail()) continue;
+ msgsize += line.length()+1;
+ if (msgsize > maxsize) {
+ cerr << progid << " rejection: bigger than " << maxsize << endl;
+ exeunt(ex_spam);
+ }
+ cout << line << endl;
+ bigbuf.push_back(line);
+ string headrec = noCR(line); // for a folded record, this is the first line
+
+ for (;;) { // inner loop to build a multi-line record e.g. folded record:
+ if (cin.eof()) break;
+ if (cin.bad()) return 1;
+ char ch;
+ if (cin.get(ch).fail()) continue;
+ cin.putback(ch);
+ if (ch != ' ' && ch != '\t') break;
+ string line;
+// on fail, go back to top of inner loop and check for eof versus bad
+ if (getline(cin, line).fail()) continue;
+ msgsize += line.length()+1;
+ if (msgsize > maxsize) {
+ cerr << progid << " rejection: bigger than " << maxsize << endl;
+ exeunt(ex_spam);
+ }
+ cout << line << endl;
+ bigbuf.push_back(line);
+ headrec += "\n" + noCR(line);
+ }
+// here with a fully assembled header record
+// headrec (unlike line) contains no DOS CR characters
+ int len = headrec.length();
+ if (len == 0) {
+ saw_blank_line = 1;
+ break; // no more headers in this message
+ }
+
+// here if it's a header line
+ string headword;
+ string rest;
+ size_t where = headrec.find(":");
+ if (where != string::npos) {
+ headword = headrec.substr(0, where);
+ rest = ltrim(headrec.substr(1+where));
+ }
+ headword = toLower(headword);
+ if (0){
+ } else if (headword == "from") {
+ from = rest;
+ } else if (headword == "to") {
+ to = rest;
+ } else if (headword == "return-path") {
+ return_path = rest;
+ } else if (headword == "message-id") {
+ message_id = rest;
+ } else if (headword == "received") {
+ if (!received_from.length() && prefix("from ", rest)){
+ received_from = rest;
+ }
+ } else if (headword == "date") {
+ date = rest;
+ } else if (headword == "subject") {
+ subject = rest;
+ } else if (headword == "content-type") {
+ content_type = rest;
+ } else if (headword == "delivered-to") {
+ delivered_to = rest;
+ }
+ //xxxx cout << headrec.length() << " ... ";
+ recno++;
+ if (0) if (recno <= 6) cerr << progid << "#" << recno
+ << " " << headrec << endl;
+ }
+ return 0;
+}
+
+int skrewt::interstage(){
+ if (saw_blank_line) {/* ignore */}
+// Note that the headers are in reverse-chronological order:
+ cerr << progid <<" Return-path: " << return_path <<endl;
+
+ { // parse the 'Received: from' line:
+ cerr << " Received: " << received_from <<endl;
+ int rslt = krunch_rfrom();
+ if (rslt) return rslt;
+ cerr << " rDNS: " << proximta_rDNS << endl;
+ cerr << " HELO: " << proximta_HELO << endl;
+ cerr << " IP: " << proximta_IP << endl;
+ cerr << " AuthUser: " << proximta_AuthUser << endl;
+ cerr << " Mid '" << message_id << "'" << endl;
+ }
+
+ sepofra my_spf;
+ try {
+ my_spf.check(proximta_IP,
+ proximta_HELO,
+ return_path,
+ "junk", 0/* verbosity */);
+ cerr << "*** " << my_spf.explain() << endl;
+ } catch (bad_thing foo) {
+ cerr << "Caught bad thing: " << foo.what() << endl;
+ return ex_syserr;
+ }
+
+// The logic here is: In order:
+// 1:: If whitelisted, accept. No greylisting, no spam-checking.
+// 2:: If blacklisted, reject. No greylisting, no spam-checking.
+// 3:: If good reputation, spam-check it and send it on its way.
+// 4:: If no reputation, greylist.
+// 5:: If bad reputation, ????
+
+// Expanding item 3 to the next level of detail:
+// 3a:: If some domain vouches for this sender-IP via SPF,
+// then the reputation is bound to the domain.
+// 3c:: If some domain vouches for the message vie DKIM,
+// then the reputation is bound to the domain.
+// 3d:: If no SPF or DKIM, then the reputation attaches
+// to the sender-IP.
+
+// Expanding item 4 to the next level of detail:
+// 4a:: If the greylisting database says this message is ripe
+// spam-check it. If it's OK, use it to count toward reputation.
+// 4b:: If it is previously unseen or too old, start greylisting
+// timer from scratch. Reject with temporary error.
+// 4c:: If it is in the "green" state, let the timer
+// continue from where it is. Reject with temporary error.
+
+// Note: Reputation normally attaches to a domain.
+// With SPF, the domain vouches for the sender at a given IP address
+// ... and then the sender implicitly vouches for the message.
+// With DKIM, the domain vouches for an individual message.
+// With neither SPF nor DKIM, reputation attaches to the sender's
+// IP address. The sender vouches for the message.
+//
+// During greylisting, delay applies to the message. Reputation
+// applies to the domain (via SPF or DKIM) or to the server
+// (otherwise).
+
+
+// If you are a medium-sized operator, such that you have one
+// and only one IP address that ever sends email, and it is a
+// static IP address, then you don't have much to gain from
+// DKIM or SPF. Attaching a reputation to your domain is not
+// much different from attaching a reputation to your IP address.
+
+// In constrast, if you are a low-budget operator with a
+// dynamic IP address, you benefit from SPF and/or DKIM.
+// Your reputation attaches to your domain, and remains
+// stable even as your IP address changes.
+
+// At the other extreme, if you are a big-time operator
+// such as googlegroups.com, you benefit from DKIM and/or
+// SPF. Your IP addresses are not dynamic, but they are
+// numerous, so you prefer to have your reputation apply
+// to all your email-sending hosts.
+
+#if 0 /* typical Received-SPF line */
+ Received-SPF: pass (google.com: domain of rpendarvis@brenau.edu designates 74.125.245.70 as permitted sender) client-ip=74.125.245.70;
+#endif
+
+#if 0 /* SPF users */
+ :; mail-scan +received-spf /home/jsd/Maildir/cur[/]* |
+ sed 's/.*domain of\(.*\).*designates.*/XXX \1 YYY/' |
+ awk '/XXX/{print "<" $2 ">"}' | sort | uniq -c | sort -nr
+ 81 <gmail.com>
+ 17 <mac.com>
+ 8 <gmx.net>
+ 8 <bbruner@gmail.com>
+ 7 <jsd@av8n.com>
+ 6 <kst24@cam.ac.uk>
+ 5 <farooq.w@gmail.com>
+ 4 <scerri@chem.ucla.edu>
+ 4 <comcast.net>
+ 4 <c2i.net>
+ 3 <gemort2006@gmail.com>
+ 2 <rrhake@earthlink.net>
+ 2 <hotmail.com>
+ 2 <GCC.EDU>
+ 1 <us.panasonic.com>
+ 1 <sss.pgh.pa.us>
+ 1 <scot_wherland@wsu.edu>
+ 1 <rpendarvis@brenau.edu>
+ 1 <hmperks@gmail.com>
+ 1 <btv1==55494f7d7e0==matt.fisher@email.stvincent.edu>
+ 1 <arcor.de>
+#endif
+
+#if 0 /* DKIM users */
+ 52 d=googlegroups.com;
+ 27 d=barackobama.com;
+ 10 d=gmail.com;
+ 5 d=bronto.com;
+ 5 d=bluehornet.com;
+ 4 d=news.abebooks.com;
+ 2 d=yahoo.co.uk;
+ 2 d=sbcglobal.net;
+ 2 d=embarqmail.com;
+ 2 d=emailms.angieslist.com;
+ 1 d=newsletters.sourceforge.net;
+ 1 d=members.ebay.com;
+ 1 d=info.citibank.com;
+ 1 d=ebay.com;
+ 1 d=commail1.co.za;
+#endif
+
+ list<string> badnews;
+ int whitelisted(0);
+
+ if (subject.find("sesame") != string::npos
+ && subject.find("swordfish") != string::npos) {
+ whitelisted++;
+ }
+
+ if (delivered_to.length()){
+ cerr << progid << "Delivered-to: <<<" << delivered_to << ">>>" << endl;
+ }
+ if (toLower(trim(delivered_to)) == "jean@av8n.com") {
+ badnews.push_back("Looping Delivered-to: " + delivered_to);
+ }
+
+ if (subject.find("-please-bounce-this-") != string::npos) {
+ badnews.push_back("by request");
+ }
+
+ if (!date.length()) {
+ badnews.push_back("no date");
+ }
+
+ if (mid_required && !message_id.length()) {
+ badnews.push_back("no message-id");
+ }
+
+ if (badnews.size() && !whitelisted){
+ cerr << progid << " " << join(", ", badnews) << endl;
+ if (error_exit){
+ cerr << progid << " '" << from
+ << "' to '" << to
+ << "'" << endl;
+ exeunt(ex_spam);
+ }
+ }
+ return 0;
+}
+
+int skrewt::body(){
+ string main_contype;
+ if (content_type.length())
+ parse_content(content_type, main_contype, boundary);
+// some slightly-useful booleans:
+ int currently_text = main_contype == "text";
+ int main_multipart = main_contype == "multipart";
+
+// early-stage thinking has been done.
+// Now spew the rest of the message
+ //xxxx cerr << "body begins: " << main_contype << " " << currently_text << " " << boundary << endl;
+ int in_subheads(0);
+ int textlines(0);
+
+ for (;;){ // outer loop over all lines in the body
+ if (cin.eof()) break;
+ if (cin.bad()) return 1;
+ string line;
+// on fail, go back to top of outer loop and check for eof versus bad
+ if (getline(cin, line).fail()) continue;
+ msgsize += line.length()+1;
+ if (msgsize > maxsize) {
+ cerr << progid << " rejection: bigger than " << maxsize << endl;
+ maybe_exeunt(ex_spam, error_exit);
+ }
+ bigbuf.push_back(line);
+ cout << line << endl;
+ if (in_subheads){
+ if (line == "" || line == "\r") in_subheads = 0;
+ }
+ if (in_subheads){
+ string sub_contype;
+ string junk;
+// in principle could worry about folded headers,
+// but in this application it doesn't actually matter
+ string headword;
+ string rest;
+ size_t where = line.find(":");
+ if (where != string::npos) {
+ headword = line.substr(0, where);
+ rest = ltrim(line.substr(1+where));
+ }
+ headword = toLower(headword);
+ if (headword == "content-type") {
+ parse_content(rest, sub_contype, junk);
+ currently_text = sub_contype == "text";
+ //xxxx cerr << "setting contype '" << sub_contype << "' " << currently_text << " ... " << textlines << endl;
+ }
+ } else {
+ if (main_multipart && line == "--" + boundary) {
+ //xxxx cerr << "found subhead boundary" << endl;
+ in_subheads = 1;
+ continue;
+ }
+ if (currently_text) textlines++;
+ }
+ }
+
+ if (0) cerr << "textlines: " << textlines << endl;
+ if (!textlines) {
+ cerr << progid << " rejection: no text: " << error_exit << endl;
+ maybe_exeunt(ex_spam, error_exit);
+ }
+ cerr << progid << " normal completion" << endl;
+ return(ex_good);
+}
+
+////////////////////////////////////////////////////////////
+int main(int _argc, const char** _argv){
+
+ int argc(_argc);
+ const char **argv(_argv);
+ {
+ progname = *argv++; argc--;
+ mypid = getpid();
+ stringstream binder;
+ binder << basename(progname) << "[" << mypid << "]";
+ progid = binder.str();
+ }
+
+ skrewt mysk;
+
+ while (argc) {
+ string arg(*argv); argv++; argc--;
+ if (arg.substr(0,2) == "--") arg = arg.substr(1);
+ if (prefix(arg, "-help")) {
+ usage(0);
+ }
+ if (0) {
+ } else if (prefix(arg, "-mid-required")) {
+ mysk.mid_required++;
+ } else if (prefix(arg, "-error-exit")) {
+ mysk.error_exit++;
+ } else if (prefix(arg, "-maxsize")) {
+ if (!argc) {
+ cerr << "Option -maxsize requires an argument" << endl;
+ exit(ex_usage);
+ }
+ mysk.maxsize = atoi(*argv); argv++; argc--;
+ } else if (arg.substr(0,1) == "-") {
+ cerr << "Unrecognized option '" << arg << "'" << endl;
+ cerr << "For help, try: " << progname << " -help" << endl;
+ exit(ex_usage);
+ } else {
+ cerr << "Extraneous verbiage '" << arg << "'" << endl;
+ cerr << "For help, try: " << progname << " -help" << endl;
+ exit(ex_usage);
+ }
+ }
+
+ int rslt = mysk.headers();
+ if (rslt) return rslt;
+
+// Headers are done.
+// Do some early-stage thinking.
+
+ rslt = mysk.interstage();
+ if (rslt) return rslt;
+
+ rslt = mysk.body();
+ return rslt;
+
+}