#include #include #include #include #include "parse_csv.h" using namespace std; /************* Format defined: https://tools.ietf.org/html/rfc4180 See also: https://en.wikipedia.org/wiki/Comma-separated_values We depart from the RFC by ignoring \r, so that lines can end with either \r\n (dos style) or simply \n (unix style). We check for malformed quoted fields and throw exceptions. The last line of the input .csv file may or may not contain a newline: :; echo -en "xxx\n" | ./parse_csv_demo - | od -b -Anone 170 170 170 012 :; echo -en "xxx" | ./parse_csv_demo - | od -b -Anone 170 170 170 012 A line containing nothing but the newline has one field with a zero-length datum: :; echo -en "\n" | ./parse_csv_demo - | od -b -Anone 012 A line containing no data /and/ no newline isn't a line at all. No line, no fields, no data: :; echo -en "" | ./parse_csv_demo - | wc 0 0 0 ************/ string const Q("\""); enum class CSVState { Start, UnquotedField, QuotedField, QuoteInQuote }; // Read one row of .csv table. // Return a zero-length vector if we encounter EoF // before seeing any data. // Note that a table row can extend across multiple .csv file lines, // if there are embedded newlines. // // Within quoted fields, we accept commas, quotes, and newlines, // e.g "Hughie, ""Louie"", Dewey". template vector readCSVRow(istream& input) { using namespace std; if (input.eof()) return vector(0); CSVState state = CSVState::Start; vector fields(0); char c; for (;;) { input.read(&c, 1); if (input.eof()) { if (state == CSVState::QuotedField) throw runtime_error("Unterminated quote in .csv file in field# " + to_string(fields.size()) + "\n... namely <" + Q + fields.back() + ">"); break; } if (!input.good()) throw runtime_error("input error in .csv file"); // if we made it this far, the line contains at least one datum: if (fields.size() == 0) fields.push_back(datum()); switch (state) { case CSVState::Start: switch (c) { case ',': // zero-length field fields.push_back(datum()); break; case '"': state = CSVState::QuotedField; // This is no-op if the datum is not of type qstring: set_q(fields.back(), 1); break; case '\r': break; case '\n': goto EndRecord; default: state = CSVState::UnquotedField; fields.back().push_back(c); break; } break; case CSVState::UnquotedField: switch (c) { case ',': // start a new field: state = CSVState::Start; fields.push_back(datum()); break; case '"': throw runtime_error ("Stray quote in .csv file in field# " + to_string(fields.size()) + "\n... namely <" + fields.back() + Q + ">" ); break; case '\r': break; case '\n': goto EndRecord; default: fields.back().push_back(c); break; } break; case CSVState::QuotedField: switch (c) { case '"': state = CSVState::QuoteInQuote; break; // Accept anything except quote here; // that includes comma, \r, and \n. default: fields.back().push_back(c); break; } break; // Previous state saw a quote; we consider what follows: case CSVState::QuoteInQuote: switch (c) { case ',': // comma after closing quote state = CSVState::Start; fields.push_back(datum()); break; case '"': // double ", map to plain " fields.back().push_back('"'); state = CSVState::QuotedField; break; case '\r': break; case '\n': goto EndRecord; default: throw runtime_error ("Extraneous verbiage in .csv file after quoted field# " + to_string(fields.size()) + "\n... namely <" + Q + fields.back() + Q + string(1,c) +">"); break; } break; } } EndRecord:;;;; return fields; } // Read entire CSV file. // Within quoted fields, we accept commas, quotes, and newlines, // e.g "Hughie, ""Louie"", Dewey". template vector > readCSV(istream &in) { using namespace std; vector > table; datum row; while (!in.eof()) { auto fields = readCSVRow(in); if (fields.size() == 0) break; table.push_back(fields); } return table; } // Scan the data array, // looking for records that look like headers // (as opposed to numerical data). template int count_header(vector > const aoa){ int NR = aoa.size(); int hh = 0; for (; hh < NR; hh++) { if (aoa[hh].size() == 0) continue; string word = aoa[hh][0]; size_t where; where = word.find_first_not_of(" "); if (where == string::npos) continue; word = word.substr(where); try { stod(word, &where); // don't care about return value } catch (exception& ee) { continue; } word = word.substr(where); where = word.find_first_not_of(" "); if (where == string::npos) break; } return hh; } // Explicit instantiation for the version. // If you want some other version, you'll have to instantiate it. template vector readCSVRow(istream& input); template vector >readCSV(istream &in); template int count_header(vector > const aoa); #if 1 // Explicit instantiation for the version. template vector readCSVRow(istream& input); template vector >readCSV(istream &in); template int count_header(vector > const aoa); #endif