From 74ddd0381aa1b1a90eb0d5300fa576cb2348eeac Mon Sep 17 00:00:00 2001 From: John Denker Date: Sun, 17 Oct 2021 10:10:18 -0700 Subject: basically functional, but still a work in progress --- parse_csv.c | 184 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 parse_csv.c (limited to 'parse_csv.c') diff --git a/parse_csv.c b/parse_csv.c new file mode 100644 index 0000000..12e98f3 --- /dev/null +++ b/parse_csv.c @@ -0,0 +1,184 @@ +#include +#include +#include +#include +#include "parse_csv.h" +using namespace std; + +/************* + +Format defined: + https://tools.ietf.org/html/rfc4180 +See also: + https://en.wikipedia.org/wiki/Comma-separated_values + +We depart from the RFC by ignoring \r, so that lines can +end with either \r\n (dos style) or simply \n (unix style). + +We check for malformed quoted fields and throw exceptions. + +The last line of the input .csv file may or may not +contain a newline: +:; echo -en "xxx\n" | ./parse_csv_demo - | od -b -Anone + 170 170 170 012 +:; echo -en "xxx" | ./parse_csv_demo - | od -b -Anone + 170 170 170 012 + +A line containing nothing but the newline has one field +with a zero-length datum: +:; echo -en "\n" | ./parse_csv_demo - | od -b -Anone + 012 + +A line containing no data /and/ no newline isn't a line +at all. No line, no fields, no data: +:; echo -en "" | ./parse_csv_demo - | wc + 0 0 0 + +************/ + +string const Q("\""); + +enum class CSVState { + Start, + UnquotedField, + QuotedField, + QuoteInQuote +}; + +// Read one row of .csv table. +// Return a zero-length vector if we encounter EoF +// before seeing any data. +// Note that a table row can extend across multiple .csv file lines, +// if there are embedded newlines. +// +// Within quoted fields, we accept commas, quotes, and newlines, +// e.g "Hughie, ""Louie"", Dewey". +template +vector readCSVRow(istream& input) { + using namespace std; + if (input.eof()) return vector(0); + CSVState state = CSVState::Start; + vector fields(0); + char c; + for (;;) { + input.read(&c, 1); + if (input.eof()) { + if (state == CSVState::QuotedField) + throw runtime_error("Unterminated quote in .csv file in field# " + + to_string(fields.size()) + + "\n... namely <" + + Q + fields.back() + ">"); + break; + } + if (!input.good()) throw runtime_error("input error in .csv file"); + +// if we made it this far, the line contains at least one datum: + if (fields.size() == 0) fields.push_back(datum()); + switch (state) { + case CSVState::Start: + switch (c) { + case ',': // zero-length field + fields.push_back(datum()); + break; + case '"': + state = CSVState::QuotedField; +// This is no-op if the datum is not of type qstring: + set_q(fields.back(), 1); + break; + case '\r': + break; + case '\n': + goto EndRecord; + default: + state = CSVState::UnquotedField; + fields.back().push_back(c); + break; + } + break; + case CSVState::UnquotedField: + switch (c) { + case ',': // start a new field: + state = CSVState::Start; + fields.push_back(datum()); + break; + case '"': throw runtime_error + ("Stray quote in .csv file in field# " + + to_string(fields.size()) + + "\n... namely <" + + fields.back() + Q + ">" + ); + break; + case '\r': break; + case '\n': goto EndRecord; + default: fields.back().push_back(c); + break; + } + break; + case CSVState::QuotedField: + switch (c) { + case '"': state = CSVState::QuoteInQuote; + break; +// Accept anything except quote here; +// that includes comma, \r, and \n. + default: fields.back().push_back(c); + break; + } + break; +// Previous state saw a quote; we consider what follows: + case CSVState::QuoteInQuote: + switch (c) { + case ',': // comma after closing quote + state = CSVState::Start; + fields.push_back(datum()); + break; + case '"': // double ", map to plain " + fields.back().push_back('"'); + state = CSVState::QuotedField; + break; + case '\r': break; + case '\n': goto EndRecord; + default: throw runtime_error + ("Extraneous verbiage in .csv file after quoted field# " + + to_string(fields.size()) + + "\n... namely <" + + Q + fields.back() + Q + + string(1,c) +">"); + break; + } + break; + } + } + EndRecord:;;;; + return fields; +} + +// Read entire CSV file. +// Within quoted fields, we accept commas, quotes, and newlines, +// e.g "Hughie, ""Louie"", Dewey". +template +vector > readCSV(istream &in) { + using namespace std; + vector > table; + datum row; + while (!in.eof()) { + auto fields = readCSVRow(in); + if (fields.size() == 0) break; + table.push_back(fields); + } + return table; +} + +// Explicit instantiation for the version. +// If you want some other version, you'll have to instantiate it. +template vector readCSVRow(istream& input); +template vector >readCSV(istream &in); +// Apparently it's not necessary to mention get_q() here, +// but probably more portable to do it anyway: +////template int get_q(qstring const&); + +#if 1 +// Explicit instantiation for the version. +template vector readCSVRow(istream& input); +template vector >readCSV(istream &in); +/////template int get_q(string const&); +#endif -- cgit v1.2.3