aboutsummaryrefslogtreecommitdiff
path: root/parse_csv.c
diff options
context:
space:
mode:
authorJohn Denker <jsd@av8n.com>2021-10-17 10:10:18 -0700
committerJohn Denker <jsd@av8n.com>2021-10-17 11:09:24 -0700
commit74ddd0381aa1b1a90eb0d5300fa576cb2348eeac (patch)
tree72a9dded6f800467d52e479eb37574e6de5f2e6c /parse_csv.c
parent634d365a03cb0581a062cd3cf4db9ae69f1cde26 (diff)
basically functional, but still a work in progress
Diffstat (limited to 'parse_csv.c')
-rw-r--r--parse_csv.c184
1 files changed, 184 insertions, 0 deletions
diff --git a/parse_csv.c b/parse_csv.c
new file mode 100644
index 0000000..12e98f3
--- /dev/null
+++ b/parse_csv.c
@@ -0,0 +1,184 @@
+#include <istream>
+#include <string>
+#include <vector>
+#include <stdexcept>
+#include "parse_csv.h"
+using namespace std;
+
+/*************
+
+Format defined:
+ https://tools.ietf.org/html/rfc4180
+See also:
+ https://en.wikipedia.org/wiki/Comma-separated_values
+
+We depart from the RFC by ignoring \r, so that lines can
+end with either \r\n (dos style) or simply \n (unix style).
+
+We check for malformed quoted fields and throw exceptions.
+
+The last line of the input .csv file may or may not
+contain a newline:
+:; echo -en "xxx\n" | ./parse_csv_demo - | od -b -Anone
+ 170 170 170 012
+:; echo -en "xxx" | ./parse_csv_demo - | od -b -Anone
+ 170 170 170 012
+
+A line containing nothing but the newline has one field
+with a zero-length datum:
+:; echo -en "\n" | ./parse_csv_demo - | od -b -Anone
+ 012
+
+A line containing no data /and/ no newline isn't a line
+at all. No line, no fields, no data:
+:; echo -en "" | ./parse_csv_demo - | wc
+ 0 0 0
+
+************/
+
+string const Q("\"");
+
+enum class CSVState {
+ Start,
+ UnquotedField,
+ QuotedField,
+ QuoteInQuote
+};
+
+// Read one row of .csv table.
+// Return a zero-length vector if we encounter EoF
+// before seeing any data.
+// Note that a table row can extend across multiple .csv file lines,
+// if there are embedded newlines.
+//
+// Within quoted fields, we accept commas, quotes, and newlines,
+// e.g "Hughie, ""Louie"", Dewey".
+template <class datum = qstring>
+vector<datum> readCSVRow(istream& input) {
+ using namespace std;
+ if (input.eof()) return vector<datum>(0);
+ CSVState state = CSVState::Start;
+ vector<datum> fields(0);
+ char c;
+ for (;;) {
+ input.read(&c, 1);
+ if (input.eof()) {
+ if (state == CSVState::QuotedField)
+ throw runtime_error("Unterminated quote in .csv file in field# "
+ + to_string(fields.size())
+ + "\n... namely <"
+ + Q + fields.back() + ">");
+ break;
+ }
+ if (!input.good()) throw runtime_error("input error in .csv file");
+
+// if we made it this far, the line contains at least one datum:
+ if (fields.size() == 0) fields.push_back(datum());
+ switch (state) {
+ case CSVState::Start:
+ switch (c) {
+ case ',': // zero-length field
+ fields.push_back(datum());
+ break;
+ case '"':
+ state = CSVState::QuotedField;
+// This is no-op if the datum is not of type qstring:
+ set_q(fields.back(), 1);
+ break;
+ case '\r':
+ break;
+ case '\n':
+ goto EndRecord;
+ default:
+ state = CSVState::UnquotedField;
+ fields.back().push_back(c);
+ break;
+ }
+ break;
+ case CSVState::UnquotedField:
+ switch (c) {
+ case ',': // start a new field:
+ state = CSVState::Start;
+ fields.push_back(datum());
+ break;
+ case '"': throw runtime_error
+ ("Stray quote in .csv file in field# "
+ + to_string(fields.size())
+ + "\n... namely <"
+ + fields.back() + Q + ">"
+ );
+ break;
+ case '\r': break;
+ case '\n': goto EndRecord;
+ default: fields.back().push_back(c);
+ break;
+ }
+ break;
+ case CSVState::QuotedField:
+ switch (c) {
+ case '"': state = CSVState::QuoteInQuote;
+ break;
+// Accept anything except quote here;
+// that includes comma, \r, and \n.
+ default: fields.back().push_back(c);
+ break;
+ }
+ break;
+// Previous state saw a quote; we consider what follows:
+ case CSVState::QuoteInQuote:
+ switch (c) {
+ case ',': // comma after closing quote
+ state = CSVState::Start;
+ fields.push_back(datum());
+ break;
+ case '"': // double ", map to plain "
+ fields.back().push_back('"');
+ state = CSVState::QuotedField;
+ break;
+ case '\r': break;
+ case '\n': goto EndRecord;
+ default: throw runtime_error
+ ("Extraneous verbiage in .csv file after quoted field# "
+ + to_string(fields.size())
+ + "\n... namely <"
+ + Q + fields.back() + Q
+ + string(1,c) +">");
+ break;
+ }
+ break;
+ }
+ }
+ EndRecord:;;;;
+ return fields;
+}
+
+// Read entire CSV file.
+// Within quoted fields, we accept commas, quotes, and newlines,
+// e.g "Hughie, ""Louie"", Dewey".
+template <class datum = qstring>
+vector<vector<datum> > readCSV(istream &in) {
+ using namespace std;
+ vector<vector<datum> > table;
+ datum row;
+ while (!in.eof()) {
+ auto fields = readCSVRow<datum>(in);
+ if (fields.size() == 0) break;
+ table.push_back(fields);
+ }
+ return table;
+}
+
+// Explicit instantiation for the <qstring> version.
+// If you want some other version, you'll have to instantiate it.
+template vector<qstring> readCSVRow<qstring>(istream& input);
+template vector<vector<qstring> >readCSV<qstring>(istream &in);
+// Apparently it's not necessary to mention get_q() here,
+// but probably more portable to do it anyway:
+////template int get_q<qstring>(qstring const&);
+
+#if 1
+// Explicit instantiation for the <string> version.
+template vector<string> readCSVRow<string>(istream& input);
+template vector<vector<string> >readCSV<string>(istream &in);
+/////template int get_q<string>(string const&);
+#endif