It buffers all matching lines from the csv after evaluation of the filter. This buffered table is then sorted using a "natural sort" relation and outputted to stdout. Offset and limit of the outputted region can be given.
This tool is used on the expression parser's web site for an Online CSV Filter Demo: http://idlebox.net/2007/stx-exparser/csvfilter.htt
// $Id: csvtool.cc 59 2007-07-17 14:43:23Z tb $ /* * STX Expression Parser C++ Framework v0.7 * Copyright (C) 2007 Timo Bingmann * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the * Free Software Foundation; either version 2.1 of the License, or (at your * option) any later version. * * This library is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ // Enhanced CSV Parser and Filter using the Expression Parser #include "ExpressionParser.h" #include "strnatcmp.h" #include <iostream> #include <fstream> #include <string> #include <vector> #include <map> #include <boost/lexical_cast.hpp> // use this as the delimiter. this can be changed to ';' or ',' if needed const char delimiter = '\t'; // read one line from instream and split it into tab (or otherwise) delimited // columns. returns the number of columns read, 0 if eof. unsigned int read_csvline(std::istream &instream, std::vector<std::string> &columns) { columns.clear(); // read one line from the input stream std::string line; if (!std::getline(instream, line, '\n').good()) { return 0; } // parse line into tab separated columns, start with inital column columns.push_back(""); for (std::string::const_iterator si = line.begin(); si != line.end(); ++si) { if (*si == delimiter) columns.push_back(""); else // add non-delimiter to last column columns.back() += *si; } return columns.size(); } // subclass stx::BasicSymbolTable and return variable values from the current // csv row. the variable names are defined by the map containing the column // header. class CSVRowSymbolTable : public stx::BasicSymbolTable { public: // maps the column variable name to the vector index const std::map<std::string, unsigned int> &headersmap; // refernce to the reused data row vector. const std::vector<std::string> &datacolumns; CSVRowSymbolTable(const std::map<std::string, unsigned int> &_headersmap, const std::vector<std::string> &_datacolumns) : stx::BasicSymbolTable(), headersmap(_headersmap), datacolumns(_datacolumns) { } virtual stx::AnyScalar lookupVariable(const std::string &varname) const { // look if the variable name is defined by the CSV file std::map<std::string, unsigned int>::const_iterator varfind = headersmap.find(varname); if (varfind == headersmap.end()) { // if not, let BasicSymbolTable check if it knows it return stx::BasicSymbolTable::lookupVariable(varname); } // return the variable value from the current vector. convert it to a // stx::AnyScalar but use the automatic type recognition for input // strings. if(varfind->second < datacolumns.size()) { return stx::AnyScalar().setAutoString( datacolumns[varfind->second] ); } else { return ""; // happens when a data row has too few delimited // fields. } } }; // std::sort order relation functional object struct DataRecordSortRelation { // the column index to sort by unsigned int sortcol; // sort ascending or descending bool descending; inline DataRecordSortRelation(unsigned int _sortcol, bool _descending = 0) : sortcol(_sortcol), descending(_descending) { } // calls the strnatcasecmp on the given column's text or "" if the column // does not exist. sorts in "natural" sort order means numbers and text are // ordered correctly. inline bool operator()(const std::vector<std::string> &recordA, const std::vector<std::string> &recordB) const { if (!descending) { return strnatcasecmp(sortcol < recordA.size() ? recordA[sortcol].c_str() : "", sortcol < recordB.size() ? recordB[sortcol].c_str() : "") < 0; } else { return strnatcasecmp(sortcol < recordA.size() ? recordA[sortcol].c_str() : "", sortcol < recordB.size() ? recordB[sortcol].c_str() : "") > 0; } } }; // trim function from my weblog. static inline std::string string_trim(const std::string& str) { std::string::size_type pos1 = str.find_first_not_of(' '); if (pos1 == std::string::npos) return std::string(); std::string::size_type pos2 = str.find_last_not_of(' '); if (pos2 == std::string::npos) return std::string(); return str.substr(pos1 == std::string::npos ? 0 : pos1, pos2 == std::string::npos ? (str.length() - 1) : (pos2 - pos1 + 1)); } int main(int argc, char *argv[]) { // get progarm argment or reasonable defaults if (argc < 2) { std::cerr << "Usage: " << argv[0] << " <csv-filename> [filter expression] [sort-column] [offset] [limit]" << "\n"; return 0; } std::string csvfilename = argv[1]; std::string exprstring = (argc >= 3) ? string_trim(argv[2]) : ""; std::string sortcolumn = (argc >= 4) ? string_trim(argv[3]) : ""; std::string offsetstring = (argc >= 5) ? string_trim(argv[4]) : ""; std::string limitstring = (argc >= 6) ? string_trim(argv[5]) : ""; // parse expression into a parse tree stx::ParseTree pt; try { if (exprstring.size()) { pt = stx::parseExpression(exprstring); } } catch (stx::ExpressionParserException &e) { std::cerr << "ExpressionParserException: " << e.what() << "\n"; return 0; } // open the CSV file stream, either stdin or the given filename std::ifstream csvfilestream; if (csvfilename != "-") { csvfilestream.open(csvfilename.c_str()); if (!csvfilestream) { std::cerr << "Error opening CSV file " << csvfilename << "\n"; return 0; } } std::istream& csvfile = (csvfilename == "-") ? std::cin : csvfilestream; // read first line of CSV input as column headers std::vector<std::string> headers; if (read_csvline(csvfile, headers) == 0) { std::cerr << "Error read column headers: no input\n"; return 0; } // create a header column lookup map for CSVRowSymbolTable std::map<std::string, unsigned int> headersmap; for(unsigned int headnum = 0; headnum < headers.size(); ++headnum) { headersmap[ headers[headnum] ] = headnum; } // iterate over the data lines of the CSV input and save matching data rows // into "datarecords" unsigned int linesprocessed = 0; bool addedEvalResult = false; std::vector<std::string> datacolumns; // current row CSVRowSymbolTable csvsymboltable(headersmap, datacolumns); // huge table containing copied rows. std::vector< std::vector<std::string> > datarecords; while( read_csvline(csvfile, datacolumns) > 0 ) { // evaluate the expression for each row using the headers/datacolumns // as variables try { linesprocessed++; if (!pt.isEmpty()) { stx::AnyScalar val = pt.evaluate( csvsymboltable ); if (val.isBooleanType()) { if (!val.getBoolean()) continue; } else { // if calculation results in non-boolean value, then save // that value into a column "EvalResult" if (!addedEvalResult) { headers.push_back("EvalResult"); addedEvalResult = true; } while( datacolumns.size() + 1 < headers.size() ) datacolumns.push_back(""); datacolumns.push_back(val.getString()); } } } catch (stx::ExpressionParserException &e) { // save exception text into column "EvalResult" if (!addedEvalResult) { headers.push_back("EvalResult"); addedEvalResult = true; } // add calculation result as last column while( datacolumns.size() + 1 < headers.size() ) datacolumns.push_back(""); datacolumns.push_back(std::string("Exception: ") + e.what()); } datarecords.push_back( datacolumns ); } // add "EvalResult" to headers map to allow sorting by it. if (addedEvalResult) { headersmap[ headers[headers.size() - 1] ] = headers.size() - 1; } // sort the result table if required. if (sortcolumn.size()) { std::map<std::string, unsigned int>::const_iterator colfind = headersmap.find(sortcolumn); if (colfind != headersmap.end()) { // sort ascending std::sort(datarecords.begin(), datarecords.end(), DataRecordSortRelation(colfind->second)); } else { // the the sort column is !header it is sorted descending. if (sortcolumn[0] == '!') { sortcolumn.erase(0, 1); colfind = headersmap.find(sortcolumn); if (colfind != headersmap.end()) { std::sort(datarecords.begin(), datarecords.end(), DataRecordSortRelation(colfind->second, 1)); } else { std::cerr << "Bad sort column: " << sortcolumn << " could not be found.\n"; return 0; } } else { std::cerr << "Bad sort column: " << sortcolumn << " could not be found.\n"; return 0; } } } // determine offset and limit of the outputted data rows. unsigned int offset = 0; unsigned int limit = datarecords.size(); if (offsetstring.size()) { try { offset = boost::lexical_cast<unsigned int>(offsetstring); } catch (boost::bad_lexical_cast &e) { std::cerr << "Bad number in offset: not an integer.\n"; return 0; } } if (limitstring.size()) { try { limit = boost::lexical_cast<unsigned int>(limitstring); } catch (boost::bad_lexical_cast &e) { std::cerr << "Bad number in limit: not an integer.\n"; return 0; } } // write a processing summary to stderr std::cerr << "Processed " << linesprocessed << " lines, " << "copied " << datarecords.size() << " and " << "skipped " << (linesprocessed - datarecords.size()) << " lines" << "\n"; // write column headers to stdout for(std::vector<std::string>::const_iterator coliter = headers.begin(); coliter != headers.end(); ++coliter) { if (coliter != headers.begin()) std::cout << delimiter; std::cout << *coliter; } std::cout << "\n"; // output data rows from "offset" to "offset+limit" for(unsigned int current = offset; current < offset + limit && current < datarecords.size(); ++current) { std::vector<std::string> &currrecord = datarecords[current]; // output this data row to std::cout for(std::vector<std::string>::const_iterator coliter = currrecord.begin(); coliter != currrecord.end(); ++coliter) { if (coliter != currrecord.begin()) std::cout << delimiter; std::cout << *coliter; } std::cout << "\n"; } }