Example Application: Enhanced CSV-File Record Filter and Sorter

The csvtool example program is an extended version of the csvfilter program.

It buffers all matching lines from the csv after evaluation of the filter. This buffered table is then sorted using a "natural sort" relation and outputted to stdout. Offset and limit of the outputted region can be given.

This tool is used on the expression parser's web site for an Online CSV Filter Demo: http://idlebox.net/2007/stx-exparser/csvfilter.htt

Complete Example Source Code

// $Id: csvtool.cc 59 2007-07-17 14:43:23Z tb $

/*
 * STX Expression Parser C++ Framework v0.7
 * Copyright (C) 2007 Timo Bingmann
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by the
 * Free Software Foundation; either version 2.1 of the License, or (at your
 * option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

// Enhanced CSV Parser and Filter using the Expression Parser
 
#include "ExpressionParser.h"
#include "strnatcmp.h"

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <map>

#include <boost/lexical_cast.hpp>

// use this as the delimiter. this can be changed to ';' or ',' if needed
const char delimiter = '\t';

// read one line from instream and split it into tab (or otherwise) delimited
// columns. returns the number of columns read, 0 if eof.
unsigned int read_csvline(std::istream &instream,
                          std::vector<std::string> &columns)
{
    columns.clear();

    // read one line from the input stream
    std::string line;
    if (!std::getline(instream, line, '\n').good()) {
        return 0;
    }

    // parse line into tab separated columns, start with inital column
    columns.push_back("");

    for (std::string::const_iterator si = line.begin();
         si != line.end(); ++si)
    {
        if (*si == delimiter)
            columns.push_back("");
        else // add non-delimiter to last column
            columns.back() += *si;
    }

    return columns.size();
}

// subclass stx::BasicSymbolTable and return variable values from the current
// csv row. the variable names are defined by the map containing the column
// header.
class CSVRowSymbolTable : public stx::BasicSymbolTable
{
public:
    // maps the column variable name to the vector index
    const std::map<std::string, unsigned int> &headersmap;

    // refernce to the reused data row vector.
    const std::vector<std::string> &datacolumns;

    CSVRowSymbolTable(const std::map<std::string, unsigned int> &_headersmap,
                      const std::vector<std::string> &_datacolumns)
        : stx::BasicSymbolTable(),
          headersmap(_headersmap),
          datacolumns(_datacolumns)
    {
    }

    virtual stx::AnyScalar lookupVariable(const std::string &varname) const
    {
        // look if the variable name is defined by the CSV file
        std::map<std::string, unsigned int>::const_iterator
            varfind = headersmap.find(varname);

        if (varfind == headersmap.end()) {
            // if not, let BasicSymbolTable check if it knows it
            return stx::BasicSymbolTable::lookupVariable(varname);
        }

        // return the variable value from the current vector. convert it to a
        // stx::AnyScalar but use the automatic type recognition for input
        // strings.
        if(varfind->second < datacolumns.size())
        {
            return stx::AnyScalar().setAutoString( datacolumns[varfind->second] );
        }
        else
        {
            return "";  // happens when a data row has too few delimited
                        // fields.
        }
    }
};

// std::sort order relation functional object
struct DataRecordSortRelation
{
    // the column index to sort by
    unsigned int sortcol;

    // sort ascending or descending
    bool        descending;

    inline DataRecordSortRelation(unsigned int _sortcol, bool _descending = 0)
        : sortcol(_sortcol), descending(_descending)
    {
    }

    // calls the strnatcasecmp on the given column's text or "" if the column
    // does not exist. sorts in "natural" sort order means numbers and text are
    // ordered correctly.
    inline bool operator()(const std::vector<std::string> &recordA,
                           const std::vector<std::string> &recordB) const
    {
        if (!descending) {
            return strnatcasecmp(sortcol < recordA.size() ? recordA[sortcol].c_str() : "",
                                 sortcol < recordB.size() ? recordB[sortcol].c_str() : "") < 0;
        }
        else {
            return strnatcasecmp(sortcol < recordA.size() ? recordA[sortcol].c_str() : "",
                                 sortcol < recordB.size() ? recordB[sortcol].c_str() : "") > 0;
        }
    }
};

// trim function from my weblog.
static inline std::string string_trim(const std::string& str)
{
    std::string::size_type pos1 = str.find_first_not_of(' ');
    if (pos1 == std::string::npos) return std::string();

    std::string::size_type pos2 = str.find_last_not_of(' ');
    if (pos2 == std::string::npos) return std::string();

    return str.substr(pos1 == std::string::npos ? 0 : pos1,
                      pos2 == std::string::npos ? (str.length() - 1) : (pos2 - pos1 + 1));
}

int main(int argc, char *argv[])
{
    // get progarm argment or reasonable defaults
    if (argc < 2) {
        std::cerr << "Usage: " << argv[0] << " <csv-filename> [filter expression] [sort-column] [offset] [limit]" << "\n";
        return 0;
    }

    std::string csvfilename = argv[1];
    std::string exprstring = (argc >= 3) ? string_trim(argv[2]) : "";
    std::string sortcolumn = (argc >= 4) ? string_trim(argv[3]) : "";
    std::string offsetstring = (argc >= 5) ? string_trim(argv[4]) : "";
    std::string limitstring = (argc >= 6) ? string_trim(argv[5]) : "";

    // parse expression into a parse tree
    stx::ParseTree pt;
    try
    {
        if (exprstring.size()) {
            pt = stx::parseExpression(exprstring);
        }
    }
    catch (stx::ExpressionParserException &e)
    {
        std::cerr << "ExpressionParserException: " << e.what() << "\n";
        return 0;
    }

    // open the CSV file stream, either stdin or the given filename
    std::ifstream csvfilestream;
    if (csvfilename != "-")
    {
        csvfilestream.open(csvfilename.c_str());
        if (!csvfilestream) {
            std::cerr << "Error opening CSV file " << csvfilename << "\n";
            return 0;
        }
    }

    std::istream& csvfile = (csvfilename == "-") ? std::cin : csvfilestream;

    // read first line of CSV input as column headers
    std::vector<std::string> headers;

    if (read_csvline(csvfile, headers) == 0) {
        std::cerr << "Error read column headers: no input\n";
        return 0;
    }

    // create a header column lookup map for CSVRowSymbolTable 
    std::map<std::string, unsigned int> headersmap;
    for(unsigned int headnum = 0; headnum < headers.size(); ++headnum)
    {
        headersmap[ headers[headnum] ] = headnum;
    }

    // iterate over the data lines of the CSV input and save matching data rows
    // into "datarecords"
    unsigned int linesprocessed = 0;
    bool addedEvalResult = false;

    std::vector<std::string> datacolumns;       // current row
    CSVRowSymbolTable csvsymboltable(headersmap, datacolumns);

    // huge table containing copied rows.
    std::vector< std::vector<std::string> > datarecords;

    while( read_csvline(csvfile, datacolumns) > 0 )
    {
        // evaluate the expression for each row using the headers/datacolumns
        // as variables
        try
        {
            linesprocessed++;
            if (!pt.isEmpty())
            {
                stx::AnyScalar val = pt.evaluate( csvsymboltable );

                if (val.isBooleanType())
                {
                    if (!val.getBoolean()) continue;
                }
                else
                {
                    // if calculation results in non-boolean value, then save
                    // that value into a column "EvalResult"
                    if (!addedEvalResult) {
                        headers.push_back("EvalResult");
                        addedEvalResult = true;
                    }

                    while( datacolumns.size() + 1 < headers.size() )
                        datacolumns.push_back("");
                    datacolumns.push_back(val.getString());
                }
            }
        }
        catch (stx::ExpressionParserException &e)
        {
            // save exception text into column "EvalResult"
            if (!addedEvalResult) {
                headers.push_back("EvalResult");
                addedEvalResult = true;
            }
            // add calculation result as last column
            while( datacolumns.size() + 1 < headers.size() )
                datacolumns.push_back("");

            datacolumns.push_back(std::string("Exception: ") + e.what());
        }

        datarecords.push_back( datacolumns );
    }

    // add "EvalResult" to headers map to allow sorting by it.
    if (addedEvalResult) {
        headersmap[ headers[headers.size() - 1] ] = headers.size() - 1;
    }

    // sort the result table if required.
    if (sortcolumn.size())
    {
        std::map<std::string, unsigned int>::const_iterator
            colfind = headersmap.find(sortcolumn);

        if (colfind != headersmap.end())
        {
            // sort ascending
            std::sort(datarecords.begin(), datarecords.end(),
                      DataRecordSortRelation(colfind->second));
        }
        else 
        {
            // the the sort column is !header it is sorted descending.
            if (sortcolumn[0] == '!')
            {
                sortcolumn.erase(0, 1);
                
                colfind = headersmap.find(sortcolumn);
                if (colfind != headersmap.end())
                {
                    std::sort(datarecords.begin(), datarecords.end(),
                              DataRecordSortRelation(colfind->second, 1));
                }
                else
                {
                    std::cerr << "Bad sort column: " << sortcolumn << " could not be found.\n";
                    return 0;
                }
            }
            else
            {
                std::cerr << "Bad sort column: " << sortcolumn << " could not be found.\n";
                return 0;
            }
        }
    }

    // determine offset and limit of the outputted data rows.

    unsigned int offset = 0;
    unsigned int limit = datarecords.size();

    if (offsetstring.size())
    {
        try {
            offset = boost::lexical_cast<unsigned int>(offsetstring);
        }
        catch (boost::bad_lexical_cast &e) {
            std::cerr << "Bad number in offset: not an integer.\n";
            return 0;
        }
    }

    if (limitstring.size())
    {
        try {
            limit = boost::lexical_cast<unsigned int>(limitstring);
        }
        catch (boost::bad_lexical_cast &e) {
            std::cerr << "Bad number in limit: not an integer.\n";
            return 0;
        }
    }

    // write a processing summary to stderr
    std::cerr << "Processed " << linesprocessed << " lines, "
              << "copied " << datarecords.size() << " and "
              << "skipped " << (linesprocessed - datarecords.size()) << " lines" << "\n";


    // write column headers to stdout
    for(std::vector<std::string>::const_iterator coliter = headers.begin();
        coliter != headers.end(); ++coliter)
    {
        if (coliter != headers.begin()) std::cout << delimiter;
        std::cout << *coliter;
    }
    std::cout << "\n";

    // output data rows from "offset" to "offset+limit"

    for(unsigned int current = offset;
        current < offset + limit && current < datarecords.size();
        ++current)
    {
        std::vector<std::string> &currrecord = datarecords[current];

        // output this data row to std::cout
        for(std::vector<std::string>::const_iterator coliter = currrecord.begin();
            coliter != currrecord.end(); ++coliter)
        {
            if (coliter != currrecord.begin()) std::cout << delimiter;
            std::cout << *coliter;
        }
        std::cout << "\n";
    }
}

Generated on Tue Jul 17 16:51:58 2007 for STX Expression Parser by  doxygen 1.5.2