views:

1111

answers:

7

I have a data that looks like this:

AAA 0.3 1.00 foo chr1,100
AAC 0.1 2.00 bar chr2,33
AAT 3.3 2.11     chr3,45
AAG 1.3 3.11 qux chr1,88
ACA 2.3 1.33     chr8,13
ACT 2.3 7.00 bux chr5,122

Note that the lines above are tab separated. Moreover, it sometime may contain 5 fields or 4 fields.

What I want to do is to capture 4th fields in variable as "" if it doesn't contain any value.

I have the following codes, but somehow it reads the 5th fields, as 4th fields when 4th is empty.

What's the correct way to do it?

#include <iostream>
#include <vector>
#include <fstream>
#include <sstream>
using namespace std;

int main  ( int arg_count, char *arg_vec[] ) {
    string line;
    ifstream myfile (arg_vec[1]);

    if (myfile.is_open())
    {
        while (getline(myfile,line) )
        {
            stringstream ss(line);    
            string Tag;  
            double Val1;
            double Val2;
            double Field4;
            double Field5;

            ss >> Tag >> Val1 >> Val2 >> Field4 >> Field5;
            cout << Field4 << endl;
            //cout << Tag << "," << Val1 << "," << Val2 << "," << Field4 << "," << Field5 << endl;

        }
        myfile.close();
    }
    else { cout << "Unable to open file"; }
    return 0;
}
+6  A: 

Tokenize the line into a vector of strings and then do conversion to an appropriate data type depending on the number of tokens.

If you can use Boost.Spirit, this reduces to a simple problem of defining an appropriate grammar.

dirkgently
Why was this downvoted? It's the correct answer, as far as I can see.
Niki
You can use boost::tokenizer for option 1.
David Rodríguez - dribeas
@dribeas: Good point. I specifically wanted to mention Spirit since I haven't seen this one much here on SO :)
dirkgently
Tokenizer skips empty columns. Do you know how to handle this?
Mykola Golubyev
boost::keep_empty_tokens
tstenner
@tstenner: thanks
Mykola Golubyev
+4  A: 

If you want to give Boost.Spirit a try, start with this. It does compile and i have tested it a bit. It seems to work fine.

#include <iostream>
#include <vector>
#include <fstream>
#include <sstream>
#include <list>
#include <boost/spirit/core.hpp>
#include <boost/spirit/actor/assign_actor.hpp>

using namespace std;
using namespace boost::spirit;

struct OneLine
{
        string tag;
        double val1;
        double val2;
        string field4;
        string field5;
};

int main  ( int arg_count, char *arg_vec[] ) {
    string line;
    ifstream myfile (arg_vec[1]);
    list<OneLine> myList;

    if (myfile.is_open())
    {
        while (getline(myfile,line) )
        {
                OneLine result;
                rule<> good_p(alnum_p|punct_p);
                parse( line.c_str(),
                    (*good_p)[assign_a(result.tag)] >> ch_p('\t') >>
                    real_p[assign_a(result.val1)] >> ch_p('\t') >>
                    real_p[assign_a(result.val2)] >> ch_p('\t') >>
                    (*good_p)[assign_a(result.field4)] >> ch_p('\t') >>
                    (*good_p)[assign_a(result.field5)],
                    ch_p(";") );

                myList.push_back( result );
        }
        myfile.close();
    }
    else { cout << "Unable to open file"; }
    return 0;
}
Benoît
+1 Thanks for the boost spirit example.
Tom Leys
+1  A: 

The simplest thing is just to use two calls to fscanf, scanf or sscanf like so:

std::string line = /* some line */;
if(sscanf(line.c_str(), "%s %f %f %s", &str1, &float1, &float2, &str2) == 4){
    // 4 parameters
}else if(sscanf(line.c_str(), ...) == 5){
    // 5 parameters
}

Using boost::Spirit seems like overkill, though this isn't the most C++-ish way of doing things.

Jasper Bekkers
The user wants to read into a std::string - you can't do that with scanf()
anon
You can with sscanf, which takes a const char *
Jasper Bekkers
Updated the example to show that.
Jasper Bekkers
I said READ INTO, not READ FROM!
anon
what a mix of C and C++. why not stringstream or something?
Mykola Golubyev
@Neil: Although it's possible to convert the data into std::string after parsing, the OP doesn't mention it.
Jasper Bekkers
yes, he does - "string Tag" is the first thing he wants to read into
anon
@Neil: You're right, I'm still leaving this answer in because of it's simplicity. Converting to the char* to std::string is trivial.
Jasper Bekkers
+2  A: 

With boost:

int main()
{
    std::ifstream in("parsefile.in");

    if (!in)
        return 1;

    typedef std::istreambuf_iterator<char> InputIterator;
    typedef boost::char_separator<char> Separator;
    typedef boost::tokenizer< Separator, InputIterator > Tokenizer;

    Tokenizer tokens(InputIterator(in),
                     InputIterator(),
                     Separator(",\t\n", "", boost::keep_empty_tokens));

    const std::size_t columnsCount = 6;
    std::size_t columnNumber = 1;
    for(Tokenizer::iterator it = tokens.begin(); 
        it != tokens.end(); 
        ++it)
    {
        const std::string value = *it;

        if ( 2 == columnNumber )
        {
            const double d = convertToDouble(value);
        }

        std::cout << std::setw(10) << value << "|";

        if ( columnsCount == columnNumber )
        {
            std::cout << std::endl;
            columnNumber = 1;
        }
        else
        {
            ++columnNumber;
        }
    }

    return 0;
}

Without boost:

int main()
{
    std::ifstream in("parsefile.in");

    if (!in)
        return 1;

    const std::size_t columnNumber = 5;
    while (in)
    {
        std::vector< std::string > columns(columnNumber);

        for (std::size_t i = 0; i < columnNumber - 1; ++i)
            std::getline(in, columns[i], '\t');
        std::getline(in, columns[columnNumber - 1], '\n');

        std::cout << columns[3] << std::endl;
    }

    return 0;
}

To convert string value to double you can use the following.

double convertToDouble( const std::string& value )
{
    std::stringstream os;
    os << value;
    double result;
    os >> result;
    return result;
}
Mykola Golubyev
+3  A: 

Another C++ only version that just uses the fact that istream must set the failbit if operator>> fails to parse.

while(getline(ss, line))
{
 stringstream sl(line);

 sl >> tag >> v1 >> v2 >> v3 >> v4;

 if(sl.rdstate() == ios::failbit) // failed to parse 5 arguments?
 {
  sl.clear();
  sl.seekg(ios::beg);
  sl >> tag >> v1 >> v2 >> v4; // do it again with 4
  v3 = "EMPTY"; // just a default value
 }


 cout << "tag: " << tag <<std::endl
  << "v1: " << v1 << std::endl
  << "v2: " << v2 << std::endl
  << "v3: " << v3 << std::endl
  << "v4: " << v4 << std::endl << std::endl;
}
Jasper Bekkers
It is hard to change if two of them can be zero.
Mykola Golubyev
@Mykola: Correct, I'm still trying to figure out how to cleanly create some sort of stream manipulator that allows for optional parameters.
Jasper Bekkers
+1  A: 

Yet another version - I think this is the one that involves the least typing!

#include <iostream>
#include <sstream>
#include <string>
using namespace std;

int main() {

    string f1, f4;
    double f2, f3, f5;

    string line;
    istringstream is;

    while( getline( cin, line ) ) {

     is.str( line );

     if ( ! (is >> f1 >> f2 >> f3 >> f4 >> f5) ) {
      is.str( line);
      f4 = "*";
      is >> f1 >> f2 >> f3 >> f5;
     }

     cout << f1 << " " << f2 << " " << f3 << " " << f4 << " " << f5 << endl;
    }
}
anon
operator!() also fails on badbit, which you might not want :-). Other than that, it's pretty similar to my previous solution.
Jasper Bekkers
+1  A: 

One more generic solution to read and handle any text based table. Solution is with boost.

typedef boost::function< void (int, int, const std::string&) > RecordHandler;
void readTableFromFile( const std::string& fileName,
                        const std::string& delimiter,
                        RecordHandler handler );

void handler(int row, int col, const std::string& value)
{
    std::cout << "[ " << row << ", " << col << "] " << value;
}

int main()
{
    readTableFromFile("parsefile.in", "\t,", handler);

    return 0;
}

And the Implementation

std::size_t columnsCountInTheFile( const std::string& fileName,
                                   const std::string& delimiter )
{
    typedef boost::char_separator<char> Separator;
    typedef boost::tokenizer< Separator > Tokenizer;

    std::ifstream in(fileName.c_str());

    std::string line;
    std::getline(in, line);

    Tokenizer t(line,
                Separator(delimiter.c_str(), "", boost::keep_empty_tokens));

    return std::distance(t.begin(), t.end());
}

void readTableFromFile( const std::string& fileName,
                        const std::string& delimiter,
                        RecordHandler handler );
{
    std::ifstream in(fileName.c_str());

    if (!in)
        throw std::runtime_error("can't read from " + fileName);

    typedef std::istreambuf_iterator<char> InputIterator;
    typedef boost::char_separator<char> Separator;
    typedef boost::tokenizer< Separator, InputIterator > Tokenizer;

    Tokenizer tokens(InputIterator(in),
                     InputIterator(),
                     Separator((delimiter + "\n").c_str(), "", boost::keep_empty_tokens));

    const std::size_t columnsCount = columnsCountInTheFile(fileName, delimiter);

    std::size_t columnNumber = 1;
    std::size_t rowNumber = 1;
    for(Tokenizer::iterator it = tokens.begin(); 
        it != tokens.end(); 
        ++it)
    {
        handler(rowNumber, columnNumber, *it);

        if ( columnsCount == columnNumber )
        {
            columnNumber = 1;
            ++rowNumber;
        }
        else
        {
            ++columnNumber;
        }
    }
}
Mykola Golubyev