After pointing you at boost::flyweight
in another answer,
I wanted to take a closer look at the relative efficiency
of containers of strings, flyweights and the "nuclear option"
of unioning four chars with a pointer (class "sillystring"
in the code below).
Notes on the code:
- Works on 32-bit Debian/squeeze with g++ 4.3.3 and boost 1.38.0
- Uses
std::deque
instead of std::vector
because vector's
size-doubling behaviour (c.f deques' chunks) gives an
(arguably) misleading impression of inefficiency if the
test case just happened to have doubled recently.
- The "sillystring" uses the LSB of the pointer to
distinguish the pointer use-case from the local chars
case. This should work unless your malloc allocates
on odd-byte boundaries (in which case the code will throw)
(certainly haven't seen this on my system; YMMV).
Before anyone feels the need to point it out, yes, I do
consider this horrible dangerous hacky code and not an
option to be chosen lightly.
Results vary depending on the distribution of word lengths,
but for a distribution "(2D6+1)/2" (so peaked at 4, but with
lengths from 1 to 6), the efficiencies (defined as the ratio
between actual memory consumption and the actual number of
characters needing to be stored) are:
- 12.4%
deque<string>
- 20.9%
deque<flyweight<string> >
- 43.7%
deque<sillystring>
If all your words were 4 characters (change to const int length=4;
in the word generator), which is the ideal case for sillystring, then you get:
- 14.2%
deque<string>
- 77.8%
deque<flyweight<string> >
- 97.0%
deque<sillystring>
So flyweight is certainly a quick improvement, but you can do better by capitalising on the ability of your words to fit into a pointer-sized space and avoid additional heap overhead.
Here's the code:
// Compile with "g++ -O3 -o fly fly.cpp -lpthread"
// run "./fly 0 && ./fly 1 && ./fly 2"
#include <boost/flyweight.hpp>
#include <boost/format.hpp>
#include <boost/random.hpp>
#include <cstring>
#include <deque>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include <sys/types.h>
#include <unistd.h>
#define THROW(X,MSG) throw X(boost::str(boost::format("%1%: %2%") % __PRETTY_FUNCTION__ % MSG))
struct random_word_generator
{
random_word_generator(uint seed)
:_rng(seed),
_length_dist(1,6),
_letter_dist('a','z'),
_random_length(_rng,_length_dist),
_random_letter(_rng,_letter_dist)
{}
std::string operator()()
{
std::string r;
const int length=(_random_length()+_random_length()+1)/2;
for (int i=0;i<length;i++) r+=static_cast<char>(_random_letter());
return r;
}
private:
boost::mt19937 _rng;
boost::uniform_int<> _length_dist;
boost::uniform_int<> _letter_dist;
boost::variate_generator<boost::mt19937&,boost::uniform_int<> >
_random_length;
boost::variate_generator<boost::mt19937&,boost::uniform_int<> >
_random_letter;
};
struct collector
{
collector(){}
virtual ~collector(){}
virtual void insert(const std::string&)
=0;
virtual void dump(const std::string&) const
=0;
};
struct string_collector
: public std::deque<std::string>,
public collector
{
void insert(const std::string& s)
{
push_back(s);
}
void dump(const std::string& f) const
{
std::ofstream out(f.c_str(),std::ios::out);
for (std::deque<std::string>::const_iterator it=begin();it!=end();it++)
out << *it << std::endl;
}
};
struct flyweight_collector
: public std::deque<boost::flyweight<std::string> >,
public collector
{
void insert(const std::string& s)
{
push_back(boost::flyweight<std::string>(s));
}
void dump(const std::string& f) const
{
std::ofstream out(f.c_str(),std::ios::out);
for (std::deque<boost::flyweight<std::string> >::const_iterator it=begin();
it!=end();
it++
)
out << *it << std::endl;
}
};
struct sillystring
{
sillystring()
{
_rep.bits=0;
}
sillystring(const std::string& s)
{
_rep.bits=0;
assign(s);
}
sillystring(const sillystring& s)
{
_rep.bits=0;
assign(s.str());
}
~sillystring()
{
if (is_ptr()) delete [] ptr();
}
sillystring& operator=(const sillystring& s)
{
assign(s.str());
}
void assign(const std::string& s)
{
if (is_ptr()) delete [] ptr();
if (s.size()>4)
{
char*const p=new char[s.size()+1];
if (reinterpret_cast<unsigned int>(p)&0x00000001)
THROW(std::logic_error,"unexpected odd-byte address returned from new");
_rep.ptr.value=(reinterpret_cast<unsigned int>(p)>>1);
_rep.ptr.is_ptr=1;
strcpy(ptr(),s.c_str());
}
else
{
_rep.txt.is_ptr=0;
_rep.txt.c0=(s.size()>0 ? validate(s[0]) : 0);
_rep.txt.c1=(s.size()>1 ? validate(s[1]) : 0);
_rep.txt.c2=(s.size()>2 ? validate(s[2]) : 0);
_rep.txt.c3=(s.size()>3 ? validate(s[3]) : 0);
}
}
std::string str() const
{
if (is_ptr())
{
return std::string(ptr());
}
else
{
std::string r;
if (_rep.txt.c0) r+=_rep.txt.c0;
if (_rep.txt.c1) r+=_rep.txt.c1;
if (_rep.txt.c2) r+=_rep.txt.c2;
if (_rep.txt.c3) r+=_rep.txt.c3;
return r;
}
}
private:
bool is_ptr() const
{
return _rep.ptr.is_ptr;
}
char* ptr()
{
if (!is_ptr())
THROW(std::logic_error,"unexpected attempt to use pointer");
return reinterpret_cast<char*>(_rep.ptr.value<<1);
}
const char* ptr() const
{
if (!is_ptr())
THROW(std::logic_error,"unexpected attempt to use pointer");
return reinterpret_cast<const char*>(_rep.ptr.value<<1);
}
static char validate(char c)
{
if (c&0x80)
THROW(std::range_error,"can only deal with 7-bit characters");
return c;
}
union
{
struct
{
unsigned int is_ptr:1;
unsigned int value:31;
} ptr;
struct
{
unsigned int is_ptr:1;
unsigned int c0:7;
unsigned int :1;
unsigned int c1:7;
unsigned int :1;
unsigned int c2:7;
unsigned int :1;
unsigned int c3:7;
} txt;
unsigned int bits;
} _rep;
};
struct sillystring_collector
: public std::deque<sillystring>,
public collector
{
void insert(const std::string& s)
{
push_back(sillystring(s));
}
void dump(const std::string& f) const
{
std::ofstream out(f.c_str(),std::ios::out);
for (std::deque<sillystring>::const_iterator it=begin();
it!=end();
it++
)
out << it->str() << std::endl;
}
};
// getrusage is useless for this; Debian doesn't fill out memory related fields
// /proc/<PID>/statm is obscure/undocumented
size_t memsize()
{
const pid_t pid=getpid();
std::ostringstream cmd;
cmd << "awk '($1==\"VmData:\"){print $2,$3;}' /proc/" << pid << "/status";
FILE*const f=popen(cmd.str().c_str(),"r");
if (!f)
THROW(std::runtime_error,"popen failed");
int amount;
char units[4];
if (fscanf(f,"%d %3s",&amount,&units[0])!=2)
THROW(std::runtime_error,"fscanf failed");
if (pclose(f)!=0)
THROW(std::runtime_error,"pclose failed");
if (units[0]!='k' || units[1]!='B')
THROW(std::runtime_error,"unexpected input");
return static_cast<size_t>(amount)*static_cast<size_t>(1<<10);
}
int main(int argc,char** argv)
{
if (sizeof(void*)!=4)
THROW(std::logic_error,"64-bit not supported");
if (sizeof(sillystring)!=4)
THROW(std::logic_error,"Compiler didn't produce expected result");
if (argc!=2)
THROW(std::runtime_error,"Expected single command-line argument");
random_word_generator w(23);
std::auto_ptr<collector> c;
switch (argv[1][0])
{
case '0':
std::cout << "Testing container of strings\n";
c=std::auto_ptr<collector>(new string_collector);
break;
case '1':
std::cout << "Testing container of flyweights\n";
c=std::auto_ptr<collector>(new flyweight_collector);
break;
case '2':
std::cout << "Testing container of sillystrings\n";
c=std::auto_ptr<collector>(new sillystring_collector);
break;
default:
THROW(std::runtime_error,"Unexpected command-line argument");
}
const size_t mem0=memsize();
size_t textsize=0;
size_t filelength=0;
while (filelength<(100<<20))
{
const std::string s=w();
textsize+=s.size();
filelength+=(s.size()+1);
c->insert(s);
}
const size_t mem1=memsize();
const ptrdiff_t memused=mem1-mem0;
std::cout
<< "Memory increased by " << memused/static_cast<float>(1<<20)
<< " megabytes for " << textsize/static_cast<float>(1<<20)
<< " megabytes text; efficiency " << (100.0*textsize)/memused << "%"
<< std::endl;
// Enable to verify all containers stored the same thing:
//c->dump(std::string(argv[1])+".txt");
return 0;
}