views:

292

answers:

3

Here's an example of a program, where coroutines really help to simplify the algorithm - imho its hardly possible to implement otherwise. I also tried to choose a useful task for the demo - this utility converts a binary file to a sequence of A-Z symbols (and back), without any significant redundancy, and it has an ability to work with any specified alphabet (see M.Init line). Basically its something like generalized base64. The code is tested and worked with MSC,IntelC and gcc/mingw.

Update: The algorithm is based on precise arithmetic coding, so its not a one-liner by default. It may be possible to cut it in half by using putc/getc file i/o (thus only a modified rangecoder class and do_process() would remain), but then it would be very limited (eg. won't be applicable to decode a memory block or network stream). Actually coroutines are used as a speed optimization here, and its the point of this post. Unfortunately I don't have any simpler application to properly demonstrate this - I could write a context modelling compressor instead, but that would be like 100 lines more.

Questions:
1) How to replace INCLUDE_PROCESS_TEMPLATE macro with proper C++ code?
2) Is there a way to implement this without coroutines?
(but still with in-memory encoding and buffered file i/o)
3) Any fixes/improvements?

#include <io.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <setjmp.h>
#define NOINLINE __declspec(noinline)

typedef unsigned int   uint;
typedef unsigned char  byte;
typedef unsigned long long int qword;

enum{ STKPAD=1<<16 };
struct coroutine {
  volatile int   state;
  volatile byte* inpptr;
  volatile byte* inpbeg;
  volatile byte* inpend;
  volatile byte* outptr;
  volatile byte* outbeg;
  volatile byte* outend;
  jmp_buf PointA, PointB;
  void yield( int value ) { if( setjmp(PointB)==0 ) { state=value; longjmp(PointA,value); } }
  void chkinp( void ) { if( inpptr>=inpend ) yield(1), inpptr=*&inpptr; }
  void chkout( void ) { if( outptr>=outend ) yield(2); }
  template <int f> byte get( void ) { if( f ) chkinp(); return *inpptr++; }
  template <int f> void put( uint c ) { *outptr++ = c; if( f ) chkout(); }
  void coro_init( void ) { inpptr=inpbeg=inpend=0; outptr=outbeg=outend=0; state=0; }
  void addinp( byte* inp,uint inplen ) { inpbeg=inpptr=inp; inpend=&inp[inplen]; }
  void addout( byte* out,uint outlen ) { outbeg=outptr=out; outend=&out[outlen]; }
};

#define INCLUDE_PROCESS_TEMPLATE \
NOINLINE void call_do_process() { byte stktmp[STKPAD]; state=ptrdiff_t(stktmp); do_process(); } \
int coro_process( void ) { if( setjmp(PointA)==0 ) if( state ) longjmp(PointB,3); else call_do_process(); return state; } 

struct Rangecoder_SH1x : coroutine {
  enum { SCALElog=15, SCALE=1<<SCALElog };
  enum { NUM=4, sTOP=0x01000000U, Thres=0xFF000000U };
  uint f_decode; // 0=encode, 1=decode;
  uint range, Cache, FFNum;
  union {
    struct { uint low; uint Carry; };
    qword lowc;
    uint  code; 
  };
  uint rc_BProcess( uint freq, uint bit ) { 
    uint rnew = (range>>SCALElog)*freq;
    if( f_decode ) bit = (code>=rnew);
    range = ((range-rnew-rnew)&(-bit)) + rnew;
    rnew &= -bit;
    if( f_decode ) code-=rnew; else lowc+=rnew;
    if( f_decode ) while( range<sTOP ) range<<=8, (code<<=8)+=get<1>();
    else while( range<sTOP ) range<<=8, ShiftLow();
    return bit;
  }
  void ShiftLow( void ) {
    if( low<Thres || Carry ) {
      put<1>( Cache+Carry );
      for(; FFNum!=0; FFNum-- ) put<1>( Carry-1 );
      Cache=low>>24; Carry=0;
    } else FFNum++;
    low<<=8;
  }
  void rc_Init( int DECODE ) {
    f_decode=DECODE; range=-1; lowc=FFNum=Cache=0;
    if( f_decode ) for(int _=0; _<NUM+0; _++) (code<<=8)+=get<1>(); 
  }
};

struct Model : Rangecoder_SH1x {
  uint DECODE, f_quit;
  enum{ lCNUM=8, CNUM=1<<lCNUM, ROWSIZE=80 };
  uint count[2*CNUM];
  enum{ inpbufsize=1<<16, outbufsize=1<<16 };
  byte inpbuf[inpbufsize], outbuf[outbufsize];

  void Init( const char* s ) {
    uint i,j;
    uint (&p)[CNUM] = (uint(&)[CNUM])count[CNUM];
    for( i=0; i<2*CNUM; i++) count[i]=0;
    for( i=0; s[i]; i++ ) p[byte(s[i])]++;
    for( j=0; j<lCNUM; j++ ) for( i=(CNUM>>j); i<((CNUM+CNUM)>>j); i++ ) count[i>>1] += count[i];
  }

  INCLUDE_PROCESS_TEMPLATE
  void do_process( void ) {
    uint i,j;
    rc_Init(1-DECODE);
    for( i=0; !f_quit; i++ ) {
      uint c=0, ctx=1;
      if( DECODE ) do c=get<1>(); while( c==10 );
      for( j=lCNUM-1; j!=-1; j-- ) {
        uint freq = count[ctx*2+0]*SCALE/(count[ctx*2+0]+count[ctx*2+1]);
        ctx += ctx + ((freq==0) ? 1 : (freq==SCALE) ? 0 : rc_BProcess(freq,(c>>j)&1));
      }
      if( !DECODE ) put<1>(ctx), (((i+1)%ROWSIZE==0)?put<1>(10),0:0);
    }
    yield(0);
  }

  void ProcessFile( uint Mode, FILE* f, FILE* g ) {
    volatile uint r; volatile qword g_len=0; uint f_len=0;
    DECODE=Mode; f_quit=0;
    if( DECODE ) addout( (byte*)&g_len, sizeof(f_len)+1 ), r=1;
    else f_len=filelength(fileno(f)), addinp( (byte*)&f_len, sizeof(f_len) ),addout(0,0), r=2;
    do {
      if( r==1 ) {
        uint l = fread( inpbuf, 1, inpbufsize, f );
        if( l>0 ) {
          addinp( inpbuf, l );
        } else {
          if( inpbeg==inpbuf+1 ) f_quit=1;
          memset( inpbuf, 0x80, inpbufsize );
          addinp( inpbuf+1, 5 ); 
        }
      } else if( r==2 ) {
        if( outbeg==outbuf ) fwrite( outbuf, 1, outptr-outbeg, g ); else g_len>>=8;
        addout( outbuf, outbufsize );
      }
      r = coro_process(); 
    } while(r);
    fwrite( outbuf, 1,outptr-outbuf, g ); // flush
    if( DECODE==0 ) fputc( 10, g ); else fflush(g), chsize( fileno(g), g_len );
  }

} M;

int main( int argc, char** argv ) {
  if( argc<4 ) return 1;
  int DECODE = argv[1][0]=='d';
  FILE* f = fopen( argv[2], "rb" ); if( f==0 ) return 2;
  FILE* g = fopen( argv[3], "wb" ); if( g==0 ) return 3;
  M.Init( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" );
  M.ProcessFile( DECODE, f, g );
}
+1  A: 

Implementing coroutines portably its a difficult task. Please consider using Boost.coroutine candidate. Here are updates to the library.

I've used it on OS X and Linux quite a bit together with boost::asio and they've proven to be very robustly implemented and a very useful abstraction of threads with the deterministic behavior of a sequential program

I don't know why it hasn't yet been added to the main boost distribution. My guess is there some political argument disguised as a technical one behind that fact, although you are encouraged to take my paranoia with a grain of salt

EDIT: there is a new boost candidate in the boost vault called Boost.Context, and its part of a larger library called Boost.Fiber. It doesn't have a webpage yet so i won't link it here. It seems to have better support

lurscher
Mine is a (relatively) portable implementation (tested on iphone) and really simple (and task-specific). But performance-wise its much better than other implementations I've seen.
Shelwien
+2  A: 

Just for grins, here's a rough idea of how I'd handle the part that just encodes to/decodes from an arbitrary alphabet. As promised, the actual encoding/decoding is around a dozen lines of code. The overall size is larger, largely because I've used templates throughout, so the numbers can be an arbitrary integer type, and the characters can be an arbitrary character type, and it uses iterators for both, so it can read from/write to arbitrary collections (streams, stringstreams, vectors, etc.)

Edit: modified code to read input from a file and write output to a file (and fixed a minor error or two):

#include <iterator>
#include <iostream>
#include <string>
#include <limits>
#include <vector>
#include <fstream>
#include <time.h>
#include <math.h>
#include <stdlib.h>

template <class intT>
intT log2(intT input) { 
    return intT(log10((double)input) / log10(2.0));
}

template <class intT>
class coder { 
    std::string alphabet;   
    size_t range;
    unsigned ratio;
public:
    coder(std::string const &alpha) : alphabet(alpha), range(alpha.size()) {
        ratio = ceil(double(log2(std::numeric_limits<intT>::max())/log2(range)));
    }

    template <class inIt, class outIt>
    void encode(inIt begin, inIt end, outIt out) { 
        while (begin != end) {
            intT val = *begin++;
            for (int i=0; i<ratio; i++) {
                *out++ = alphabet[val % range];
                val /= range;
            }
        }
    }

    template <class inIt, class outIt>
    void decode(inIt begin, inIt end, outIt out) { 
        while (begin != end) {
            int temp = 0;
            for (int i=0; i<ratio; i++)
                temp += alphabet.find(*begin++) * pow((double)range, i);
            *out++ = temp;
        }
    }
};

int main(int argc, char **argv) {
    if (argc != 3) {
        std::cerr << "Usage: encode <infile> <outfile>\n";
        return EXIT_FAILURE;
    }

    coder<unsigned> enc("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
    std::ifstream in(argv[1], std::ios::binary);
    std::ofstream out(argv[2]);
    clock_t start = clock();
    enc.encode(std::istream_iterator<char>(in), 
        std::istream_iterator<char>(), 
        std::ostream_iterator<char>(out, ""));
    clock_t stop = clock();
    std::cerr << "Processing time: " << double(stop-start)/CLOCKS_PER_SEC << "\n";
    return 0;
}

At least for the moment, I've ignored the arithmetic encoding part, but it should (at least IMO) follow a similar structure so you could pretty easily string things together more or less arbitrarily.

As far as comparing speed and size goes, keep in mind that this isn't doing any compression (at all) just the baseX encoding -- that being the case, attempting to compare to something that does compression makes no real sense (except, for example, to get an idea of how effective the compression is -- but if it's effective at all, it'll obviously produce smaller output).

As far as executable size goes, about all I can say is that gcc producing large executables never surprises me. Using MS VC++, I get an executable of 9,728 bytes for the code above.

Jerry Coffin
Hey, I can read this one.
GMan
Can you please fix this to encode bytes from input file to output file, and to use A-Z alphabet, so I'd be able to directly comparethe output size and speed? Presumably I'm not aware of "good C++", so I'd prefer if somebody else could provide a complete source for benchmark. For now I can only say that gcc 4.5 compiles your source to 474112 bytes, and mine to 10752.
Shelwien
With msvcr90.dll or something? That doesn't really count as you'd have to distribute that with your app anyway.Also my code doesn't do any compression either. It just sets up a probability distribution where only symbols A-Z have nonzero (equal) probabilities, then applies arithmetic _decoding_ to produce a file with only A-Z in it. And as to output size - the actual problem with your approach is that with not power-of-2 sized alphabets (eg. 26 symbols like my example) you'd have significant redundancy after every few bytes.
Shelwien
As to your code, first it produced a 30k output from its own 474k binary, then (after i added ios::binary flags to streams), it suddenly became 3M on output. Can you please make another effort and fix it to actually work?
Shelwien
@Shelwein: You are right that it should have used `ios::binary` when opening the file. Beyond that, probably not -- the idea was to show a general *structure* that can work and people will stand a chance of reading and understanding. The algorithm itself is intended as pretty much a place-holder, showing where to plug in the code to implement the algorithm if your choice.
Jerry Coffin
@Shelwein: as far as the algorithm itself goes, I'd ask about that in a separate question that includes an understandable description of what it's supposed to do (IMO, the code in this question doesn't qualify).
Jerry Coffin
Not that it matters per se, but I think `coder<unsigned> enc("ABCDEFGHIJKLMNOPQRSTUVWXYZ");` would go after you check if there are enough arguments. And maybe return `EXIT_FAILURE` instead of 0 when the parameter count is wrong.
GMan
@GMan: those are certainly reasonable suggestions -- done.
Jerry Coffin
1. It still doesn't work - the same 3M output from 474k original binary. The reason is likely <char> in input iterator,but replacing it with unsigned doesn't make it work for me either.2. output should have the binary flag too - why can't I set the alphabet to "\r\n"?3. I agree that its a separate question, but you're not answering my actual question (read the post), and its impossible to even compare the performance of your version vs mine (while coroutine and buffering are speed optimizations in my case as i said).
Shelwien
A: 

Ok, here's what I actually asked about (see [1]) - the trick to statically call a function from child class. tl;dr is apparently a mighty power, so here's your readable standard coroutine fibonacci generator this time. There's a small difference though - we don't really need coroutines to generate these numbers, but its really hard (if possible) to make a faster implementation of my first program without coroutines.

#include <stdio.h>
#include <stddef.h>
#include <setjmp.h>

// without noinline some compilers tend to allocate the array before setjmp()
#ifdef __GNUC__
 #define NOINLINE __attribute__((noinline))
#else
 #define NOINLINE __declspec(noinline)
#endif

enum{ STKPAD=1<<16 };
struct coroutine {
  volatile unsigned state;
  jmp_buf PointA, PointB;
  void yield( int value ) { if( setjmp(PointB)==0 ) { state=value; longjmp(PointA,value); } }
  template <typename T> NOINLINE void call_do_process() {
    char stktmp[STKPAD]; state=ptrdiff_t(stktmp); ((T*)this)->do_process();
  }
  template <typename T> unsigned coro_process( T* ) {
    if( setjmp(PointA)==0 ) if( state ) longjmp(PointB,3); else call_do_process<T>();
    return state;
  }
};

struct fibonacci : coroutine {

  void do_process( void ) {
    unsigned a=0,b=1;
    while(1) {
      yield( b );
      b = b + a;
      a = b - a;
    }
  }

  unsigned get( void ) {
    return coro_process(this); 
  }

} F;

int main( int argc, char** argv ) {

  for( int i=0; i<20; i++ ) {
    printf( "%i ", F.get() );
  } printf( "\n" );

  return 0;
}

And since Jerry Coffin's alternative version still fails to produce sensible results, here're some simpler stream benchmarks. Its a pity, as I'd expect it to be even slower with iterators.

In fact I've tested all kinds of approaches with arithmetic coders - plain getc/putc, virtual methods, plain functions pointers, iterator-like classes, and its clear that there's a large difference. For now, coroutines proved to be the best way for this - there's no complex logic incapsulated into byte i/o calls (unlike iterators), and the processing doesn't have to care about i/o details. Sure, there're even further optimizations, but I really only tried to demonstrate the benefits of coroutine approach here...

#define _CRT_SECURE_NO_DEPRECATE
#define _CRT_DISABLE_PERFCRIT_LOCKS

#include <stdio.h>
#include <time.h>
#include <fstream>

int main( int argc, char** argv ) {
  if( argc<3 ) return 1;

  { 
    clock_t start = clock();
    FILE* f = fopen( argv[1], "rb" ); if( f==0 ) return 2;
    FILE* g = fopen( argv[2], "wb" ); if( g==0 ) return 3;
    while(1) {
      int c = getc(f);
      if( c<0 ) break;
      putc(c,g);
    }
    fclose(f);
    fclose(g);
    clock_t stop = clock();
    printf( "            File copy via stdio getc/putc - %7.3fs\n", float(stop-start)/CLOCKS_PER_SEC );
  }

  {
    clock_t start = clock();
    FILE* f = fopen( argv[1], "rb" ); if( f==0 ) return 2;
    FILE* g = fopen( argv[2], "wb" ); if( g==0 ) return 3;
    while(1) {
      static char buf[1<<16];
      int l = fread( buf, 1,sizeof(buf), f ); if( l<=0 ) break;
      fwrite( buf, 1,l, g ); if( l<sizeof(buf) ) break;
    }
    fclose(f);
    fclose(g);
    clock_t stop = clock();
    printf( "     File copy via stdio 64k fread/fwrite - %7.3fs\n", float(stop-start)/CLOCKS_PER_SEC );
  }

  {
    clock_t start = clock();
    std::ifstream f(argv[1],std::ios::in|std::ios::binary); if( !f.is_open() ) return 2;
    std::ofstream g(argv[2],std::ios::out|std::ios::binary); if( !g.is_open() ) return 3;
    while(1) {
      int c = f.get();
      if( c<0 ) break;
      g.put(c);
    }
    f.close();
    g.close();
    clock_t stop = clock();
    printf( "File copy via ifstream::get/ofstream::put - %.3fs\n", float(stop-start)/CLOCKS_PER_SEC );
  }

}
----- 100,000,000 byte file ----- 
[ GCC 4.5 ]
            File copy via stdio getc/putc -   0.546s
     File copy via stdio 64k fread/fwrite -   0.188s
File copy via ifstream::get/ofstream::put -  10.578s

[ IntelC 11.1 / VS 2005 ]
            File copy via stdio getc/putc -   0.500s
     File copy via stdio 64k fread/fwrite -   0.156s
File copy via ifstream::get/ofstream::put -  14.656s

[ MSC 14.0 / VS 2005 ]
            File copy via stdio getc/putc -   0.609s
     File copy via stdio 64k fread/fwrite -   0.156s
File copy via ifstream::get/ofstream::put -  19.063s

----- 1,000,000,000 byte file ----- 
[ GCC 4.5 ]
            File copy via stdio getc/putc -   7.468s
     File copy via stdio 64k fread/fwrite -   1.828s
File copy via ifstream::get/ofstream::put - 109.891s

[ IntelC 11.1 / VS 2005 ]
            File copy via stdio getc/putc -   6.718s
     File copy via stdio 64k fread/fwrite -   1.672s
File copy via ifstream::get/ofstream::put - 145.500s

[ MSC 14.0 / VS 2005 ]
            File copy via stdio getc/putc -   6.453s
     File copy via stdio 64k fread/fwrite -   1.609s
File copy via ifstream::get/ofstream::put - 191.031s
Shelwien