c99, penalty score=29587 (277*100+1887)
1887 character with the addition of code to reduce the representation of small integers, making the compressed form 277
1785 characters in this version. Still many to lose.
1822 characters in the version (not yet very golfed, there are many that can be removed, but I want to preserve the ungolfed source in a readable form for editing...golfing comes later), and it actually makes the test input 3 characters longer in the coded form.
Moderately golfed version
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#define E case
#define G default:
#define H switch
#define I stdin
#define J break;
#define P printf
#define R return
#define W while
#define F(a) a=getchar()
#define U(a) ungetc(a,I)
#define Q(a) putc(a,stdout)
#define L(a) do{F(a);}W(isspace(a)&&(a!=EOF))
long long int N,M;
int A,B,C,D,S[99],s;
int eat(int a){
W(isalpha(F(B)));
H(B){
E EOF: a+=4;
E ':': a+=2;
E ',':
J
G
U(B);
a+=4;
}
R a;
}
int str(int a){
W((F(B))!=a && B<0x80 && B>0)Q(B);
R B;
}
int sep(){
L(B);
H (B){
E ',':
R 0;
E ':':
R 2;
E EOF:
R 6;
G
U(B);
R 4;
}
}
int num(){
double e=1.0, d;
A = (C-'-')?1:-1;
N=isdigit(C)?(C-'0'):0;
W((F(B))!=EOF){
if (isdigit(B)){
N=N*10+(B-'0');
continue;
}
if (B=='.')goto Z;
J
}
U(B);
M=127LL;
B = 1;
W ( (N>M) ) {
for (C=B;C>0;--C) {
M<<=8;
M+=0xFFLL;
};
B = B*2;
}
N*=A;
R 0x80 + (((B-1)&7)<<3);
Z:
d=N;
W((F(B))!=EOF){
if (!isdigit(B)) J
e/=10;
d+=e*(B-'0');
}
d*=1.0*A;
U(B);
bcopy(&d,&N,8);
R 0x78;
}
int e(){
L(C);
W (C != EOF){
H (C){
E '[':
E '{':
A=( C & 0x70 ) | 0x1 ;
J
E ']':
E '}':
A=0xC0 + sep();
J
E 'f':
A=eat(0x00);
J
E 't':
A=eat(0x08);
J
E 'n':
A=eat(0x10);
J
E '\'':
Q(0x20);
str(C);
A=0x80 + sep();
J
G
A=num()+sep();
Q(A);
A=((A&0x38)>>3)+1;
for(B=A-1;B>0;B--){
C = (N>>(B*8))&0xFFLL;
Q(C);
}
A=(N & 0xFFLL);
J
}
Q(A);
L(C);
}
}
int f(){
F(C);
O:
B=(C&0x70);
Q((S[s++]=B)+0xB);
T:
F(C);
if (C&1)goto O;
B=C&0xC0;
if (B==0xC0)goto C;
H(C&0xF0){
E 0x00:
P(C&8?"true":"false");
J
E 0x10:
P("null");
J
E 0x20:
Q('\'');
str(0x80);
Q('\'');
C=B;
J
G
N=0LL;
B=((C&0x38)>>3)+1;
for(A=7;A>=(8-B);--A){
F(D);
N|=((D&0xFFLL)<<(A*8));
}
N>>=((8-B)*8);
if (C&0x80) {
P("%lld",N);
} else {
double d;
bcopy(&N,&d,8);
P("%.2f",d);
}
J
}
Z:
H(C&6){
E 0:
Q(',');
Q(' ');
J
E 2:
Q(':');
Q(' ');
J
E 6:
R EOF;
G
L(C);
goto C;
}
goto T;
C:
B=S[--s];
Q(B+0xD);
goto Z;
}
int main(int c, char** v){
R (*v[1]-'i')?f():e();
}
Validation:
$ gcc bincode.define.c
$ ./a.out i <input.test>output.test
$ ls -l output.test
-rw-r--r-- 1 dmckee staff 277 Aug 31 23:20 output.test
$ ./a.out d <output.test
[-1, 0.10, 0, 'Foo', {'b': 8388608.25, 'e': -0.99, 'c': 14}, [1, 256, false, 32768, 0.00, 128, -127, -65535, 'Use jQuery for this one!'], 127, null, 2147483648, -32769.40, ['Hello World', -2147483647, [-65535.22, 42]], 2147483648.12, -19, {'l': -2147483647.99, 'o': {'p': [-128, 'Blub']}}, [['Bar'], [false, ['Test', {}, [null], [[1, 2], 1337], true], 4]], 'The cake is a lie...', -8388607.12]
and regression test
$ ./a.out i < input.test | ./a.out d > recon.test
$ ./a.out i < recon.test | ./a.out d > recon2.test
$ cmp recon.test recon2.test
Rather more readable version
/* bincode.c */
/* */
/* A minimal binary en-/de-coder. */
/* */
/* 29 Aug. 2010 dmckee */
/* */
/* Defects and Assumptions */
/* */
/* ** Strings are assumed to consist of characters 0-127 only. */
/* ** The char type is assumed to be signed */
/* ** ':' is treated like any other separator, so the code does not consider */
/* "[1:2:true:false]" to be a problem and does not insist that {} type */
/* entities contain only X:Y entries. */
/* ** long long int is assumed to be 8 bytes as is double */
/* ** Assumes that encode and decode will be done on machines with the same */
/* endianess */
/* ** Currently all number are coded in 8 bytes */
/* ** No compression on strings */
/* ** Absolutely no input validation */
/* */
/* */
/* Standard: */
/* */
/* Each data object is coded with one or more bytes. */
/* */
/* A compound object [] or {} is indicated with the LSB (bit 1) set, and has */
/* the high nibble of the character ('[' or '{' that indicates the start of */
/* the object. Bit three is ignores (set to zero) */
/* */
/* Each object has a separator field which indicated what separator */
/* (',', ':', a compound object closes after, EOF) follows. Separators are */
/* coded into bits 2 and 3 as (00:',' 01:':' 10:close 11:EOF). */
/* */
/* The beginning of a string is indicated with a byte 0x20 (the separator */
/* is coded in the close byte), followed by the literal byte of the string, */
/* which must be 7-bit ASCII, and a close byte in the form b10000ss0, where */
/* the s's are the separator coding. */
/* */
/* Booleans are coded b0000vss0 where v indicates the value. */
/* */
/* Nulls are coded b00010ss0. */
/* */
/* Integers are coded b10lllss0 and floats are b01lllss0. In both cases the */
/* l-bits are reserved for indicating the length, a feature currently used */
/* only for integers. For the numeric types the next L+1 bytes hold the in */
/* memory representation of the value in MSB first order. */
/* */
/* A closing bracket for a compound entity is coded b11000ss0. The type of */
/* bracket is not indicated and is handled by stacking the open brackets. */
/* */
#include <stdio.h>
#include <ctype.h>
#include <string.h>
/* debugging only libraries */
/* #include <stdlib.h> */
#define fetch(a) a=getchar()
#define unfetch(a) ungetc(a,stdin)
#define say(a) putc(a,stdout)
#define lex(a) do{fetch(a);}while(isspace(a)&&(a!=EOF))
long long int N,M; /* buffer for int and float coding */
int A,B,C,D,S[99],s;/* object symbol stack and index*/
/* Consume non-spearating bytes, leaving the separator on the input, */
/* and returning EOF in the event that the end of nput is reached */
int eat(int a){
/* fprintf(stderr,"eat:In: %02x (%c)\n",a,a); */
while(isalpha(fetch(B)))/*{ fprintf(stderr,"eat: %02x (%c)\n",B,B); }*/;
/* fprintf(stderr,"eat: %02x (%c)\n",B,B); */
switch(B){
case EOF: a+=4;
case ':': a+=2;
case ',':
break;
default:
unfetch(B);
a+=4;
}
/* fprintf(stderr,"eat:Out: %02x (%c)\n",a,a); */
return a;
}
/* copy input bytes to the output until the argument or EOF is */
/* reached. Return EOF if the end of the input occurs, otherwise stick */
/* the argument back on the input */
int str(int a){
/* fprintf(stderr,"str:In: %02x (%c)\n",a,a); */
while((fetch(B))!=a && B<0x80 && B>0)say(B);
/* fprintf(stderr,"str:Out: %02x (%c)\n",B,B); */
return B;
}
/* Fetch one input byte and code the separator type from it. If it is */
/* non-separator, non-EOF, (i.e. a close bracket) put it back on the */
/* input */
int sep(){
lex(B);
switch (B){
case ',':
return 0;
case ':':
return 2;
case EOF:
return 6;
default:
unfetch(B);
return 4;
}
}
/* attempt to parse numeric input */
int num(){
double e=1.0, d;
/* C holds the first charater */
A = (C-'-')?1:-1; /* pick off a sign if needed */
N=isdigit(C)?(C-'0'):0; /* C might be '-' or '+' */
while((fetch(B))!=EOF){
if (isdigit(B)){
N=N*10+(B-'0');
continue;
}
if (B=='.')goto Z;
break;
}
/* push back the first non digit */
unfetch(B);
/* determine how many bytes are need to represent N */
M=127LL;
B = 1;
while ( (N>M) ) {
for (C=B;C>0;--C) {
M<<=8;
M+=0xFFLL;
};
B = B*2;
}
/* apply the sign */
N*=A;
return 0x80 + (((B-1)&7)<<3);
Z:
d=N;
/* going to leave floats at 8 bytes for now */
while((fetch(B))!=EOF){
if (!isdigit(B)) break;
e/=10;
d+=e*(B-'0');
}
d*=1.0*A;
unfetch(B);
bcopy(&d,&N,8);
return 0x78;
}
int encode(){
/* fprintf(stderr,"Encoding...\n"); */
lex(C);
while (C != EOF){
switch (C){
case '[':
case '{':
/* emit and object byte coding for this bracket */
A=( C & 0x70 ) | 0x1 ;
break;
case ']':
case '}':
/* emit a close byte; no matching enforced! */
A=0xC0 + sep();
break;
case 'f': /* false boolean */
A=eat(0x00);
break;
case 't': /* true boolean */
A=eat(0x08);
break;
case 'n': /* null */
A=eat(0x10);
break;
case '\'': /* string case */
say(0x20);
str(C);
A=0x80 + sep(); /* string close is signaled by a negative value */
break;
default:
/* a numeric value */
A=num()+sep();
say(A);
A=((A&0x38)>>3)+1; /* decode the length to use */
for(B=A-1;B>0;B--){
C = (N>>(B*8))&0xFFLL;
say(C);
}
A=(N & 0xFFLL);
break;
}
say(A);
lex(C);
}
}
int decode(){
/* fprintf(stderr,"Decoding...\n"); */
fetch(C);
O:
B=(C&0x70);
say((S[s++]=B)/* Push to the symbol stack */+0xB);
T:
fetch(C);
if (C&1)goto O;
B=C&0xC0;
if (B==0xC0)goto C;
switch(C&0xF0){
case 0x00:
printf(C&8?"true":"false");
break;
case 0x10:
printf("null");
break;
case 0x20: /* string case */
say('\'');
str(0x80);
say('\'');
C=B;
break;
default: /* int or float case */
/* C holds the number indicator with int/float and sep info */
N=0LL;
B=((C&0x38)>>3)+1; /* decode the length to use */
/* we need to force sign extension on small values, */
/* so we put the bytes in high */
for(A=7;A>=(8-B);--A){
fetch(D);
N|=((D&0xFFLL)<<(A*8));
}
/* then shift down wholesale */
N>>=((8-B)*8);
if (C&0x80) {
printf("%lld",N);
} else {
double d;
bcopy(&N,&d,8);
printf("%.2f",d);
}
break;
}
Z:
switch(C&6){
case 0:
say(',');
say(' ');
break;
case 2:
say(':');
say(' ');
break;
case 6:
return EOF;
default:
lex(C);
goto C;
}
goto T;
/* Close an object bracket */
C:
/* C currently holds the close record, with the necessary separator, */
/* so don't overwrite it */
B=S[--s];/* Pop from symbol stack */
say(B+0xD);
goto Z;
}
int main(int c, char** v){
return (*v[1]-'i')?decode():encode();
}
The difficulty with the coding arises because this versions codes are floating point values into 8 bytes and does not compress strings at all.
Philosophical discussion: In detail c is ill suited to the problem, because lacking reflection makes general serialization...uhm...tricky. This code actually woks on the test input as a string. In that sense a plain string compression is almost certainly better. But I've gone ahead ad treated this as a structured parsing problem, and the resulting compresssed format has substrings that are fully valid inputs to the decoder.