One approach - partition into nibbles then use switch to select the bits from the nibble. Use templates so the bit selected is known at compile time, and to help unwind the code.
template < int i, int x >
struct process_bit {
inline static void apply ( int y ) { };
};
template < int x >
struct process_bit < 1, x > {
inline static void apply ( int y ) {
evaluate ( x, y );
}
};
template < int x, int n >
inline void process_nibble_bits ( int y ) {
process_bit < x & 1, n >::apply( y );
process_bit < ( x >> 1 ) & 1, n + 1 > ::apply( y );
process_bit < ( x >> 2 ) & 1, n + 2 > ::apply( y );
process_bit < ( x >> 3 ) & 1, n + 3 > ::apply( y );
}
template < int n >
inline void process_nibble ( uint64_t xbits, int y ) {
uint64_t nibble = ( xbits >> n ) & 0xf;
if ( nibble ) {
switch ( nibble ) {
case 0:
process_nibble_bits < 0, n > ( y );
break;
case 1:
process_nibble_bits < 1, n > ( y );
break;
case 2:
process_nibble_bits < 2, n > ( y );
break;
case 3:
process_nibble_bits < 3, n > ( y );
break;
case 4:
process_nibble_bits < 4, n > ( y );
break;
case 5:
process_nibble_bits < 5, n > ( y );
break;
case 6:
process_nibble_bits < 6, n > ( y );
break;
case 7:
process_nibble_bits < 7, n > ( y );
break;
case 8:
process_nibble_bits < 8, n > ( y );
break;
case 9:
process_nibble_bits < 9, n > ( y );
break;
case 10:
process_nibble_bits < 10, n > ( y );
break;
case 11:
process_nibble_bits < 11, n > ( y );
break;
case 12:
process_nibble_bits < 12, n > ( y );
break;
case 13:
process_nibble_bits < 13, n > ( y );
break;
case 14:
process_nibble_bits < 14, n > ( y );
break;
case 15:
process_nibble_bits < 15, n > ( y );
break;
}
}
}
template < int i, int n >
struct bit_tree {
inline static void apply ( uint64_t xbits, int y ) {
// each call to here represents scan of bits in [ n, n + 2i )
bit_tree < i >> 1, n > ::apply(xbits, y);
bit_tree < i >> 1, n + i > ::apply(xbits, y);
};
};
template < int i, int n >
struct bit_tree_with_guard {
inline static void apply ( uint64_t xbits, int y ) {
// each call to here represents scan of bits in [ n, n + 2i )
// so this branch to execute if any in [ n, n + i ) are set
if ( xbits & ( ( ( ( ( uint64_t ) 1LL ) << i ) - 1 ) << n ) )
bit_tree < i >> 1, n > ::apply(xbits, y);
if ( xbits & ( ( ( ( ( uint64_t ) 1LL ) << i ) - 1 ) << ( n + i) ) )
bit_tree < i >> 1, n + i > ::apply(xbits, y);
};
};
// put guards on 8 and 16 bit blocks ( for some reason using inheritance is slower )
template < int n >
struct bit_tree < 8, n > {
inline static void apply ( uint64_t xbits, int y ) {
bit_tree_with_guard < 8, n > ::apply ( xbits, y );
}
};
template < int n >
struct bit_tree < 16, n > {
inline static void apply ( uint64_t xbits, int y ) {
bit_tree_with_guard < 16, n > ::apply ( xbits, y );
}
};
template < int n >
struct bit_tree < 2, n > {
inline static void apply ( uint64_t xbits, int y ) {
process_nibble < n > ( xbits, y );
}
};
void template_nibbles(int height) {
for (int y = 0; y < height; y++) {
uint64_t xbits = ~board[y];
bit_tree< 32, 0>::apply ( xbits, y );
}
}
Running it's not as fast as the ffs version, but it is a touch better than the other portable ones, and appears to be consistent with them in results:
$ bin\bit_twiddle_micro_opt.exe
testing will_while()... 3375000 usecs (check 1539404233,1539597930)
testing will_ffs()... 2890625 usecs (check 675191567,1001386403)
testing alphaneo_unrolled_8()... 3296875 usecs (check 1539404233,1539597930)
testing template_nibbles()... 3046875 usecs (check 1539404233,1539597930)
Using a tree all the way doesn't seem to give any gain; not using the switch for the nibble is slower. Anyone know a way of not having to write the 16 cases by hand using only C++?