Try
total += (-((i & mask) != 0)) & value[j];
in place of
total += ((i & mask) != 0) * value[j];
This avoids the multiply. Whether there will be a branch or not is up to whether the compiler is clever enough to find branch-free code for -(foo != 0). (Which is possible, but I'd be a bit surprised.)
(Of course, this depends on two's-complement representation; the C standard is agnostic on that.)
You might help out the compiler like so, assuming 32-bit ints and that signed >> propagates the sign bit:
total += (((int)((i & mask) << (31 - j))) >> 31) & value[j];
That is, shift the possibly-set bit left to the most-significant position, cast as signed int, then right all the way back to the least-significant position, yielding either all 0's or all 1's, under the above implementation-defined assumptions. (I haven't tested this.)
Another possibility: consider blocks of (say) 4 bits at a time. There are 16 different addition sequences; you can dispatch to unrolled code for each of them, with no tests at all within each code block. The hope here is for one indirect jump to cost less than 4 test-and-branches.
Update: Using Jonathan Leffler's scaffolding, the 4-bits-at-a-time method is fastest by a wide margin on my MacBook. Negate-and turns out to be about the same as multiply. I wonder if the processor multiplies special cases like 0 and 1 faster (or not such a special case if it's faster for most-bits-clear or most-bits-set multiplicands in general).
I didn't code up the accepted answer since it's unlikely to be fastest on this particular benchmark (it should get most of its benefit from enumerating only the set bits, doing best on sparse sets, but fully half of the bits are set in this benchmark). Here are my changes to Leffler's code, in case anyone else is strangely motivated to spend time on this:
#include <stdio.h>
#include <time.h>
static int value[] =
{
12, 36, 79, 21, 31, 93, 24, 15,
56, 63, 20, 47, 62, 88, 9, 36,
};
static int test_1(int i)
{
int total = 0;
for (unsigned short mask = 0x0001, j = 0; mask != 0; mask <<= 1, j++)
{
if (i & mask)
total += value[j];
}
return(total);
}
static int test_2(int i)
{
int total = 0;
for (unsigned short mask = 0x0001, j = 0; mask != 0; mask <<= 1, j++)
{
total += ((i & mask) != 0) * value[j];
}
return(total);
}
static int test_3(int i)
{
int total = 0;
for (unsigned mask = i & 0xFFFF, j = 0; mask != 0; mask >>= 1, j++)
{
total += (mask & 0x0001) * value[j];
}
return(total);
}
static int test_4(int i)
{
int total = 0;
for (unsigned mask = i & 0xFFFF, j = 0; mask != 0; mask >>= 1, j++)
{
total += -(mask & 0x0001) & value[j];
}
return(total);
}
static int test_5(int i)
{
int total = 0;
const int *p = value;
for (unsigned mask = i & 0xFFFF; mask != 0; mask >>= 4, p += 4)
{
switch (mask & 0xF)
{
case 0x0: break;
case 0x1: total += p[0]; break;
case 0x2: total += p[1]; break;
case 0x3: total += p[1] + p[0]; break;
case 0x4: total += p[2]; break;
case 0x5: total += p[2] + p[0]; break;
case 0x6: total += p[2] + p[1]; break;
case 0x7: total += p[2] + p[1] + p[0]; break;
case 0x8: total += p[3]; break;
case 0x9: total += p[3] + p[0]; break;
case 0xA: total += p[3] + p[1]; break;
case 0xB: total += p[3] + p[1] + p[0]; break;
case 0xC: total += p[3] + p[2]; break;
case 0xD: total += p[3] + p[2] + p[0]; break;
case 0xE: total += p[3] + p[2] + p[1]; break;
case 0xF: total += p[3] + p[2] + p[1] + p[0]; break;
}
}
return(total);
}
typedef int(*func_pointer)(int);
static func_pointer test[] = { test_1, test_2, test_3, test_4, test_5 };
#define DIM(x)(sizeof(x)/sizeof(*(x)))
int main()
{
int i, j, k;
for (i = 0; i < DIM(test); i++)
{
long sum = 0;
clock_t start = clock();
for (j = 0; j <= 0xFFFF; j += 13)
{
int rv;
for (k = 0; k < 1000; k++)
rv = (*test[i])(j);
sum += rv;
}
clock_t stop = clock();
printf("(sum = %ld) Test %d: %8.6f s\n", sum, i + 1,
(stop - start) / (1.0 * CLOCKS_PER_SEC));
}
}
Results (gcc -O4 -std=c99 branchmult2.c
):
(sum = 1744366) Test 1: 0.225497 s
(sum = 1744366) Test 2: 0.221127 s
(sum = 1744366) Test 3: 0.126301 s
(sum = 1744366) Test 4: 0.124750 s
(sum = 1744366) Test 5: 0.064877 s
Edit 2: I decided the test would be more realistic without the volatile
qualifier.