If you don't consider SIMD operations cheating, you can usually write SIMD assembly that performs much better than your compilers autovectorization abilities (If it even has autovectorization!)
Here's a very basic SSE(One of x86's SIMD instruction sets) tutorial. It's for Visual C++ in-line assembly.
Edit: Here's a small pair of functions if you want to try for yourself. It's the calculation of an n length dot product. One is using SSE 2 instructions in-line (GCC in-line syntax) the other is very basic C.
It's very very simple and I'd be very surprised if a good compiler couldn't vectorize the simple C loop, but if it doesn't you should see a speed up in the SSE2. The SSE 2 version could probably be faster if I used more registers but I don't want to stretch my very weak SSE skills :).
float dot_asm(float *a, float*b, int n)
{
float ans = 0;
int i;
// I'm not doing checking for size % 8 != 0 arrays.
while( n > 0) {
float tmp[4] __attribute__ ((aligned(16)));
__asm__ __volatile__(
"xorps %%xmm0, %%xmm0\n\t"
"movups (%0), %%xmm1\n\t"
"movups 16(%0), %%xmm2\n\t"
"movups (%1), %%xmm3\n\t"
"movups 16(%1), %%xmm4\n\t"
"add $32,%0\n\t"
"add $32,%1\n\t"
"mulps %%xmm3, %%xmm1\n\t"
"mulps %%xmm4, %%xmm2\n\t"
"addps %%xmm2, %%xmm1\n\t"
"addps %%xmm1, %%xmm0"
:"+r" (a), "+r" (b)
:
:"xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
__asm__ __volatile__(
"movaps %%xmm0, %0"
: "=m" (tmp)
:
:"xmm0", "memory" );
for(i = 0; i < 4; i++) {
ans += tmp[i];
}
n -= 8;
}
return ans;
}
float dot_c(float *a, float *b, int n) {
float ans = 0;
int i;
for(i = 0;i < n; i++) {
ans += a[i]*b[i];
}
return ans;
}