I have an application where part of the inner loop was basically:
double sum = 0;
for (int i = 0; i != N; ++i, ++data, ++x) sum += *data * x;
If x is an unsigned int, then the code takes 3 times as long as with int!
This was part of a larger code-base, but I got it down to the essentials:
#include <iostream>
#include <cstdlib>
#include <vector>
#include <time.h>
typedef unsigned char uint8;
template<typename T>
double moments(const uint8* data, int N, T wrap) {
T pos = 0;
double sum = 0.;
for (int i = 0; i != N; ++i, ++data) {
sum += *data * pos;
++pos;
if (pos == wrap) pos = 0;
}
return sum;
}
template<typename T>
const char* name() { return "unknown"; }
template<>
const char* name<int>() { return "int"; }
template<>
const char* name<unsigned int>() { return "unsigned int"; }
const int Nr_Samples = 10 * 1000;
template<typename T>
void measure(const std::vector<uint8>& data) {
const uint8* dataptr = &data[0];
double moments_results[Nr_Samples];
time_t start, end;
time(&start);
for (int i = 0; i != Nr_Samples; ++i) {
moments_results[i] = moments<T>(dataptr, data.size(), 128);
}
time(&end);
double avg = 0.0;
for (int i = 0; i != Nr_Samples; ++i) avg += moments_results[i];
avg /= Nr_Samples;
std::cout << "With " << name<T>() << ": " << avg << " in " << (end - start) << "secs" << std::endl;
}
int main() {
std::vector<uint8> data(128*1024);
for (int i = 0; i != data.size(); ++i) data[i] = std::rand();
measure<int>(data);
measure<unsigned int>(data);
measure<int>(data);
return 0;
}
Compiling with no optimization:
luispedro@oakeshott:/home/luispedro/tmp/so §g++ test.cpp
luispedro@oakeshott:/home/luispedro/tmp/so §./a.out
With int: 1.06353e+09 in 9secs
With unsigned int: 1.06353e+09 in 14secs
With int: 1.06353e+09 in 9secs
With optimization:
luispedro@oakeshott:/home/luispedro/tmp/so §g++ -O3 test.cpp
luispedro@oakeshott:/home/luispedro/tmp/so §./a.out
With int: 1.06353e+09 in 3secs
With unsigned int: 1.06353e+09 in 12secs
With int: 1.06353e+09 in 4secs
I don't understand why such a large difference in speed. I tried figuring it out from the generated assembly, but I got nowhere. Anyone have any thoughts?
Is this something to do with the hardware or is it a limitation of gcc's optimisation machinery? I'm betting the second.
My machine is an Intel 32 bit running Ubuntu 9.10.
Edit: Since Stephen asked, here is the de-compiled source (from a -O3 compilation). I believe I got the main loops:
int version:
40: 0f b6 14 0b movzbl (%ebx,%ecx,1),%edx
sum += *data * pos;
44: 0f b6 d2 movzbl %dl,%edx
47: 0f af d0 imul %eax,%edx
++pos;
4a: 83 c0 01 add $0x1,%eax
sum += *data * pos;
4d: 89 95 54 c7 fe ff mov %edx,-0x138ac(%ebp)
++pos;
if (pos == wrap) pos = 0;
53: 31 d2 xor %edx,%edx
55: 3d 80 00 00 00 cmp $0x80,%eax
5a: 0f 94 c2 sete %dl
T pos = 0;
double sum = 0.;
for (int i = 0; i != N; ++i, ++data) {
5d: 83 c1 01 add $0x1,%ecx
sum += *data * pos;
60: db 85 54 c7 fe ff fildl -0x138ac(%ebp)
++pos;
if (pos == wrap) pos = 0;
66: 83 ea 01 sub $0x1,%edx
69: 21 d0 and %edx,%eax
T pos = 0;
double sum = 0.;
for (int i = 0; i != N; ++i, ++data) {
6b: 39 f1 cmp %esi,%ecx
sum += *data * pos;
6d: de c1 faddp %st,%st(1)
T pos = 0;
double sum = 0.;
for (int i = 0; i != N; ++i, ++data) {
6f: 75 cf jne 40
unsigned version:
50: 0f b6 34 13 movzbl (%ebx,%edx,1),%esi
sum += *data * pos;
54: 81 e6 ff 00 00 00 and $0xff,%esi
5a: 31 ff xor %edi,%edi
5c: 0f af f0 imul %eax,%esi
++pos;
5f: 83 c0 01 add $0x1,%eax
if (pos == wrap) pos = 0;
62: 3d 80 00 00 00 cmp $0x80,%eax
67: 0f 94 c1 sete %cl
T pos = 0;
double sum = 0.;
for (int i = 0; i != N; ++i, ++data) {
6a: 83 c2 01 add $0x1,%edx
sum += *data * pos;
6d: 89 bd 54 c7 fe ff mov %edi,-0x138ac(%ebp)
73: 89 b5 50 c7 fe ff mov %esi,-0x138b0(%ebp)
++pos;
if (pos == wrap) pos = 0;
79: 89 ce mov %ecx,%esi
7b: 81 e6 ff 00 00 00 and $0xff,%esi
sum += *data * pos;
81: df ad 50 c7 fe ff fildll -0x138b0(%ebp)
++pos;
if (pos == wrap) pos = 0;
87: 83 ee 01 sub $0x1,%esi
8a: 21 f0 and %esi,%eax
for (int i = 0; i != N; ++i, ++data) {
8c: 3b 95 34 c7 fe ff cmp -0x138cc(%ebp),%edx
sum += *data * pos;
92: de c1 faddp %st,%st(1)
for (int i = 0; i != N; ++i, ++data) {
94: 75 ba jne 50
This is the -O3 version, which is why the source lines jump up and down. Thank you.