views:

302

answers:

7

OK, the question title is a bit crappy, but I didn't really know how to phrase this better.

The problem I have is that given a std::vector<T> vs. a T* + size_t count my compiler (Visual Studio 2005 / VC++ 8) will actually generate worse code when looping over the pointer than when looping over the vector.

That is, I have a test struct containing a vector and another one containing a pointer + count. Now, when writing the semantically exact same looping construct, the version with the std::vector is significantly (which is to say > 10%) faster than the version with the pointer.

Below you will find the code as well as the generated assembly. It would be great if someone could explain what's going on here.

If you look at the assembly, you can note how the raw pointer version generates slightly more instructions. It would already be a very nice answer if anyone could explain how these versions differ semantically on the assembly level.

And please refrain from answers telling me I shouldn't care, premature optimization, root of all evil, etc. In this specific case I do care and anyway I think it is a rather interesting puzzle! :-)


Compiler settings:

  • Full Optimization (/Ox)
  • Whole Program Opt. = NO

Here comes the code:

stdafx.h

// Disable secure STL stuff!
#define _SECURE_SCL 0
#define _SECURE_SCL_THROWS 0
#include <iostream>
#include <iomanip>
#include <vector>
#include <mmsystem.h>

header file

// loop1.h
typedef int PodType;

const size_t container_size = 3;
extern volatile size_t g_read_size;

void side_effect();

struct RawX {
    PodType* pData;
    PodType wCount;

    RawX()
    : pData(NULL)
    , wCount(0)
    { }

    ~RawX() {
        delete[] pData;
        pData = NULL;
        wCount = 0;
    }

    void Resize(PodType n) {
        delete[] pData;
        wCount = n;
        pData = new PodType[wCount];
    }
private:
    RawX(RawX const&);
    RawX& operator=(RawX const&);
};

struct VecX {
    std::vector<PodType> vData;
};

void raw_loop(const int n, RawX* obj);
void raw_iterator_loop(const int n, RawX* obj);
void vector_loop(const int n, VecX* obj);
void vector_iterator_loop(const int n, VecX* obj);

implementation file

// loop1.cpp
void raw_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(int j=0, e=obj->wCount; j!=e; ++j) {
            g_read_size = obj->pData[j];
            side_effect();
        }
        side_effect();
    }
}

void raw_iterator_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(PodType *j=obj->pData, *e=obj->pData+size_t(obj->wCount); j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();
    }
}

void vector_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(size_t j=0, e=obj->vData.size(); j!=e; ++j) {
            g_read_size = obj->vData[j];
            side_effect();
        }
        side_effect();
    }
}

void vector_iterator_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(std::vector<PodType>::const_iterator j=obj->vData.begin(), e=obj->vData.end(); j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();      
    }
}

test main file

using namespace std;

volatile size_t g_read_size;
void side_effect()
{
    g_read_size = 0;
}

typedef size_t Value;

template<typename Container>
Value average(Container const& c)
{
    const Value sz = c.size();
    Value sum = 0;
    for(Container::const_iterator i=c.begin(), e=c.end(); i!=e; ++i)
        sum += *i;
    return sum/sz;

}

void take_timings()
{
    const int x = 10;
    const int n = 10*1000*1000;

    VecX vobj;
    vobj.vData.resize(container_size);
    RawX robj;
    robj.Resize(container_size);

    std::vector<DWORD> raw_times;
    std::vector<DWORD> vec_times;
    std::vector<DWORD> rit_times;
    std::vector<DWORD> vit_times;

    for(int i=0; i!=x; ++i) {
        const DWORD t1 = timeGetTime();
        raw_loop(n, &robj);
        const DWORD t2 = timeGetTime();
        vector_loop(n, &vobj);
        const DWORD t3 = timeGetTime();
        raw_iterator_loop(n, &robj);
        const DWORD t4 = timeGetTime();
        vector_iterator_loop(n, &vobj);
        const DWORD t5 = timeGetTime();
        raw_times.push_back(t2-t1);
        vec_times.push_back(t3-t2);
        rit_times.push_back(t4-t3);
        vit_times.push_back(t5-t4);
    }

    cout << "Average over " << x << " iterations for loops with count " << n << " ...\n";
    cout << "The PodType is '" << typeid(PodType).name() << "'\n";
    cout << "raw_loop: " << setw(10) << average(raw_times) << " ms \n";
    cout << "vec_loop: " << setw(10) << average(vec_times) << " ms \n";
    cout << "rit_loop: " << setw(10) << average(rit_times) << " ms \n";
    cout << "vit_loop: " << setw(10) << average(vit_times) << " ms \n";
}

int main()
{
    take_timings();
    return 0;
}

Here comes the generated assembly as displayed by the visual studio debugger (for the 2 functions with the "iterators".

*raw_iterator_loop*

void raw_iterator_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
00  mov         eax,dword ptr [esp+4] 
00  test        eax,eax 
00  je          raw_iterator_loop+53h (4028C3h) 
00  push        ebx  
00  mov         ebx,dword ptr [esp+0Ch] 
00  push        ebp  
00  push        esi  
00  push        edi  
00  mov         ebp,eax 
        side_effect();
00  call        side_effect (401020h) 
        for(PodType *j=obj->pData, *e=obj->pData+size_t(obj->wCount); j!=e; ++j) {
00  movzx       eax,word ptr [ebx+4] 
00  mov         esi,dword ptr [ebx] 
00  lea         edi,[esi+eax*2] 
00  cmp         esi,edi 
00  je          raw_iterator_loop+45h (4028B5h) 
00  jmp         raw_iterator_loop+30h (4028A0h) 
00  lea         esp,[esp] 
00  lea         ecx,[ecx] 
            g_read_size = *j;
00  movzx       ecx,word ptr [esi] 
00  mov         dword ptr [g_read_size (4060B0h)],ecx 
            side_effect();
00  call        side_effect (401020h) 
00  add         esi,2 
00  cmp         esi,edi 
00  jne         raw_iterator_loop+30h (4028A0h) 
        }
        side_effect();
00  call        side_effect (401020h) 
00  sub         ebp,1 
00  jne         raw_iterator_loop+12h (402882h) 
00  pop         edi  
00  pop         esi  
00  pop         ebp  
00  pop         ebx  
    }
}
00  ret              

*vector_iterator_loop*

void vector_iterator_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
00  mov         eax,dword ptr [esp+4] 
00  test        eax,eax 
00  je          vector_iterator_loop+43h (402813h) 
00  push        ebx  
00  mov         ebx,dword ptr [esp+0Ch] 
00  push        ebp  
00  push        esi  
00  push        edi  
00  mov         ebp,eax 
        side_effect();
00  call        side_effect (401020h) 
        for(std::vector<PodType>::const_iterator j=obj->vData.begin(), e=obj->vData.end(); j!=e; ++j) {
00  mov         esi,dword ptr [ebx+4] 
00  mov         edi,dword ptr [ebx+8] 
00  cmp         esi,edi 
00  je          vector_iterator_loop+35h (402805h) 
            g_read_size = *j;
00  movzx       eax,word ptr [esi] 
00  mov         dword ptr [g_read_size (4060B0h)],eax 
            side_effect();
00  call        side_effect (401020h) 
00  add         esi,2 
00  cmp         esi,edi 
00  jne         vector_iterator_loop+21h (4027F1h) 
        }
        side_effect();      
00  call        side_effect (401020h) 
00  sub         ebp,1 
00  jne         vector_iterator_loop+12h (4027E2h) 
00  pop         edi  
00  pop         esi  
00  pop         ebp  
00  pop         ebx  
    }
}
00  ret          
A: 

Looking at the assembly generated for the inner loops, they are essentially identical except for one register change. I wouldn't expect any timing difference on that basis.

            g_read_size = *j; 
00  movzx       ecx,word ptr [esi]  
00  mov         dword ptr [g_read_size (4060B0h)],ecx  
            side_effect(); 
00  call        side_effect (401020h)  
00  add         esi,2  
00  cmp         esi,edi  
00  jne         raw_iterator_loop+30h (4028A0h)  


            g_read_size = *j;  
00  movzx       eax,word ptr [esi]   
00  mov         dword ptr [g_read_size (4060B0h)],eax   
            side_effect();  
00  call        side_effect (401020h)   
00  add         esi,2   
00  cmp         esi,edi   
00  jne         vector_iterator_loop+21h (4027F1h)   

I had a similar question on code timing where I couldn't explain a difference in two pieces of code. I never got a definitive answer on that one, although in the end I convinced myself that it was a case of code alignment. http://stackoverflow.com/questions/688325/how-can-adding-code-to-a-loop-make-it-faster

Mark Ransom
A: 

Try changing PodType from int to size_t. The conversion there complicates the loop initialization code.

ybungalobill
+1  A: 

The compiler has no way to know that side_effect() doesn't change obj->pData. Storing things that cannot change in local variables can speed up tight loops, especially when functions that the compiler cannot analyze are called inside the loop.

p.S.: this affects raw_loop and vector_loop. raw_loop can be "fixed" by using local variables, vector_loop can't. It can't, because there will be an array-pointer inside the vector, and the compiler also has no way of knowing that nothing will change the array-pointer inside the vector. After all, side_effect could call e.g. push_back(). Of course, if the compiler can inline any of the loop-functions, it could possibly optimize better. E.g. if the used vector is a local variable who's address isn't known outside the function, and only passed to the loop functions. Then the compiler could again know that side_effect cannot change the vector (because it cannot know it's address), and optimize better. -> Make sure the compiler cannot inline the function if you want to optimize it for non-inlining cases.

pgroke
A: 

I would have expected the optimizer to be smart enough to remove any differences, but typically a decrement-and-compare-to-zero is faster than a comparison to a non-zero pointer.

e.g.

void countdown_loop(int n, RawX* obj)
{
    if (!n) return;
    size_t const wc = obj->wCount;
    if (!wc) return;
    PodType* const first = obj->pData;
    do {
        side_effect();
        size_t i = wc;
        PodType* p = first;
        do {
            g_read_size = *p;
            side_effect();
            ++p;
        } while (--i);
        side_effect();
    } while (--n);
}
Ben Voigt
+1  A: 

One concrete reason for the difference in instructions generated is that Visual C++ vector has members _Myfirst and _Mylast (corresponding to begin() and end()) that simplify the loop setup.

In the raw case, the compiler has to do actual pointer math to set up the required start and end locals.

It's feasible that this complicates register usage enough to make the vector code faster.

Steve Townsend
+6  A: 

While my version of the generated machine code is different from yours (MSVC++ 2005), one difference between the two variants is pretty much the same as in your code:

  • In vector version of the code the "end iterator" value is pre-calculated and stored as a member of std::vector object, so the inner loop simply loads the readily available value.

  • In raw pointer version the "end iterator" value is calculated explicitly in the header of the inner cycle (by a lea instruction used to implement multiplication), meaning that each iteration of the outer cycle performs that calculation again and again.

If you re-implement your raw_iterator_loop as follows (i.e. pull the calculation of the end pointer out of the outer loop)

void raw_iterator_loop(const int n, RawX* obj)
{
    PodType *e = obj->pData+size_t(obj->wCount);

    for(int i=0; i!=n; ++i) {
        side_effect();
        for(PodType *j=obj->pData; j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();
    }
}

(or even store and maintain the end pointer in your class) you should end up with a more "fair" comparison.

AndreyT
Yay! That makes sense: *the "end iterator" value is pre-calculated and stored as a member of std::vector object* ... I forgot that the vector stores the end pointer and not the size!
Martin
Marking this as answer because it explains the assembly differences.
Martin
+1  A: 

Your timings might reflect the fact that the initial raw_loop pays the cost for loading the cache. Do you get similar timings if you reorder the tests to do the vector tests first (Or you could throw out the first iteration, or make each test a separate program).

Joe Valenzuela