views:

364

answers:

3

I encountered a strange problem where increasing my occupancy by increasing the number of threads reduced performance.

I created the following program to illustrate the problem:

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>

__global__ void less_threads(float * d_out) {
    int num_inliers;
    for (int j=0;j<800;++j) {
        //Do 12 computations
        num_inliers += j*(j+1);
        num_inliers += j*(j+2);
        num_inliers += j*(j+3);
        num_inliers += j*(j+4);
        num_inliers += j*(j+5);
        num_inliers += j*(j+6);
        num_inliers += j*(j+7);
        num_inliers += j*(j+8);
        num_inliers += j*(j+9);
        num_inliers += j*(j+10);
        num_inliers += j*(j+11);
        num_inliers += j*(j+12);
    }

    if (threadIdx.x == -1)
        d_out[threadIdx.x] = num_inliers;
}

__global__ void more_threads(float *d_out) {
    int num_inliers;
    for (int j=0;j<800;++j) {
        // Do 4 computations
        num_inliers += j*(j+1);
        num_inliers += j*(j+2);
        num_inliers += j*(j+3);
        num_inliers += j*(j+4);
    }
    if (threadIdx.x == -1)
        d_out[threadIdx.x] = num_inliers;
}


int main(int argc, char* argv[])
{
    float *d_out = NULL;
    cudaMalloc((void**)&d_out,sizeof(float)*25000);

    more_threads<<<780,128>>>(d_out);
    less_threads<<<780,32>>>(d_out);


    return 0;
}

And the PTX output is:

    .entry _Z12less_threadsPf (
        .param .u32 __cudaparm__Z12less_threadsPf_d_out)
    {
    .reg .u32 %r<35>;
    .reg .f32 %f<3>;
    .reg .pred %p<4>;
    .loc    17  6   0
 //   2  #include <stdlib.h>
 //   3  #include <cuda_runtime.h>
 //   4  #include <cutil.h>
 //   5  
 //   6  __global__ void less_threads(float * d_out) {
$LBB1__Z12less_threadsPf:
    mov.s32     %r1, 0;
    mov.s32     %r2, 0;
    mov.s32     %r3, 0;
    mov.s32     %r4, 0;
    mov.s32     %r5, 0;
    mov.s32     %r6, 0;
    mov.s32     %r7, 0;
    mov.s32     %r8, 0;
    mov.s32     %r9, 0;
    mov.s32     %r10, 0;
    mov.s32     %r11, 0;
    mov.s32     %r12, %r13;
    mov.s32     %r14, 0;
$Lt_0_2562:
 //<loop> Loop body line 6, nesting depth: 1, iterations: 800
    .loc    17  10  0
 //   7     int num_inliers;
 //   8     for (int j=0;j<800;++j) {
 //   9         //Do 12 computations
 //  10         num_inliers += j*(j+1);
    mul.lo.s32  %r15, %r14, %r14;
    add.s32     %r16, %r12, %r14;
    add.s32     %r12, %r15, %r16;
    .loc    17  11  0
 //  11         num_inliers += j*(j+2);
    add.s32     %r17, %r15, %r12;
    add.s32     %r12, %r1, %r17;
    .loc    17  12  0
 //  12         num_inliers += j*(j+3);
    add.s32     %r18, %r15, %r12;
    add.s32     %r12, %r2, %r18;
    .loc    17  13  0
 //  13         num_inliers += j*(j+4);
    add.s32     %r19, %r15, %r12;
    add.s32     %r12, %r3, %r19;
    .loc    17  14  0
 //  14         num_inliers += j*(j+5);
    add.s32     %r20, %r15, %r12;
    add.s32     %r12, %r4, %r20;
    .loc    17  15  0
 //  15         num_inliers += j*(j+6);
    add.s32     %r21, %r15, %r12;
    add.s32     %r12, %r5, %r21;
    .loc    17  16  0
 //  16         num_inliers += j*(j+7);
    add.s32     %r22, %r15, %r12;
    add.s32     %r12, %r6, %r22;
    .loc    17  17  0
 //  17         num_inliers += j*(j+8);
    add.s32     %r23, %r15, %r12;
    add.s32     %r12, %r7, %r23;
    .loc    17  18  0
 //  18         num_inliers += j*(j+9);
    add.s32     %r24, %r15, %r12;
    add.s32     %r12, %r8, %r24;
    .loc    17  19  0
 //  19         num_inliers += j*(j+10);
    add.s32     %r25, %r15, %r12;
    add.s32     %r12, %r9, %r25;
    .loc    17  20  0
 //  20         num_inliers += j*(j+11);
    add.s32     %r26, %r15, %r12;
    add.s32     %r12, %r10, %r26;
    .loc    17  21  0
 //  21         num_inliers += j*(j+12);
    add.s32     %r27, %r15, %r12;
    add.s32     %r12, %r11, %r27;
    add.s32     %r14, %r14, 1;
    add.s32     %r11, %r11, 12;
    add.s32     %r10, %r10, 11;
    add.s32     %r9, %r9, 10;
    add.s32     %r8, %r8, 9;
    add.s32     %r7, %r7, 8;
    add.s32     %r6, %r6, 7;
    add.s32     %r5, %r5, 6;
    add.s32     %r4, %r4, 5;
    add.s32     %r3, %r3, 4;
    add.s32     %r2, %r2, 3;
    add.s32     %r1, %r1, 2;
    mov.u32     %r28, 1600;
    setp.ne.s32     %p1, %r1, %r28;
    @%p1 bra    $Lt_0_2562;
    cvt.u32.u16     %r29, %tid.x;
    mov.u32     %r30, -1;
    setp.ne.u32     %p2, %r29, %r30;
    @%p2 bra    $Lt_0_3074;
    .loc    17  25  0
 //  22     }
 //  23  
 //  24     if (threadIdx.x == -1)
 //  25         d_out[threadIdx.x] = num_inliers;
    cvt.rn.f32.s32  %f1, %r12;
    ld.param.u32    %r31, [__cudaparm__Z12less_threadsPf_d_out];
    mul24.lo.u32    %r32, %r29, 4;
    add.u32     %r33, %r31, %r32;
    st.global.f32   [%r33+0], %f1;
$Lt_0_3074:
    .loc    17  26  0
 //  26  }
    exit;
$LDWend__Z12less_threadsPf:
    } // _Z12less_threadsPf

    .entry _Z12more_threadsPf (
        .param .u32 __cudaparm__Z12more_threadsPf_d_out)
    {
    .reg .u32 %r<19>;
    .reg .f32 %f<3>;
    .reg .pred %p<4>;
    .loc    17  28  0
 //  27  
 //  28  __global__ void more_threads(float *d_out) {
$LBB1__Z12more_threadsPf:
    mov.s32     %r1, 0;
    mov.s32     %r2, 0;
    mov.s32     %r3, 0;
    mov.s32     %r4, %r5;
    mov.s32     %r6, 0;
$Lt_1_2562:
 //<loop> Loop body line 28, nesting depth: 1, iterations: 800
    .loc    17  32  0
 //  29     int num_inliers;
 //  30     for (int j=0;j<800;++j) {
 //  31         // Do 4 computations
 //  32         num_inliers += j*(j+1);
    mul.lo.s32  %r7, %r6, %r6;
    add.s32     %r8, %r4, %r6;
    add.s32     %r4, %r7, %r8;
    .loc    17  33  0
 //  33         num_inliers += j*(j+2);
    add.s32     %r9, %r7, %r4;
    add.s32     %r4, %r1, %r9;
    .loc    17  34  0
 //  34         num_inliers += j*(j+3);
    add.s32     %r10, %r7, %r4;
    add.s32     %r4, %r2, %r10;
    .loc    17  35  0
 //  35         num_inliers += j*(j+4);
    add.s32     %r11, %r7, %r4;
    add.s32     %r4, %r3, %r11;
    add.s32     %r6, %r6, 1;
    add.s32     %r3, %r3, 4;
    add.s32     %r2, %r2, 3;
    add.s32     %r1, %r1, 2;
    mov.u32     %r12, 1600;
    setp.ne.s32     %p1, %r1, %r12;
    @%p1 bra    $Lt_1_2562;
    cvt.u32.u16     %r13, %tid.x;
    mov.u32     %r14, -1;
    setp.ne.u32     %p2, %r13, %r14;
    @%p2 bra    $Lt_1_3074;
    .loc    17  38  0
 //  36     }
 //  37     if (threadIdx.x == -1)
 //  38         d_out[threadIdx.x] = num_inliers;
    cvt.rn.f32.s32  %f1, %r4;
    ld.param.u32    %r15, [__cudaparm__Z12more_threadsPf_d_out];
    mul24.lo.u32    %r16, %r13, 4;
    add.u32     %r17, %r15, %r16;
    st.global.f32   [%r17+0], %f1;
$Lt_1_3074:
    .loc    17  39  0
 //  39  }
    exit;
$LDWend__Z12more_threadsPf:
    } // _Z12more_threadsPf

Note both kernels should do the same amount of work in total, the (if threadIdx.x == -1 is a trick to stop the compiler optimising everything out and leaving an empty kernel). The work should be the same as more_threads is using 4 times as many threads but with each thread doing 4 times less work.

Significant results form the profiler results are as followsL:

more_threads: GPU runtime = 1474 us,reg per thread = 6,occupancy=1,branch=83746,divergent_branch = 26,instructions = 584065,gst request=1084552

less_threads: GPU runtime = 921 us,reg per thread = 14,occupancy=0.25,branch=20956,divergent_branch = 26,instructions = 312663,gst request=677381

As I said previously, the run time of the kernel using more threads is longer, this could be due to the increased number of instructions.

Why are there more instructions?

Why is there any branching, let alone divergent branching, considering there is no conditional code?

Why are there any gst requests when there is no global memory access?

What is going on here!

Thanks

Update

Added PTX code and fixed CUDA C so it should compile

A: 
  1. two functions have different number of code lines, so different number of instructions

  2. for loop is implemented using branches. last line of code always divergent

  3. global store request is not the same as global score. operation is set up, but never commited.

aaa
+4  A: 

Since your code has only arithmetic instructions, you don't need very high occupancy to hide the latency of the arithmetic units. Indeed, even if you do have memory instructions you can maximise performance with ~50% occupancy provided your reads/writes are efficient. See the recorded Advanced CUDA C presentation for more information on occupancy and performance.

In your case, given that your kernel doesn't need high occupancy to saturate the arithmetic units, you will have better performance using fewer larger blocks than more smaller blocks since there is a cost for launching blocks. In general however the cost of launching blocks is negligible compared with the time to actually run the code.

Why are there more instructions?

Remember that the counters are not counting per block (aka CTA) but instead per SM (Streaming Multiprocessor) or per TPC (Texture Processing Cluster) which is a group of two or three SMs depending on your device. The instructions count is per SM.

It is fair to expect the less_threads kernel to have fewer instructions, however you are launching four times as many warps per block which means each SM will execute the code approximately four times as many times. Taking into account the shorter kernel code, your measurement doesn't seem unreasonable.

Why is there any branching?

Actually you do have conditional code:

for (int j=0;j<800;++j)

This has a condition, however all threads within a warp are indeed executing the same path so it is not divergent. My guess is the divergence is in the administration code somewhere, you could take a look at the PTX code to analyse this if you were worried. 26 is very low compared with the number of instructions executed, so this will not affect your performance.

Why are there any gst requests?

In your code you have:

if (threadIdx.x == -1)
  d_out[blockIdx.x*blockDim.x+threadIdx.x] = num_inliers;

This will be handled by the load/store unit and hence counted even though it results in no actual transaction. The gst_32/gst_64/gst_128 counters indicate actual memory transfers (your device has compute capability 1.2 or 1.3, older devices have different sets of counters).

Tom
Thanks for the informative answers. I am still unsure about the number of instructions. As you noted, less_threads does 4 times as many num_inliers += j*(j+n) but uses 4 times more threads. The number of instructions is extrapolated from a single SM or TPC, but overall shouldn't the numbers be equal?
zenna
From examining the PTX it is apparent that due to the overhead of the for loop, more_threads is actually executing less instructions. Could this account for the difference? If it would, then proportionally increasing the work done within the loops of both kernels would diminish this for loop overhead, but it does not from my tests.
zenna
I have seen the Advanced CUDA tutorial and I understand performance may not increase above 50%, but what we are seeing here is a decrease in performance. The number of blocks launched is the same, just the number of threads per block in less_threads is less.
zenna
+2  A: 

The two functions are not doing the same amount of work.

more_threads<<<780, 128>>>():

  • 780 blocks
  • 128 threads per block
  • 4 mul per loop
  • 8 add per loop
  • 780*128*800*(4+8) = 958,464,000 flops

less_threads<<<780, 32>>>():

  • 780 blocks
  • 32 threads per block
  • 12 mul per loop
  • 24 add per loop
  • 780*32*800*(12+24) = 718,848,000 flops

So, more_threads is doing more work than less threads, which is why the number of instructions goes up and why more_threads is slower. To fix more_threads, do only 3 computations inside the loop: 780*128*800*(3+6) = 718,848,000.

mch