I encountered a strange problem where increasing my occupancy by increasing the number of threads reduced performance.
I created the following program to illustrate the problem:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
__global__ void less_threads(float * d_out) {
int num_inliers;
for (int j=0;j<800;++j) {
//Do 12 computations
num_inliers += j*(j+1);
num_inliers += j*(j+2);
num_inliers += j*(j+3);
num_inliers += j*(j+4);
num_inliers += j*(j+5);
num_inliers += j*(j+6);
num_inliers += j*(j+7);
num_inliers += j*(j+8);
num_inliers += j*(j+9);
num_inliers += j*(j+10);
num_inliers += j*(j+11);
num_inliers += j*(j+12);
}
if (threadIdx.x == -1)
d_out[threadIdx.x] = num_inliers;
}
__global__ void more_threads(float *d_out) {
int num_inliers;
for (int j=0;j<800;++j) {
// Do 4 computations
num_inliers += j*(j+1);
num_inliers += j*(j+2);
num_inliers += j*(j+3);
num_inliers += j*(j+4);
}
if (threadIdx.x == -1)
d_out[threadIdx.x] = num_inliers;
}
int main(int argc, char* argv[])
{
float *d_out = NULL;
cudaMalloc((void**)&d_out,sizeof(float)*25000);
more_threads<<<780,128>>>(d_out);
less_threads<<<780,32>>>(d_out);
return 0;
}
And the PTX output is:
.entry _Z12less_threadsPf (
.param .u32 __cudaparm__Z12less_threadsPf_d_out)
{
.reg .u32 %r<35>;
.reg .f32 %f<3>;
.reg .pred %p<4>;
.loc 17 6 0
// 2 #include <stdlib.h>
// 3 #include <cuda_runtime.h>
// 4 #include <cutil.h>
// 5
// 6 __global__ void less_threads(float * d_out) {
$LBB1__Z12less_threadsPf:
mov.s32 %r1, 0;
mov.s32 %r2, 0;
mov.s32 %r3, 0;
mov.s32 %r4, 0;
mov.s32 %r5, 0;
mov.s32 %r6, 0;
mov.s32 %r7, 0;
mov.s32 %r8, 0;
mov.s32 %r9, 0;
mov.s32 %r10, 0;
mov.s32 %r11, 0;
mov.s32 %r12, %r13;
mov.s32 %r14, 0;
$Lt_0_2562:
//<loop> Loop body line 6, nesting depth: 1, iterations: 800
.loc 17 10 0
// 7 int num_inliers;
// 8 for (int j=0;j<800;++j) {
// 9 //Do 12 computations
// 10 num_inliers += j*(j+1);
mul.lo.s32 %r15, %r14, %r14;
add.s32 %r16, %r12, %r14;
add.s32 %r12, %r15, %r16;
.loc 17 11 0
// 11 num_inliers += j*(j+2);
add.s32 %r17, %r15, %r12;
add.s32 %r12, %r1, %r17;
.loc 17 12 0
// 12 num_inliers += j*(j+3);
add.s32 %r18, %r15, %r12;
add.s32 %r12, %r2, %r18;
.loc 17 13 0
// 13 num_inliers += j*(j+4);
add.s32 %r19, %r15, %r12;
add.s32 %r12, %r3, %r19;
.loc 17 14 0
// 14 num_inliers += j*(j+5);
add.s32 %r20, %r15, %r12;
add.s32 %r12, %r4, %r20;
.loc 17 15 0
// 15 num_inliers += j*(j+6);
add.s32 %r21, %r15, %r12;
add.s32 %r12, %r5, %r21;
.loc 17 16 0
// 16 num_inliers += j*(j+7);
add.s32 %r22, %r15, %r12;
add.s32 %r12, %r6, %r22;
.loc 17 17 0
// 17 num_inliers += j*(j+8);
add.s32 %r23, %r15, %r12;
add.s32 %r12, %r7, %r23;
.loc 17 18 0
// 18 num_inliers += j*(j+9);
add.s32 %r24, %r15, %r12;
add.s32 %r12, %r8, %r24;
.loc 17 19 0
// 19 num_inliers += j*(j+10);
add.s32 %r25, %r15, %r12;
add.s32 %r12, %r9, %r25;
.loc 17 20 0
// 20 num_inliers += j*(j+11);
add.s32 %r26, %r15, %r12;
add.s32 %r12, %r10, %r26;
.loc 17 21 0
// 21 num_inliers += j*(j+12);
add.s32 %r27, %r15, %r12;
add.s32 %r12, %r11, %r27;
add.s32 %r14, %r14, 1;
add.s32 %r11, %r11, 12;
add.s32 %r10, %r10, 11;
add.s32 %r9, %r9, 10;
add.s32 %r8, %r8, 9;
add.s32 %r7, %r7, 8;
add.s32 %r6, %r6, 7;
add.s32 %r5, %r5, 6;
add.s32 %r4, %r4, 5;
add.s32 %r3, %r3, 4;
add.s32 %r2, %r2, 3;
add.s32 %r1, %r1, 2;
mov.u32 %r28, 1600;
setp.ne.s32 %p1, %r1, %r28;
@%p1 bra $Lt_0_2562;
cvt.u32.u16 %r29, %tid.x;
mov.u32 %r30, -1;
setp.ne.u32 %p2, %r29, %r30;
@%p2 bra $Lt_0_3074;
.loc 17 25 0
// 22 }
// 23
// 24 if (threadIdx.x == -1)
// 25 d_out[threadIdx.x] = num_inliers;
cvt.rn.f32.s32 %f1, %r12;
ld.param.u32 %r31, [__cudaparm__Z12less_threadsPf_d_out];
mul24.lo.u32 %r32, %r29, 4;
add.u32 %r33, %r31, %r32;
st.global.f32 [%r33+0], %f1;
$Lt_0_3074:
.loc 17 26 0
// 26 }
exit;
$LDWend__Z12less_threadsPf:
} // _Z12less_threadsPf
.entry _Z12more_threadsPf (
.param .u32 __cudaparm__Z12more_threadsPf_d_out)
{
.reg .u32 %r<19>;
.reg .f32 %f<3>;
.reg .pred %p<4>;
.loc 17 28 0
// 27
// 28 __global__ void more_threads(float *d_out) {
$LBB1__Z12more_threadsPf:
mov.s32 %r1, 0;
mov.s32 %r2, 0;
mov.s32 %r3, 0;
mov.s32 %r4, %r5;
mov.s32 %r6, 0;
$Lt_1_2562:
//<loop> Loop body line 28, nesting depth: 1, iterations: 800
.loc 17 32 0
// 29 int num_inliers;
// 30 for (int j=0;j<800;++j) {
// 31 // Do 4 computations
// 32 num_inliers += j*(j+1);
mul.lo.s32 %r7, %r6, %r6;
add.s32 %r8, %r4, %r6;
add.s32 %r4, %r7, %r8;
.loc 17 33 0
// 33 num_inliers += j*(j+2);
add.s32 %r9, %r7, %r4;
add.s32 %r4, %r1, %r9;
.loc 17 34 0
// 34 num_inliers += j*(j+3);
add.s32 %r10, %r7, %r4;
add.s32 %r4, %r2, %r10;
.loc 17 35 0
// 35 num_inliers += j*(j+4);
add.s32 %r11, %r7, %r4;
add.s32 %r4, %r3, %r11;
add.s32 %r6, %r6, 1;
add.s32 %r3, %r3, 4;
add.s32 %r2, %r2, 3;
add.s32 %r1, %r1, 2;
mov.u32 %r12, 1600;
setp.ne.s32 %p1, %r1, %r12;
@%p1 bra $Lt_1_2562;
cvt.u32.u16 %r13, %tid.x;
mov.u32 %r14, -1;
setp.ne.u32 %p2, %r13, %r14;
@%p2 bra $Lt_1_3074;
.loc 17 38 0
// 36 }
// 37 if (threadIdx.x == -1)
// 38 d_out[threadIdx.x] = num_inliers;
cvt.rn.f32.s32 %f1, %r4;
ld.param.u32 %r15, [__cudaparm__Z12more_threadsPf_d_out];
mul24.lo.u32 %r16, %r13, 4;
add.u32 %r17, %r15, %r16;
st.global.f32 [%r17+0], %f1;
$Lt_1_3074:
.loc 17 39 0
// 39 }
exit;
$LDWend__Z12more_threadsPf:
} // _Z12more_threadsPf
Note both kernels should do the same amount of work in total, the (if threadIdx.x == -1 is a trick to stop the compiler optimising everything out and leaving an empty kernel). The work should be the same as more_threads is using 4 times as many threads but with each thread doing 4 times less work.
Significant results form the profiler results are as followsL:
more_threads: GPU runtime = 1474 us,reg per thread = 6,occupancy=1,branch=83746,divergent_branch = 26,instructions = 584065,gst request=1084552
less_threads: GPU runtime = 921 us,reg per thread = 14,occupancy=0.25,branch=20956,divergent_branch = 26,instructions = 312663,gst request=677381
As I said previously, the run time of the kernel using more threads is longer, this could be due to the increased number of instructions.
Why are there more instructions?
Why is there any branching, let alone divergent branching, considering there is no conditional code?
Why are there any gst requests when there is no global memory access?
What is going on here!
Thanks
Update
Added PTX code and fixed CUDA C so it should compile