I have two arrays ffcorr_d and ref_d each having 19600 values. The first kernel simple_multiply does a multiply operation along with sum reduction.
I instantiate this kernel with 49 blocks and 400 threads.
simplemultiply <<< nblocksn, blocksize >>> (ffcorr_d, ref_d, out1_d, out2_d, d_indices);
const int threads = 400;
__global__ void simplemultiply(float *a, float *b, float *c, float *sumr, int* index)
{
// Declare arrays to be in shared memory.
__shared__ float prod[threads];
int nTotalThreads = blockDim.x; // Total number of active threads
float max_val;
int max_index;
// Calculate which element this thread reads from memory
int arrayIndex = gridDim.x*blockDim.x*blockIdx.y + blockDim.x*blockIdx.x + threadIdx.x;
c[arrayIndex] = a[arrayIndex]*b[arrayIndex];
prod[threadIdx.x] = (threadIdx.x < 400) ? c[arrayIndex+threadIdx.x] : 0;
max_val = prod[threadIdx.x];
max_index = (threadIdx.x < 400) ? arrayIndex+threadIdx.x : 0;
__syncthreads();
while(nTotalThreads > 1)
{
int halfPoint = (nTotalThreads >> 1);
if (threadIdx.x < halfPoint)
{
// prod[threadIdx.x] += prod[threadIdx.x + halfPoint];
prod[threadIdx.x] += (threadIdx.x < 200) ? prod[threadIdx.x + halfPoint] : 0;
max_val = prod[threadIdx.x];
}
__syncthreads();
nTotalThreads = (nTotalThreads >> 1); // divide by two.
}
int i = threadIdx.x;
if (threadIdx.x == 0)
{
sumr[blockIdx.x] = prod[threadIdx.x];
}
if(sumr[blockIdx.x] == max_val )
{
index[blockIdx.x] = max_index;
}
}
But here prod[threadIdx.x] is wrong . I checked the values before reducing and they are correct. I am not sure if the reduction is incorrect because the threads are not a power of 2. Suggestions?? Thanks in advance.