I am trying to compare cross-correlation using FFT vs using windowing method.
My Matlab code is:
isize = 20;
n = 7;
for i = 1:n %%7x7 xcorr
for j = 1:n
xcout(i,j) = sum(sum(ffcorr1 .* ref(i:i+isize-1,j:j+isize-1))); %%ref is 676 element array and ffcorr1 is a 400 element array
end
end
similar CUDA kernel:
__global__ void xc_corr(double* in_im, double* ref_im, int pix3, int isize, int n, double* out1, double* temp1, double* sum_temp1)
{
int p = blockIdx.x * blockDim.x + threadIdx.x;
int q = 0;
int i = 0;
int j = 0;
int summ = 0;
for(i = 0; i < n; ++i)
{
for(j = 0; j < n; ++j)
{
summ = 0; //force update
for(p = 0; p < pix1; ++p)
{
for(q = 0; q < pix1; ++q)
{
temp1[((i*n+j)*pix1*pix1)+p*pix1+q] = in_im[p*pix1+q] * ref_im[(p+i)*pix1+(q+j)];
sum_temp1[((i*n+j)*pix1*pix1)+p*pix1+q] += temp1[((i*n+j)*pix1*pix1)+p*pix1+q];
out1[i*n+j] = sum_temp1[((i*n+j)*pix1*pix1)+p*pix1+q];
}
}
}
}
I have called this in my kernel as
int blocksize = 64; //multiple of 32
int nblocks = (pix3+blocksize-1)/blocksize; //round to max pix3 = 400
xc_corr <<< nblocks,blocksize >>> (ffcorr1, ref_d, pix3, isize, npix, xcout, xc_partial);
cudaThreadSynchronize();
Somehow, when I do a diff on the output file, I see that the CUDA kernel computes for only the first 400 elements.
What is the correct way to write this kernel??
Also, what is the difference in declaring i,j as shown below in my kernel??
int i = blockIdx.x * blockDim.y + threadIdx.x * threadIdx.y;
int j = blockIdx.y * blockDim.x + threadIdx.x * threadIdx.y;