Hi,
I'm facing a problem with OpenCL and I hope someone will have a hint on what the cause might be. Following is a version of the program, reduced to the problem. I have an input int array of size 4000. In my kernel, I am doing a scan. Obviously, there are nice ways to do this in parallel, but to reproduce the problem, only one thread is doing the entire computation. Before the scan, the input (result_mask) has only values 0 or 1.
__kernel void
sel_a(__global db_tuple * input,
__global int * result_mask,
__global int * result_count,
const unsigned int max_id)
{
// update mask based on input in parallel
mem_fence(CLK_GLOBAL_MEM_FENCE);
if(gid == 0)
{
int i, c = 0;
for(i = 0; i < max_id; i++)
{
if(result_mask[i]!=0)
{
c++;
result_mask[i] = 5;
}
else
{
result_mask[i] = 5;
}
}
*result_count = c;
}
}
The expected result would be the number of elements that initially had a value different than 0 and nothing but 5's in the result mask. However, that is not the case. The output looks like this:
...
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
5 5 5 5 5 5 5 5 5 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 5 5 5 5 5 5 5 5 5 5
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
...
I get this block of 80 elements somewhere after approx. 3200 elements. It's not always the same positions, but it's always the same amount of elements - 80. And it get's even weirder - if I change the first line to if(gid == 2000) the problem is gone. However, after playing around with the thread id, I've come to the conclusion that the problem isn't gone, it just moved. Using thread 1425, I get the problem half the time and when I get it, buggy block is at the end of the array. Hence I assume, when I don't have the 0s and 1s, the block has "moved" further back. For some more excitement - when I increase the input size to 5000, the output consists entirely of 0s. Furthermore, the following code won't work:
if(gid == 0)
{
int i, c = 0;
for(i = 0; i < max_id; i++)
{
if(result_mask[i]!=0)
{
c++;
result_mask[i] = 5;
}
else
{
result_mask[i] = 5;
}
}
*result_count = c;
}
if(gid == 3999)
{
int i, c = 0;
for(i = 0; i < max_id; i++)
{
if(result_mask[i]!=0)
{
c++;
result_mask[i] = 5;
}
else
{
result_mask[i] = 5;
}
}
*result_count = c;
}
whereas only
if(gid == 3999)
{
int i, c = 0;
for(i = 0; i < max_id; i++)
{
if(result_mask[i]!=0)
{
c++;
result_mask[i] = 5;
}
else
{
result_mask[i] = 5;
}
}
*result_count = c;
}
will work (again, probable with a larger input, it might not work). Following are some details on the device:
Device name: GeForce 9600M GT
Device vendor: NVIDIA
Clock frequency: 1250 MHz
Max compute units: 4
Global memory size: 256 MB
Local memory size:. 16 KB
Max memory allocation size: 128 MB
Max work group size: 512
Obviously, I am missing out something big here. My first thought was it's some memory conflict, where the block of 80 elements is overridden by another 'thread'. But the more I think about it, the less sense it makes.
I'll be very grateful for any hints! Thanks.
EDIT: Sorry for the late response. So I've modified the code, reducing it to a minimum to reproduce the problem. Following is the c-code of the program:
#include <stdio.h>
#include <stdlib.h>
#include <OpenCL/openCL.h>
#define INPUTSIZE (200)
typedef struct tag_openCL
{
cl_device_id device;
cl_context ctx;
cl_command_queue queue;
cl_program program;
} openCL;
int main(void)
{
int err;
openCL* cl_ctx = malloc(sizeof(openCL));
if(!cl_ctx)
exit(1);
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &cl_ctx->device, NULL);
cl_ctx->ctx = clCreateContext(0, 1, &cl_ctx->device, clLogMessagesToStdoutAPPLE, NULL, &err);
cl_ctx->queue = clCreateCommandQueue(cl_ctx->ctx, cl_ctx->device, CL_QUEUE_PROFILING_ENABLE, &err);
printf("Successfully created context and queue for openCL device. \n");
/* Build program */
char * kernel_source = "__kernel void \
sel(__global int * input, \
__global int * result_mask, \
const unsigned int max_id) \
{ \
int gid = get_global_id(0); \
\
result_mask[gid] = input[gid] % 2 == 0; \
result_mask[gid] &= (input[gid] + 1) % 3 == 0; \
\
if(gid == 0) { \
int i; \
for(i = 0; i < max_id; i++) { \
if(result_mask[i]) { \
result_mask[i] = 5; \
} \
else { \
result_mask[i] = 5; \
} \
} \
} \
}";
cl_program prog = clCreateProgramWithSource(cl_ctx->ctx, 1, (const char**)&kernel_source, NULL, &err);
cl_ctx->program = prog;
err = clBuildProgram(cl_ctx->program, 0, NULL, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(cl_ctx->program, "sel", &err);
/* create dummy input data */
int * input = calloc(sizeof(int), INPUTSIZE);
int k;
for(k = 0; k < INPUTSIZE; k++)
{
input[k] = abs((k % 5) - (k % 3))+ k % 2;
}
cl_mem source, intermediate;
unsigned int problem_size = INPUTSIZE;
source = clCreateBuffer(cl_ctx->ctx, CL_MEM_READ_WRITE, problem_size * sizeof(int), NULL, NULL);
clEnqueueWriteBuffer(cl_ctx->queue, source, CL_TRUE, 0, problem_size * sizeof(int), (void*) input, 0, NULL, NULL);
intermediate = clCreateBuffer(cl_ctx->ctx, CL_MEM_READ_WRITE, problem_size * sizeof(int), NULL, NULL);
int arg = 0;
clSetKernelArg(kernel, arg++, sizeof(cl_mem), &source);
clSetKernelArg(kernel, arg++, sizeof(cl_mem), &intermediate);
clSetKernelArg(kernel, arg++, sizeof(unsigned int), &problem_size);
size_t global_work_size = problem_size;
size_t local_work_size = 1;
clEnqueueNDRangeKernel(cl_ctx->queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
clFinish(cl_ctx->queue);
// read results
int * result = calloc(sizeof(int), problem_size );
clEnqueueReadBuffer(cl_ctx->queue, intermediate, CL_TRUE, 0, problem_size * sizeof(int), result, 0, NULL, NULL);
clFinish(cl_ctx->queue);
int j;
for(j=1; j<=problem_size; j++)
{
printf("%i \t", result[j-1]);
if(j%10 ==0 && j>0)
printf("\n");
}
return EXIT_SUCCESS;
}
The result is still non-deterministic, I get 0's and 1's at random positions in the output. For a local workgroup size of 1, they are in the first half of the array, for a size of 2 - in the second half, for a size of 4 it looks okay for 200 elements, but there are again 0's and 1's for a problem size of 400. Furthermore, for a global work group size of 1, all works fine. That is, if I use two kernels - one to do the parallel computation with a global work group size of [problem size] and a second one with a global work group size of 1, everything works great. Again, I'm am perfectly aware that this is not the way to do it (a kernel running such sequential code), however, I'd like to know, why it's not working, since it looks I'm missing something out.
Thanks, Vassil