tags:

views:

336

answers:

2

I'm currently using CUDA.NET library by GASS. I need to initialize cuda arrays (actually cublas vectors, but it doesn't matters) in one CPU thread and use them in other CPU thread. But CUDA context which holding all initialized arrays and loaded functions, can be attached only to one CPU thread.

There is mechanism called context migration API to detach context from one thread and attach it to another. But i don't how to properly use it in CUDA.NET.

I tried something like this:

class Program
{
    private static float[] vector1, vector2;
    private static CUDA cuda;
    private static CUBLAS cublas;

    private static CUdeviceptr ptr;

    static void Main(string[] args)
    {
        cuda = new CUDA(false);
        cublas = new CUBLAS(cuda);
        cuda.Init();
        cuda.CreateContext(0);
        AllocateVectors();
        cuda.DetachContext();
        CUcontext context = cuda.PopCurrentContext();
        GetVectorFromDeviceAsync(context);
    }

    private static void AllocateVectors()
    {
        vector1 = new float[]{1f, 2f, 3f, 4f, 5f};
        ptr = cublas.Allocate(vector1.Length, sizeof (float));
        cublas.SetVector(vector1, ptr);

        vector2 = new float[5];
    }


    private static void GetVectorFromDevice(object objContext)
    {
        CUcontext localContext = (CUcontext) objContext;
        cuda.PushCurrentContext(localContext);
        cuda.AttachContext(localContext);

        //change vector somehow
        vector1[0] = -1;
        //copy changed vector to device
        cublas.SetVector(vector1, ptr);
        cublas.GetVector(ptr, vector2);
        CUDADriver.cuCtxPopCurrent(ref localContext);
    }

    private static void GetVectorFromDeviceAsync(CUcontext cUcontext)
    {
        Thread thread = new Thread(GetVectorFromDevice);
        thread.IsBackground = false;
        thread.Start(cUcontext);
    }
}

But execution fails on attempt to copy changed vector to device because context is not attached. Other reasons are unlikely, because it works fine in single threaded mode. Any ideas how i can get it work?

+1  A: 

I still have not found a solution for this problem but i did came up with a workaround. The point is to execute all the functions which have something to deal with CUDA in one CPU thread. For example, you can do it like this:

class Program
{
    private static float[] vector1, vector2;
    private static CUDA cuda;
    private static CUBLAS cublas;

    private static CUdeviceptr ptr;

    private static readonly AutoResetEvent autoResetEvent = new AutoResetEvent(false);

    static void Main()
    {
        cuda = new CUDA(true);
        cublas = new CUBLAS(cuda);

        //allocate vector on cuda device in main thread
        CudaManager.CallMethod(AllocateVectors);

        //changing first vector from other thread
        Thread changeThread = new Thread(ChangeVectorOnDevice_ThreadRun) { IsBackground = false };
        changeThread.Start();

        //wait for changeThread to finish
        autoResetEvent.WaitOne();

        //getting vector from device in another one thread
        Thread getThread = new Thread(GetVectorFromDevice_ThreadRun) { IsBackground = false };
        getThread.Start();

        //wait for getThread to finish
        autoResetEvent.WaitOne();

        Console.WriteLine("({0}, {1}, {2}, {3}, {4})", vector2[0], vector2[1], vector2[2], vector2[3], vector2[4]);

        Console.ReadKey(true);
    }

    private static void AllocateVectors()
    {
        vector1 = new[] { 1f, 2f, 3f, 4f, 5f };
        vector2 = new float[5];
        //allocate memory and copy first vector to device
        ptr = cublas.Allocate(vector1.Length, sizeof(float));
        cublas.SetVector(vector1, ptr);

    }

    private static void GetVectorFromDevice()
    {
        cublas.GetVector(ptr, vector2);
    }

    private static void ChangeVectorOnDevice()
    {
        //changing vector and copying it to device
        vector1 = new[] { -1f, -2f, -3f, -4f, -5f };
        cublas.SetVector(vector1, ptr);
    }

    private static void ChangeVectorOnDevice_ThreadRun()
    {
        CudaManager.CallMethod(ChangeVectorOnDevice);
        //releasing main thread
        autoResetEvent.Set();
    }

    private static void GetVectorFromDevice_ThreadRun()
    {
        CudaManager.CallMethod(GetVectorFromDevice);
        //releasing main thread
        autoResetEvent.Set();
    }
}

public static class CudaManager
{
    public static Action WorkMethod { get; private set; }

    private static readonly AutoResetEvent actionRecived = new AutoResetEvent(false);
    private static readonly AutoResetEvent callbackEvent = new AutoResetEvent(false);

    private static readonly object mutext = new object();
    private static bool isCudaThreadRunning;

    private static void ThreadRun()
    {
        //waiting for work method to execute
        while (actionRecived.WaitOne())
        {
            //invoking recived method
            WorkMethod.Invoke();
            //releasing caller thread
            callbackEvent.Set();
        }
    }

    static CudaManager()
    {
        Run();
    }

    public static void Run()
    {
        if (!isCudaThreadRunning)
        {
            Thread thread = new Thread(ThreadRun);
            thread.IsBackground = true;
            thread.Start();
            isCudaThreadRunning = true;
        }
    }

    public static void CallMethod(Action method)
    {
        lock (mutext)
        {
            WorkMethod = method;
            //releasing ThreadRun method
            actionRecived.Set();
            //blocking caller thread untill delegate invokation is complete
            callbackEvent.WaitOne();
        }
    }
}

I hope it's gonna help someone.

Vyacheslav
+1  A: 

Check out CUDAContextSynchronizer class in GASS documentation.

Yevgeny
This class from CUDA.NET 3.0 but I'm using 2.3 version. Thx for the answer anyway. I'll check it.
Vyacheslav