Click here to Skip to main content
15,893,668 members
Articles / Programming Languages / CUDA

Using Cudafy for GPGPU Programming in .NET

Rate me:
Please Sign up or sign in to vote.
4.95/5 (59 votes)
16 Sep 2013LGPL313 min read 377.3K   5.4K   141  
An introduction to using Cudafy.NET to perform processing on a GPU
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;
using Cudafy;
using Cudafy.Host;
using Cudafy.Translator;
namespace CudafyIntroduction
{
    class Program
    {
        private static int N = 1024;

        private const int XSIZE = 4;
        private const int YSIZE = 8;
        private const int ZSIZE = 16;
        
        static void Main(string[] args)
        {
            try
            {
                // This 'smart' method will Cudafy all members with the Cudafy attribute in the calling type (i.e. Program)
                CudafyModule km = CudafyTranslator.Cudafy();
                // If cudafying will not work for you (CUDA SDK + VS not set up right) then comment out above and
                // uncomment below. Remember to also comment out the Structs and 3D arrays region below.
                // CUDA 5.5 SDK must be installed and cl.exe (VC++ compiler) must be in path.
                //CudafyModule km = CudafyModule.Deserialize(typeof(Program).Name);

                // Get the first CUDA device and load our module
                _gpu = CudafyHost.GetDevice(eGPUType.Cuda);
                _gpu.LoadModule(km);

                #region Simplest GPU function possible
                // Call the kernel method (which does nothing useful, but does it on the GPU)
                // We use .NET 4.0 Dynamics to resolve the method. We could also use _gpu.Launch(1, 1, "kernel");
                _gpu.Launch().kernel();
                #endregion

                #region Add two numbers on GPU
                // Next we will add together some numbers. First we need to allocate memory on GPU for result (one int).
                // Then we launch our method and then read our results back again.
                int result;
                int[] dev_result = _gpu.Allocate<int>();
                _gpu.Launch().add(2, 7, dev_result); // or gpu.Launch(1, 1, "add", 2, 7, dev_c);
                _gpu.CopyFromDevice(dev_result, out result);
                Console.WriteLine("2 + 7 = {0}", result);
                Debug.Assert(result == 9);
                #endregion

                #region Hello, world
                // Write Hello, world on GPU
                string str = "Hello, world";
                char[] dev_str = _gpu.Allocate<char>(str.Length);
                char[] char_array = new char[str.Length];
                _gpu.Launch(1, 1, "WriteHelloWorldOnGPU", dev_str);
                _gpu.CopyFromDevice(dev_str, char_array);
                string host_str = new string(char_array);
                Console.WriteLine(host_str);
                Debug.Assert(str == host_str);
                #endregion

                #region Add vectors
                // Add vectors - GPUs are best at algorithms like working on matrices and large vectors
                // where lots of calculations can be done independently in parallel.
                int[] a = new int[N];
                int[] b = new int[N];
                int[] c = new int[N];
                // fill the arrays 'a' and 'b' on the CPU
                for (int i = 0; i < N; i++)
                {
                    a[i] = -i;
                    b[i] = i * i;
                }
                // copy the arrays 'a' and 'b' to the GPU - these overloads automatically allocate GPU memory
                int[] dev_a = _gpu.CopyToDevice(a);
                int[] dev_b = _gpu.CopyToDevice(b);
                // allocate memory on the GPU for the result - this allocate enough memory to hold a vector the
                // same length as vector c - it does not copy vector c (same as _gpu.Allocate<int>(c.Length);)
                int[] dev_c = _gpu.Allocate<int>(c);
                // Threads are grouped in Blocks. Blocks are grouped in a Grid. Here we launch N Blocks where
                // each block contains 1 thread. Note addVector contains a GThread arg - no need to pass this.
                // GThread is the Cudafy equivalent of the built-in CUDA variables. Use it to identify thread id.
                _gpu.Launch(N, 1).addVector(dev_a, dev_b, dev_c);
                // copy the array 'c' back from the GPU to the CPU
                _gpu.CopyFromDevice(dev_c, c);
                for (int i = 0; i < N; i++)
                    Debug.Assert(a[i] + b[i] == c[i]);
                Console.WriteLine("We just added {0} elements of our two vectors in {0} parallel threads.", N);
                // This used a bit more precious GPU memory than the earlier examples, so let's free it
                _gpu.FreeAll();
                #endregion

                #region Structs and 3D arrays
                // Here we will cudafy a .NET struct and use a 3D array - let's make a new module and this time
                // we will explicitly state what types to cudafy.
                km = CudafyTranslator.Cudafy(typeof(ComplexFloat), typeof(Struct3D)); // see Struct3D.cs
                _gpu.LoadModule(km, false); // don't unload existing loaded module so now there are two modules loaded
                Debug.Assert(_gpu.GetFunctionNames().Count() > 1);// prove it

                ComplexFloat[, ,] host_array3DS = new ComplexFloat[XSIZE, YSIZE, ZSIZE];
                ComplexFloat[, ,] result_array3DS = new ComplexFloat[XSIZE, YSIZE, ZSIZE];
                int ctr = 0;
                for (int x = 0; x < XSIZE; x++)
                    for (int y = 0; y < YSIZE; y++)
                        for (int z = 0; z < ZSIZE; z++, ctr++)
                            host_array3DS[x, y, z] = new ComplexFloat(ctr * 2, ctr);
                ComplexFloat[, ,] dev_array3DS = _gpu.CopyToDevice(host_array3DS);

                // Let's launch old school sans dynamic. XSIZE*YSIZE blocks of 1 thread each.
                _gpu.Launch(new dim3(XSIZE, YSIZE), 1, "struct3D", dev_array3DS);
                _gpu.CopyFromDevice(dev_array3DS, result_array3DS);
                bool pass = true;
                ctr = 0;
                for (int x = 0; x < XSIZE; x++)
                {
                    for (int y = 0; y < YSIZE; y++)
                    {
                        for (int z = 0; z < ZSIZE && pass; z++, ctr++)
                        {
                            ComplexFloat expected = new ComplexFloat(ctr * 2, ctr).Add(new ComplexFloat(ctr * 2, ctr));
                            ComplexFloat res = result_array3DS[x, y, z];
                            pass = res.Real == expected.Real && res.Imag == expected.Imag;
                        }
                    }
                }
                Console.WriteLine(pass ? "Pass" : "Fail");
                #endregion

                Console.WriteLine("Done!");
            }
            catch (CudafyLanguageException cle)
            {
                HandleException(cle);
            }
            catch (CudafyCompileException cce)
            {
                HandleException(cce);
            }
            catch (CudafyHostException che)
            {
                HandleException(che);
            }
            
            Console.ReadLine();
        }

        [Cudafy]
        public static void kernel()
        {
        }

        [Cudafy]
        public static void add(int a, int b, int[] c)
        {
            c[0] = a + b;
        }

        [Cudafy]
        public static void WriteHelloWorldOnGPU(char[] c)
        {
            c[0] = 'H';
            c[1] = 'e';
            c[2] = 'l';
            c[3] = 'l';
            c[4] = 'o';
            c[5] = ',';
            c[6] = ' ';
            c[7] = 'w';
            c[8] = 'o';
            c[9] = 'r';
            c[10] = 'l';
            c[11] = 'd';
        }

        [Cudafy]
        public static void addVector(GThread thread, int[] a, int[] b, int[] c)
        {
            // Get the id of the thread. addVector is called N times in parallel, so we need 
            // to know which one we are dealing with.
            int tid = thread.blockIdx.x;
            // To prevent reading beyond the end of the array we check that the id is less than Length
            if (tid < a.Length)
                c[tid] = a[tid] + b[tid];
        }

        private static GPGPU _gpu;

        private static void HandleException(Exception ex)
        {
            Console.WriteLine(ex.Message);
        }
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The GNU Lesser General Public License (LGPLv3)


Written By
Systems Engineer Hybrid DSP Systems
Netherlands Netherlands
Nick is co owner of Hybrid DSP, a company specialized in high speed data acquisition, processing and storage.

CUDAfy.NET took considerable effort to develop and we ask nothing in return from users of the LGPL library other than that you please consider donating to Harmony through Education. This small charity helps handicapped children in developing countries by providing suitable schooling.

Comments and Discussions