Click here to Skip to main content
15,885,720 members
Articles / Programming Languages / CUDA

High Performance Queries: GPU vs. PLINQ vs. LINQ

Rate me:
Please Sign up or sign in to vote.
4.94/5 (102 votes)
16 Sep 2013LGPL310 min read 143.9K   5K   195  
How to get 30x performance increase for queries by using your Graphics Processing Unit (GPU) instead of LINQ and PLINQ.
/* 
 * This software is based upon the book CUDA By Example by Sanders and Kandrot
 * and source code provided by NVIDIA Corporation.
 * It is a good idea to read the book while studying the examples!
*/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Cudafy;
using Cudafy.Host;
using Cudafy.Atomics;
using Cudafy.Translator;

namespace CudafyByExample
{
    public class hist_gpu_shmem_atomics
    {
        public const int SIZE =  100 * 1024 * 1024;

        [Cudafy]
        public static void histo_kernel(GThread thread, byte[] buffer, int size, uint[] histo) 
        {
            // clear out the accumulation buffer called temp
            // since we are launched with 256 threads, it is easy
            // to clear that memory with one write per thread
            uint[] temp = thread.AllocateShared<uint>("temp", 256);
            temp[thread.threadIdx.x] = 0;
            thread.SyncThreads();

            // calculate the starting index and the offset to the next
            // block that each thread will be processing
            int i = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
            int stride = thread.blockDim.x * thread.gridDim.x;
            while (i < size) 
            {
                thread.atomicAdd(ref temp[buffer[i]], 1 );
                i += stride;
            }
            // sync the data from the above writes to shared memory
            // then add the shared memory values to the values from
            // the other thread blocks using global memory
            // atomic adds
            // same as before, since we have 256 threads, updating the
            // global histogram is just one write per thread!
            thread.SyncThreads();

            thread.atomicAdd(ref (histo[thread.threadIdx.x]), temp[thread.threadIdx.x]);
        }

        static byte[] big_random_block(int size) 
        {
            Random rand = new Random(DateTime.Now.Millisecond);
            byte[] data = new byte[size];
            for (int i=0; i<size; i++)
                data[i] = (byte)rand.Next(Byte.MaxValue);

            return data;
        }

        public static int Execute() 
        {
            CudafyModule km = CudafyTranslator.Cudafy();

            GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
            if (gpu is CudaGPU && gpu.GetDeviceProperties().Capability < new Version(1, 2))
            {
                Console.WriteLine("Compute capability 1.2 or higher required for atomics.");
                return -1;
            }
            gpu.LoadModule(km);

            byte[] buffer = big_random_block(SIZE);

            // cudart.dll must be accessible!
            GPGPUProperties prop = null;
            try
            {
                prop = gpu.GetDeviceProperties(true);
            }
            catch (DllNotFoundException)
            {
                prop = gpu.GetDeviceProperties(false);
            }
            
            // capture the start time
            // starting the timer here so that we include the cost of
            // all of the operations on the GPU.  if the data were
            // already on the GPU and we just timed the kernel
            // the timing would drop from 74 ms to 15 ms.  Very fast.
            gpu.StartTimer();

            // allocate memory on the GPU for the file's data
            byte[] dev_buffer = gpu.CopyToDevice(buffer);
            uint[] dev_histo = gpu.Allocate<uint>(256);
            gpu.Set(dev_histo);

            // kernel launch - 2x the number of mps gave best timing          
            int blocks = prop.MultiProcessorCount;
            if (blocks == 0)
                blocks = 16;
            Console.WriteLine("Processors: {0}", blocks);
            gpu.Launch(blocks * 2, 256).histo_kernel(dev_buffer, SIZE, dev_histo); 
    
            uint[] histo = new uint[256];
            gpu.CopyFromDevice(dev_histo, histo);

            // get stop time, and display the timing results
            float elapsedTime = gpu.StopTimer();
            Console.WriteLine( "Time to generate: {0} ms", elapsedTime );

            long histoCount = 0;
            for (int i = 0; i < 256; i++) 
            {
                histoCount += histo[i];
            }
            Console.WriteLine( "Histogram Sum:  {0}", histoCount );

            // verify that we have the same counts via CPU
            for (int i=0; i<SIZE; i++)
                histo[buffer[i]]--;
            for (int i=0; i<256; i++) 
            {
                if (histo[i] != 0)
                    Console.WriteLine("Failure at {0}!", i);
            }

            gpu.FreeAll();
        
            return 0;
        }
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The GNU Lesser General Public License (LGPLv3)


Written By
Systems Engineer Hybrid DSP Systems
Netherlands Netherlands
Nick is co owner of Hybrid DSP, a company specialized in high speed data acquisition, processing and storage.

CUDAfy.NET took considerable effort to develop and we ask nothing in return from users of the LGPL library other than that you please consider donating to Harmony through Education. This small charity helps handicapped children in developing countries by providing suitable schooling.

Comments and Discussions