Base64 Encoding on a GPU

Nick Kopp

Rate me:

4.89/5 (26 votes)

16 Sep 2013LGPL310 min read

64K

1.8K

Performing base64 encoding on a graphics processing unit using CUDAfy.NET (CUDA in .NET).

Base64EncodingOnGPU_src-noexe.zip
- Base64EncodingOnGPU
  - Base64EncodingOnGPU.sln
  - Base64EncodingOnGPU
    - app.config
    - Base64EncodingOnGPU.csproj
    - GPUConvert.cs
    - Program.cs
    - Properties
      - AssemblyInfo.cs
  - CudafyV1.26
    - bin
      - Cudafy.NET.xml
    - CUDA.NET.Readme.txt
    - CUDAfy API Documentation.url
    - CUDAfy_User_Manual_1_22.pdf
    - CudafyByExample
      - app.config
      - chapter03
        
        enum_gpu.cs
        
        hello_world.cs
        
        simple_kernel.cs
        
        simple_kernel_params.cs
      - chapter04
        
        add_loop_cpu.cs
        
        add_loop_gpu.cs
        
        add_loop_gpu_alt.cs
        
        add_loop_long.cs
        
        cuComplex.cs
        
        julia_cpu.cs
        
        julia_cpu.Designer.cs
        
        julia_cpu.resx
        
        julia_gpu.cs
      - chapter05
        
        add_loop_blocks.cs
        
        add_loop_long_blocks.cs
        
        dot.cs
        
        ripple.cs
        
        ripple.Designer.cs
        
        ripple.resx
        
        ripple_gpu.cs
      - chapter06
        
        ray.cs
        
        ray_gui.cs
        
        ray_gui.Designer.cs
        
        ray_gui.resx
        
        ray_noconst.cs
        
        ray_opencl.cs
        
        ray_opencl_const.cs
      - chapter09
        
        hist_gpu_shmem_atomics.cs
      - chapter10
        
        basic_double_stream_correct.cs
        
        copy_timed.cs
      - CudafyByExample.csproj
      - CudafyByExample.sln
      - Program.cs
      - Properties
        
        AssemblyInfo.cs
    - CUDAfy-License.txt
    - ILSpy-license.txt
    - README.txt
    - Visit www.hybriddsp.com for full CUDAfy.NET SDK.txt
Base64EncodingOnGPU_src.zip
- Base64EncodingOnGPU.sln
- .svn
  - app.config
  - Base64EncodingOnGPU.csproj
  - GPUConvert.cs
  - Program.cs
  - AssemblyInfo.cs
  - Cudafy.NET.dll
  - Cudafy.NET.xml
  - CUDA.NET.Readme.txt
  - CUDAfy API Documentation.url
  - CUDAfy_User_Manual_1_22.pdf
  - app.config
  - enum_gpu.cs
  - hello_world.cs
  - simple_kernel.cs
  - simple_kernel_params.cs
  - add_loop_cpu.cs
  - add_loop_gpu.cs
  - add_loop_gpu_alt.cs
  - add_loop_long.cs
  - cuComplex.cs
  - julia_cpu.cs
  - julia_cpu.Designer.cs
  - julia_cpu.resx
  - julia_gpu.cs
  - add_loop_blocks.cs
  - add_loop_long_blocks.cs
  - dot.cs
  - ripple.cs
  - ripple.Designer.cs
  - ripple.resx
  - ripple_gpu.cs
  - ray.cs
  - ray_gui.cs
  - ray_gui.Designer.cs
  - ray_gui.resx
  - ray_noconst.cs
  - ray_opencl.cs
  - ray_opencl_const.cs
  - hist_gpu_shmem_atomics.cs
  - basic_double_stream_correct.cs
  - copy_timed.cs
  - CudafyByExample.csproj
  - CudafyByExample.sln
  - Program.cs
  - AssemblyInfo.cs
  - CUDAfy-License.txt
  - ILSpy-license.txt
  - README.txt
  - Visit www.hybriddsp.com for full CUDAfy.NET SDK.txt

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Diagnostics;
using System.Threading;
using System.Threading.Tasks;
using Cudafy;
using Cudafy.Host;
using Cudafy.Types;
using Cudafy.Translator;

namespace Hybrid.DSP
{
    /// <summary>
    /// Contains routines for performing conversions on a GPU.
    /// </summary>
    public class GConvert
    {
        /// <summary>
        /// Initializes a new instance of the <see cref="GConvert"/> class.
        /// </summary>
        /// <param name="gpu">The gpu.</param>
        public GConvert(GPGPU gpu)
        {
            _gpu = gpu;

            string moduleName = typeof(GPUConvertCUDA).Name;
            // If module is not already loaded try to load from file.
            if (!_gpu.IsModuleLoaded(moduleName))
            {
                var mod = CudafyModule.TryDeserialize(moduleName);
                // If file does not exist or the checksum does not match then re-Cudafy.
                if (mod == null || !mod.TryVerifyChecksums())
                {
                    Debug.WriteLine("Cudafying...");
                    mod = CudafyTranslator.Cudafy(ePlatform.Auto, eArchitecture.sm_12, typeof(GPUConvertCUDA));
                    // Save the module to file for future use.
                    mod.Serialize(moduleName);
                }
                _gpu.LoadModule(mod);
            }
            // Instantiate arrays. _dev arrays will ultimately be on the GPU.
            _inArrays_dev = new byte[MAXCHUNKS][];
            _outArrays_dev = new char[MAXCHUNKS][];
            _outArrays = new char[MAXCHUNKS][];
            _inStages = new IntPtr[MAXCHUNKS];
            _outStages = new IntPtr[MAXCHUNKS];

            // Make MAXCHUNKS number of each array. Input is bytes, output is chars.
            // The output array will be 4/3 the size of the input.
            for (int c = 0; c < MAXCHUNKS; c++)
            {
                _inArrays_dev[c] = _gpu.Allocate<byte>(MAXCHUNKSIZE);
                _outArrays_dev[c] = _gpu.Allocate<char>((MAXCHUNKSIZE * 4) / 3);
                _inStages[c] = _gpu.HostAllocate<byte>(MAXCHUNKSIZE);
                _outStages[c] = _gpu.HostAllocate<char>((MAXCHUNKSIZE * 4) / 3);
                _outArrays[c] = new char[(MAXCHUNKSIZE * 4) / 3];
            }
        }

        /// <summary>
        /// Releases unmanaged resources and performs other cleanup operations before the
        /// <see cref="GConvert"/> is reclaimed by garbage collection.
        /// </summary>
        ~GConvert()
        {
            lock (_lock)
            {
                if (!this._disposed)
                {
                    Free();
                    _disposed = true;
                }
            }
        }

        /// <summary>
        /// Frees all temporary memory allocated on host and device.
        /// </summary>
        public void Free()
        {
            try
            {
                lock (_lock)
                {
                    if (!_disposed)
                    {
                        _gpu.FreeAll();
                        _gpu.HostFreeAll();
                    }
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex.Message);
                throw;
            }
        }

        private GPGPU _gpu;

        private object _lock = new object();

        private bool _disposed = false;

        private const int MAXCHUNKSIZE = 1024 * 1536;

        private const int MAXCHUNKS = 9;

        private IntPtr[] _inStages;

        private IntPtr[] _outStages;

        private byte[][] _inArrays_dev;

        private char[][] _outArrays_dev;

        private char[][] _outArrays;

        /// <summary>
        /// Converts the supplied byte array to base64.
        /// </summary>
        /// <param name="inArray">The input array.</param>
        /// <param name="outStream">The out stream.</param>
        public void ToBase64(byte[] inArray, Stream outStream)
        {
            int totalBytes = inArray.Length;
            int ctr = 0;
            int chunkIndex = 0;
            int threadsPerBlock = 256;

            StreamWriter sw = new StreamWriter(outStream);
            int[] outChunkSizes = new int[MAXCHUNKS]; // keep track of output chunk sizes.
            BeginWriteDelegate beginWrite = new BeginWriteDelegate(BeginWrite);
            IAsyncResult res = null;
            while (totalBytes > 0)
            {
                // Split into chunks
                int chunkSize = Math.Min(totalBytes, MAXCHUNKSIZE);
                outChunkSizes[chunkIndex] = (chunkSize * 4) / 3;
                
                // Copy the managed array to unmanaged buffer.
                GPGPU.CopyOnHost(inArray, ctr, _inStages[chunkIndex], 0, chunkSize);
                // Copy the data from unmanaged buffer to GPU asynchronously (use stream id = chunkIndex + 1)
                _gpu.CopyToDeviceAsync(_inStages[chunkIndex], 0, _inArrays_dev[chunkIndex], 0, chunkSize, chunkIndex + 1);
                // Calculate blocksPerGrid - GPU launches multiple blocks (blocksPerGrid) each consisting of multiple threads (threadsPerBlock).
                // Each thread will handle 3 bytes.
                int blocksPerGrid = (chunkSize + (threadsPerBlock * 3) - 1) / (threadsPerBlock * 3);
                // Launch the function ToBase64String asynchronously (same stream id as previous GPU command - they are in same queue).
                _gpu.LaunchAsync(blocksPerGrid, threadsPerBlock, chunkIndex + 1, "ToBase64String", _inArrays_dev[chunkIndex], _outArrays_dev[chunkIndex]);
                // Copy the data from GPU to an unmanaged buffer asynchronously (same stream id as previous GPU command - they are in same queue).
                _gpu.CopyFromDeviceAsync(_outArrays_dev[chunkIndex], 0, _outStages[chunkIndex], 0, outChunkSizes[chunkIndex], chunkIndex + 1);

                // Increment the chunkIndex, decrement totalBytes by chunkSize and increase our offset counter.
                chunkIndex++;
                totalBytes -= chunkSize;
                ctr += chunkSize;
                // If it is the last chunk, then we write all to the stream
                if (chunkIndex == MAXCHUNKS)
                {
                    res = null;
                    for (int c = 0; c < MAXCHUNKS; c++)
                    {
                        // Synchronize stream with index c + 1 (the first one). This will block.
                        _gpu.SynchronizeStream(c + 1);
                        // Copy the unmanaged buffer to a managed array. 
                        GPGPU.CopyOnHost(_outStages[c], 0, _outArrays[c], 0, outChunkSizes[c]);
                        // Wait for previous invoke to complete.
                        if (res != null)
                            beginWrite.EndInvoke(res);
                        // Begin writing the managed buffer to the stream asynchronously.
                        res = beginWrite.BeginInvoke(sw, _outArrays[c], 0, outChunkSizes[c], null, null);
                    }
                    chunkIndex = 0; // Reset chunkIndex.
                }
            }
            // Write any remaining chunks to the stream. The number remaining will be equal to chunkIndex.
            for (int c = 0; c < chunkIndex; c++)
            {
                _gpu.SynchronizeStream(c + 1);
                GPGPU.CopyOnHost(_outStages[c], 0, _outArrays[c], 0, outChunkSizes[c]);
                if (res != null)
                    beginWrite.EndInvoke(res);
                res = beginWrite.BeginInvoke(sw, _outArrays[c], 0, outChunkSizes[c], null, null);
            }
            // Wait for last chunk to be written.
            if (res != null)
                beginWrite.EndInvoke(res);
            // If the total number of bytes converted was not a multiple of 3 then handle the last bytes here.
            int remainder = inArray.Length % 3;
            if (remainder != 0)
            {
                string s = Convert.ToBase64String(inArray, inArray.Length - remainder, remainder).Remove(0, remainder);
                sw.Write(s);
            }
            sw.Flush();
        }

        /// <summary>
        /// Converts the supplied byte array to base64.
        /// </summary>
        /// <param name="inArray">The input array.</param>
        /// <param name="outStream">The out stream.</param>
        public void ToBase64Naive(byte[] inArray, Stream outStream)
        {
            int totalBytes = inArray.Length;
            int ctr = 0;
            int chunkIndex = 0;
            int threadsPerBlock = 256;
            StreamWriter sw = new StreamWriter(outStream);
            BeginWriteDelegate beginWrite = new BeginWriteDelegate(BeginWrite);
            IAsyncResult res = null;
            while (totalBytes > 0)
            {
                // Split into chunks
                int chunkSize = Math.Min(totalBytes, MAXCHUNKSIZE);
                int outChunkSize = (chunkSize * 4) / 3;

                // Copy the data to GPU
                _gpu.CopyToDevice(inArray, ctr, _inArrays_dev[chunkIndex], 0, chunkSize);
                // Calculate blocksPerGrid - GPU launches multiple blocks (blocksPerGrid) each consisting of multiple threads (threadsPerBlock).
                // Each thread will handle 3 bytes.
                int blocksPerGrid = (chunkSize + (threadsPerBlock * 3) - 1) / (threadsPerBlock * 3);
                // Launch the function ToBase64String asynchronously (same stream id as previous GPU command - they are in same queue).
                _gpu.Launch(blocksPerGrid, threadsPerBlock, "ToBase64String", _inArrays_dev[chunkIndex], _outArrays_dev[chunkIndex]);
                // Copy the data from GPU 
                _gpu.CopyFromDevice(_outArrays_dev[chunkIndex], 0, _outArrays[chunkIndex], 0, outChunkSize);

                // End any pending write
                if (res != null)
                    beginWrite.EndInvoke(res);
                // Begin writing the managed buffer to the stream asynchronously.
                res = beginWrite.BeginInvoke(sw, _outArrays[chunkIndex], 0, outChunkSize, null, null);

                // Increment the chunkIndex, decrement totalBytes by chunkSize and increase our offset counter.
                chunkIndex++;
                totalBytes -= chunkSize;
                ctr += chunkSize;
                if (chunkIndex == MAXCHUNKS)
                    chunkIndex = 0;
            }
            // Wait for last chunk to be written.
            if (res != null)
                beginWrite.EndInvoke(res);
            // If the total number of bytes converted was not a multiple of 3 then handle the last bytes here.
            int remainder = inArray.Length % 3;
            if (remainder != 0)
            {
                string s = Convert.ToBase64String(inArray, inArray.Length - remainder, remainder).Remove(0, remainder);
                sw.Write(s);
            }
            sw.Flush();
        }

        private delegate void BeginWriteDelegate(StreamWriter sw, char[] buffer, int offset, int count);

        public void BeginWrite(StreamWriter sw, char[] buffer, int offset, int count)
        {
            sw.Write(buffer, offset, count);
        }
    }

    /// <summary>
    /// Class containing GPGPU code.
    /// </summary>
    public class GPUConvertCUDA
    {
        /// <summary>
        /// Global GPU function that will convert 3 bytes.
        /// </summary>
        /// <param name="thread">The thread.</param>
        /// <param name="input">The input.</param>
        /// <param name="output">The output.</param>
        [Cudafy]
        public static void ToBase64String(GThread thread, byte[] input, char[] output)
        {
            // Get the id of the current thread.
            int tid = Gettid(thread);
            // Input id is 3 times the thread id.
            int itid = tid * 3;
            // Output id is 4 times the thread id.
            int otid = tid * 4;
            // Since we always launch a fixed number of threads per block we do not want a thread to try            
            // accessing an out of range index.
            if (itid + 2 < input.Length)
            {                             
                byte a0 = 0;
                byte a1 = 0;
                byte a2 = 0;
                byte b0 = 0;
                byte b1 = 0;
                byte b2 = 0;
                byte b3 = 0;

                a0 = input[itid];
                a1 = input[itid + 1];
                a2 = input[itid + 2];

                // Do the bit shuffling that's the core of base64 encoding.
                b0 = (byte)((a0 & 0xfc) >> 2);
                b1 = (byte)(((a0 & 0x03) << 4) + ((a1 & 0xf0) >> 4));
                b2 = (byte)(((a1 & 0x0f) << 2) + ((a2 & 0xc0) >> 6));
                b3 = (byte)(a2 & 0x3f);

                // Set the four output chars by selecting the index based on above four values.
                output[otid] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b0];
                output[otid + 1] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b1];
                output[otid + 2] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b2];
                output[otid + 3] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b3];       
            }
        }

        /// <summary>
        /// Device GPU function that gets the overall id of the thread.
        /// </summary>
        /// <param name="thread">The thread.</param>
        /// <returns>Thread id in terms of position in grid and block.</returns>
        [Cudafy]
        public static int Gettid(GThread thread)
        {
            int tid = thread.blockIdx.x * thread.blockDim.x + thread.threadIdx.x;
            return tid;
        }
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The GNU Lesser General Public License (LGPLv3)

Written By

Nick Kopp

Systems Engineer Hybrid DSP Systems

Netherlands

Nick is co owner of Hybrid DSP , a company specialized in high speed data acquisition, processing and storage.

CUDAfy.NET took considerable effort to develop and we ask nothing in return from users of the LGPL library other than that you please consider donating to Harmony through Education . This small charity helps handicapped children in developing countries by providing suitable schooling.

Base64 Encoding on a GPU

License

Comments and Discussions