Click here to Skip to main content
15,892,480 members
Articles / Programming Languages / CUDA

Base64 Encoding on a GPU

Rate me:
Please Sign up or sign in to vote.
4.89/5 (26 votes)
16 Sep 2013LGPL310 min read 64K   1.8K   56  
Performing base64 encoding on a graphics processing unit using CUDAfy.NET (CUDA in .NET).
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Diagnostics;
using System.Threading;
using System.Threading.Tasks;
using Cudafy;
using Cudafy.Host;
using Cudafy.Types;
using Cudafy.Translator;

namespace Hybrid.DSP
{
    /// <summary>
    /// Contains routines for performing conversions on a GPU.
    /// </summary>
    public class GConvert
    {
        /// <summary>
        /// Initializes a new instance of the <see cref="GConvert"/> class.
        /// </summary>
        /// <param name="gpu">The gpu.</param>
        public GConvert(GPGPU gpu)
        {
            _gpu = gpu;

            string moduleName = typeof(GPUConvertCUDA).Name;
            // If module is not already loaded try to load from file.
            if (!_gpu.IsModuleLoaded(moduleName))
            {
                var mod = CudafyModule.TryDeserialize(moduleName);
                // If file does not exist or the checksum does not match then re-Cudafy.
                if (mod == null || !mod.TryVerifyChecksums())
                {
                    Debug.WriteLine("Cudafying...");
                    mod = CudafyTranslator.Cudafy(ePlatform.Auto, eArchitecture.sm_12, typeof(GPUConvertCUDA));
                    // Save the module to file for future use.
                    mod.Serialize(moduleName);
                }
                _gpu.LoadModule(mod);
            }
            // Instantiate arrays. _dev arrays will ultimately be on the GPU.
            _inArrays_dev = new byte[MAXCHUNKS][];
            _outArrays_dev = new char[MAXCHUNKS][];
            _outArrays = new char[MAXCHUNKS][];
            _inStages = new IntPtr[MAXCHUNKS];
            _outStages = new IntPtr[MAXCHUNKS];

            // Make MAXCHUNKS number of each array. Input is bytes, output is chars.
            // The output array will be 4/3 the size of the input.
            for (int c = 0; c < MAXCHUNKS; c++)
            {
                _inArrays_dev[c] = _gpu.Allocate<byte>(MAXCHUNKSIZE);
                _outArrays_dev[c] = _gpu.Allocate<char>((MAXCHUNKSIZE * 4) / 3);
                _inStages[c] = _gpu.HostAllocate<byte>(MAXCHUNKSIZE);
                _outStages[c] = _gpu.HostAllocate<char>((MAXCHUNKSIZE * 4) / 3);
                _outArrays[c] = new char[(MAXCHUNKSIZE * 4) / 3];
            }
        }

        /// <summary>
        /// Releases unmanaged resources and performs other cleanup operations before the
        /// <see cref="GConvert"/> is reclaimed by garbage collection.
        /// </summary>
        ~GConvert()
        {
            lock (_lock)
            {
                if (!this._disposed)
                {
                    Free();
                    _disposed = true;
                }
            }
        }

        /// <summary>
        /// Frees all temporary memory allocated on host and device.
        /// </summary>
        public void Free()
        {
            try
            {
                lock (_lock)
                {
                    if (!_disposed)
                    {
                        _gpu.FreeAll();
                        _gpu.HostFreeAll();
                    }
                }
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex.Message);
                throw;
            }
        }

        private GPGPU _gpu;

        private object _lock = new object();

        private bool _disposed = false;

        private const int MAXCHUNKSIZE = 1024 * 1536;

        private const int MAXCHUNKS = 9;

        private IntPtr[] _inStages;

        private IntPtr[] _outStages;

        private byte[][] _inArrays_dev;

        private char[][] _outArrays_dev;

        private char[][] _outArrays;

        /// <summary>
        /// Converts the supplied byte array to base64.
        /// </summary>
        /// <param name="inArray">The input array.</param>
        /// <param name="outStream">The out stream.</param>
        public void ToBase64(byte[] inArray, Stream outStream)
        {
            int totalBytes = inArray.Length;
            int ctr = 0;
            int chunkIndex = 0;
            int threadsPerBlock = 256;

            StreamWriter sw = new StreamWriter(outStream);
            int[] outChunkSizes = new int[MAXCHUNKS]; // keep track of output chunk sizes.
            BeginWriteDelegate beginWrite = new BeginWriteDelegate(BeginWrite);
            IAsyncResult res = null;
            while (totalBytes > 0)
            {
                // Split into chunks
                int chunkSize = Math.Min(totalBytes, MAXCHUNKSIZE);
                outChunkSizes[chunkIndex] = (chunkSize * 4) / 3;
                
                // Copy the managed array to unmanaged buffer.
                GPGPU.CopyOnHost(inArray, ctr, _inStages[chunkIndex], 0, chunkSize);
                // Copy the data from unmanaged buffer to GPU asynchronously (use stream id = chunkIndex + 1)
                _gpu.CopyToDeviceAsync(_inStages[chunkIndex], 0, _inArrays_dev[chunkIndex], 0, chunkSize, chunkIndex + 1);
                // Calculate blocksPerGrid - GPU launches multiple blocks (blocksPerGrid) each consisting of multiple threads (threadsPerBlock).
                // Each thread will handle 3 bytes.
                int blocksPerGrid = (chunkSize + (threadsPerBlock * 3) - 1) / (threadsPerBlock * 3);
                // Launch the function ToBase64String asynchronously (same stream id as previous GPU command - they are in same queue).
                _gpu.LaunchAsync(blocksPerGrid, threadsPerBlock, chunkIndex + 1, "ToBase64String", _inArrays_dev[chunkIndex], _outArrays_dev[chunkIndex]);
                // Copy the data from GPU to an unmanaged buffer asynchronously (same stream id as previous GPU command - they are in same queue).
                _gpu.CopyFromDeviceAsync(_outArrays_dev[chunkIndex], 0, _outStages[chunkIndex], 0, outChunkSizes[chunkIndex], chunkIndex + 1);

                // Increment the chunkIndex, decrement totalBytes by chunkSize and increase our offset counter.
                chunkIndex++;
                totalBytes -= chunkSize;
                ctr += chunkSize;
                // If it is the last chunk, then we write all to the stream
                if (chunkIndex == MAXCHUNKS)
                {
                    res = null;
                    for (int c = 0; c < MAXCHUNKS; c++)
                    {
                        // Synchronize stream with index c + 1 (the first one). This will block.
                        _gpu.SynchronizeStream(c + 1);
                        // Copy the unmanaged buffer to a managed array. 
                        GPGPU.CopyOnHost(_outStages[c], 0, _outArrays[c], 0, outChunkSizes[c]);
                        // Wait for previous invoke to complete.
                        if (res != null)
                            beginWrite.EndInvoke(res);
                        // Begin writing the managed buffer to the stream asynchronously.
                        res = beginWrite.BeginInvoke(sw, _outArrays[c], 0, outChunkSizes[c], null, null);
                    }
                    chunkIndex = 0; // Reset chunkIndex.
                }
            }
            // Write any remaining chunks to the stream. The number remaining will be equal to chunkIndex.
            for (int c = 0; c < chunkIndex; c++)
            {
                _gpu.SynchronizeStream(c + 1);
                GPGPU.CopyOnHost(_outStages[c], 0, _outArrays[c], 0, outChunkSizes[c]);
                if (res != null)
                    beginWrite.EndInvoke(res);
                res = beginWrite.BeginInvoke(sw, _outArrays[c], 0, outChunkSizes[c], null, null);
            }
            // Wait for last chunk to be written.
            if (res != null)
                beginWrite.EndInvoke(res);
            // If the total number of bytes converted was not a multiple of 3 then handle the last bytes here.
            int remainder = inArray.Length % 3;
            if (remainder != 0)
            {
                string s = Convert.ToBase64String(inArray, inArray.Length - remainder, remainder).Remove(0, remainder);
                sw.Write(s);
            }
            sw.Flush();
        }

        /// <summary>
        /// Converts the supplied byte array to base64.
        /// </summary>
        /// <param name="inArray">The input array.</param>
        /// <param name="outStream">The out stream.</param>
        public void ToBase64Naive(byte[] inArray, Stream outStream)
        {
            int totalBytes = inArray.Length;
            int ctr = 0;
            int chunkIndex = 0;
            int threadsPerBlock = 256;
            StreamWriter sw = new StreamWriter(outStream);
            BeginWriteDelegate beginWrite = new BeginWriteDelegate(BeginWrite);
            IAsyncResult res = null;
            while (totalBytes > 0)
            {
                // Split into chunks
                int chunkSize = Math.Min(totalBytes, MAXCHUNKSIZE);
                int outChunkSize = (chunkSize * 4) / 3;

                // Copy the data to GPU
                _gpu.CopyToDevice(inArray, ctr, _inArrays_dev[chunkIndex], 0, chunkSize);
                // Calculate blocksPerGrid - GPU launches multiple blocks (blocksPerGrid) each consisting of multiple threads (threadsPerBlock).
                // Each thread will handle 3 bytes.
                int blocksPerGrid = (chunkSize + (threadsPerBlock * 3) - 1) / (threadsPerBlock * 3);
                // Launch the function ToBase64String asynchronously (same stream id as previous GPU command - they are in same queue).
                _gpu.Launch(blocksPerGrid, threadsPerBlock, "ToBase64String", _inArrays_dev[chunkIndex], _outArrays_dev[chunkIndex]);
                // Copy the data from GPU 
                _gpu.CopyFromDevice(_outArrays_dev[chunkIndex], 0, _outArrays[chunkIndex], 0, outChunkSize);

                // End any pending write
                if (res != null)
                    beginWrite.EndInvoke(res);
                // Begin writing the managed buffer to the stream asynchronously.
                res = beginWrite.BeginInvoke(sw, _outArrays[chunkIndex], 0, outChunkSize, null, null);

                // Increment the chunkIndex, decrement totalBytes by chunkSize and increase our offset counter.
                chunkIndex++;
                totalBytes -= chunkSize;
                ctr += chunkSize;
                if (chunkIndex == MAXCHUNKS)
                    chunkIndex = 0;
            }
            // Wait for last chunk to be written.
            if (res != null)
                beginWrite.EndInvoke(res);
            // If the total number of bytes converted was not a multiple of 3 then handle the last bytes here.
            int remainder = inArray.Length % 3;
            if (remainder != 0)
            {
                string s = Convert.ToBase64String(inArray, inArray.Length - remainder, remainder).Remove(0, remainder);
                sw.Write(s);
            }
            sw.Flush();
        }

        private delegate void BeginWriteDelegate(StreamWriter sw, char[] buffer, int offset, int count);

        public void BeginWrite(StreamWriter sw, char[] buffer, int offset, int count)
        {
            sw.Write(buffer, offset, count);
        }
    }

    /// <summary>
    /// Class containing GPGPU code.
    /// </summary>
    public class GPUConvertCUDA
    {
        /// <summary>
        /// Global GPU function that will convert 3 bytes.
        /// </summary>
        /// <param name="thread">The thread.</param>
        /// <param name="input">The input.</param>
        /// <param name="output">The output.</param>
        [Cudafy]
        public static void ToBase64String(GThread thread, byte[] input, char[] output)
        {
            // Get the id of the current thread.
            int tid = Gettid(thread);
            // Input id is 3 times the thread id.
            int itid = tid * 3;
            // Output id is 4 times the thread id.
            int otid = tid * 4;
            // Since we always launch a fixed number of threads per block we do not want a thread to try            
            // accessing an out of range index.
            if (itid + 2 < input.Length)
            {                             
                byte a0 = 0;
                byte a1 = 0;
                byte a2 = 0;
                byte b0 = 0;
                byte b1 = 0;
                byte b2 = 0;
                byte b3 = 0;

                a0 = input[itid];
                a1 = input[itid + 1];
                a2 = input[itid + 2];

                // Do the bit shuffling that's the core of base64 encoding.
                b0 = (byte)((a0 & 0xfc) >> 2);
                b1 = (byte)(((a0 & 0x03) << 4) + ((a1 & 0xf0) >> 4));
                b2 = (byte)(((a1 & 0x0f) << 2) + ((a2 & 0xc0) >> 6));
                b3 = (byte)(a2 & 0x3f);

                // Set the four output chars by selecting the index based on above four values.
                output[otid] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b0];
                output[otid + 1] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b1];
                output[otid + 2] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b2];
                output[otid + 3] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b3];       
            }
        }

        /// <summary>
        /// Device GPU function that gets the overall id of the thread.
        /// </summary>
        /// <param name="thread">The thread.</param>
        /// <returns>Thread id in terms of position in grid and block.</returns>
        [Cudafy]
        public static int Gettid(GThread thread)
        {
            int tid = thread.blockIdx.x * thread.blockDim.x + thread.threadIdx.x;
            return tid;
        }
    }
}

By viewing downloads associated with this article you agree to the Terms of Service and the article's licence.

If a file you wish to view isn't highlighted, and is a text file (not binary), please let us know and we'll add colourisation support for it.

License

This article, along with any associated source code and files, is licensed under The GNU Lesser General Public License (LGPLv3)


Written By
Systems Engineer Hybrid DSP Systems
Netherlands Netherlands
Nick is co owner of Hybrid DSP, a company specialized in high speed data acquisition, processing and storage.

CUDAfy.NET took considerable effort to develop and we ask nothing in return from users of the LGPL library other than that you please consider donating to Harmony through Education. This small charity helps handicapped children in developing countries by providing suitable schooling.

Comments and Discussions