using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Diagnostics;
using System.Threading;
using System.Threading.Tasks;
using Cudafy;
using Cudafy.Host;
using Cudafy.Types;
using Cudafy.Translator;
namespace Hybrid.DSP
{
/// <summary>
/// Contains routines for performing conversions on a GPU.
/// </summary>
public class GConvert
{
/// <summary>
/// Initializes a new instance of the <see cref="GConvert"/> class.
/// </summary>
/// <param name="gpu">The gpu.</param>
public GConvert(GPGPU gpu)
{
_gpu = gpu;
string moduleName = typeof(GPUConvertCUDA).Name;
// If module is not already loaded try to load from file.
if (!_gpu.IsModuleLoaded(moduleName))
{
var mod = CudafyModule.TryDeserialize(moduleName);
// If file does not exist or the checksum does not match then re-Cudafy.
if (mod == null || !mod.TryVerifyChecksums())
{
Debug.WriteLine("Cudafying...");
mod = CudafyTranslator.Cudafy(ePlatform.Auto, eArchitecture.sm_12, typeof(GPUConvertCUDA));
// Save the module to file for future use.
mod.Serialize(moduleName);
}
_gpu.LoadModule(mod);
}
// Instantiate arrays. _dev arrays will ultimately be on the GPU.
_inArrays_dev = new byte[MAXCHUNKS][];
_outArrays_dev = new char[MAXCHUNKS][];
_outArrays = new char[MAXCHUNKS][];
_inStages = new IntPtr[MAXCHUNKS];
_outStages = new IntPtr[MAXCHUNKS];
// Make MAXCHUNKS number of each array. Input is bytes, output is chars.
// The output array will be 4/3 the size of the input.
for (int c = 0; c < MAXCHUNKS; c++)
{
_inArrays_dev[c] = _gpu.Allocate<byte>(MAXCHUNKSIZE);
_outArrays_dev[c] = _gpu.Allocate<char>((MAXCHUNKSIZE * 4) / 3);
_inStages[c] = _gpu.HostAllocate<byte>(MAXCHUNKSIZE);
_outStages[c] = _gpu.HostAllocate<char>((MAXCHUNKSIZE * 4) / 3);
_outArrays[c] = new char[(MAXCHUNKSIZE * 4) / 3];
}
}
/// <summary>
/// Releases unmanaged resources and performs other cleanup operations before the
/// <see cref="GConvert"/> is reclaimed by garbage collection.
/// </summary>
~GConvert()
{
lock (_lock)
{
if (!this._disposed)
{
Free();
_disposed = true;
}
}
}
/// <summary>
/// Frees all temporary memory allocated on host and device.
/// </summary>
public void Free()
{
try
{
lock (_lock)
{
if (!_disposed)
{
_gpu.FreeAll();
_gpu.HostFreeAll();
}
}
}
catch (Exception ex)
{
Debug.WriteLine(ex.Message);
throw;
}
}
private GPGPU _gpu;
private object _lock = new object();
private bool _disposed = false;
private const int MAXCHUNKSIZE = 1024 * 1536;
private const int MAXCHUNKS = 9;
private IntPtr[] _inStages;
private IntPtr[] _outStages;
private byte[][] _inArrays_dev;
private char[][] _outArrays_dev;
private char[][] _outArrays;
/// <summary>
/// Converts the supplied byte array to base64.
/// </summary>
/// <param name="inArray">The input array.</param>
/// <param name="outStream">The out stream.</param>
public void ToBase64(byte[] inArray, Stream outStream)
{
int totalBytes = inArray.Length;
int ctr = 0;
int chunkIndex = 0;
int threadsPerBlock = 256;
StreamWriter sw = new StreamWriter(outStream);
int[] outChunkSizes = new int[MAXCHUNKS]; // keep track of output chunk sizes.
BeginWriteDelegate beginWrite = new BeginWriteDelegate(BeginWrite);
IAsyncResult res = null;
while (totalBytes > 0)
{
// Split into chunks
int chunkSize = Math.Min(totalBytes, MAXCHUNKSIZE);
outChunkSizes[chunkIndex] = (chunkSize * 4) / 3;
// Copy the managed array to unmanaged buffer.
GPGPU.CopyOnHost(inArray, ctr, _inStages[chunkIndex], 0, chunkSize);
// Copy the data from unmanaged buffer to GPU asynchronously (use stream id = chunkIndex + 1)
_gpu.CopyToDeviceAsync(_inStages[chunkIndex], 0, _inArrays_dev[chunkIndex], 0, chunkSize, chunkIndex + 1);
// Calculate blocksPerGrid - GPU launches multiple blocks (blocksPerGrid) each consisting of multiple threads (threadsPerBlock).
// Each thread will handle 3 bytes.
int blocksPerGrid = (chunkSize + (threadsPerBlock * 3) - 1) / (threadsPerBlock * 3);
// Launch the function ToBase64String asynchronously (same stream id as previous GPU command - they are in same queue).
_gpu.LaunchAsync(blocksPerGrid, threadsPerBlock, chunkIndex + 1, "ToBase64String", _inArrays_dev[chunkIndex], _outArrays_dev[chunkIndex]);
// Copy the data from GPU to an unmanaged buffer asynchronously (same stream id as previous GPU command - they are in same queue).
_gpu.CopyFromDeviceAsync(_outArrays_dev[chunkIndex], 0, _outStages[chunkIndex], 0, outChunkSizes[chunkIndex], chunkIndex + 1);
// Increment the chunkIndex, decrement totalBytes by chunkSize and increase our offset counter.
chunkIndex++;
totalBytes -= chunkSize;
ctr += chunkSize;
// If it is the last chunk, then we write all to the stream
if (chunkIndex == MAXCHUNKS)
{
res = null;
for (int c = 0; c < MAXCHUNKS; c++)
{
// Synchronize stream with index c + 1 (the first one). This will block.
_gpu.SynchronizeStream(c + 1);
// Copy the unmanaged buffer to a managed array.
GPGPU.CopyOnHost(_outStages[c], 0, _outArrays[c], 0, outChunkSizes[c]);
// Wait for previous invoke to complete.
if (res != null)
beginWrite.EndInvoke(res);
// Begin writing the managed buffer to the stream asynchronously.
res = beginWrite.BeginInvoke(sw, _outArrays[c], 0, outChunkSizes[c], null, null);
}
chunkIndex = 0; // Reset chunkIndex.
}
}
// Write any remaining chunks to the stream. The number remaining will be equal to chunkIndex.
for (int c = 0; c < chunkIndex; c++)
{
_gpu.SynchronizeStream(c + 1);
GPGPU.CopyOnHost(_outStages[c], 0, _outArrays[c], 0, outChunkSizes[c]);
if (res != null)
beginWrite.EndInvoke(res);
res = beginWrite.BeginInvoke(sw, _outArrays[c], 0, outChunkSizes[c], null, null);
}
// Wait for last chunk to be written.
if (res != null)
beginWrite.EndInvoke(res);
// If the total number of bytes converted was not a multiple of 3 then handle the last bytes here.
int remainder = inArray.Length % 3;
if (remainder != 0)
{
string s = Convert.ToBase64String(inArray, inArray.Length - remainder, remainder).Remove(0, remainder);
sw.Write(s);
}
sw.Flush();
}
/// <summary>
/// Converts the supplied byte array to base64.
/// </summary>
/// <param name="inArray">The input array.</param>
/// <param name="outStream">The out stream.</param>
public void ToBase64Naive(byte[] inArray, Stream outStream)
{
int totalBytes = inArray.Length;
int ctr = 0;
int chunkIndex = 0;
int threadsPerBlock = 256;
StreamWriter sw = new StreamWriter(outStream);
BeginWriteDelegate beginWrite = new BeginWriteDelegate(BeginWrite);
IAsyncResult res = null;
while (totalBytes > 0)
{
// Split into chunks
int chunkSize = Math.Min(totalBytes, MAXCHUNKSIZE);
int outChunkSize = (chunkSize * 4) / 3;
// Copy the data to GPU
_gpu.CopyToDevice(inArray, ctr, _inArrays_dev[chunkIndex], 0, chunkSize);
// Calculate blocksPerGrid - GPU launches multiple blocks (blocksPerGrid) each consisting of multiple threads (threadsPerBlock).
// Each thread will handle 3 bytes.
int blocksPerGrid = (chunkSize + (threadsPerBlock * 3) - 1) / (threadsPerBlock * 3);
// Launch the function ToBase64String asynchronously (same stream id as previous GPU command - they are in same queue).
_gpu.Launch(blocksPerGrid, threadsPerBlock, "ToBase64String", _inArrays_dev[chunkIndex], _outArrays_dev[chunkIndex]);
// Copy the data from GPU
_gpu.CopyFromDevice(_outArrays_dev[chunkIndex], 0, _outArrays[chunkIndex], 0, outChunkSize);
// End any pending write
if (res != null)
beginWrite.EndInvoke(res);
// Begin writing the managed buffer to the stream asynchronously.
res = beginWrite.BeginInvoke(sw, _outArrays[chunkIndex], 0, outChunkSize, null, null);
// Increment the chunkIndex, decrement totalBytes by chunkSize and increase our offset counter.
chunkIndex++;
totalBytes -= chunkSize;
ctr += chunkSize;
if (chunkIndex == MAXCHUNKS)
chunkIndex = 0;
}
// Wait for last chunk to be written.
if (res != null)
beginWrite.EndInvoke(res);
// If the total number of bytes converted was not a multiple of 3 then handle the last bytes here.
int remainder = inArray.Length % 3;
if (remainder != 0)
{
string s = Convert.ToBase64String(inArray, inArray.Length - remainder, remainder).Remove(0, remainder);
sw.Write(s);
}
sw.Flush();
}
private delegate void BeginWriteDelegate(StreamWriter sw, char[] buffer, int offset, int count);
public void BeginWrite(StreamWriter sw, char[] buffer, int offset, int count)
{
sw.Write(buffer, offset, count);
}
}
/// <summary>
/// Class containing GPGPU code.
/// </summary>
public class GPUConvertCUDA
{
/// <summary>
/// Global GPU function that will convert 3 bytes.
/// </summary>
/// <param name="thread">The thread.</param>
/// <param name="input">The input.</param>
/// <param name="output">The output.</param>
[Cudafy]
public static void ToBase64String(GThread thread, byte[] input, char[] output)
{
// Get the id of the current thread.
int tid = Gettid(thread);
// Input id is 3 times the thread id.
int itid = tid * 3;
// Output id is 4 times the thread id.
int otid = tid * 4;
// Since we always launch a fixed number of threads per block we do not want a thread to try
// accessing an out of range index.
if (itid + 2 < input.Length)
{
byte a0 = 0;
byte a1 = 0;
byte a2 = 0;
byte b0 = 0;
byte b1 = 0;
byte b2 = 0;
byte b3 = 0;
a0 = input[itid];
a1 = input[itid + 1];
a2 = input[itid + 2];
// Do the bit shuffling that's the core of base64 encoding.
b0 = (byte)((a0 & 0xfc) >> 2);
b1 = (byte)(((a0 & 0x03) << 4) + ((a1 & 0xf0) >> 4));
b2 = (byte)(((a1 & 0x0f) << 2) + ((a2 & 0xc0) >> 6));
b3 = (byte)(a2 & 0x3f);
// Set the four output chars by selecting the index based on above four values.
output[otid] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b0];
output[otid + 1] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b1];
output[otid + 2] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b2];
output[otid + 3] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[b3];
}
}
/// <summary>
/// Device GPU function that gets the overall id of the thread.
/// </summary>
/// <param name="thread">The thread.</param>
/// <returns>Thread id in terms of position in grid and block.</returns>
[Cudafy]
public static int Gettid(GThread thread)
{
int tid = thread.blockIdx.x * thread.blockDim.x + thread.threadIdx.x;
return tid;
}
}
}