Hi,
I have following cuda code which calculates sum of squares of array elements.
I am able to measure the clock ticks of the core.
Any suggestions about how can i measure the time take to calculate the sum?
#include <stdio.h>
#include <cutil_inline.h>
#include <thrust/version.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#define BLOCK_NUM 32
#define THREAD_NUM 512
template <typename T>
struct square
{
__host__ __device__
T operator() (T x)
{
return x * x;
}
};
__global__ static void sumOfSquares(int * num, int * result, clock_t * time,int DATA_SIZE)
{
extern __shared__ int shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
if (tid == 0) time[bid] = clock();
shared[tid] = 0;
for (int i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {
shared[tid] += num[i] * num[i];
}
__syncthreads();
int offset = THREAD_NUM / 2;
while (offset > 0) {
if (tid < offset) {
shared[tid] += shared[tid + offset];
}
offset >>= 1;
__syncthreads();
}
if (tid == 0) {
result[bid] = shared[0];
time[bid + BLOCK_NUM] = clock();
}
}
extern "C"
int run_kernel(int array[],int nelements)
{
int * gpudata, * result;
clock_t * time;
cudaMalloc((void **) &gpudata, sizeof(int) * nelements);
cudaMalloc((void **) &result, sizeof(int) * THREAD_NUM * BLOCK_NUM);
cudaMalloc((void **) &time, sizeof(clock_t) * BLOCK_NUM * 2);
cudaMemcpy(gpudata, array, sizeof(int) * nelements, cudaMemcpyHostToDevice);
int sum[BLOCK_NUM];
sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int)>>>(gpudata, result, time, nelements);
cudaMemcpy(&sum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);
int final_sum = 0;
for (int i = 0; i < BLOCK_NUM; i++) {
final_sum += sum[i];
}
cudaFree(gpudata);
cudaFree(result);
clock_t time_used[BLOCK_NUM * 2];
cudaMemcpy(&time_used, time, sizeof(clock_t) * BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
cudaFree(time);
clock_t min_start, max_end;
min_start = time_used[0];
max_end = time_used[BLOCK_NUM];
for (int i = 1; i < BLOCK_NUM; i++) {
if (min_start > time_used[i])
min_start = time_used[i];
if (max_end < time_used[i + BLOCK_NUM])
max_end = time_used[i + BLOCK_NUM];
}
printf("sum (on GPU): %d (core clocks: %d)\n", final_sum, max_end - min_start);
return final_sum;
}
Also any suggestions about improving the performance ?