Hi,
I have following cuda code which calculates sum of squares of array elements.

I am able to measure the clock ticks of the core.

Any suggestions about how can i measure the time take to calculate the sum?

```#include <stdio.h>
#include <cutil_inline.h>

#include <thrust/version.h>

#include <thrust/generate.h>

#include <thrust/host_vector.h>

#include <thrust/device_vector.h>

#include <thrust/functional.h>

#include <thrust/transform_reduce.h>

#define BLOCK_NUM	32

template <typename T>
struct square

{

__host__ __device__

T operator() (T x)

{

return x * x;

}

};

__global__ static void sumOfSquares(int * num, int * result, clock_t * time,int DATA_SIZE)

{

extern __shared__ int shared[];

const int tid = threadIdx.x;

const int bid = blockIdx.x;

if (tid == 0) time[bid] = clock();

shared[tid] = 0;

for (int i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {

shared[tid] += num[i] * num[i];

}

int offset = THREAD_NUM / 2;

while (offset > 0) {

if (tid < offset) {

shared[tid] += shared[tid + offset];

}

offset >>= 1;

}

if (tid == 0) {

result[bid] = shared[0];

time[bid + BLOCK_NUM] = clock();

}

}

extern "C"
int run_kernel(int array[],int nelements)
{
int * gpudata, * result;

clock_t * time;

cudaMalloc((void **) &gpudata, sizeof(int) * nelements);

cudaMalloc((void **) &result, sizeof(int) * THREAD_NUM * BLOCK_NUM);

cudaMalloc((void **) &time, sizeof(clock_t) * BLOCK_NUM * 2);

cudaMemcpy(gpudata, array, sizeof(int) * nelements, cudaMemcpyHostToDevice);

int sum[BLOCK_NUM];

sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int)>>>(gpudata, result, time, nelements);

cudaMemcpy(&sum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);

int final_sum = 0;

for (int i = 0; i < BLOCK_NUM; i++) {

final_sum += sum[i];

}

cudaFree(gpudata);

cudaFree(result);

clock_t time_used[BLOCK_NUM * 2];

cudaMemcpy(&time_used, time, sizeof(clock_t) * BLOCK_NUM * 2, cudaMemcpyDeviceToHost);

cudaFree(time);

clock_t min_start, max_end;

min_start = time_used[0];

max_end = time_used[BLOCK_NUM];

for (int i = 1; i < BLOCK_NUM; i++) {

if (min_start > time_used[i])

min_start = time_used[i];

if (max_end < time_used[i + BLOCK_NUM])

max_end = time_used[i + BLOCK_NUM];

}

printf("sum (on GPU): %d  (core clocks: %d)\n", final_sum, max_end - min_start);

return final_sum;
}```

Also any suggestions about improving the performance ?
