Click here to Skip to main content
15,892,697 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
See more:
Hi,
I have following cuda code which calculates sum of squares of array elements.

I am able to measure the clock ticks of the core.

Any suggestions about how can i measure the time take to calculate the sum?

#include <stdio.h>
#include <cutil_inline.h>



#include <thrust/version.h>

#include <thrust/generate.h>

#include <thrust/host_vector.h>

#include <thrust/device_vector.h>

#include <thrust/functional.h>

#include <thrust/transform_reduce.h>

#define BLOCK_NUM	32

#define THREAD_NUM	512
template <typename T>
struct square

{

	__host__ __device__

		T operator() (T x)

	{

		return x * x;

	}

};

__global__ static void sumOfSquares(int * num, int * result, clock_t * time,int DATA_SIZE)

{

	extern __shared__ int shared[];

	const int tid = threadIdx.x;

	const int bid = blockIdx.x;



	if (tid == 0) time[bid] = clock();



	shared[tid] = 0;

	for (int i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {

		shared[tid] += num[i] * num[i];

	}



	__syncthreads();

	int offset = THREAD_NUM / 2;

	while (offset > 0) {

		if (tid < offset) {

			shared[tid] += shared[tid + offset];

		}

		offset >>= 1;

		__syncthreads();

	}



	if (tid == 0) {

		result[bid] = shared[0];

		time[bid + BLOCK_NUM] = clock();

	}

}

extern "C"
int run_kernel(int array[],int nelements)
{
  int * gpudata, * result;

      clock_t * time;

      cudaMalloc((void **) &gpudata, sizeof(int) * nelements);

      cudaMalloc((void **) &result, sizeof(int) * THREAD_NUM * BLOCK_NUM);

      cudaMalloc((void **) &time, sizeof(clock_t) * BLOCK_NUM * 2);

      cudaMemcpy(gpudata, array, sizeof(int) * nelements, cudaMemcpyHostToDevice);



      int sum[BLOCK_NUM];

      sumOfSquares<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int)>>>(gpudata, result, time, nelements);

      cudaMemcpy(&sum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);



      int final_sum = 0;

	for (int i = 0; i < BLOCK_NUM; i++) {

		final_sum += sum[i];

      }

      cudaFree(gpudata);

      cudaFree(result);



      clock_t time_used[BLOCK_NUM * 2];

      cudaMemcpy(&time_used, time, sizeof(clock_t) * BLOCK_NUM * 2, cudaMemcpyDeviceToHost);

      cudaFree(time);


      clock_t min_start, max_end;

      min_start = time_used[0];

      max_end = time_used[BLOCK_NUM];

      for (int i = 1; i < BLOCK_NUM; i++) {

	if (min_start > time_used[i])

	  min_start = time_used[i];

	if (max_end < time_used[i + BLOCK_NUM])

	  max_end = time_used[i + BLOCK_NUM];

     }
     
     printf("sum (on GPU): %d  (core clocks: %d)\n", final_sum, max_end - min_start);

    
     return final_sum;
}

Also any suggestions about improving the performance ?
Posted

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900