__global__ fmultiply(float* A, float *B, float *C) { int idx = blockIdx.x*blockDim.x + threadIdx.x; //B[idx] gives Garbage Value here.. C[idx] = A[idx]*B[idx]; }
int N = 10; //Array Containing Maximum of 10 elements size_t size = N*sizeof(float); ... cudaMalloc((**void)&a_d, size); cudaMalloc((**void)&b_d, size); cudaMalloc((**void)&c_d, size); ... cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice); cudaMemcpy(b_d, b_h, size, cudaMemcpyHostToDevice); int threadsPerBlock = 256; int noOfBlocks = (N/threadsPerBlock); //Calling Kernel Function fmultiply<<<threadsPerBlock, noOfBlock>>>(a_d, b_d, c_d); cudaMemcpy(c_d, c_h, size, cudaMemcpyDeviceToHost); ...... cudaFree(a_d); cudaFree(b_d); cudaFree(c_d);
b_d
fmultiply(..)
critical section
B
__global__ fmultiply(float* A, float *B, float *C) { int idx = blockIdx.x*blockDim.x + threadIdx.x; if(idx<N) C[idx] = A[idx]*B[idx]; }
var
This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)