Can anyone help me in doing matrix addition in Cuda C.Matrix should be square as well as non square and block dimension should be 2D.Here is the code what i have done.But it wont work for matrix above 2*2 matrix.Can anyone help me in solving this.....

#include <iostream>
#include <cuda.h>
#define blocksize 16
texture<float,> texVecA;
texture<float,> texVecB;
__constant__ int x;
__constant__ int y;
__global__ void MatrixAdd_d(float *C)
{
int N=x*y;
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
int index = i*N + j;
float flValA = tex1Dfetch(texVecA, index);
float flValB = tex1Dfetch(texVecB, index);
if(i<n mode="hold" /> {
C[index]=flValA +flValB;
}
}
int main()
{
float *a_h, *b_h, *c_h; float *a_d, *b_d, *c_d; int n,m, i, j, index;
printf("Enter dimension of matrix\n");
scanf("%d%d",&n,&m);
int N=m*n;
a_h = (float *)malloc(sizeof(float)*n*m);
b_h = (float *)malloc(sizeof(float)*n*m);
c_h = (float *)malloc(sizeof(float)*n*m);
cudaMalloc((void **)&a_d,m*n*sizeof(float));
cudaMalloc((void **)&b_d,m*n*sizeof(float));
cudaMalloc((void **)&c_d,m*n*sizeof(float));
printf("Enter elements of first Matrix:\n");
for(int i=0;i<n;i++)>
{
for(int j=0;j<m;j++)>
{
scanf("%f",&a_h[i * m + j]);
}
}
printf("Enter elements of second matrix:\n");
for(int i=0;i<n;i++)>
{
for(int j=0;j<m;j++)>
{
scanf_s("%f",&b_h[i * m + j]);
}
}
cudaMemcpy(a_d,a_h,N*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(b_d,b_h,N*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(x, &m, sizeof(int), 0);
cudaMemcpyToSymbol(y, &n, sizeof(int), 0);
cudaBindTexture(0, texVecA, a_d, (N * sizeof(float)));
cudaBindTexture(0, texVecB, b_d, (N * sizeof(float)));
dim3 dimBlock( blocksize, blocksize );
dim3 dimGrid( ceil(float(n)/float(dimBlock.x)), ceil(float(n)/float(dimBlock.y)) );
MatrixAdd_d<<<dimgrid,>>>(c_d);
cudaMemcpy(c_h,c_d,N*sizeof(float),cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
for(j=0;j<n;j++)>
{
for(i=0;i<m;i++)>
{
index = j*m+i;
printf("A + B = C: %d %d %f + %f = %f\n",i,j,a_h[index],b_h[index],c_h[index]);
}
}
cudaUnbindTexture(texVecA);
cudaUnbindTexture(texVecB);
free(a_h);
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
return(0);
}</cuda.h></iostream>