My Cuda kernel code is not working? -


i try make small code generate numbers , return result in array after run code it's not working, tried use nsight debugger understand problem freeze , close immediately. don't know problem ?

could me please understand problem in code ?

thanks

__global__ void mykernel( int* pf_tmp, int* pl_tmp, int* qf_tmp, int* ql_tmp,                                int m[2], int p[5], int q[5], int i, int* n,                                int out[10][5], int n)     {         int id = blockdim.x * blockidx.x + threadidx.x;          int idx = blockidx.x;         int idy = blockidx.y;          int w = idx/100;         int x = idx%100;         int y = idy;          int z = threadidx.x;          int len = ((i * 2) + 5);           // fill pf_tmp & qf_tmp         if( > 0){             for(int k = 0; k < (i * 2); k++)             {                 p[k]   = pf_tmp[k];                 q[k]   = qf_tmp[k];             }         }          // fill x         if( x > 10)         {             p[(i*2)] = (x - (x % 10)) / 10;             p[(i*2)+1] = x % 10;         }else{             p[(i*2)] = 0;             p[(i*2)+1] = x;         }          // fill y         if( y > 10)         {             q[(i*2)] = (y - (y % 10)) / 10;             q[(i*2)+1] = y % 10;         }else{             q[(i*2)] = 0;             q[(i*2)+1] = y;         }          // fill m         p[(i * 2)+2] = m[0];         q[(i * 2)+2] = m[1];          // fill w          if( w > 10)         {             p[(i*2)+3] = (w - (w % 10)) / 10;             p[(i*2)+4] = w % 10;         }else{             p[(i*2)+3] = 0;             p[(i*2)+4] = w;         }          // fill z          if( z > 10)         {             q[(i*2)+3] = (z - (z % 10)) / 10;             q[(i*2)+4] = z % 10;         }else{             q[(i*2)+3] = 0;             q[(i*2)+4] = z;         }          // fill pl_tmp & ql_tmp         if( > 0)         {             for(int k = 0; k < (i * 2); k++)             {                 p[(len-(i * 2))+k]   = pl_tmp[k];                 q[(len-(i * 2))+k]   = ql_tmp[k];             }         }          if(id<10)         {             for(int k =0; k<5; k++)                 out[id][k] = p[k];         }      }        int main()     {         cudaerror err;         dim3 blocks(10000, 100);         dim3 threads(100);           int m[2] = {4,5};         int hst_out[10][5];         int p[5];         int q[5];         err = cudamalloc((void **)&p, 5);         err = cudamalloc((void **)&q, 5);         err = cudamalloc((void **)&hst_out, 50);          mykernel<<<blocks, threads>>>(null, null, null, null, m, p, q, 0, null, hst_out, 100000000);          return 0;     } 

the error obvious, c programming.

when declare

         int m[2] = {4,5};         int hst_out[10][5];         int p[5];         int q[5]; 

now hst_out, p, q not pointer, later used pointer:

         err = cudamalloc((void **)&p, 5);         err = cudamalloc((void **)&q, 5);         err = cudamalloc((void **)&hst_out, 50); 

so u should have declare pointer instead, eg,

         int *p; 

and used way:

         err = cudamalloc((void **)&p, 5*sizeof(int)); 

and notice size have declared 5 bytes....whereas declared 5*sizeof(int).

for more example see:

http://cuda-programming.blogspot.sg/2013/03/how-to-avoid-uses-of-cudamalloc-in.html


Comments

Popular posts from this blog

shopping cart - Page redirect not working PHP -

php - How to modify a menu to show sub-menus -

python - Installing PyDev in eclipse is failed -