CUDA 2d Array Mapping -


#include <cuda_runtime.h> #include <stdio.h>    void initialint(int *ip,int size) {     for(int i=0;i<size;i++)         ip[i]=i;  }  void printmatrix(int *c,const int nx,const int ny) {     int *ic=c;     printf("\n matrix: (%d.%d) \n",nx,ny);     for(int i=0;i<ny;i++){         for(int j=0;j<nx;j++){             printf("%3d",ic[j+nx*i]);}     printf("\n");      } printf("\n"); }  __global__ void printthreadindex(int *a,const int nx,const int ny) {     int ix=threadidx.x+blockidx.x*blockdim.x;     int iy=threadidx.y+blockidx.y*blockdim.y;      unsigned int idx=ix+iy*nx;      printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index %2d  ival %2d \n",threadidx.x,threadidx.y,blockidx.x,blockidx.y,ix,iy,idx,a[idx]);  }  int main() {     int nx=8,ny=6;     int nxy=nx*ny;     int nbytes=nxy*sizeof(float);      int *h_a;     h_a=(int *)malloc(nbytes);      initialint(h_a,nxy);     printmatrix(h_a,nx,ny);      int *d_mata;     cudamalloc((void **)&d_mata,nbytes);      cudamemcpy(d_mata,h_a,nbytes,cudamemcpyhosttodevice);     dim3 block(4,2);     dim3 grid(2,3);     printthreadindex <<<grid,block>>> (d_mata,nx,ny);      cudafree(d_mata);     free(h_a);      system("pause");     return 0;    } 

output:

matrix: (8.6)   0  1  2  3  4  5  6  7   8  9 10 11 12 13 14 15  16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31  32 33 34 35 36 37 38 39  40 41 42 43 44 45 46 47  thread_id (0,0) block_id (1,0) coordinate (4,0) global index  4  ival  4 thread_id (1,0) block_id (1,0) coordinate (5,0) global index  5  ival  5 thread_id (2,0) block_id (1,0) coordinate (6,0) global index  6  ival  6 thread_id (3,0) block_id (1,0) coordinate (7,0) global index  7  ival  7 thread_id (0,1) block_id (1,0) coordinate (4,1) global index 12  ival 12 thread_id (1,1) block_id (1,0) coordinate (5,1) global index 13  ival 13 thread_id (2,1) block_id (1,0) coordinate (6,1) global index 14  ival 14 thread_id (3,1) block_id (1,0) coordinate (7,1) global index 15  ival 15 thread_id (0,0) block_id (1,1) coordinate (4,2) global index 20  ival 20 thread_id (1,0) block_id (1,1) coordinate (5,2) global index 21  ival 21 thread_id (2,0) block_id (1,1) coordinate (6,2) global index 22  ival 22 thread_id (3,0) block_id (1,1) coordinate (7,2) global index 23  ival 23 thread_id (0,1) block_id (1,1) coordinate (4,3) global index 28  ival 28 thread_id (1,1) block_id (1,1) coordinate (5,3) global index 29  ival 29 thread_id (2,1) block_id (1,1) coordinate (6,3) global index 30  ival 30 thread_id (3,1) block_id (1,1) coordinate (7,3) global index 31  ival 31 thread_id (0,0) block_id (0,2) coordinate (0,4) global index 32  ival 32 thread_id (1,0) block_id (0,2) coordinate (1,4) global index 33  ival 33 thread_id (2,0) block_id (0,2) coordinate (2,4) global index 34  ival 34 thread_id (3,0) block_id (0,2) coordinate (3,4) global index 35  ival 35 thread_id (0,1) block_id (0,2) coordinate (0,5) global index 40  ival 40 thread_id (1,1) block_id (0,2) coordinate (1,5) global index 41  ival 41 thread_id (2,1) block_id (0,2) coordinate (2,5) global index 42  ival 42 thread_id (3,1) block_id (0,2) coordinate (3,5) global index 43  ival 43 thread_id (0,0) block_id (1,2) coordinate (4,4) global index 36  ival 36 thread_id (1,0) block_id (1,2) coordinate (5,4) global index 37  ival 37 thread_id (2,0) block_id (1,2) coordinate (6,4) global index 38  ival 38 thread_id (3,0) block_id (1,2) coordinate (7,4) global index 39  ival 39 thread_id (0,1) block_id (1,2) coordinate (4,5) global index 44  ival 44 thread_id (1,1) block_id (1,2) coordinate (5,5) global index 45  ival 45 thread_id (2,1) block_id (1,2) coordinate (6,5) global index 46  ival 46 thread_id (3,1) block_id (1,2) coordinate (7,5) global index 47  ival 47 thread_id (0,0) block_id (0,1) coordinate (0,2) global index 16  ival 16 thread_id (1,0) block_id (0,1) coordinate (1,2) global index 17  ival 17 thread_id (2,0) block_id (0,1) coordinate (2,2) global index 18  ival 18 thread_id (3,0) block_id (0,1) coordinate (3,2) global index 19  ival 19 thread_id (0,1) block_id (0,1) coordinate (0,3) global index 24  ival 24 thread_id (1,1) block_id (0,1) coordinate (1,3) global index 25  ival 25 thread_id (2,1) block_id (0,1) coordinate (2,3) global index 26  ival 26 thread_id (3,1) block_id (0,1) coordinate (3,3) global index 27  ival 27 thread_id (0,0) block_id (0,0) coordinate (0,0) global index  0  ival  0 thread_id (1,0) block_id (0,0) coordinate (1,0) global index  1  ival  1 thread_id (2,0) block_id (0,0) coordinate (2,0) global index  2  ival  2 thread_id (3,0) block_id (0,0) coordinate (3,0) global index  3  ival  3 thread_id (0,1) block_id (0,0) coordinate (0,1) global index  8  ival  8 thread_id (1,1) block_id (0,0) coordinate (1,1) global index  9  ival  9 thread_id (2,1) block_id (0,0) coordinate (2,1) global index 10  ival 10 thread_id (3,1) block_id (0,0) coordinate (3,1) global index 11  ival 11 

hi, above code example cuda book tries explain how 2d array mapped cuda grids , blocks , prints matrix coordinates , offset in global memory each thread.

i bit confused how threads mapped, statement "idx=ix+iynx". tried interchange indices value of nx,ny , change statement "idx=iy+ixny", did not seem work.

also matrix elements mapped threads as

block(0,0) -0,1,2,3,8,9,10,11 block(1,0)-4,5,6,7,12,13,14,15 .....

if want mapping like

block(0,0) -0,1,2,3,4,5,6,7 block(0,1)-8,9,10,11,12,13,14,15 ....

how modify parameters , launch kernel.

p.s- using i7 processor gtx 860m vs 2012 on windows 8.1.

thanks.

nx=8, ny=6. 

idx=ix+iy*nx: take example thread coordinates (5,2).ix=5, iy=2, ival=5+2*8=21

in order launch kernel different parameters, should change dim3 block(a,b) , dim3 block(c,d) instructions.

for example, achieve example, should use:

dim3 block(8,1); dim3 grid(1,6); 

Comments

Popular posts from this blog

shopping cart - Page redirect not working PHP -

php - How to modify a menu to show sub-menus -

python - Installing PyDev in eclipse is failed -