CUDA 2d Array Mapping -
#include <cuda_runtime.h> #include <stdio.h> void initialint(int *ip,int size) { for(int i=0;i<size;i++) ip[i]=i; } void printmatrix(int *c,const int nx,const int ny) { int *ic=c; printf("\n matrix: (%d.%d) \n",nx,ny); for(int i=0;i<ny;i++){ for(int j=0;j<nx;j++){ printf("%3d",ic[j+nx*i]);} printf("\n"); } printf("\n"); } __global__ void printthreadindex(int *a,const int nx,const int ny) { int ix=threadidx.x+blockidx.x*blockdim.x; int iy=threadidx.y+blockidx.y*blockdim.y; unsigned int idx=ix+iy*nx; printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index %2d ival %2d \n",threadidx.x,threadidx.y,blockidx.x,blockidx.y,ix,iy,idx,a[idx]); } int main() { int nx=8,ny=6; int nxy=nx*ny; int nbytes=nxy*sizeof(float); int *h_a; h_a=(int *)malloc(nbytes); initialint(h_a,nxy); printmatrix(h_a,nx,ny); int *d_mata; cudamalloc((void **)&d_mata,nbytes); cudamemcpy(d_mata,h_a,nbytes,cudamemcpyhosttodevice); dim3 block(4,2); dim3 grid(2,3); printthreadindex <<<grid,block>>> (d_mata,nx,ny); cudafree(d_mata); free(h_a); system("pause"); return 0; } output:
matrix: (8.6) 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 thread_id (0,0) block_id (1,0) coordinate (4,0) global index 4 ival 4 thread_id (1,0) block_id (1,0) coordinate (5,0) global index 5 ival 5 thread_id (2,0) block_id (1,0) coordinate (6,0) global index 6 ival 6 thread_id (3,0) block_id (1,0) coordinate (7,0) global index 7 ival 7 thread_id (0,1) block_id (1,0) coordinate (4,1) global index 12 ival 12 thread_id (1,1) block_id (1,0) coordinate (5,1) global index 13 ival 13 thread_id (2,1) block_id (1,0) coordinate (6,1) global index 14 ival 14 thread_id (3,1) block_id (1,0) coordinate (7,1) global index 15 ival 15 thread_id (0,0) block_id (1,1) coordinate (4,2) global index 20 ival 20 thread_id (1,0) block_id (1,1) coordinate (5,2) global index 21 ival 21 thread_id (2,0) block_id (1,1) coordinate (6,2) global index 22 ival 22 thread_id (3,0) block_id (1,1) coordinate (7,2) global index 23 ival 23 thread_id (0,1) block_id (1,1) coordinate (4,3) global index 28 ival 28 thread_id (1,1) block_id (1,1) coordinate (5,3) global index 29 ival 29 thread_id (2,1) block_id (1,1) coordinate (6,3) global index 30 ival 30 thread_id (3,1) block_id (1,1) coordinate (7,3) global index 31 ival 31 thread_id (0,0) block_id (0,2) coordinate (0,4) global index 32 ival 32 thread_id (1,0) block_id (0,2) coordinate (1,4) global index 33 ival 33 thread_id (2,0) block_id (0,2) coordinate (2,4) global index 34 ival 34 thread_id (3,0) block_id (0,2) coordinate (3,4) global index 35 ival 35 thread_id (0,1) block_id (0,2) coordinate (0,5) global index 40 ival 40 thread_id (1,1) block_id (0,2) coordinate (1,5) global index 41 ival 41 thread_id (2,1) block_id (0,2) coordinate (2,5) global index 42 ival 42 thread_id (3,1) block_id (0,2) coordinate (3,5) global index 43 ival 43 thread_id (0,0) block_id (1,2) coordinate (4,4) global index 36 ival 36 thread_id (1,0) block_id (1,2) coordinate (5,4) global index 37 ival 37 thread_id (2,0) block_id (1,2) coordinate (6,4) global index 38 ival 38 thread_id (3,0) block_id (1,2) coordinate (7,4) global index 39 ival 39 thread_id (0,1) block_id (1,2) coordinate (4,5) global index 44 ival 44 thread_id (1,1) block_id (1,2) coordinate (5,5) global index 45 ival 45 thread_id (2,1) block_id (1,2) coordinate (6,5) global index 46 ival 46 thread_id (3,1) block_id (1,2) coordinate (7,5) global index 47 ival 47 thread_id (0,0) block_id (0,1) coordinate (0,2) global index 16 ival 16 thread_id (1,0) block_id (0,1) coordinate (1,2) global index 17 ival 17 thread_id (2,0) block_id (0,1) coordinate (2,2) global index 18 ival 18 thread_id (3,0) block_id (0,1) coordinate (3,2) global index 19 ival 19 thread_id (0,1) block_id (0,1) coordinate (0,3) global index 24 ival 24 thread_id (1,1) block_id (0,1) coordinate (1,3) global index 25 ival 25 thread_id (2,1) block_id (0,1) coordinate (2,3) global index 26 ival 26 thread_id (3,1) block_id (0,1) coordinate (3,3) global index 27 ival 27 thread_id (0,0) block_id (0,0) coordinate (0,0) global index 0 ival 0 thread_id (1,0) block_id (0,0) coordinate (1,0) global index 1 ival 1 thread_id (2,0) block_id (0,0) coordinate (2,0) global index 2 ival 2 thread_id (3,0) block_id (0,0) coordinate (3,0) global index 3 ival 3 thread_id (0,1) block_id (0,0) coordinate (0,1) global index 8 ival 8 thread_id (1,1) block_id (0,0) coordinate (1,1) global index 9 ival 9 thread_id (2,1) block_id (0,0) coordinate (2,1) global index 10 ival 10 thread_id (3,1) block_id (0,0) coordinate (3,1) global index 11 ival 11 hi, above code example cuda book tries explain how 2d array mapped cuda grids , blocks , prints matrix coordinates , offset in global memory each thread.
i bit confused how threads mapped, statement "idx=ix+iynx". tried interchange indices value of nx,ny , change statement "idx=iy+ixny", did not seem work.
also matrix elements mapped threads as
block(0,0) -0,1,2,3,8,9,10,11 block(1,0)-4,5,6,7,12,13,14,15 .....
if want mapping like
block(0,0) -0,1,2,3,4,5,6,7 block(0,1)-8,9,10,11,12,13,14,15 ....
how modify parameters , launch kernel.
p.s- using i7 processor gtx 860m vs 2012 on windows 8.1.
thanks.
nx=8, ny=6. idx=ix+iy*nx: take example thread coordinates (5,2).ix=5, iy=2, ival=5+2*8=21
in order launch kernel different parameters, should change dim3 block(a,b) , dim3 block(c,d) instructions.
for example, achieve example, should use:
dim3 block(8,1); dim3 grid(1,6);
Comments
Post a Comment