Initially one of the things that can be very confusing when learning CUDA is the whole Grid, Block and Thread parcelling up of the work. Even though I am soon to be giving a talk about CUDA, I suddenly discovered today that my understanding of it was a bit wrong.
In order to see what was happening under the hood I wrote a simple CUDA utility program that would help me understand grids, blocks and threads when used with a simple one dimensional array.
The program uses an array of GridInfo structs to record the block and thread indexes that are used when the array is passed to a CUDA kernel. With the array length set to 20, number of blocks set to 10 and number of threads per block set to 3 then the program outputs the following
0: blockIdx: 0 threadIdx: 0 blockDimx: 3 1: blockIdx: 0 threadIdx: 1 blockDimx: 3 2: blockIdx: 0 threadIdx: 2 blockDimx: 3 3: blockIdx: 1 threadIdx: 0 blockDimx: 3 4: blockIdx: 1 threadIdx: 1 blockDimx: 3 5: blockIdx: 1 threadIdx: 2 blockDimx: 3 6: blockIdx: 2 threadIdx: 0 blockDimx: 3 7: blockIdx: 2 threadIdx: 1 blockDimx: 3 8: blockIdx: 2 threadIdx: 2 blockDimx: 3 9: blockIdx: 3 threadIdx: 0 blockDimx: 3 10: blockIdx: 3 threadIdx: 1 blockDimx: 3 11: blockIdx: 3 threadIdx: 2 blockDimx: 3 12: blockIdx: 4 threadIdx: 0 blockDimx: 3 13: blockIdx: 4 threadIdx: 1 blockDimx: 3 14: blockIdx: 4 threadIdx: 2 blockDimx: 3 15: blockIdx: 5 threadIdx: 0 blockDimx: 3 16: blockIdx: 5 threadIdx: 1 blockDimx: 3 17: blockIdx: 5 threadIdx: 2 blockDimx: 3 18: blockIdx: 6 threadIdx: 0 blockDimx: 3 19: blockIdx: 6 threadIdx: 1 blockDimx: 3
Where it can be seen that if we did not ignore the last thread in the lastblock (blockIdx: 6 threadIdx: 2) then we would overrun the array (tid would be 20).
I hope somebody out there might find this useful. I certainly have.
The source code is here
#include <stdio.h> #include "GridInfo.h" #define ARRAY_SIZE 20 #define N_BLOCKS 10 #define N_THREADS 3 __global__ void generateGridInfo( GridInfo *gridInfo, int n ) { int tid = blockIdx.x*blockDim.x + threadIdx.x; if ( tid<n ) { GridInfo *gi = &gridInfo[tid]; gi->blockDimX = blockDim.x; gi->blockIdX = blockIdx.x; gi->threadIdX = threadIdx.x; gi->tid = tid; } } void initialiseGrid( GridInfo *gridInfo, int n) { for ( int i=0; i<n; i++ ) { GridInfo gi; gi.blockDimX = -1; gi.blockIdX = -1; gi.threadIdX = -1; gi.tid = -1; gridInfo[i] = gi; } } int main( char *argc) { GridInfo *gridHost; GridInfo *gridDev; int gridSize = ARRAY_SIZE*sizeof(GridInfo); gridHost = (GridInfo *) malloc( gridSize ); cudaMalloc( (void **) &gridDev, gridSize ); initialiseGrid( gridHost, ARRAY_SIZE ); cudaMemcpy( gridDev, gridHost, gridSize, cudaMemcpyHostToDevice); generateGridInfo<<<N_BLOCKS,N_THREADS>>>( gridDev, ARRAY_SIZE ); cudaMemcpy( gridHost, gridDev, gridSize, cudaMemcpyDeviceToHost); for ( int i=0;i<ARRAY_SIZE;i++ ) { GridInfo gi = gridHost[i]; printf("%i: blockIdx: %i threadIdx: %i blockDimx: %i\n",gi.tid,gi.blockIdX,gi.threadIdX,gi.blockDimX); } free( gridHost ); cudaFree( gridDev ); return 0; } |
struct GridInfo { int threadIdX; int blockIdX; int blockDimX; int tid; }; |


