__global__
It tells CUDA c++ compiler that this function runs on the GPU and can be called from CPU Code.
/**
* CUDA kernel function to add the elements of 2 arrays on the GPU
*/
__global__
void add(int n, float *x, float *y)
{
for(int i=0;i<n;i++)
{
y[i] = x[i] + y[i];
}
}
// To launch the add() kernel, which invokes it on the GPU
add<<<1, 1>>>(N, x, y);
malloc() allocates memory block of given size (in bytes) and returns a pointer to the beginning of the block.
malloc() doesn’t initialize the allocated memory.
void* malloc(size_t size);
arr = (int*)malloc(5 * sizeof(int)); // 5*4bytes = 20 bytes
calloc() allocates the memory and also initializes the allocated memory block to zero.
void* calloc(size_t num, size_t size);
arr = (int*)calloc(5, sizeof(int));
Return Value:
After successful allocation in malloc() and calloc(), a pointer to the block of memory is returned otherwise NULL value is returned which indicates the failure of allocation.
// Deallocates memory previously allocated
free(arr);
C++ supports calloc() and malloc() functions and also has two operators new and delete that perform the task of allocating and freeing the memory in a better and easier way.
int N = 1<<20; // 1M elements
float *x = new float[N];
int *p = new int(25); // Intialize with 25
int *q = NULL;
q = new int;
// Free memory
delete [] x;
delete p;
delete q;
Unified Memory
Single memory space accessible by all GPU's and CPU's in our system.
// Allocate Unified Memory -- accessible from CPU or GPU
float *x, *y;
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// Free memory
cudaFree(x);
cudaFree(y);
race condition can occur since multiple parallel threads would both read and write the same locations.So care has to be taken otherwise the output may be inapproriate.
Although CUDA kernel launches are asynchronous, all GPU-related tasks placed in one stream (which is the default behavior) are executed sequentially.
// kernel start execution, CPU continues to next statement
kernel1<<<X,Y>>>(...);
// kernel is placed in queue and will start after kernel1 finishes, CPU continues to next statement
kernel2<<<X,Y>>>(...);
// CPU blocks until memory is copied, memory copy starts only after kernel2 finishes
cudaMemcpy(...);
using cudaDeviceSynchronize() is appropriate would be when you have several cudaStreams running, and you would like to have them exchange some information.
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
nvcc add.cu -o add_cuda
nvprof ./add_cuda
Execution configuration: It tells how many parallel threads to use.
<<<no_of_thread_blocks,threads_in_thread_block>>>
// For 256 parallel threads.
add<<<1, 256>>>(N, x, y);
__global__
void add(int n, float *x, float *y)
{
int index = threadIdx.x;
int stride = blockDim.x;// 256
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
CUDA GPUs have many parallel processors grouped into Streaming Multiprocessors, or SMs. Each SM can run multiple concurrent thread blocks.
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
index = blockIdx.x * blockDim.x + threadIdx.x;
index = 2 * 256 + 3 = 515
__global__
void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x; // total number of threads in the grid
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
No.of blocks - Multiple of number of Streaming Multiprocessors (SMs) on the device to balance the utilization.
int nDevices;
cudaGetDeviceCount(&nDevices);
int numSMs;
cudaDeviceGetAttribute(&numSMs,cudaDevAttrMultiProcessorCount,deviceId);
add<<<32*numSMs,256>>>(1<<20,x,y);
Serial Host Implementation (Generally for Debugging)
add<<<1,1>>>(1<<20,x,y);
int nDevices,i,numSMs;
cudaGetDeviceCount(&nDevices);
for(i=0; i<nDevices; i++){
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop,i);
cudaDeviceGetAttribute(&numSMs,cudaDevAttrMultiProcessorCount,i);
cout << "\n" << prop.name << "\n Clock Rate: " << prop.memoryClockRate
<< "\n MemoryBusWidth: " << prop.memoryBusWidth << "\n Num of SMs: " << numSMs << "\n\n";
}
int numSMs,nDevices,totalSMs = 0;
cudaGetDeviceCount(&nDevices);
for(i = 0; i < nDevices;i++)
{
cudaDeviceGetAttribute(&numSMs,cudaDevAttrMultiProcessorCount,i);
totalSMs += numSMs;
}
std::cout << "\n Total SMS " << totalSMs
The NVIDIA GPU architecture is built around a scalable array of multithreaded Streaming Multiprocessors (SMs). When a CUDA program on the host CPU invokes a kernel grid, the blocks of the grid are enumerated and distributed to multiprocessors with available execution capacity. The threads of a thread block execute concurrently on one multiprocessor, and multiple thread blocks can execute concurrently on one multiprocessor. As thread blocks terminate, new blocks are launched on the vacated multiprocessors.
Warp size is the number of threads in a warp. The block of threads is actually divided into sub-blocks called "warps".
Device | Compute Capability | Micro-architecture | Wrap Size | SMs | Maximum number of threads per block |
---|---|---|---|---|---|
Tesla K40 | 3.5 | Kepler | 32 | 15 | 1024 |
GeForce940M | 5.0 | Maxwell | 32 | 1024 | |