__global__

It tells CUDA c++ compiler that this function runs on the GPU and can be called from CPU Code.

/**
* CUDA kernel function to add the elements of 2 arrays on the GPU
*/
__global__
void add(int n, float *x, float *y)
{
    for(int i=0;i<n;i++)
    {
        y[i] = x[i] + y[i];
    }
}

// To launch the add() kernel, which invokes it on the GPU
add<<<1, 1>>>(N, x, y);

Memory Allocation in C

malloc()

calloc()

Return Value:

After successful allocation in malloc() and calloc(), a pointer to the block of memory is returned otherwise NULL value is returned which indicates the failure of allocation.

    // Deallocates memory previously allocated 
    free(arr); 

Memory Allocation in C++

C++ supports calloc() and malloc() functions and also has two operators new and delete that perform the task of allocating and freeing the memory in a better and easier way.

    int N = 1<<20; // 1M elements
    float *x = new float[N];
    int *p = new int(25); // Intialize with 25

    int *q = NULL; 
    q = new int;   

    // Free memory
    delete [] x;
    delete p;
    delete q;

Memory Allocation in CUDA

Unified Memory

cudaDeviceSynchronize()

Although CUDA kernel launches are asynchronous, all GPU-related tasks placed in one stream (which is the default behavior) are executed sequentially.

    // kernel start execution, CPU continues to next statement
    kernel1<<<X,Y>>>(...); 
    // kernel is placed in queue and will start after kernel1 finishes, CPU continues to next statement
    kernel2<<<X,Y>>>(...);
    // CPU blocks until memory is copied, memory copy starts only after kernel2 finishes 
    cudaMemcpy(...); 

using cudaDeviceSynchronize() is appropriate would be when you have several cudaStreams running, and you would like to have them exchange some information.

// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();

Important Info

Compilation

nvcc add.cu -o add_cuda

Profiling

nvprof ./add_cuda

Threads

Execution configuration: It tells how many parallel threads to use.

<<<no_of_thread_blocks,threads_in_thread_block>>>

Important

Examples

    // For 256 parallel threads.

    add<<<1, 256>>>(N, x, y);
    __global__
    void add(int n, float *x, float *y)
    {
    int index = threadIdx.x;
    int stride = blockDim.x;// 256
    for (int i = index; i < n; i += stride)
        y[i] = x[i] + y[i];
    }

CUDA GPUs have many parallel processors grouped into Streaming Multiprocessors, or SMs. Each SM can run multiple concurrent thread blocks.

    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;
    add<<<numBlocks, blockSize>>>(N, x, y);

index = blockIdx.x * blockDim.x + threadIdx.x;

index = 2 * 256 + 3 = 515

    __global__
    void add(int n, float *x, float *y)
    {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x; // total number of threads in the grid
    for (int i = index; i < n; i += stride)
        y[i] = x[i] + y[i];
    }

Scalability and Thread Use

No.of blocks - Multiple of number of Streaming Multiprocessors (SMs) on the device to balance the utilization.

int nDevices;
cudaGetDeviceCount(&nDevices);

int numSMs;
cudaDeviceGetAttribute(&numSMs,cudaDevAttrMultiProcessorCount,deviceId);
add<<<32*numSMs,256>>>(1<<20,x,y);

Serial Host Implementation (Generally for Debugging)

add<<<1,1>>>(1<<20,x,y);

Querying device properties

int nDevices,i,numSMs;
cudaGetDeviceCount(&nDevices);
for(i=0; i<nDevices; i++){

    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop,i);
    cudaDeviceGetAttribute(&numSMs,cudaDevAttrMultiProcessorCount,i);
    cout << "\n" << prop.name << "\n Clock Rate: " << prop.memoryClockRate
         << "\n MemoryBusWidth: " << prop.memoryBusWidth << "\n Num of SMs: " << numSMs << "\n\n";

}

Caliculating total SMs in a system.

int numSMs,nDevices,totalSMs = 0;
cudaGetDeviceCount(&nDevices);
for(i = 0; i < nDevices;i++)
{
    cudaDeviceGetAttribute(&numSMs,cudaDevAttrMultiProcessorCount,i);
    totalSMs += numSMs;    
}
std::cout << "\n Total SMS " << totalSMs

Hardware Implementation

The NVIDIA GPU architecture is built around a scalable array of multithreaded Streaming Multiprocessors (SMs). When a CUDA program on the host CPU invokes a kernel grid, the blocks of the grid are enumerated and distributed to multiprocessors with available execution capacity. The threads of a thread block execute concurrently on one multiprocessor, and multiple thread blocks can execute concurrently on one multiprocessor. As thread blocks terminate, new blocks are launched on the vacated multiprocessors.

Wrap Size

Warp size is the number of threads in a warp. The block of threads is actually divided into sub-blocks called "warps".

Device Info

Device Compute Capability Micro-architecture Wrap Size SMs Maximum number of threads per block
Tesla K40 3.5 Kepler 32 15 1024
GeForce940M 5.0 Maxwell 32 1024