CUDA Programming

A comprehensive reference for CUDA C/C++ GPU programming, covering kernels, memory management, synchronization, and optimization techniques.

Quick Start

Installation & Setup

# Check CUDA installation
nvcc --version
nvidia-smi

# Compile CUDA program
nvcc program.cu -o program

# Compile with debugging info
nvcc -g -G program.cu -o program

# Compile for specific architecture
nvcc -arch=sm_80 program.cu -o program

Basic Program Structure

#include <cuda_runtime.h>
#include <stdio.h>

// Kernel function (runs on GPU)
__global__ void myKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] *= 2.0f;
    }
}

int main() {
    // Host code
    int n = 1024;
    size_t size = n * sizeof(float);

    // Allocate host memory
    float* h_data = (float*)malloc(size);

    // Allocate device memory
    float* d_data;
    cudaMalloc(&d_data, size);

    // Copy data to device
    cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);

    // Launch kernel
    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;
    myKernel<<<gridSize, blockSize>>>(d_data, n);

    // Copy result back
    cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);

    // Cleanup
    cudaFree(d_data);
    free(h_data);

    return 0;
}

Core Concepts

Execution Model

// Grid -> Blocks -> Threads hierarchy
__global__ void kernel() {
    // Thread indices
    int tid = threadIdx.x;                    // Thread within block
    int bid = blockIdx.x;                     // Block within grid
    int gid = bid * blockDim.x + tid;         // Global thread index

    // Multi-dimensional indexing
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int idx = row * width + col;
}

// Launch configuration
dim3 gridSize(16, 16);      // 16x16 blocks
dim3 blockSize(32, 32);     // 32x32 threads per block
kernel<<<gridSize, blockSize>>>();

Function Qualifiers

__global__ void kernelFunction() {    // Runs on GPU, called from host
    // Kernel code
}

__device__ void deviceFunction() {    // Runs on GPU, called from GPU
    // Device function code
}

__host__ void hostFunction() {        // Runs on CPU (default)
    // Host function code
}

__host__ __device__ void bothFunction() {  // Can run on both
    // Code that works on both host and device
}

Variable Qualifiers

__global__ void kernel() {
    __shared__ float sharedArray[256];    // Shared memory
    __constant__ float constValue;        // Constant memory
    __device__ float deviceGlobal;        // Global device memory

    int localVar;                         // Register/local memory
}

Memory Management

Basic Memory Operations

// Device memory allocation
float* d_array;
size_t size = n * sizeof(float);
cudaError_t err = cudaMalloc(&d_array, size);

// Memory copy operations
cudaMemcpy(d_dst, h_src, size, cudaMemcpyHostToDevice);
cudaMemcpy(h_dst, d_src, size, cudaMemcpyDeviceToHost);
cudaMemcpy(d_dst, d_src, size, cudaMemcpyDeviceToDevice);

// Memory initialization
cudaMemset(d_array, 0, size);

// Free memory
cudaFree(d_array);

Asynchronous Memory Operations

// Create streams
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);

// Asynchronous operations
cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream1);
kernel<<<grid, block, 0, stream1>>>(d_data);
cudaMemcpyAsync(h_result, d_result, size, cudaMemcpyDeviceToHost, stream1);

// Synchronization
cudaStreamSynchronize(stream1);
cudaDeviceSynchronize();  // Wait for all operations

// Cleanup
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);

Pinned Memory

// Allocate pinned memory (faster transfers)
float* h_pinned;
cudaHostAlloc(&h_pinned, size, cudaHostAllocDefault);

// Or use cudaMallocHost
cudaMallocHost(&h_pinned, size);

// Free pinned memory
cudaFreeHost(h_pinned);

Unified Memory (Managed Memory)

// Allocate unified memory
float* unified_data;
cudaMallocManaged(&unified_data, size);

// Use directly in kernel
kernel<<<grid, block>>>(unified_data);

// Prefetch to device (optional optimization)
cudaMemPrefetchAsync(unified_data, size, 0);  // Device 0

// Free unified memory
cudaFree(unified_data);

Memory Types & Hierarchy

Global Memory

// Allocated with cudaMalloc
// Accessible by all threads
// Highest latency, largest capacity
__global__ void kernel(float* global_mem) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    global_mem[idx] = idx;  // Global memory access
}

Shared Memory

__global__ void sharedMemoryExample(float* input, float* output) {
    __shared__ float tile[TILE_SIZE][TILE_SIZE];  // Static allocation

    // Dynamic shared memory (specified at kernel launch)
    extern __shared__ float dynamic_tile[];

    int tid = threadIdx.x;

    // Load data to shared memory
    tile[threadIdx.y][threadIdx.x] = input[...];

    // Synchronize threads in block
    __syncthreads();

    // Use shared memory data
    float result = tile[threadIdx.y][threadIdx.x] * 2.0f;
    output[...] = result;
}

// Launch with dynamic shared memory
int sharedMemSize = TILE_SIZE * TILE_SIZE * sizeof(float);
kernel<<<grid, block, sharedMemSize>>>(input, output);

Constant Memory

// Declare constant memory (at file scope)
__constant__ float const_array[1024];

// Copy to constant memory
float host_array[1024];
cudaMemcpyToSymbol(const_array, host_array, sizeof(host_array));

__global__ void useConstant() {
    float value = const_array[threadIdx.x];  // Fast broadcast read
}

Texture Memory

// Texture object (modern approach)
texture<float, 2, cudaReadModeElementType> tex;

__global__ void textureKernel() {
    float x = blockIdx.x * blockDim.x + threadIdx.x;
    float y = blockIdx.y * blockDim.y + threadIdx.y;

    // Read from texture (with interpolation)
    float value = tex2D(tex, x + 0.5f, y + 0.5f);
}

// Setup texture (host code)
cudaArray* cuArray;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMallocArray(&cuArray, &channelDesc, width, height);
cudaBindTextureToArray(tex, cuArray, channelDesc);

Kernel Launch & Configuration

Launch Configuration

// 1D configuration
int blockSize = 256;
int gridSize = (n + blockSize - 1) / blockSize;
kernel<<<gridSize, blockSize>>>(data);

// 2D configuration
dim3 blockSize(16, 16);     // 256 threads per block
dim3 gridSize((width + 15) / 16, (height + 15) / 16);
kernel2D<<<gridSize, blockSize>>>(data);

// With shared memory
int sharedMemSize = blockSize.x * blockSize.y * sizeof(float);
kernel<<<gridSize, blockSize, sharedMemSize>>>(data);

// With streams
kernel<<<gridSize, blockSize, sharedMemSize, stream>>>(data);

Dynamic Parallelism

__global__ void parentKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n && needsProcessing(data[idx])) {
        // Launch child kernel from device
        childKernel<<<1, 32>>>(data, idx);

        // Synchronize child kernel
        cudaDeviceSynchronize();
    }
}

__global__ void childKernel(float* data, int offset) {
    int idx = threadIdx.x;
    // Process data[offset + idx]
}

Occupancy Calculation

int blockSize;      // The launch configurator will suggest a block size
int minGridSize;    // The minimum grid size needed to achieve max occupancy
int gridSize;       // The actual grid size

// Calculate optimal block size
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, myKernel, 0, 0);

// Round up according to array size
gridSize = (n + blockSize - 1) / blockSize;

myKernel<<<gridSize, blockSize>>>(data);

Thread Synchronization

Block-Level Synchronization

__global__ void syncExample() {
    __shared__ float shared_data[256];

    int tid = threadIdx.x;

    // Load data
    shared_data[tid] = input[blockIdx.x * blockDim.x + tid];

    // Wait for all threads in block to finish loading
    __syncthreads();

    // Now all threads can safely access shared_data
    float result = shared_data[255 - tid];  // Reverse access

    // Another sync before writing results
    __syncthreads();

    output[blockIdx.x * blockDim.x + tid] = result;
}

Warp-Level Synchronization

#include <cooperative_groups.h>

__global__ void warpExample() {
    auto warp = cooperative_groups::tiled_partition<32>(
        cooperative_groups::this_thread_block());

    int lane_id = warp.thread_rank();
    int value = lane_id;

    // Warp-level reduction
    for (int delta = 16; delta > 0; delta /= 2) {
        value += warp.shfl_down_sync(0xffffffff, value, delta);
    }

    if (lane_id == 0) {
        // Thread 0 of each warp has the sum
        atomicAdd(result, value);
    }
}

Atomic Operations

// Basic atomic operations
__global__ void atomicsExample(int* counter, float* sum, int* histogram) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Atomic increment
    int old_val = atomicAdd(counter, 1);

    // Atomic floating point operations (CC >= 6.0)
    atomicAdd(sum, data[idx]);

    // Atomic compare and swap
    int expected = 0;
    int desired = idx;
    int old = atomicCAS(&flag, expected, desired);

    // Histogram example
    int bin = (int)(data[idx] * NUM_BINS);
    atomicAdd(&histogram[bin], 1);
}

// Custom atomic operations using CAS
__device__ float atomicMaxFloat(float* address, float val) {
    int* address_as_int = (int*)address;
    int old = *address_as_int, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_int, assumed,
            __float_as_int(fmaxf(val, __int_as_float(assumed))));
    } while (assumed != old);
    return __int_as_float(old);
}

Cooperative Groups

#include <cooperative_groups.h>
using namespace cooperative_groups;

__global__ void cooperativeKernel() {
    // Thread block group
    thread_block block = this_thread_block();

    // Warp-sized tile
    auto tile32 = tiled_partition<32>(block);

    // Grid group (requires cooperative launch)
    grid_group grid = this_grid();

    // Synchronize at different levels
    block.sync();     // Block-level sync
    tile32.sync();    // Warp-level sync
    grid.sync();      // Grid-level sync (cooperative kernels only)
}

// Launch cooperative kernel
cudaLaunchCooperativeKernel((void*)cooperativeKernel, gridSize, blockSize, args);

Error Handling

Basic Error Checking

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d - %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)

// Usage
CUDA_CHECK(cudaMalloc(&d_data, size));
CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));

// Check kernel launch errors
kernel<<<grid, block>>>(data);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());

Advanced Error Handling

// Error checking function
void checkCudaErrors(cudaError_t result) {
    if (result != cudaSuccess) {
        fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
        assert(result == cudaSuccess);
    }
}

// Async error checking
cudaError_t asyncCheckErrors() {
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Async kernel error: %s\n", cudaGetErrorString(err));
        return err;
    }

    // Check for execution errors
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        printf("Sync error: %s\n", cudaGetErrorString(err));
        return err;
    }

    return cudaSuccess;
}

Performance Optimization

Memory Coalescing

// Bad: Non-coalesced access
__global__ void badAccess(float* data, int width) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int row = idx / width;
    int col = idx % width;

    // Strided access pattern (bad)
    float value = data[col * width + row];  // Column-major access
}

// Good: Coalesced access
__global__ void goodAccess(float* data, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // Sequential access pattern (good)
    float value = data[row * width + col];  // Row-major access
}

Shared Memory Bank Conflicts

// Bad: Bank conflicts
__global__ void badSharedAccess() {
    __shared__ float shared[32][32];
    int tid = threadIdx.x;

    // All threads access same bank (conflict)
    float value = shared[0][tid];
}

// Good: No bank conflicts
__global__ void goodSharedAccess() {
    __shared__ float shared[32][33];  // Padding to avoid conflicts
    int tid = threadIdx.x;

    // Diagonal access pattern
    float value = shared[tid][(tid + offset) % 32];
}

Loop Unrolling

// Manual unrolling for small, known loop counts
__global__ void unrolledKernel(float* data) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;

    // Unroll small loops
    #pragma unroll 4
    for (int i = 0; i < 4; i++) {
        sum += data[idx * 4 + i];
    }

    data[idx] = sum;
}

Memory Bandwidth Optimization

// Vectorized memory access
__global__ void vectorizedAccess(float4* input, float4* output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n) {
        float4 data = input[idx];  // Load 4 floats at once

        // Process each component
        data.x *= 2.0f;
        data.y *= 2.0f;
        data.z *= 2.0f;
        data.w *= 2.0f;

        output[idx] = data;  // Store 4 floats at once
    }
}

Instruction-Level Optimizations

__global__ void optimizedKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < n) {
        float x = data[idx];

        // Use fast math functions
        float result = __sinf(x) + __cosf(x);  // Fast sin/cos

        // Fast division (less accurate)
        result = __fdividef(result, 3.14159f);

        // Fast reciprocal square root
        result *= __frsqrt_rn(result);

        data[idx] = result;
    }
}

Common Patterns

Reduction

// Efficient block-level reduction
__global__ void blockReduce(float* input, float* output, int n) {
    __shared__ float sdata[256];

    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data
    sdata[tid] = (idx < n) ? input[idx] : 0.0f;
    __syncthreads();

    // Reduction in shared memory
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    // Write result
    if (tid == 0) {
        output[blockIdx.x] = sdata[0];
    }
}

Matrix Multiplication (Tiled)

#define TILE_SIZE 16

__global__ void matmulTiled(float* A, float* B, float* C, int N) {
    __shared__ float As[TILE_SIZE][TILE_SIZE];
    __shared__ float Bs[TILE_SIZE][TILE_SIZE];

    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
    int col = blockIdx.x * TILE_SIZE + threadIdx.x;

    float sum = 0.0f;

    for (int tile = 0; tile < (N + TILE_SIZE - 1) / TILE_SIZE; tile++) {
        // Load tiles into shared memory
        if (row < N && tile * TILE_SIZE + threadIdx.x < N)
            As[threadIdx.y][threadIdx.x] = A[row * N + tile * TILE_SIZE + threadIdx.x];
        else
            As[threadIdx.y][threadIdx.x] = 0.0f;

        if (col < N && tile * TILE_SIZE + threadIdx.y < N)
            Bs[threadIdx.y][threadIdx.x] = B[(tile * TILE_SIZE + threadIdx.y) * N + col];
        else
            Bs[threadIdx.y][threadIdx.x] = 0.0f;

        __syncthreads();

        // Compute partial result
        for (int k = 0; k < TILE_SIZE; k++) {
            sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
        }

        __syncthreads();
    }

    // Write result
    if (row < N && col < N) {
        C[row * N + col] = sum;
    }
}

Scan (Prefix Sum)

__global__ void scanBlock(float* input, float* output, int n) {
    __shared__ float temp[2 * BLOCK_SIZE];

    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data
    temp[2 * tid] = (2 * idx < n) ? input[2 * idx] : 0;
    temp[2 * tid + 1] = (2 * idx + 1 < n) ? input[2 * idx + 1] : 0;

    __syncthreads();

    // Up-sweep phase
    for (int stride = 1; stride < 2 * BLOCK_SIZE; stride *= 2) {
        int index = (tid + 1) * stride * 2 - 1;
        if (index < 2 * BLOCK_SIZE) {
            temp[index] += temp[index - stride];
        }
        __syncthreads();
    }

    // Clear last element
    if (tid == 0) temp[2 * BLOCK_SIZE - 1] = 0;
    __syncthreads();

    // Down-sweep phase
    for (int stride = BLOCK_SIZE; stride > 0; stride /= 2) {
        int index = (tid + 1) * stride * 2 - 1;
        if (index < 2 * BLOCK_SIZE) {
            float t = temp[index];
            temp[index] += temp[index - stride];
            temp[index - stride] = t;
        }
        __syncthreads();
    }

    // Write results
    if (2 * idx < n) output[2 * idx] = temp[2 * tid];
    if (2 * idx + 1 < n) output[2 * idx + 1] = temp[2 * tid + 1];
}

Debugging & Profiling

CUDA-GDB

# Compile with debugging info
nvcc -g -G -O0 program.cu -o program

# Run with cuda-gdb
cuda-gdb ./program

# Common commands
(cuda-gdb) break main
(cuda-gdb) break kernel_name
(cuda-gdb) run
(cuda-gdb) cuda thread (0,0,0)  # Switch to specific thread
(cuda-gdb) cuda block (0,0)     # Switch to specific block
(cuda-gdb) print variable_name
(cuda-gdb) continue

Profiling with Nsight (nsys/ncu)

# Nsight Systems (nsys) for system-wide performance analysis
nsys profile --stats=true ./program

# Nsight Compute (ncu) for detailed kernel analysis
ncu --set full ./program

# Legacy profiling (nvprof) - deprecated
nvprof ./program

Printf Debugging

__global__ void debugKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Debug specific threads
    if (blockIdx.x == 0 && threadIdx.x < 5) {
        printf("Block %d, Thread %d: data[%d] = %f\n",
               blockIdx.x, threadIdx.x, idx, data[idx]);
    }

    // Conditional debugging
    if (data[idx] < 0) {
        printf("Negative value at index %d: %f\n", idx, data[idx]);
    }
}

Device Management

Device Properties

void queryDevice() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    for (int dev = 0; dev < deviceCount; dev++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, dev);

        printf("Device %d: %s\n", dev, prop.name);
        printf("  Compute capability: %d.%d\n", prop.major, prop.minor);
        printf("  Total global memory: %zu MB\n", prop.totalGlobalMem / (1024*1024));
        printf("  Shared memory per block: %zu KB\n", prop.sharedMemPerBlock / 1024);
        printf("  Max threads per block: %d\n", prop.maxThreadsPerBlock);
        printf("  Warp size: %d\n", prop.warpSize);
        printf("  Memory bus width: %d bits\n", prop.memoryBusWidth);
        printf("  Memory clock rate: %d MHz\n", prop.memoryClockRate / 1000);
    }
}

void setDevice() {
    int device = 0;  // Use device 0
    cudaSetDevice(device);

    // Verify device was set
    int currentDevice;
    cudaGetDevice(&currentDevice);
    printf("Using device %d\n", currentDevice);
}

Multi-GPU Programming

void multiGPU() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    // Allocate data on each device
    float** d_data = new float*[deviceCount];
    cudaStream_t* streams = new cudaStream_t[deviceCount];

    for (int dev = 0; dev < deviceCount; dev++) {
        cudaSetDevice(dev);
        cudaMalloc(&d_data[dev], dataSize);
        cudaStreamCreate(&streams[dev]);

        // Launch work on this device
        kernel<<<grid, block, 0, streams[dev]>>>(d_data[dev]);
    }

    // Synchronize all devices
    for (int dev = 0; dev < deviceCount; dev++) {
        cudaSetDevice(dev);
        cudaStreamSynchronize(streams[dev]);
        cudaFree(d_data[dev]);
        cudaStreamDestroy(streams[dev]);
    }
}

Quick Reference

Memory Copy Directions

cudaMemcpyHostToDevice    // CPU → GPU
cudaMemcpyDeviceToHost    // GPU → CPU
cudaMemcpyDeviceToDevice  // GPU → GPU
cudaMemcpyHostToHost      // CPU → CPU (rarely used)

Built-in Variables

// Thread indexing
threadIdx.x, threadIdx.y, threadIdx.z  // Thread index within block
blockIdx.x, blockIdx.y, blockIdx.z     // Block index within grid
blockDim.x, blockDim.y, blockDim.z     // Block dimensions
gridDim.x, gridDim.y, gridDim.z        // Grid dimensions

// Warp information
warpSize                                // Usually 32

Mathematical Functions

// Standard math functions (slower, double precision available)
sin(), cos(), tan(), log(), exp(), sqrt(), pow()

// Fast math functions (faster, single precision only)
__sinf(), __cosf(), __tanf(), __logf(), __expf()
__fsqrt_rn(), __fdividef()

// Integer functions
__clz()      // Count leading zeros
__popc()     // Population count (number of set bits)
__brev()     // Bit reverse

Warp Shuffle Operations

// Warp shuffle functions (CC >= 3.0)
__shfl_sync(mask, var, srcLane)        // Direct lane access
__shfl_up_sync(mask, var, delta)       // Shift up
__shfl_down_sync(mask, var, delta)     // Shift down  
__shfl_xor_sync(mask, var, laneMask)   // XOR-based shuffle

// Example: Warp reduction
int value = threadIdx.x;
for (int offset = 16; offset > 0; offset /= 2) {
    value += __shfl_down_sync(0xffffffff, value, offset);
}

Common Gotchas & Best Practices

Memory Access Patterns

Always aim for coalesced memory access - threads in a warp should access consecutive memory addresses
Use shared memory for repeated data access - 100x faster than global memory
Avoid bank conflicts in shared memory - pad arrays or use different access patterns
Prefer AoS over SoA for coalesced access in most cases

Thread Management

Choose block sizes that are multiples of warp size (32) for best efficiency
Aim for high occupancy but not at all costs - sometimes fewer blocks with more shared memory is better
Use __syncthreads() carefully - all threads in the block must reach it
Avoid divergent branches within warps when possible

Performance Tips

Use fast math functions (__sinf, __expf, etc.) when precision allows
Minimize register usage to increase occupancy
Use texture memory for spatial locality in read-only data
Consider using unified memory for easier development, but profile performance
Profile your code with nsys/ncu to identify bottlenecks

Error Handling

Always check CUDA API return values using error checking macros
Use cudaGetLastError() after kernel launches to catch errors
Synchronize before error checking for kernel execution errors
Use CUDA-GDB for debugging device code when needed

This cheat sheet covers the essential CUDA programming concepts and patterns. For the most current information, always refer to the official CUDA documentation.