CUDA Programming
A comprehensive reference for CUDA C/C++ GPU programming, covering kernels, memory management, synchronization, and optimization techniques.
Quick Start
Installation & Setup
# Check CUDA installation
nvcc --version
nvidia-smi
# Compile CUDA program
nvcc program.cu -o program
# Compile with debugging info
nvcc -g -G program.cu -o program
# Compile for specific architecture
nvcc -arch=sm_80 program.cu -o program
Basic Program Structure
#include <cuda_runtime.h>
#include <stdio.h>
// Kernel function (runs on GPU)
__global__ void myKernel(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
data[idx] *= 2.0f;
}
}
int main() {
// Host code
int n = 1024;
size_t size = n * sizeof(float);
// Allocate host memory
float* h_data = (float*)malloc(size);
// Allocate device memory
float* d_data;
cudaMalloc(&d_data, size);
// Copy data to device
cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
// Launch kernel
int blockSize = 256;
int gridSize = (n + blockSize - 1) / blockSize;
myKernel<<<gridSize, blockSize>>>(d_data, n);
// Copy result back
cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
// Cleanup
cudaFree(d_data);
free(h_data);
return 0;
}
Core Concepts
Execution Model
// Grid -> Blocks -> Threads hierarchy
__global__ void kernel() {
// Thread indices
int tid = threadIdx.x; // Thread within block
int bid = blockIdx.x; // Block within grid
int gid = bid * blockDim.x + tid; // Global thread index
// Multi-dimensional indexing
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int idx = row * width + col;
}
// Launch configuration
dim3 gridSize(16, 16); // 16x16 blocks
dim3 blockSize(32, 32); // 32x32 threads per block
kernel<<<gridSize, blockSize>>>();
Function Qualifiers
__global__ void kernelFunction() { // Runs on GPU, called from host
// Kernel code
}
__device__ void deviceFunction() { // Runs on GPU, called from GPU
// Device function code
}
__host__ void hostFunction() { // Runs on CPU (default)
// Host function code
}
__host__ __device__ void bothFunction() { // Can run on both
// Code that works on both host and device
}
Variable Qualifiers
__global__ void kernel() {
__shared__ float sharedArray[256]; // Shared memory
__constant__ float constValue; // Constant memory
__device__ float deviceGlobal; // Global device memory
int localVar; // Register/local memory
}
Memory Management
Basic Memory Operations
// Device memory allocation
float* d_array;
size_t size = n * sizeof(float);
cudaError_t err = cudaMalloc(&d_array, size);
// Memory copy operations
cudaMemcpy(d_dst, h_src, size, cudaMemcpyHostToDevice);
cudaMemcpy(h_dst, d_src, size, cudaMemcpyDeviceToHost);
cudaMemcpy(d_dst, d_src, size, cudaMemcpyDeviceToDevice);
// Memory initialization
cudaMemset(d_array, 0, size);
// Free memory
cudaFree(d_array);
Asynchronous Memory Operations
// Create streams
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
// Asynchronous operations
cudaMemcpyAsync(d_data, h_data, size, cudaMemcpyHostToDevice, stream1);
kernel<<<grid, block, 0, stream1>>>(d_data);
cudaMemcpyAsync(h_result, d_result, size, cudaMemcpyDeviceToHost, stream1);
// Synchronization
cudaStreamSynchronize(stream1);
cudaDeviceSynchronize(); // Wait for all operations
// Cleanup
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
Pinned Memory
// Allocate pinned memory (faster transfers)
float* h_pinned;
cudaHostAlloc(&h_pinned, size, cudaHostAllocDefault);
// Or use cudaMallocHost
cudaMallocHost(&h_pinned, size);
// Free pinned memory
cudaFreeHost(h_pinned);
Unified Memory (Managed Memory)
// Allocate unified memory
float* unified_data;
cudaMallocManaged(&unified_data, size);
// Use directly in kernel
kernel<<<grid, block>>>(unified_data);
// Prefetch to device (optional optimization)
cudaMemPrefetchAsync(unified_data, size, 0); // Device 0
// Free unified memory
cudaFree(unified_data);
Memory Types & Hierarchy
Global Memory
// Allocated with cudaMalloc
// Accessible by all threads
// Highest latency, largest capacity
__global__ void kernel(float* global_mem) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
global_mem[idx] = idx; // Global memory access
}
Shared Memory
__global__ void sharedMemoryExample(float* input, float* output) {
__shared__ float tile[TILE_SIZE][TILE_SIZE]; // Static allocation
// Dynamic shared memory (specified at kernel launch)
extern __shared__ float dynamic_tile[];
int tid = threadIdx.x;
// Load data to shared memory
tile[threadIdx.y][threadIdx.x] = input[...];
// Synchronize threads in block
__syncthreads();
// Use shared memory data
float result = tile[threadIdx.y][threadIdx.x] * 2.0f;
output[...] = result;
}
// Launch with dynamic shared memory
int sharedMemSize = TILE_SIZE * TILE_SIZE * sizeof(float);
kernel<<<grid, block, sharedMemSize>>>(input, output);
Constant Memory
// Declare constant memory (at file scope)
__constant__ float const_array[1024];
// Copy to constant memory
float host_array[1024];
cudaMemcpyToSymbol(const_array, host_array, sizeof(host_array));
__global__ void useConstant() {
float value = const_array[threadIdx.x]; // Fast broadcast read
}
Texture Memory
// Texture object (modern approach)
texture<float, 2, cudaReadModeElementType> tex;
__global__ void textureKernel() {
float x = blockIdx.x * blockDim.x + threadIdx.x;
float y = blockIdx.y * blockDim.y + threadIdx.y;
// Read from texture (with interpolation)
float value = tex2D(tex, x + 0.5f, y + 0.5f);
}
// Setup texture (host code)
cudaArray* cuArray;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMallocArray(&cuArray, &channelDesc, width, height);
cudaBindTextureToArray(tex, cuArray, channelDesc);
Kernel Launch & Configuration
Launch Configuration
// 1D configuration
int blockSize = 256;
int gridSize = (n + blockSize - 1) / blockSize;
kernel<<<gridSize, blockSize>>>(data);
// 2D configuration
dim3 blockSize(16, 16); // 256 threads per block
dim3 gridSize((width + 15) / 16, (height + 15) / 16);
kernel2D<<<gridSize, blockSize>>>(data);
// With shared memory
int sharedMemSize = blockSize.x * blockSize.y * sizeof(float);
kernel<<<gridSize, blockSize, sharedMemSize>>>(data);
// With streams
kernel<<<gridSize, blockSize, sharedMemSize, stream>>>(data);
Dynamic Parallelism
__global__ void parentKernel(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n && needsProcessing(data[idx])) {
// Launch child kernel from device
childKernel<<<1, 32>>>(data, idx);
// Synchronize child kernel
cudaDeviceSynchronize();
}
}
__global__ void childKernel(float* data, int offset) {
int idx = threadIdx.x;
// Process data[offset + idx]
}
Occupancy Calculation
int blockSize; // The launch configurator will suggest a block size
int minGridSize; // The minimum grid size needed to achieve max occupancy
int gridSize; // The actual grid size
// Calculate optimal block size
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, myKernel, 0, 0);
// Round up according to array size
gridSize = (n + blockSize - 1) / blockSize;
myKernel<<<gridSize, blockSize>>>(data);
Thread Synchronization
Block-Level Synchronization
__global__ void syncExample() {
__shared__ float shared_data[256];
int tid = threadIdx.x;
// Load data
shared_data[tid] = input[blockIdx.x * blockDim.x + tid];
// Wait for all threads in block to finish loading
__syncthreads();
// Now all threads can safely access shared_data
float result = shared_data[255 - tid]; // Reverse access
// Another sync before writing results
__syncthreads();
output[blockIdx.x * blockDim.x + tid] = result;
}
Warp-Level Synchronization
#include <cooperative_groups.h>
__global__ void warpExample() {
auto warp = cooperative_groups::tiled_partition<32>(
cooperative_groups::this_thread_block());
int lane_id = warp.thread_rank();
int value = lane_id;
// Warp-level reduction
for (int delta = 16; delta > 0; delta /= 2) {
value += warp.shfl_down_sync(0xffffffff, value, delta);
}
if (lane_id == 0) {
// Thread 0 of each warp has the sum
atomicAdd(result, value);
}
}
Atomic Operations
// Basic atomic operations
__global__ void atomicsExample(int* counter, float* sum, int* histogram) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Atomic increment
int old_val = atomicAdd(counter, 1);
// Atomic floating point operations (CC >= 6.0)
atomicAdd(sum, data[idx]);
// Atomic compare and swap
int expected = 0;
int desired = idx;
int old = atomicCAS(&flag, expected, desired);
// Histogram example
int bin = (int)(data[idx] * NUM_BINS);
atomicAdd(&histogram[bin], 1);
}
// Custom atomic operations using CAS
__device__ float atomicMaxFloat(float* address, float val) {
int* address_as_int = (int*)address;
int old = *address_as_int, assumed;
do {
assumed = old;
old = atomicCAS(address_as_int, assumed,
__float_as_int(fmaxf(val, __int_as_float(assumed))));
} while (assumed != old);
return __int_as_float(old);
}
Cooperative Groups
#include <cooperative_groups.h>
using namespace cooperative_groups;
__global__ void cooperativeKernel() {
// Thread block group
thread_block block = this_thread_block();
// Warp-sized tile
auto tile32 = tiled_partition<32>(block);
// Grid group (requires cooperative launch)
grid_group grid = this_grid();
// Synchronize at different levels
block.sync(); // Block-level sync
tile32.sync(); // Warp-level sync
grid.sync(); // Grid-level sync (cooperative kernels only)
}
// Launch cooperative kernel
cudaLaunchCooperativeKernel((void*)cooperativeKernel, gridSize, blockSize, args);
Error Handling
Basic Error Checking
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d - %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// Usage
CUDA_CHECK(cudaMalloc(&d_data, size));
CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));
// Check kernel launch errors
kernel<<<grid, block>>>(data);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
Advanced Error Handling
// Error checking function
void checkCudaErrors(cudaError_t result) {
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
}
// Async error checking
cudaError_t asyncCheckErrors() {
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("Async kernel error: %s\n", cudaGetErrorString(err));
return err;
}
// Check for execution errors
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
printf("Sync error: %s\n", cudaGetErrorString(err));
return err;
}
return cudaSuccess;
}
Performance Optimization
Memory Coalescing
// Bad: Non-coalesced access
__global__ void badAccess(float* data, int width) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int row = idx / width;
int col = idx % width;
// Strided access pattern (bad)
float value = data[col * width + row]; // Column-major access
}
// Good: Coalesced access
__global__ void goodAccess(float* data, int width) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// Sequential access pattern (good)
float value = data[row * width + col]; // Row-major access
}
Shared Memory Bank Conflicts
// Bad: Bank conflicts
__global__ void badSharedAccess() {
__shared__ float shared[32][32];
int tid = threadIdx.x;
// All threads access same bank (conflict)
float value = shared[0][tid];
}
// Good: No bank conflicts
__global__ void goodSharedAccess() {
__shared__ float shared[32][33]; // Padding to avoid conflicts
int tid = threadIdx.x;
// Diagonal access pattern
float value = shared[tid][(tid + offset) % 32];
}
Loop Unrolling
// Manual unrolling for small, known loop counts
__global__ void unrolledKernel(float* data) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
// Unroll small loops
#pragma unroll 4
for (int i = 0; i < 4; i++) {
sum += data[idx * 4 + i];
}
data[idx] = sum;
}
Memory Bandwidth Optimization
// Vectorized memory access
__global__ void vectorizedAccess(float4* input, float4* output, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
float4 data = input[idx]; // Load 4 floats at once
// Process each component
data.x *= 2.0f;
data.y *= 2.0f;
data.z *= 2.0f;
data.w *= 2.0f;
output[idx] = data; // Store 4 floats at once
}
}
Instruction-Level Optimizations
__global__ void optimizedKernel(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
float x = data[idx];
// Use fast math functions
float result = __sinf(x) + __cosf(x); // Fast sin/cos
// Fast division (less accurate)
result = __fdividef(result, 3.14159f);
// Fast reciprocal square root
result *= __frsqrt_rn(result);
data[idx] = result;
}
}
Common Patterns
Reduction
// Efficient block-level reduction
__global__ void blockReduce(float* input, float* output, int n) {
__shared__ float sdata[256];
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Load data
sdata[tid] = (idx < n) ? input[idx] : 0.0f;
__syncthreads();
// Reduction in shared memory
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// Write result
if (tid == 0) {
output[blockIdx.x] = sdata[0];
}
}
Matrix Multiplication (Tiled)
#define TILE_SIZE 16
__global__ void matmulTiled(float* A, float* B, float* C, int N) {
__shared__ float As[TILE_SIZE][TILE_SIZE];
__shared__ float Bs[TILE_SIZE][TILE_SIZE];
int row = blockIdx.y * TILE_SIZE + threadIdx.y;
int col = blockIdx.x * TILE_SIZE + threadIdx.x;
float sum = 0.0f;
for (int tile = 0; tile < (N + TILE_SIZE - 1) / TILE_SIZE; tile++) {
// Load tiles into shared memory
if (row < N && tile * TILE_SIZE + threadIdx.x < N)
As[threadIdx.y][threadIdx.x] = A[row * N + tile * TILE_SIZE + threadIdx.x];
else
As[threadIdx.y][threadIdx.x] = 0.0f;
if (col < N && tile * TILE_SIZE + threadIdx.y < N)
Bs[threadIdx.y][threadIdx.x] = B[(tile * TILE_SIZE + threadIdx.y) * N + col];
else
Bs[threadIdx.y][threadIdx.x] = 0.0f;
__syncthreads();
// Compute partial result
for (int k = 0; k < TILE_SIZE; k++) {
sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];
}
__syncthreads();
}
// Write result
if (row < N && col < N) {
C[row * N + col] = sum;
}
}
Scan (Prefix Sum)
__global__ void scanBlock(float* input, float* output, int n) {
__shared__ float temp[2 * BLOCK_SIZE];
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Load data
temp[2 * tid] = (2 * idx < n) ? input[2 * idx] : 0;
temp[2 * tid + 1] = (2 * idx + 1 < n) ? input[2 * idx + 1] : 0;
__syncthreads();
// Up-sweep phase
for (int stride = 1; stride < 2 * BLOCK_SIZE; stride *= 2) {
int index = (tid + 1) * stride * 2 - 1;
if (index < 2 * BLOCK_SIZE) {
temp[index] += temp[index - stride];
}
__syncthreads();
}
// Clear last element
if (tid == 0) temp[2 * BLOCK_SIZE - 1] = 0;
__syncthreads();
// Down-sweep phase
for (int stride = BLOCK_SIZE; stride > 0; stride /= 2) {
int index = (tid + 1) * stride * 2 - 1;
if (index < 2 * BLOCK_SIZE) {
float t = temp[index];
temp[index] += temp[index - stride];
temp[index - stride] = t;
}
__syncthreads();
}
// Write results
if (2 * idx < n) output[2 * idx] = temp[2 * tid];
if (2 * idx + 1 < n) output[2 * idx + 1] = temp[2 * tid + 1];
}
Debugging & Profiling
CUDA-GDB
# Compile with debugging info
nvcc -g -G -O0 program.cu -o program
# Run with cuda-gdb
cuda-gdb ./program
# Common commands
(cuda-gdb) break main
(cuda-gdb) break kernel_name
(cuda-gdb) run
(cuda-gdb) cuda thread (0,0,0) # Switch to specific thread
(cuda-gdb) cuda block (0,0) # Switch to specific block
(cuda-gdb) print variable_name
(cuda-gdb) continue
Profiling with Nsight (nsys/ncu)
# Nsight Systems (nsys) for system-wide performance analysis
nsys profile --stats=true ./program
# Nsight Compute (ncu) for detailed kernel analysis
ncu --set full ./program
# Legacy profiling (nvprof) - deprecated
nvprof ./program
Printf Debugging
__global__ void debugKernel(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Debug specific threads
if (blockIdx.x == 0 && threadIdx.x < 5) {
printf("Block %d, Thread %d: data[%d] = %f\n",
blockIdx.x, threadIdx.x, idx, data[idx]);
}
// Conditional debugging
if (data[idx] < 0) {
printf("Negative value at index %d: %f\n", idx, data[idx]);
}
}
Device Management
Device Properties
void queryDevice() {
int deviceCount;
cudaGetDeviceCount(&deviceCount);
for (int dev = 0; dev < deviceCount; dev++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, dev);
printf("Device %d: %s\n", dev, prop.name);
printf(" Compute capability: %d.%d\n", prop.major, prop.minor);
printf(" Total global memory: %zu MB\n", prop.totalGlobalMem / (1024*1024));
printf(" Shared memory per block: %zu KB\n", prop.sharedMemPerBlock / 1024);
printf(" Max threads per block: %d\n", prop.maxThreadsPerBlock);
printf(" Warp size: %d\n", prop.warpSize);
printf(" Memory bus width: %d bits\n", prop.memoryBusWidth);
printf(" Memory clock rate: %d MHz\n", prop.memoryClockRate / 1000);
}
}
void setDevice() {
int device = 0; // Use device 0
cudaSetDevice(device);
// Verify device was set
int currentDevice;
cudaGetDevice(¤tDevice);
printf("Using device %d\n", currentDevice);
}
Multi-GPU Programming
void multiGPU() {
int deviceCount;
cudaGetDeviceCount(&deviceCount);
// Allocate data on each device
float** d_data = new float*[deviceCount];
cudaStream_t* streams = new cudaStream_t[deviceCount];
for (int dev = 0; dev < deviceCount; dev++) {
cudaSetDevice(dev);
cudaMalloc(&d_data[dev], dataSize);
cudaStreamCreate(&streams[dev]);
// Launch work on this device
kernel<<<grid, block, 0, streams[dev]>>>(d_data[dev]);
}
// Synchronize all devices
for (int dev = 0; dev < deviceCount; dev++) {
cudaSetDevice(dev);
cudaStreamSynchronize(streams[dev]);
cudaFree(d_data[dev]);
cudaStreamDestroy(streams[dev]);
}
}
Quick Reference
Memory Copy Directions
cudaMemcpyHostToDevice // CPU → GPU
cudaMemcpyDeviceToHost // GPU → CPU
cudaMemcpyDeviceToDevice // GPU → GPU
cudaMemcpyHostToHost // CPU → CPU (rarely used)
Built-in Variables
// Thread indexing
threadIdx.x, threadIdx.y, threadIdx.z // Thread index within block
blockIdx.x, blockIdx.y, blockIdx.z // Block index within grid
blockDim.x, blockDim.y, blockDim.z // Block dimensions
gridDim.x, gridDim.y, gridDim.z // Grid dimensions
// Warp information
warpSize // Usually 32
Mathematical Functions
// Standard math functions (slower, double precision available)
sin(), cos(), tan(), log(), exp(), sqrt(), pow()
// Fast math functions (faster, single precision only)
__sinf(), __cosf(), __tanf(), __logf(), __expf()
__fsqrt_rn(), __fdividef()
// Integer functions
__clz() // Count leading zeros
__popc() // Population count (number of set bits)
__brev() // Bit reverse
Warp Shuffle Operations
// Warp shuffle functions (CC >= 3.0)
__shfl_sync(mask, var, srcLane) // Direct lane access
__shfl_up_sync(mask, var, delta) // Shift up
__shfl_down_sync(mask, var, delta) // Shift down
__shfl_xor_sync(mask, var, laneMask) // XOR-based shuffle
// Example: Warp reduction
int value = threadIdx.x;
for (int offset = 16; offset > 0; offset /= 2) {
value += __shfl_down_sync(0xffffffff, value, offset);
}
Common Gotchas & Best Practices
Memory Access Patterns
- Always aim for coalesced memory access - threads in a warp should access consecutive memory addresses
- Use shared memory for repeated data access - 100x faster than global memory
- Avoid bank conflicts in shared memory - pad arrays or use different access patterns
- Prefer AoS over SoA for coalesced access in most cases
Thread Management
- Choose block sizes that are multiples of warp size (32) for best efficiency
- Aim for high occupancy but not at all costs - sometimes fewer blocks with more shared memory is better
- Use
__syncthreads()carefully - all threads in the block must reach it - Avoid divergent branches within warps when possible
Performance Tips
- Use fast math functions (
__sinf,__expf, etc.) when precision allows - Minimize register usage to increase occupancy
- Use texture memory for spatial locality in read-only data
- Consider using unified memory for easier development, but profile performance
- Profile your code with nsys/ncu to identify bottlenecks
Error Handling
- Always check CUDA API return values using error checking macros
- Use
cudaGetLastError()after kernel launches to catch errors - Synchronize before error checking for kernel execution errors
- Use CUDA-GDB for debugging device code when needed
This cheat sheet covers the essential CUDA programming concepts and patterns. For the most current information, always refer to the official CUDA documentation.