PyTorch
Installation
# CPU only
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# CUDA (check https://pytorch.org for your CUDA version)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# AMD ROCm
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6
# Intel GPU (XPU)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
# Development version (nightly)
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
Import Essentials
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision
import torchvision.transforms as transforms
import numpy as np
Tensor Basics
Creating Tensors
# From data
x = torch.tensor([1, 2, 3])
x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
# Zeros and ones
x = torch.zeros(3, 4)
x = torch.ones(2, 3)
x = torch.eye(3) # identity matrix
# Random tensors
x = torch.randn(2, 3) # normal distribution
x = torch.rand(2, 3) # uniform [0, 1)
x = torch.randint(0, 10, (2, 3)) # random integers
# From numpy
numpy_array = np.array([1, 2, 3])
x = torch.from_numpy(numpy_array)
# Ranges
x = torch.arange(0, 10, 2) # [0, 2, 4, 6, 8]
x = torch.linspace(0, 1, 5) # [0, 0.25, 0.5, 0.75, 1.0]
Tensor Properties
x = torch.randn(3, 4, 5)
print(x.shape) # torch.Size([3, 4, 5])
print(x.size()) # torch.Size([3, 4, 5])
print(x.dtype) # torch.float32
print(x.device) # cpu or cuda:0
print(x.ndim) # 3
print(x.numel()) # 60 (total elements)
Tensor Operations
# Arithmetic
x = torch.tensor([1, 2, 3])
y = torch.tensor([4, 5, 6])
z = x + y # or torch.add(x, y)
z = x - y # or torch.sub(x, y)
z = x * y # element-wise multiplication
z = x / y # element-wise division
z = x @ y # dot product
z = torch.matmul(x, y) # matrix multiplication
# In-place operations (end with _)
x.add_(1) # adds 1 to x in-place
x.mul_(2) # multiplies x by 2 in-place
Reshaping and Indexing
x = torch.randn(4, 4)
# Reshaping
x = x.view(16) # or x.view(-1)
x = x.view(2, 8)
x = x.reshape(4, 4) # more flexible than view
x = x.squeeze() # remove dimensions of size 1
x = x.unsqueeze(0) # add dimension at index 0
# Indexing
x[0, 1] # element at row 0, col 1
x[:, 1] # all rows, column 1
x[1, :] # row 1, all columns
x[0:2, 1:3] # submatrix
# Advanced indexing
mask = x > 0
x[mask] # elements where mask is True
Device Management
# Check device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# Move tensors to device
x = torch.randn(3, 3)
x = x.to(device)
# or
x = x.cuda() if torch.cuda.is_available() else x
# Create tensors directly on device
x = torch.randn(3, 3, device=device)
Neural Networks
Basic Neural Network
class SimpleNet(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(SimpleNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return x
# Create model
model = SimpleNet(784, 128, 10)
print(model)
Convolutional Neural Network
class ConvNet(nn.Module):
def __init__(self, num_classes=10):
super(ConvNet, self).__init__()
self.conv1 = nn.Conv2d(3, 16, 3, padding=1)
self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(32 * 8 * 8, 128)
self.fc2 = nn.Linear(128, num_classes)
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 32 * 8 * 8) # flatten
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
Common Layers
# Linear layers
nn.Linear(in_features, out_features)
nn.Linear(784, 10, bias=False)
# Convolutional layers
nn.Conv1d(in_channels, out_channels, kernel_size)
nn.Conv2d(1, 32, 3, stride=1, padding=1)
nn.Conv3d(1, 16, 3)
# Pooling layers
nn.MaxPool2d(kernel_size=2)
nn.AvgPool2d(kernel_size=2, stride=2)
nn.AdaptiveAvgPool2d((1, 1))
# Normalization
nn.BatchNorm1d(num_features)
nn.BatchNorm2d(num_features)
nn.LayerNorm(normalized_shape)
# Activation functions
nn.ReLU()
nn.LeakyReLU(negative_slope=0.01)
nn.Sigmoid()
nn.Tanh()
nn.Softmax(dim=1)
# Regularization
nn.Dropout(p=0.5)
nn.Dropout2d(p=0.5)
Loss Functions
# Classification
criterion = nn.CrossEntropyLoss()
criterion = nn.BCELoss() # Binary cross entropy
criterion = nn.BCEWithLogitsLoss() # BCE with sigmoid
# Regression
criterion = nn.MSELoss() # Mean squared error
criterion = nn.L1Loss() # Mean absolute error
criterion = nn.SmoothL1Loss() # Huber loss
# Custom loss example
class CustomLoss(nn.Module):
def __init__(self):
super(CustomLoss, self).__init__()
def forward(self, predictions, targets):
return torch.mean((predictions - targets) ** 2)
Optimizers
# Common optimizers
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
optimizer = optim.RMSprop(model.parameters(), lr=0.01, alpha=0.9)
# Learning rate schedulers
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
Training Loop
def train_model(model, train_loader, criterion, optimizer, num_epochs, device):
model.train()
for epoch in range(num_epochs):
running_loss = 0.0
for batch_idx, (data, targets) in enumerate(train_loader):
# Move data to device
data, targets = data.to(device), targets.to(device)
# Zero gradients
optimizer.zero_grad()
# Forward pass
outputs = model(data)
loss = criterion(outputs, targets)
# Backward pass
loss.backward()
optimizer.step()
running_loss += loss.item()
if batch_idx % 100 == 0:
print(f'Epoch {epoch+1}/{num_epochs}, '
f'Batch {batch_idx}/{len(train_loader)}, '
f'Loss: {loss.item():.4f}')
# Optional: step scheduler
# scheduler.step()
epoch_loss = running_loss / len(train_loader)
print(f'Epoch {epoch+1} average loss: {epoch_loss:.4f}')
# Usage
train_model(model, train_loader, criterion, optimizer, num_epochs=10, device=device)
Evaluation
def evaluate_model(model, test_loader, device):
model.eval()
correct = 0
total = 0
test_loss = 0
with torch.no_grad():
for data, targets in test_loader:
data, targets = data.to(device), targets.to(device)
outputs = model(data)
test_loss += F.cross_entropy(outputs, targets, reduction='sum').item()
_, predicted = torch.max(outputs.data, 1)
total += targets.size(0)
correct += (predicted == targets).sum().item()
accuracy = 100 * correct / total
avg_loss = test_loss / total
print(f'Test Accuracy: {accuracy:.2f}%')
print(f'Test Loss: {avg_loss:.4f}')
return accuracy, avg_loss
Data Loading
Custom Dataset
class CustomDataset(Dataset):
def __init__(self, data, labels, transform=None):
self.data = data
self.labels = labels
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
sample = self.data[idx]
label = self.labels[idx]
if self.transform:
sample = self.transform(sample)
return sample, label
# Usage
dataset = CustomDataset(data, labels, transform=transforms.ToTensor())
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)
from torchvision import transforms
# Common transforms
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomRotation(degrees=10),
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# For evaluation (no data augmentation)
test_transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
Model Saving and Loading
# Save model
torch.save(model.state_dict(), 'model_weights.pth')
torch.save(model, 'complete_model.pth')
# Save checkpoint
checkpoint = {
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss,
}
torch.save(checkpoint, 'checkpoint.pth')
# Load model
model.load_state_dict(torch.load('model_weights.pth', map_location=device))
model = torch.load('complete_model.pth', map_location=device)
# Load checkpoint
checkpoint = torch.load('checkpoint.pth', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
Autograd and Gradients
# Enable/disable gradients
x = torch.randn(3, requires_grad=True)
# Forward pass
y = x.sum()
# Backward pass
y.backward()
print(x.grad)
# Gradient context managers
with torch.no_grad():
# Operations here won't track gradients
y = model(x)
# Temporarily enable gradients
with torch.enable_grad():
# Operations here will track gradients
pass
# Manual gradient computation
def custom_backward(x):
x.retain_grad()
y = x ** 2
y.backward(torch.ones_like(y))
return x.grad
Mixed Precision Training
from torch.amp import autocast, GradScaler
# Initialize scaler
scaler = GradScaler()
def train_with_amp(model, train_loader, criterion, optimizer, device):
model.train()
for data, targets in train_loader:
data, targets = data.to(device), targets.to(device)
optimizer.zero_grad()
# Use autocast for forward pass
with autocast(device_type='cuda'):
outputs = model(data)
loss = criterion(outputs, targets)
# Scale loss and backward
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
torch.compile (PyTorch 2.0+)
# Optimize model with torch.compile
model = torch.compile(model)
# With specific backend
model = torch.compile(model, backend="inductor")
# For inference only
@torch.compile
def inference_function(x):
return torch.sin(x).cos()
# Disable compilation for debugging
model = torch.compile(model, disable=True)
Model Utilities
Parameter Counting
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model has {count_parameters(model):,} trainable parameters")
Model Summary
def model_summary(model, input_size):
def register_hook(module):
def hook(module, input, output):
class_name = str(module.__class__).split(".")[-1].split("'")[0]
module_idx = len(summary)
m_key = f"{class_name}-{module_idx+1}"
summary[m_key] = {
"input_shape": list(input[0].size()),
"output_shape": list(output.size()),
"nb_params": sum([param.nelement() for param in module.parameters()])
}
if not isinstance(module, nn.Sequential) and not isinstance(module, nn.ModuleList):
hooks.append(module.register_forward_hook(hook))
device = next(model.parameters()).device
summary = {}
hooks = []
model.apply(register_hook)
# Make a forward pass
x = torch.randn(*input_size).to(device)
model(x)
# Remove hooks
for h in hooks:
h.remove()
return summary
Transfer Learning
import torchvision.models as models
# Load pretrained model
model = models.resnet18(pretrained=True)
# Freeze parameters
for param in model.parameters():
param.requires_grad = False
# Replace final layer
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)
# Only train final layer
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)
# Fine-tuning: unfreeze some layers
for param in model.layer4.parameters():
param.requires_grad = True
Common Patterns
Early Stopping
class EarlyStopping:
def __init__(self, patience=7, min_delta=0, restore_best_weights=True):
self.patience = patience
self.min_delta = min_delta
self.restore_best_weights = restore_best_weights
self.best_loss = None
self.counter = 0
self.best_weights = None
def __call__(self, val_loss, model):
if self.best_loss is None:
self.best_loss = val_loss
self.save_checkpoint(model)
elif val_loss < self.best_loss - self.min_delta:
self.best_loss = val_loss
self.counter = 0
self.save_checkpoint(model)
else:
self.counter += 1
if self.counter >= self.patience:
if self.restore_best_weights:
model.load_state_dict(self.best_weights)
return True
return False
def save_checkpoint(self, model):
self.best_weights = model.state_dict().copy()
Gradient Clipping
# During training
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
Learning Rate Finding
def find_lr(model, train_loader, optimizer, criterion, device):
lrs = []
losses = []
lr = 1e-7
for data, targets in train_loader:
data, targets = data.to(device), targets.to(device)
optimizer.param_groups[0]['lr'] = lr
optimizer.zero_grad()
outputs = model(data)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
lrs.append(lr)
losses.append(loss.item())
lr *= 1.1
if lr > 1:
break
return lrs, losses
Debugging Tips
Check for NaN/Inf
def check_for_nan(tensor, name="tensor"):
if torch.isnan(tensor).any():
print(f"NaN detected in {name}")
if torch.isinf(tensor).any():
print(f"Inf detected in {name}")
Monitor Gradients
def monitor_gradients(model):
for name, param in model.named_parameters():
if param.grad is not None:
grad_norm = param.grad.norm()
print(f"{name}: {grad_norm:.4f}")
Memory Usage
def print_gpu_memory():
if torch.cuda.is_available():
print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.1f} MB")
print(f"GPU memory cached: {torch.cuda.memory_reserved() / 1024**2:.1f} MB")
- Use
torch.compile() for PyTorch 2.0+ performance gains
- Use mixed precision training with AMP for faster training
- Set
torch.backends.cudnn.benchmark = True for consistent input sizes
- Use
pin_memory=True in DataLoader for faster GPU transfer
- Use appropriate
num_workers in DataLoader (typically 2-4x number of GPUs)
- Use
torch.no_grad() during inference to save memory
- Consider using
torch.jit.script() for model optimization
- Use
torch.utils.checkpoint for memory-efficient training of large models