Skip to content

Instantly share code, notes, and snippets.

@KohakuBlueleaf
Created May 4, 2025 14:44
Show Gist options
  • Save KohakuBlueleaf/ec182a1e542905a5b0ec2fbdf3518e46 to your computer and use it in GitHub Desktop.
Save KohakuBlueleaf/ec182a1e542905a5b0ec2fbdf3518e46 to your computer and use it in GitHub Desktop.
A minimal reproduction of the "CUDA error: an illegal memory access was encountered" error in nccl + pytorch for ddp model training
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
import torch.multiprocessing as mp
import torchvision.models as models
from torch.utils.data import Dataset
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
# NCCL result in CUDA error: an illegal memory access was encountered
# Only gloo works
BACKEND = "nccl"
class DummyDataset(Dataset):
"""Dummy dataset for testing DDP training."""
def __init__(self, size=1000, img_size=224, num_classes=10):
self.size = size
self.img_size = img_size
self.num_classes = num_classes
def __len__(self):
return self.size
def __getitem__(self, idx):
# Create dummy image and label
img = torch.randn(3, self.img_size, self.img_size)
label = torch.tensor(idx % self.num_classes, dtype=torch.long)
return img, label
def create_model(num_classes=10):
model = models.vit_b_16(weights=None)
model.heads.head = nn.Linear(model.heads.head.in_features, num_classes)
return model
def setup(rank, world_size):
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
dist.init_process_group(backend=BACKEND, rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
def cleanup():
"""Clean up the distributed environment."""
dist.destroy_process_group()
def train(rank, world_size):
print(f"Setup DDP for rank {rank} with world size {world_size}")
setup(rank, world_size)
print("Creating Model")
model = create_model()
model = model.to(rank)
model = DDP(
model,
device_ids=[rank],
output_device=rank,
)
print("Creating Loss and Optimizer")
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
print("Creating Dataset and DataLoader")
train_dataset = DummyDataset(size=32768)
train_sampler = DistributedSampler(
train_dataset, num_replicas=world_size, rank=rank, shuffle=True
)
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=32,
sampler=train_sampler,
)
print("Starting Training")
num_epochs = 2
model.train()
for epoch in range(num_epochs):
train_sampler.set_epoch(epoch)
for i, (images, labels) in enumerate(train_loader):
images, labels = images.to(rank), labels.to(rank)
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if i % 10 == 0 and rank == 0:
print(
f"Epoch [{epoch+1}/{num_epochs}], "
f"Step [{i+1}/{len(train_loader)}], "
f"Loss: {loss.item():.4f}"
)
cleanup()
def main():
world_size = torch.cuda.device_count()
print(f"Using {world_size} GPUs")
mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment