Created
May 4, 2025 14:44
-
-
Save KohakuBlueleaf/ec182a1e542905a5b0ec2fbdf3518e46 to your computer and use it in GitHub Desktop.
A minimal reproduction of the "CUDA error: an illegal memory access was encountered" error in nccl + pytorch for ddp model training
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import torch | |
import torch.nn as nn | |
import torch.optim as optim | |
import torch.distributed as dist | |
import torch.multiprocessing as mp | |
import torchvision.models as models | |
from torch.utils.data import Dataset | |
from torch.nn.parallel import DistributedDataParallel as DDP | |
from torch.utils.data.distributed import DistributedSampler | |
# NCCL result in CUDA error: an illegal memory access was encountered | |
# Only gloo works | |
BACKEND = "nccl" | |
class DummyDataset(Dataset): | |
"""Dummy dataset for testing DDP training.""" | |
def __init__(self, size=1000, img_size=224, num_classes=10): | |
self.size = size | |
self.img_size = img_size | |
self.num_classes = num_classes | |
def __len__(self): | |
return self.size | |
def __getitem__(self, idx): | |
# Create dummy image and label | |
img = torch.randn(3, self.img_size, self.img_size) | |
label = torch.tensor(idx % self.num_classes, dtype=torch.long) | |
return img, label | |
def create_model(num_classes=10): | |
model = models.vit_b_16(weights=None) | |
model.heads.head = nn.Linear(model.heads.head.in_features, num_classes) | |
return model | |
def setup(rank, world_size): | |
os.environ["MASTER_ADDR"] = "localhost" | |
os.environ["MASTER_PORT"] = "12355" | |
dist.init_process_group(backend=BACKEND, rank=rank, world_size=world_size) | |
torch.cuda.set_device(rank) | |
def cleanup(): | |
"""Clean up the distributed environment.""" | |
dist.destroy_process_group() | |
def train(rank, world_size): | |
print(f"Setup DDP for rank {rank} with world size {world_size}") | |
setup(rank, world_size) | |
print("Creating Model") | |
model = create_model() | |
model = model.to(rank) | |
model = DDP( | |
model, | |
device_ids=[rank], | |
output_device=rank, | |
) | |
print("Creating Loss and Optimizer") | |
criterion = nn.CrossEntropyLoss() | |
optimizer = optim.AdamW(model.parameters(), lr=1e-3) | |
print("Creating Dataset and DataLoader") | |
train_dataset = DummyDataset(size=32768) | |
train_sampler = DistributedSampler( | |
train_dataset, num_replicas=world_size, rank=rank, shuffle=True | |
) | |
train_loader = torch.utils.data.DataLoader( | |
train_dataset, | |
batch_size=32, | |
sampler=train_sampler, | |
) | |
print("Starting Training") | |
num_epochs = 2 | |
model.train() | |
for epoch in range(num_epochs): | |
train_sampler.set_epoch(epoch) | |
for i, (images, labels) in enumerate(train_loader): | |
images, labels = images.to(rank), labels.to(rank) | |
outputs = model(images) | |
loss = criterion(outputs, labels) | |
loss.backward() | |
optimizer.step() | |
optimizer.zero_grad() | |
if i % 10 == 0 and rank == 0: | |
print( | |
f"Epoch [{epoch+1}/{num_epochs}], " | |
f"Step [{i+1}/{len(train_loader)}], " | |
f"Loss: {loss.item():.4f}" | |
) | |
cleanup() | |
def main(): | |
world_size = torch.cuda.device_count() | |
print(f"Using {world_size} GPUs") | |
mp.spawn(train, args=(world_size,), nprocs=world_size, join=True) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment