Skip to content

Instantly share code, notes, and snippets.

@BIGBALLON
Created April 24, 2025 11:14
Show Gist options
  • Save BIGBALLON/19cd2b7e1c7e8fdb3105aedb239b3fe1 to your computer and use it in GitHub Desktop.
Save BIGBALLON/19cd2b7e1c7e8fdb3105aedb239b3fe1 to your computer and use it in GitHub Desktop.
This script is useful for developers, researchers, or system administrators who want to ensure that a PyTorch environment is correctly installed and fully operational for deep learning tasks, both on CPU and GPU.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
# Function to print a formatted section header
def print_header(msg):
print("\n" + "=" * len(msg))
print(msg)
print("=" * len(msg))
# 1. Test whether PyTorch can be imported
def test_import():
print_header("1. Import Test")
try:
import torch
version = torch.__version__ # Get the installed PyTorch version
print(f"[PASS] torch version: {version}")
return True, version
except Exception as e:
print(f"[FAIL] Cannot import torch: {e}")
return False, None
# 2. Test basic tensor operations on CPU
def test_tensor_ops():
print_header("2. Tensor Operation Test")
try:
import torch
a = torch.rand(2, 3) # Create a random 2x3 tensor
b = torch.rand(2, 3) # Create another random 2x3 tensor
c = a + b # Element-wise addition
print("Tensor a =\n", a)
print("Tensor b =\n", b)
print("a + b =\n", c)
return True
except Exception as e:
print(f"[FAIL] Tensor operation error: {e}")
return False
# 3. Test CUDA availability and perform a GPU operation within a memory limit
def test_cuda(max_mem_mb=500):
print_header("3. CUDA Test")
try:
import torch
cuda_avail = torch.cuda.is_available() # Check if CUDA is available
print(f"torch.cuda.is_available() -> {cuda_avail}")
if cuda_avail:
# Check number of GPUs
ngpu = torch.cuda.device_count()
print(f"Number of CUDA devices: {ngpu}")
# Estimate tensor size to fit within the specified memory limit
dtype = torch.float32
dtype_size = torch.tensor([], dtype=dtype).element_size()
max_bytes = max_mem_mb * 1024**2
nelements = max_bytes // (3 * dtype_size)
D = int(nelements ** 0.5)
actual_bytes = 3 * (D * D) * dtype_size
print(f"Allocating {D}x{D} tensors (~{actual_bytes / 1024**2:.2f} MB, limit {max_mem_mb} MB)")
# Allocate and perform matrix multiplication on GPU 0
x = torch.randn(D, D, device="cuda:0", dtype=dtype)
y = torch.randn(D, D, device="cuda:0", dtype=dtype)
z = x @ y
norm = z.norm().item()
print(f"[PASS] GPU matrix multiplication norm: {norm:.4f}")
else:
print("[INFO] CUDA not available, skipping GPU operations")
return True
except Exception as e:
print(f"[FAIL] CUDA test error: {e}")
return False
# 4. Test cuDNN backend availability
def test_cudnn():
print_header("4. cuDNN Test")
try:
import torch
cudnn_avail = torch.backends.cudnn.is_available()
print(f"torch.backends.cudnn.is_available() -> {cudnn_avail}")
if cudnn_avail:
# Simple convolution to test cuDNN
import torch.nn as nn
conv = nn.Conv2d(3, 16, kernel_size=3, padding=1).cuda()
inp = torch.randn(1, 3, 64, 64, device="cuda")
out = conv(inp)
print(f"[PASS] cuDNN convolution output shape: {tuple(out.shape)}")
else:
print("[INFO] cuDNN not available, skipping convolution test")
return True
except Exception as e:
print(f"[FAIL] cuDNN test error: {e}")
return False
# 5. Test autograd (gradient computation)
def test_autograd():
print_header("5. Autograd Test")
try:
import torch
x = torch.randn(5, 5, requires_grad=True)
y = x * 2
z = y.mean()
z.backward()
# Check that gradient is populated
grad_norm = x.grad.norm().item()
print(f"[PASS] gradient norm: {grad_norm:.4f}")
return True
except Exception as e:
print(f"[FAIL] Autograd test error: {e}")
return False
# Main function to execute all tests and summarize the results
def main():
results = {}
ok, version = test_import()
results['import'] = ok
results['tensor_ops'] = test_tensor_ops()
results['cuda'] = test_cuda(max_mem_mb=500)
results['cudnn'] = test_cudnn()
results['autograd'] = test_autograd()
# Print summary of all tests
print_header("Summary")
for name, passed in results.items():
status = "PASS" if passed else "FAIL"
print(f"{name:12s}: {status}")
# Determine overall success status
all_ok = all(results.values())
overall = "All tests passed ✅" if all_ok else "Some tests failed ❌"
print(f"\nOverall status: {overall}")
sys.exit(0 if all_ok else 1)
# Entry point of the script
if __name__ == "__main__":
main()
"""
pytorch_diagnostics.py is a lightweight diagnostic script designed to verify the integrity and functionality of a PyTorch installation. It performs a series of systematic checks, including:
PyTorch import verification
Basic tensor operations on CPU
CUDA availability and GPU-based matrix multiplication within a memory limit
cuDNN backend support and convolution test
Autograd functionality and gradient backpropagation test
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment