Created
April 24, 2025 11:14
-
-
Save BIGBALLON/19cd2b7e1c7e8fdb3105aedb239b3fe1 to your computer and use it in GitHub Desktop.
This script is useful for developers, researchers, or system administrators who want to ensure that a PyTorch environment is correctly installed and fully operational for deep learning tasks, both on CPU and GPU.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import sys | |
# Function to print a formatted section header | |
def print_header(msg): | |
print("\n" + "=" * len(msg)) | |
print(msg) | |
print("=" * len(msg)) | |
# 1. Test whether PyTorch can be imported | |
def test_import(): | |
print_header("1. Import Test") | |
try: | |
import torch | |
version = torch.__version__ # Get the installed PyTorch version | |
print(f"[PASS] torch version: {version}") | |
return True, version | |
except Exception as e: | |
print(f"[FAIL] Cannot import torch: {e}") | |
return False, None | |
# 2. Test basic tensor operations on CPU | |
def test_tensor_ops(): | |
print_header("2. Tensor Operation Test") | |
try: | |
import torch | |
a = torch.rand(2, 3) # Create a random 2x3 tensor | |
b = torch.rand(2, 3) # Create another random 2x3 tensor | |
c = a + b # Element-wise addition | |
print("Tensor a =\n", a) | |
print("Tensor b =\n", b) | |
print("a + b =\n", c) | |
return True | |
except Exception as e: | |
print(f"[FAIL] Tensor operation error: {e}") | |
return False | |
# 3. Test CUDA availability and perform a GPU operation within a memory limit | |
def test_cuda(max_mem_mb=500): | |
print_header("3. CUDA Test") | |
try: | |
import torch | |
cuda_avail = torch.cuda.is_available() # Check if CUDA is available | |
print(f"torch.cuda.is_available() -> {cuda_avail}") | |
if cuda_avail: | |
# Check number of GPUs | |
ngpu = torch.cuda.device_count() | |
print(f"Number of CUDA devices: {ngpu}") | |
# Estimate tensor size to fit within the specified memory limit | |
dtype = torch.float32 | |
dtype_size = torch.tensor([], dtype=dtype).element_size() | |
max_bytes = max_mem_mb * 1024**2 | |
nelements = max_bytes // (3 * dtype_size) | |
D = int(nelements ** 0.5) | |
actual_bytes = 3 * (D * D) * dtype_size | |
print(f"Allocating {D}x{D} tensors (~{actual_bytes / 1024**2:.2f} MB, limit {max_mem_mb} MB)") | |
# Allocate and perform matrix multiplication on GPU 0 | |
x = torch.randn(D, D, device="cuda:0", dtype=dtype) | |
y = torch.randn(D, D, device="cuda:0", dtype=dtype) | |
z = x @ y | |
norm = z.norm().item() | |
print(f"[PASS] GPU matrix multiplication norm: {norm:.4f}") | |
else: | |
print("[INFO] CUDA not available, skipping GPU operations") | |
return True | |
except Exception as e: | |
print(f"[FAIL] CUDA test error: {e}") | |
return False | |
# 4. Test cuDNN backend availability | |
def test_cudnn(): | |
print_header("4. cuDNN Test") | |
try: | |
import torch | |
cudnn_avail = torch.backends.cudnn.is_available() | |
print(f"torch.backends.cudnn.is_available() -> {cudnn_avail}") | |
if cudnn_avail: | |
# Simple convolution to test cuDNN | |
import torch.nn as nn | |
conv = nn.Conv2d(3, 16, kernel_size=3, padding=1).cuda() | |
inp = torch.randn(1, 3, 64, 64, device="cuda") | |
out = conv(inp) | |
print(f"[PASS] cuDNN convolution output shape: {tuple(out.shape)}") | |
else: | |
print("[INFO] cuDNN not available, skipping convolution test") | |
return True | |
except Exception as e: | |
print(f"[FAIL] cuDNN test error: {e}") | |
return False | |
# 5. Test autograd (gradient computation) | |
def test_autograd(): | |
print_header("5. Autograd Test") | |
try: | |
import torch | |
x = torch.randn(5, 5, requires_grad=True) | |
y = x * 2 | |
z = y.mean() | |
z.backward() | |
# Check that gradient is populated | |
grad_norm = x.grad.norm().item() | |
print(f"[PASS] gradient norm: {grad_norm:.4f}") | |
return True | |
except Exception as e: | |
print(f"[FAIL] Autograd test error: {e}") | |
return False | |
# Main function to execute all tests and summarize the results | |
def main(): | |
results = {} | |
ok, version = test_import() | |
results['import'] = ok | |
results['tensor_ops'] = test_tensor_ops() | |
results['cuda'] = test_cuda(max_mem_mb=500) | |
results['cudnn'] = test_cudnn() | |
results['autograd'] = test_autograd() | |
# Print summary of all tests | |
print_header("Summary") | |
for name, passed in results.items(): | |
status = "PASS" if passed else "FAIL" | |
print(f"{name:12s}: {status}") | |
# Determine overall success status | |
all_ok = all(results.values()) | |
overall = "All tests passed ✅" if all_ok else "Some tests failed ❌" | |
print(f"\nOverall status: {overall}") | |
sys.exit(0 if all_ok else 1) | |
# Entry point of the script | |
if __name__ == "__main__": | |
main() | |
""" | |
pytorch_diagnostics.py is a lightweight diagnostic script designed to verify the integrity and functionality of a PyTorch installation. It performs a series of systematic checks, including: | |
PyTorch import verification | |
Basic tensor operations on CPU | |
CUDA availability and GPU-based matrix multiplication within a memory limit | |
cuDNN backend support and convolution test | |
Autograd functionality and gradient backpropagation test | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment