BIGBALLON · April 24, 2025 11:14
diff --git a/pytorch_diagnostics.py b/pytorch_diagnostics.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

 import sys

 # Function to print a formatted section header
 def print_header(msg):
    print("\n" + "=" * len(msg))
    print(msg)
    print("=" * len(msg))

 # 1. Test whether PyTorch can be imported
 def test_import():
    print_header("1. Import Test")
    try:
        import torch
        version = torch.__version__  # Get the installed PyTorch version
        print(f"[PASS] torch version: {version}")
        return True, version
    except Exception as e:
        print(f"[FAIL] Cannot import torch: {e}")
        return False, None

 # 2. Test basic tensor operations on CPU
 def test_tensor_ops():
    print_header("2. Tensor Operation Test")
    try:
        import torch
        a = torch.rand(2, 3)  # Create a random 2x3 tensor
        b = torch.rand(2, 3)  # Create another random 2x3 tensor
        c = a + b             # Element-wise addition
        print("Tensor a =\n", a)
        print("Tensor b =\n", b)
        print("a + b =\n", c)
        return True
    except Exception as e:
        print(f"[FAIL] Tensor operation error: {e}")
        return False

 # 3. Test CUDA availability and perform a GPU operation within a memory limit
 def test_cuda(max_mem_mb=500):
    print_header("3. CUDA Test")
    try:
        import torch
        cuda_avail = torch.cuda.is_available()  # Check if CUDA is available
        print(f"torch.cuda.is_available() -> {cuda_avail}")
        if cuda_avail:
            # Check number of GPUs
            ngpu = torch.cuda.device_count()
            print(f"Number of CUDA devices: {ngpu}")
            # Estimate tensor size to fit within the specified memory limit
            dtype = torch.float32
            dtype_size = torch.tensor([], dtype=dtype).element_size()
            max_bytes = max_mem_mb * 1024**2
            nelements = max_bytes // (3 * dtype_size)
            D = int(nelements ** 0.5)
            actual_bytes = 3 * (D * D) * dtype_size
            print(f"Allocating {D}x{D} tensors (~{actual_bytes / 1024**2:.2f} MB, limit {max_mem_mb} MB)")
            # Allocate and perform matrix multiplication on GPU 0
            x = torch.randn(D, D, device="cuda:0", dtype=dtype)
            y = torch.randn(D, D, device="cuda:0", dtype=dtype)
            z = x @ y
            norm = z.norm().item()
            print(f"[PASS] GPU matrix multiplication norm: {norm:.4f}")
        else:
            print("[INFO] CUDA not available, skipping GPU operations")
        return True
    except Exception as e:
        print(f"[FAIL] CUDA test error: {e}")
        return False

 # 4. Test cuDNN backend availability
 def test_cudnn():
    print_header("4. cuDNN Test")
    try:
        import torch
        cudnn_avail = torch.backends.cudnn.is_available()
        print(f"torch.backends.cudnn.is_available() -> {cudnn_avail}")
        if cudnn_avail:
            # Simple convolution to test cuDNN
            import torch.nn as nn
            conv = nn.Conv2d(3, 16, kernel_size=3, padding=1).cuda()
            inp = torch.randn(1, 3, 64, 64, device="cuda")
            out = conv(inp)
            print(f"[PASS] cuDNN convolution output shape: {tuple(out.shape)}")
        else:
            print("[INFO] cuDNN not available, skipping convolution test")
        return True
    except Exception as e:
        print(f"[FAIL] cuDNN test error: {e}")
        return False

 # 5. Test autograd (gradient computation)
 def test_autograd():
    print_header("5. Autograd Test")
    try:
        import torch
        x = torch.randn(5, 5, requires_grad=True)
        y = x * 2
        z = y.mean()
        z.backward()
        # Check that gradient is populated
        grad_norm = x.grad.norm().item()
        print(f"[PASS] gradient norm: {grad_norm:.4f}")
        return True
    except Exception as e:
        print(f"[FAIL] Autograd test error: {e}")
        return False

 # Main function to execute all tests and summarize the results
 def main():
    results = {}

    ok, version = test_import()
    results['import'] = ok

    results['tensor_ops'] = test_tensor_ops()
    results['cuda'] = test_cuda(max_mem_mb=500)
    results['cudnn'] = test_cudnn()
    results['autograd'] = test_autograd()

    # Print summary of all tests
    print_header("Summary")
    for name, passed in results.items():
        status = "PASS" if passed else "FAIL"
        print(f"{name:12s}: {status}")

    # Determine overall success status
    all_ok = all(results.values())
    overall = "All tests passed ✅" if all_ok else "Some tests failed ❌"
    print(f"\nOverall status: {overall}")
    sys.exit(0 if all_ok else 1)

 # Entry point of the script
 if __name__ == "__main__":
    main()

 """
 pytorch_diagnostics.py is a lightweight diagnostic script designed to verify the integrity and functionality of a PyTorch installation. It performs a series of systematic checks, including:

 PyTorch import verification
 Basic tensor operations on CPU
 CUDA availability and GPU-based matrix multiplication within a memory limit
 cuDNN backend support and convolution test
 Autograd functionality and gradient backpropagation test
 """
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	import sys

	# Function to print a formatted section header
	def print_header(msg):
	print("\n" + "=" * len(msg))
	print(msg)
	print("=" * len(msg))

	# 1. Test whether PyTorch can be imported
	def test_import():
	print_header("1. Import Test")
	try:
	import torch
	version = torch.__version__ # Get the installed PyTorch version
	print(f"[PASS] torch version: {version}")
	return True, version
	except Exception as e:
	print(f"[FAIL] Cannot import torch: {e}")
	return False, None

	# 2. Test basic tensor operations on CPU
	def test_tensor_ops():
	print_header("2. Tensor Operation Test")
	try:
	import torch
	a = torch.rand(2, 3) # Create a random 2x3 tensor
	b = torch.rand(2, 3) # Create another random 2x3 tensor
	c = a + b # Element-wise addition
	print("Tensor a =\n", a)
	print("Tensor b =\n", b)
	print("a + b =\n", c)
	return True
	except Exception as e:
	print(f"[FAIL] Tensor operation error: {e}")
	return False

	# 3. Test CUDA availability and perform a GPU operation within a memory limit
	def test_cuda(max_mem_mb=500):
	print_header("3. CUDA Test")
	try:
	import torch
	cuda_avail = torch.cuda.is_available() # Check if CUDA is available
	print(f"torch.cuda.is_available() -> {cuda_avail}")
	if cuda_avail:
	# Check number of GPUs
	ngpu = torch.cuda.device_count()
	print(f"Number of CUDA devices: {ngpu}")
	# Estimate tensor size to fit within the specified memory limit
	dtype = torch.float32
	dtype_size = torch.tensor([], dtype=dtype).element_size()
	max_bytes = max_mem_mb * 1024**2
	nelements = max_bytes // (3 * dtype_size)
	D = int(nelements ** 0.5)
	actual_bytes = 3 * (D * D) * dtype_size
	print(f"Allocating {D}x{D} tensors (~{actual_bytes / 1024**2:.2f} MB, limit {max_mem_mb} MB)")
	# Allocate and perform matrix multiplication on GPU 0
	x = torch.randn(D, D, device="cuda:0", dtype=dtype)
	y = torch.randn(D, D, device="cuda:0", dtype=dtype)
	z = x @ y
	norm = z.norm().item()
	print(f"[PASS] GPU matrix multiplication norm: {norm:.4f}")
	else:
	print("[INFO] CUDA not available, skipping GPU operations")
	return True
	except Exception as e:
	print(f"[FAIL] CUDA test error: {e}")
	return False

	# 4. Test cuDNN backend availability
	def test_cudnn():
	print_header("4. cuDNN Test")
	try:
	import torch
	cudnn_avail = torch.backends.cudnn.is_available()
	print(f"torch.backends.cudnn.is_available() -> {cudnn_avail}")
	if cudnn_avail:
	# Simple convolution to test cuDNN
	import torch.nn as nn
	conv = nn.Conv2d(3, 16, kernel_size=3, padding=1).cuda()
	inp = torch.randn(1, 3, 64, 64, device="cuda")
	out = conv(inp)
	print(f"[PASS] cuDNN convolution output shape: {tuple(out.shape)}")
	else:
	print("[INFO] cuDNN not available, skipping convolution test")
	return True
	except Exception as e:
	print(f"[FAIL] cuDNN test error: {e}")
	return False

	# 5. Test autograd (gradient computation)
	def test_autograd():
	print_header("5. Autograd Test")
	try:
	import torch
	x = torch.randn(5, 5, requires_grad=True)
	y = x * 2
	z = y.mean()
	z.backward()
	# Check that gradient is populated
	grad_norm = x.grad.norm().item()
	print(f"[PASS] gradient norm: {grad_norm:.4f}")
	return True
	except Exception as e:
	print(f"[FAIL] Autograd test error: {e}")
	return False

	# Main function to execute all tests and summarize the results
	def main():
	results = {}

	ok, version = test_import()
	results['import'] = ok

	results['tensor_ops'] = test_tensor_ops()
	results['cuda'] = test_cuda(max_mem_mb=500)
	results['cudnn'] = test_cudnn()
	results['autograd'] = test_autograd()

	# Print summary of all tests
	print_header("Summary")
	for name, passed in results.items():
	status = "PASS" if passed else "FAIL"
	print(f"{name:12s}: {status}")

	# Determine overall success status
	all_ok = all(results.values())
	overall = "All tests passed ✅" if all_ok else "Some tests failed ❌"
	print(f"\nOverall status: {overall}")
	sys.exit(0 if all_ok else 1)

	# Entry point of the script
	if __name__ == "__main__":
	main()

	"""
	pytorch_diagnostics.py is a lightweight diagnostic script designed to verify the integrity and functionality of a PyTorch installation. It performs a series of systematic checks, including:

	PyTorch import verification
	Basic tensor operations on CPU
	CUDA availability and GPU-based matrix multiplication within a memory limit
	cuDNN backend support and convolution test
	Autograd functionality and gradient backpropagation test
	"""