qpwo · March 18, 2025 11:17
diff --git a/hello.py b/hello.py
 import torch
 from torch.utils.cpp_extension import load_inline

 # Define C++ source code
 cpp_source = """
 #include <torch/extension.h>

 torch::Tensor add_one(torch::Tensor input) {
    return input + 1;
 }

 // Forward declaration of the CUDA function
 torch::Tensor add_one_cuda(torch::Tensor input);
 """

 # Define CUDA source code
 cuda_source = """
 #include <torch/extension.h>

 __global__ void add_one_kernel(float* input, float* output, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        output[idx] = input[idx] + 1.0f;
    }
 }

 torch::Tensor add_one_cuda(torch::Tensor input) {
    auto output = torch::empty_like(input);
    
    const int threads = 256;
    const int blocks = (input.numel() + threads - 1) / threads;
    
    add_one_kernel<<<blocks, threads>>>(
        input.data_ptr<float>(),
        output.data_ptr<float>(),
        input.numel()
    );
    
    return output;
 }
 """

 # Compile the extension
 add_module = load_inline(
    name="add_extension",
    cpp_sources=cpp_source,
    cuda_sources=cuda_source,
    functions=["add_one", "add_one_cuda"],
    verbose=True
 )

 # Test the CPU extension
 tensor = torch.tensor([1, 2, 3])
 result = add_module.add_one(tensor)
 print(f"Input: {tensor}")
 print(f"Output (CPU): {result}")

 # Test the CUDA extension
 if torch.cuda.is_available():
    tensor_cuda = torch.tensor([1, 2, 3], device="cuda", dtype=torch.float32)
    result_cuda = add_module.add_one_cuda(tensor_cuda)
    print(f"Input: {tensor_cuda}")
    print(f"Output (CUDA): {result_cuda}")
 else:
    print("CUDA not available, skipping CUDA test")
	import torch
	from torch.utils.cpp_extension import load_inline

	# Define C++ source code
	cpp_source = """
	#include <torch/extension.h>

	torch::Tensor add_one(torch::Tensor input) {
	return input + 1;
	}

	// Forward declaration of the CUDA function
	torch::Tensor add_one_cuda(torch::Tensor input);
	"""

	# Define CUDA source code
	cuda_source = """
	#include <torch/extension.h>

	__global__ void add_one_kernel(float* input, float* output, int size) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < size) {
	output[idx] = input[idx] + 1.0f;
	}
	}

	torch::Tensor add_one_cuda(torch::Tensor input) {
	auto output = torch::empty_like(input);

	const int threads = 256;
	const int blocks = (input.numel() + threads - 1) / threads;

	add_one_kernel<<<blocks, threads>>>(
	input.data_ptr<float>(),
	output.data_ptr<float>(),
	input.numel()
	);

	return output;
	}
	"""

	# Compile the extension
	add_module = load_inline(
	name="add_extension",
	cpp_sources=cpp_source,
	cuda_sources=cuda_source,
	functions=["add_one", "add_one_cuda"],
	verbose=True
	)

	# Test the CPU extension
	tensor = torch.tensor([1, 2, 3])
	result = add_module.add_one(tensor)
	print(f"Input: {tensor}")
	print(f"Output (CPU): {result}")

	# Test the CUDA extension
	if torch.cuda.is_available():
	tensor_cuda = torch.tensor([1, 2, 3], device="cuda", dtype=torch.float32)
	result_cuda = add_module.add_one_cuda(tensor_cuda)
	print(f"Input: {tensor_cuda}")
	print(f"Output (CUDA): {result_cuda}")
	else:
	print("CUDA not available, skipping CUDA test")