Skip to content

Instantly share code, notes, and snippets.

@googlefan256
Created June 26, 2025 15:12
Show Gist options
  • Save googlefan256/c4ea9f71ccb4b0a45f52e78b9a684115 to your computer and use it in GitHub Desktop.
Save googlefan256/c4ea9f71ccb4b0a45f52e78b9a684115 to your computer and use it in GitHub Desktop.
tinygrad in rust working example
use std::ffi::c_int;
fn inner() -> anyhow::Result<()> {
use tinygrad_in_rust::backend::cuda;
cuda::cuda_init("/usr/lib/x86_64-linux-gnu/libcuda.so")?;
cuda::nvrtc_init("/usr/local/cuda/lib64/libnvrtc.so")?;
cuda::nvjitlink_init("/usr/local/cuda/lib64/libnvJitLink.so")?;
let res = cuda::CuCompiler::new_default()?.compile(
r#"
extern "C" __global__ void add(int *a, int *b, int *c, int size) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
{
c[idx] = a[idx] + b[idx];
}
}
"#,
"test.cu",
true,
)?;
println!("Compiled successfully, size: {}", res.len());
let arch = cuda::CudaArch::new_least()?;
println!("Using CUDA architecture: {:?}", arch);
let res = cuda::NvPTXCompiler::new(&arch)?.compile(&String::from_utf8(res)?, "test.cu")?;
println!("PTX compiled to binary successfully, size: {}", res.len());
let context = cuda::CudaContext::new()?;
context.set_current()?;
let stream = cuda::CudaStream::new(&context)?;
let alloc = cuda::CudaAllocator::new(&context, &stream);
let m = cuda::CudaProgram::new(&res, "add", Some(200), &stream)?;
let mut a = alloc.alloc(1024 * size_of::<c_int>(), false)?;
alloc.copy_in(&a, &(0..1024).map(|x| x / 2 as c_int).collect::<Vec<_>>())?;
let mut b = alloc.alloc(1024 * size_of::<c_int>(), false)?;
alloc.copy_in(&b, &(0..1024).map(|x| 512 - x as c_int).collect::<Vec<_>>())?;
let mut o = alloc.alloc(1024 * size_of::<c_int>(), false)?;
m.run(
cuda::kernel_args!(&mut a.ptr, &mut b.ptr, &mut o.ptr, &mut c_int::from(1024)),
((1024 / size_of::<c_int>() as u32 + 255) / 256, 1, 1),
(256, 1, 1),
)?;
println!("Kernel executed successfully");
let mut c = vec![c_int::default(); 1024];
alloc.copy_out(&o, &mut c)?;
println!("Copied data to device, first 10 bytes: {:?}", &c[..10]);
let mut d = vec![c_int::default(); 1024];
alloc.transfer(&o, &b, 1024 * size_of::<c_int>())?;
alloc.copy_out(&b, &mut d)?;
println!("Copied data to device, first 10 bytes: {:?}", &d[..10]);
alloc.free(a)?;
alloc.free(b)?;
alloc.free(o)?;
stream.drop()?;
context.drop()?;
Ok(())
}
fn main() -> anyhow::Result<()> {
if let Err(e) = inner() {
eprintln!("Error: {}", e);
}
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment