eric-czech · January 3, 2025 17:33
diff --git a/README.md b/README.md
diff --git a/build_triton.sh b/build_triton.sh
 #!/bin/bash

 # Build Script for Triton Language
 # ==========================================
 #
 # Note: This script includes several reboot points. After each reboot:
 # 1. Reconnect to your instance using SSH
 # 2. Navigate back to the script directory
 # 3. Run the script again 

 set -euxo pipefail

 echo "=== Checking system updates ==="
 if dnf check-release-update; then
    echo "=== No release update needed, continuing with installation ==="
 elif [ $? -eq 100 ]; then
    echo "=== Release update needed, performing update ==="
    sudo dnf update -y
    sudo dnf upgrade --releasever=latest -y
    echo "=== Updates complete, rebooting system ==="
    echo "After reboot:"
    echo "1. Reconnect to the instance using SSH"
    echo "2. Navigate back to: $(pwd)"
    echo "3. Run this script again to continue the installation"
    sudo reboot
    exit 0
 else
    echo "=== Error checking for updates, exiting ==="
    exit 1
 fi

 # Install base system dependencies
 echo "=== Installing base system dependencies ==="
 sudo dnf install -y git tmux

 echo "=== Installing CUDA system dependencies ==="
 sudo dnf install -y dkms kernel-devel kernel-modules-extra \
    vulkan-devel libglvnd-devel \
    elfutils-libelf-devel xorg-x11-server-Xorg
 sudo systemctl enable --now dkms

 # Check if CUDA is already installed
 if [ -f "/usr/local/cuda/bin/nvcc" ] && [ -f "/usr/bin/nvidia-smi" ]; then
    echo "=== CUDA already installed, skipping installation ==="
 else
    echo "=== Installing CUDA ==="
    mkdir -p cuda
    pushd cuda
    CUDA_INSTALLER="cuda_12.6.3_560.35.05_linux_sbsa.run"
    if [ ! -f "$CUDA_INSTALLER" ]; then
        wget "https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/$CUDA_INSTALLER"
    fi
    chmod +x ./*.run
    sudo ./*.run --driver --toolkit --silent --tmpdir=`pwd`
    popd
    echo "=== CUDA installation complete, rebooting system ==="
    echo "After reboot:"
    echo "1. Reconnect to the instance using SSH"
    echo "2. Navigate back to: $(pwd)"
    echo "3. Run this script again to continue the installation"
    sudo reboot
    exit 0
 fi

 # Verify CUDA installation
 echo "=== Verifying CUDA installation ==="
 nvidia-smi
 export CUDA_HOME=/usr/local/cuda
 export PATH=$CUDA_HOME/bin:$PATH
 $CUDA_HOME/bin/nvcc -V

 echo "=== Installing Miniforge ==="
 MINIFORGE_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh"
 if [ ! -f "Miniforge3-Linux-aarch64.sh" ]; then
    wget $MINIFORGE_URL
 else
    echo "=== Miniforge3 installer already downloaded, skipping download ==="
 fi

 if [ ! -d "$HOME/miniforge3" ]; then
    bash Miniforge3-Linux-aarch64.sh -b
    ~/miniforge3/bin/conda init bash
    source ~/.bashrc
 else
    echo "=== Miniforge3 already installed in $HOME/miniforge3, skipping installation ==="
 fi

 echo "=== Creating Triton development environment ==="
 if ! ~/miniforge3/bin/conda env list | grep -q "triton "; then
    ~/miniforge3/bin/conda create -n triton python=3.9 -y
    source ~/miniforge3/bin/activate triton
    mamba install -y \
        pip pybind11 \
        pytest scipy numpy pytest lit pandas matplotlib
    # Install PyTorch with CUDA support
    mamba install -y pytorch torchvision -c pytorch
    pip install ninja cmake build twine wheel setuptools
 else
    echo "=== Triton environment already exists, activating ==="
    source ~/miniforge3/bin/activate triton
 fi

 echo "=== Cloning Triton repository ==="
 TRITON_REPO_URL="https://github.com/triton-lang/triton.git"
 git clone "$TRITON_REPO_URL"
 cd triton

 echo "=== Installing Triton system dependencies ==="
 sudo dnf install -y cmake ninja-build

 echo "=== Setting build environment variables ==="
 export TRITON_BUILD_WITH_CLANG_LLD=true
 export TRITON_BUILD_WITH_CCACHE=true
 # See https://github.com/triton-lang/triton/blob/781ae0b2f6b26c496054624d807fc64c0ed6594c/.github/workflows/wheels.yml#L56-L58
 # for commentary on needing to lower MAX_JOBS from default 2 * n_cpus
 export MAX_JOBS=3

 echo "=== Building Triton ==="
 python -m build python/ --wheel --no-isolation --verbose 2>&1 | tee triton_build.log

 echo "=== Installing Triton ==="
 pip install python/dist/triton*.whl 2>&1 | tee triton_install.log

 echo "=== Running Triton tests ==="
 (
    cd python/build/cmake.linux-aarch64-cpython-3.9/ && \
    echo "=== Running ctest ===" && \
    ctest -j32 && \
    echo "=== Running lit tests ===" && \
    lit test
 )

 echo "=== Creating GPU test file ==="
 cat > triton_test.py << 'EOL'
 import torch
 import triton
 import triton.language as tl
 import numpy as np

 @triton.jit
 def add_vectors(
    x_ptr,  # Pointer to first input vector
    y_ptr,  # Pointer to second input vector
    output_ptr,  # Pointer to output vector
    n_elements,  # Size of vectors
    BLOCK_SIZE: tl.constexpr,  # Number of elements per block
 ):
    # We can add print statements in interpreter mode
    print(f"Processing block of size {BLOCK_SIZE}")
    
    pid = tl.program_id(axis=0)
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    mask = offsets < n_elements
    
    # Load vectors
    x = tl.load(x_ptr + offsets, mask=mask)
    y = tl.load(y_ptr + offsets, mask=mask)
    
    # Add them
    output = x + y
    
    # Store result
    tl.store(output_ptr + offsets, output, mask=mask)

 def main():
    # Vector size
    size = 128
    
    # Create input vectors on GPU
    x = torch.randn(size, device='cuda')
    y = torch.randn(size, device='cuda')
    output = torch.empty_like(x)
    
    # Launch kernel
    grid = (triton.cdiv(size, 128),)
    add_vectors[grid](x, y, output, size, BLOCK_SIZE=128)
    
    # Verify result (keeping tensors on GPU)
    torch_output = x + y
    max_diff = torch.max(torch.abs(output - torch_output)).item()
    print(f"Max difference between Triton and Torch: {max_diff}")
    
    # Assert the results match within a small tolerance
    tolerance = 1e-6
    assert max_diff <= tolerance, f"Results don't match! Max difference ({max_diff}) exceeds tolerance ({tolerance})"
    print("Test passed successfully!")

 if __name__ == "__main__":
    main()
 EOL

 echo "=== Running GPU test ==="
 python triton_test.py

 echo "=== SUCCESS: Installation complete! All tests passed ==="
	#!/bin/bash

	# Build Script for Triton Language
	# ==========================================
	#
	# Note: This script includes several reboot points. After each reboot:
	# 1. Reconnect to your instance using SSH
	# 2. Navigate back to the script directory
	# 3. Run the script again

	set -euxo pipefail

	echo "=== Checking system updates ==="
	if dnf check-release-update; then
	echo "=== No release update needed, continuing with installation ==="
	elif [ $? -eq 100 ]; then
	echo "=== Release update needed, performing update ==="
	sudo dnf update -y
	sudo dnf upgrade --releasever=latest -y
	echo "=== Updates complete, rebooting system ==="
	echo "After reboot:"
	echo "1. Reconnect to the instance using SSH"
	echo "2. Navigate back to: $(pwd)"
	echo "3. Run this script again to continue the installation"
	sudo reboot
	exit 0
	else
	echo "=== Error checking for updates, exiting ==="
	exit 1
	fi

	# Install base system dependencies
	echo "=== Installing base system dependencies ==="
	sudo dnf install -y git tmux

	echo "=== Installing CUDA system dependencies ==="
	sudo dnf install -y dkms kernel-devel kernel-modules-extra \
	vulkan-devel libglvnd-devel \
	elfutils-libelf-devel xorg-x11-server-Xorg
	sudo systemctl enable --now dkms

	# Check if CUDA is already installed
	if [ -f "/usr/local/cuda/bin/nvcc" ] && [ -f "/usr/bin/nvidia-smi" ]; then
	echo "=== CUDA already installed, skipping installation ==="
	else
	echo "=== Installing CUDA ==="
	mkdir -p cuda
	pushd cuda
	CUDA_INSTALLER="cuda_12.6.3_560.35.05_linux_sbsa.run"
	if [ ! -f "$CUDA_INSTALLER" ]; then
	wget "https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/$CUDA_INSTALLER"
	fi
	chmod +x ./*.run
	sudo ./*.run --driver --toolkit --silent --tmpdir=`pwd`
	popd
	echo "=== CUDA installation complete, rebooting system ==="
	echo "After reboot:"
	echo "1. Reconnect to the instance using SSH"
	echo "2. Navigate back to: $(pwd)"
	echo "3. Run this script again to continue the installation"
	sudo reboot
	exit 0
	fi

	# Verify CUDA installation
	echo "=== Verifying CUDA installation ==="
	nvidia-smi
	export CUDA_HOME=/usr/local/cuda
	export PATH=$CUDA_HOME/bin:$PATH
	$CUDA_HOME/bin/nvcc -V

	echo "=== Installing Miniforge ==="
	MINIFORGE_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-aarch64.sh"
	if [ ! -f "Miniforge3-Linux-aarch64.sh" ]; then
	wget $MINIFORGE_URL
	else
	echo "=== Miniforge3 installer already downloaded, skipping download ==="
	fi

	if [ ! -d "$HOME/miniforge3" ]; then
	bash Miniforge3-Linux-aarch64.sh -b
	~/miniforge3/bin/conda init bash
	source ~/.bashrc
	else
	echo "=== Miniforge3 already installed in $HOME/miniforge3, skipping installation ==="
	fi

	echo "=== Creating Triton development environment ==="
	if ! ~/miniforge3/bin/conda env list \| grep -q "triton "; then
	~/miniforge3/bin/conda create -n triton python=3.9 -y
	source ~/miniforge3/bin/activate triton
	mamba install -y \
	pip pybind11 \
	pytest scipy numpy pytest lit pandas matplotlib
	# Install PyTorch with CUDA support
	mamba install -y pytorch torchvision -c pytorch
	pip install ninja cmake build twine wheel setuptools
	else
	echo "=== Triton environment already exists, activating ==="
	source ~/miniforge3/bin/activate triton
	fi

	echo "=== Cloning Triton repository ==="
	TRITON_REPO_URL="https://github.com/triton-lang/triton.git"
	git clone "$TRITON_REPO_URL"
	cd triton

	echo "=== Installing Triton system dependencies ==="
	sudo dnf install -y cmake ninja-build

	echo "=== Setting build environment variables ==="
	export TRITON_BUILD_WITH_CLANG_LLD=true
	export TRITON_BUILD_WITH_CCACHE=true
	# See https://github.com/triton-lang/triton/blob/781ae0b2f6b26c496054624d807fc64c0ed6594c/.github/workflows/wheels.yml#L56-L58
	# for commentary on needing to lower MAX_JOBS from default 2 * n_cpus
	export MAX_JOBS=3

	echo "=== Building Triton ==="
	python -m build python/ --wheel --no-isolation --verbose 2>&1 \| tee triton_build.log

	echo "=== Installing Triton ==="
	pip install python/dist/triton*.whl 2>&1 \| tee triton_install.log

	echo "=== Running Triton tests ==="
	(
	cd python/build/cmake.linux-aarch64-cpython-3.9/ && \
	echo "=== Running ctest ===" && \
	ctest -j32 && \
	echo "=== Running lit tests ===" && \
	lit test
	)

	echo "=== Creating GPU test file ==="
	cat > triton_test.py << 'EOL'
	import torch
	import triton
	import triton.language as tl
	import numpy as np

	@triton.jit
	def add_vectors(
	x_ptr, # Pointer to first input vector
	y_ptr, # Pointer to second input vector
	output_ptr, # Pointer to output vector
	n_elements, # Size of vectors
	BLOCK_SIZE: tl.constexpr, # Number of elements per block
	):
	# We can add print statements in interpreter mode
	print(f"Processing block of size {BLOCK_SIZE}")

	pid = tl.program_id(axis=0)
	block_start = pid * BLOCK_SIZE
	offsets = block_start + tl.arange(0, BLOCK_SIZE)
	mask = offsets < n_elements

	# Load vectors
	x = tl.load(x_ptr + offsets, mask=mask)
	y = tl.load(y_ptr + offsets, mask=mask)

	# Add them
	output = x + y

	# Store result
	tl.store(output_ptr + offsets, output, mask=mask)

	def main():
	# Vector size
	size = 128

	# Create input vectors on GPU
	x = torch.randn(size, device='cuda')
	y = torch.randn(size, device='cuda')
	output = torch.empty_like(x)

	# Launch kernel
	grid = (triton.cdiv(size, 128),)
	add_vectors[grid](x, y, output, size, BLOCK_SIZE=128)

	# Verify result (keeping tensors on GPU)
	torch_output = x + y
	max_diff = torch.max(torch.abs(output - torch_output)).item()
	print(f"Max difference between Triton and Torch: {max_diff}")

	# Assert the results match within a small tolerance
	tolerance = 1e-6
	assert max_diff <= tolerance, f"Results don't match! Max difference ({max_diff}) exceeds tolerance ({tolerance})"
	print("Test passed successfully!")

	if __name__ == "__main__":
	main()
	EOL

	echo "=== Running GPU test ==="
	python triton_test.py

	echo "=== SUCCESS: Installation complete! All tests passed ==="