Skip to content

Instantly share code, notes, and snippets.

@Apsu
Last active March 25, 2025 19:35
Show Gist options
  • Save Apsu/e7455403974056e1e644434584aab6db to your computer and use it in GitHub Desktop.
Save Apsu/e7455403974056e1e644434584aab6db to your computer and use it in GitHub Desktop.
vLLM full stack multi-platform build script
#!/usr/bin/env bash
set -euo pipefail
# Architecture
export BUILD_ARCH=$(arch)
# Python
export PYTHON_VERSION=${PYTHON_VERSION:-3.12}
# CUDA
export CUDA_VERSION=${CUDA_VERSION:-12.8.90}
export CUDA_SHORT=${CUDA_VERSION%.*}
export CUDA_TAG=cu${CUDA_SHORT//./}
export CUDA_RELEASE=${CUDA_SHORT//./-}
# Requires cuda-cupti-dev cuda-nvml-dev libnccl-dev
# sudo apt install -y cuda-cupti-dev-${CUDA_RELEASE} cuda-nvml-dev-${CUDA_RELEASE} libnccl-dev
# Job scaling
export MAX_JOBS=${MAX_JOBS:-$(nproc)}
export NVCC_THREADS=${NVCC_THREADS:-8}
# Cmake build type
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
# Arch lists
# 'a' suffix is not forward compatible but enables all optimizations
export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST:-9.0a}
CUDA_ARCHES=${TORCH_CUDA_ARCH_LIST//+PTX/}
IFS=";" read -ra CUDA_ARCHES <<< ${CUDA_ARCHES}
VLLM_FA_ARCH_LIST=$(IFS=";"; echo "${CUDA_ARCHES[*]/%/-real}")
export VLLM_FA_CMAKE_GPU_ARCHES=${VLLM_FA_CMAKE_GPU_ARCHES:-${VLLM_FA_ARCH_LIST//./}}
# Prep build venv
uv venv -p ${PYTHON_VERSION} --seed --python-preference only-managed
export VIRTUAL_ENV=${PWD}/.venv
export PATH=${VIRTUAL_ENV}/bin:${PATH}
export CUDA_HOME=/usr/local/cuda
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-:}
mkdir -p wheels src
export WHEELS=${PWD}/wheels
# Reset repo helper
# $1 = repo, $2 = ref
reset_repo () {
# Get checkout path from repo name
REPO_PATH=$(basename ${1} .git)
# If clone fails likely because repo is cloned already
if ! git clone ${1}
then
pushd ${REPO_PATH}
# Set remote and fetch
git remote set-url origin ${1}
git fetch origin
# Get default branch
HEAD_BRANCH=$(git remote show origin | awk '/HEAD branch/ {print $3}')
# Make sure it's checked out and reset
git reset --hard
git checkout ${HEAD_BRANCH}
git reset --hard
git pull
# Delete other branches
for branch in $(git branch --format '%(refname:short)')
do
# Skip default
if [[ ${branch} != ${HEAD_BRANCH} ]]
then
git branch -D ${branch}
fi
done
popd
fi
pushd ${REPO_PATH}
# Checkout ref and prep submodules if any
git checkout ${2}
git submodule sync --recursive
git submodule update --init --recursive -j 8
popd
}
# PyNVML
uv pip install pynvml
# Install build deps that aren't in project requirements files
# Make sure to upgrade setuptools to avoid triton build bug
uv pip install -U build cmake ninja pybind11 "setuptools<=76" wheel
pushd src
# Build architecture specific differences
if [[ ${BUILD_ARCH} == "x86_64" ]]
then
uv pip install mkl-static mkl-include
elif [[ ${BUILD_ARCH} == "aarch64" ]]
then
if ! [[ -v SKIP_ACL ]]
then
# Deps for ACL
uv pip install patchelf scons
# Optimize ARM linking
export USE_PRIORITIZED_TEXT_FOR_LD=1
# Build ARM ComputeLibrary
export ACL_REPO=https://github.com/ARM-software/ComputeLibrary.git
export ACL_REF=v24.09
export ACL_ROOT_DIR=${PWD}/acl
export ACL_INCLUDE_DIR=${ACL_ROOT_DIR}/include
export ACL_LIBRARY=${ACL_ROOT_DIR}/build
export LD_LIBRARY_PATH=${ACL_LIBRARY}:${LD_LIBRARY_PATH}
mkdir -p acl
reset_repo ${ACL_REPO} ${ACL_REF}
pushd ComputeLibrary
scons Werror=1 -j$(nproc) build_dir=${ACL_LIBRARY} debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 fixed_format_kernels=1 build=native
popd
fi
else
echo "Unsupported architecture: ${BUILD_ARCH}"
exit 1
fi
# Torch
export TORCH_REPO=${TORCH_REPO:-https://github.com/pytorch/pytorch.git}
export TORCH_REF=${TORCH_REF:-v2.6.0}
export TORCH_BUILD_VERSION=${TORCH_BUILD_VERSION:-${TORCH_REF#v}+${CUDA_TAG}}
export PYTORCH_BUILD_VERSION=${TORCH_BUILD_VERSION:-${TORCH_REF#v}+${CUDA_TAG}}
export PYTORCH_BUILD_NUMBER=0
if ! [[ -v SKIP_TORCH ]]
then
reset_repo ${TORCH_REPO} ${TORCH_REF}
pushd pytorch
if [[ ${BUILD_ARCH} == "aarch64" ]]
then
# Use NVPL on ARM64
export BLAS=NVPL
# Bump XNNPACK submodule ref to fix compilation bug on ARM64
pushd third_party/XNNPACK
git checkout fcc06d1
popd
fi
uv pip install -r requirements.txt
uv build --wheel --no-build-isolation -o ${WHEELS}
uv pip install ${WHEELS}/*
popd
else
uv pip install ${WHEELS}/*
fi
export AUDIO_REPO=${AUDIO_REPO:-https://github.com/pytorch/audio.git}
export AUDIO_REF=${AUDIO_REF:-v2.6.0}
export AUDIO_BUILD_VERSION=${AUDIO_BUILD_VERSION:-${AUDIO_REF#v}+${CUDA_TAG}}
export BUILD_VERSION=${AUDIO_BUILD_VERSION:-${AUDIO_REF#v}+${CUDA_TAG}}
if ! [[ -v SKIP_AUDIO ]]
then
reset_repo ${AUDIO_REPO} ${AUDIO_REF}
pushd audio
uv build --wheel --no-build-isolation -o ${WHEELS}
popd
fi
export VISION_REPO=${VISION_REPO:-https://github.com/pytorch/vision.git}
export VISION_REF=${VISION_REF:-v0.21.0}
export VISION_BUILD_VERSION=${VISION_BUILD_VERSION:-${VISION_REF#v}+${CUDA_TAG}}
export BUILD_VERSION=${VISION_BUILD_VERSION:-${VISION_REF#v}+${CUDA_TAG}}
if ! [[ -v SKIP_VISION ]]
then
reset_repo ${VISION_REPO} ${VISION_REF}
pushd vision
uv build --wheel --no-build-isolation -o ${WHEELS}
popd
fi
export TRITON_REPO=${TRITON_REPO:-https://github.com/triton-lang/triton.git}
export TRITON_REF=${TRITON_REF:-release/3.2.x}
export TRITON_VERSION=${TRITON_VERSION:-3.2.0}
export TRITON_BUILD_VERSION=${TRITON_BUILD_VERSION:-${TRITON_VERSION}+${CUDA_TAG}}
if ! [[ -v SKIP_TRITON ]]
then
reset_repo ${TRITON_REPO} ${TRITON_REF}
pushd triton/python
# Override package version because Triton is silly
sed -i "s/version=\"${TRITON_VERSION}\".*$/version=\"${TRITON_BUILD_VERSION}\",/" setup.py
uv build --wheel --no-build-isolation -o ${WHEELS}
popd
fi
export XFORMERS_REPO=${XFORMERS_REPO:-https://github.com/facebookresearch/xformers.git}
export XFORMERS_REF=${XFORMERS_REF:-v0.0.29.post2}
export XFORMERS_BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}+${CUDA_TAG}}
export BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}+${CUDA_TAG}}
if ! [[ -v SKIP_XFORMERS ]]
then
reset_repo ${XFORMERS_REPO} ${XFORMERS_REF}
pushd xformers
uv build --wheel --no-build-isolation -o ${WHEELS}
popd
fi
export FLASHINFER_ENABLE_AOT=1
export FLASHINFER_REPO=${FLASHINFER_REPO:-https://github.com/flashinfer-ai/flashinfer.git}
export FLASHINFER_REF=${FLASHINFER_REF:-v0.2.2.post1}
export FLASHINFER_BUILD_SUFFIX=${FLASHINFER_BUILD_SUFFIX:-${CUDA_TAG}}
export FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-}
if ! [[ -v SKIP_FLASHINFER ]]
then
reset_repo ${FLASHINFER_REPO} ${FLASHINFER_REF}
pushd flashinfer
uv build --wheel --no-build-isolation -o ${WHEELS}
popd
fi
export VLLM_REPO=${VLLM_REPO:-https://github.com/vllm-project/vllm.git}
export VLLM_REF=${VLLM_REF:-main}
if ! [[ -v SKIP_VLLM ]]
then
reset_repo ${VLLM_REPO} ${VLLM_REF}
pushd vllm
python use_existing_torch.py
uv pip install -r requirements/build.txt
uv build --wheel --no-build-isolation -o ${WHEELS}
popd
fi
popd
uv pip install ${WHEELS}/*
# Add additional packages for vLLM
uv pip install accelerate bitsandbytes boto3 hf_transfer modelscope runai-model-streamer runai-model-streamer[s3] tensorizer timm
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment