Last active
March 25, 2025 19:35
-
-
Save Apsu/e7455403974056e1e644434584aab6db to your computer and use it in GitHub Desktop.
vLLM full stack multi-platform build script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -euo pipefail | |
# Architecture | |
export BUILD_ARCH=$(arch) | |
# Python | |
export PYTHON_VERSION=${PYTHON_VERSION:-3.12} | |
# CUDA | |
export CUDA_VERSION=${CUDA_VERSION:-12.8.90} | |
export CUDA_SHORT=${CUDA_VERSION%.*} | |
export CUDA_TAG=cu${CUDA_SHORT//./} | |
export CUDA_RELEASE=${CUDA_SHORT//./-} | |
# Requires cuda-cupti-dev cuda-nvml-dev libnccl-dev | |
# sudo apt install -y cuda-cupti-dev-${CUDA_RELEASE} cuda-nvml-dev-${CUDA_RELEASE} libnccl-dev | |
# Job scaling | |
export MAX_JOBS=${MAX_JOBS:-$(nproc)} | |
export NVCC_THREADS=${NVCC_THREADS:-8} | |
# Cmake build type | |
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} | |
# Arch lists | |
# 'a' suffix is not forward compatible but enables all optimizations | |
export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST:-9.0a} | |
CUDA_ARCHES=${TORCH_CUDA_ARCH_LIST//+PTX/} | |
IFS=";" read -ra CUDA_ARCHES <<< ${CUDA_ARCHES} | |
VLLM_FA_ARCH_LIST=$(IFS=";"; echo "${CUDA_ARCHES[*]/%/-real}") | |
export VLLM_FA_CMAKE_GPU_ARCHES=${VLLM_FA_CMAKE_GPU_ARCHES:-${VLLM_FA_ARCH_LIST//./}} | |
# Prep build venv | |
uv venv -p ${PYTHON_VERSION} --seed --python-preference only-managed | |
export VIRTUAL_ENV=${PWD}/.venv | |
export PATH=${VIRTUAL_ENV}/bin:${PATH} | |
export CUDA_HOME=/usr/local/cuda | |
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH:-:} | |
mkdir -p wheels src | |
export WHEELS=${PWD}/wheels | |
# Reset repo helper | |
# $1 = repo, $2 = ref | |
reset_repo () { | |
# Get checkout path from repo name | |
REPO_PATH=$(basename ${1} .git) | |
# If clone fails likely because repo is cloned already | |
if ! git clone ${1} | |
then | |
pushd ${REPO_PATH} | |
# Set remote and fetch | |
git remote set-url origin ${1} | |
git fetch origin | |
# Get default branch | |
HEAD_BRANCH=$(git remote show origin | awk '/HEAD branch/ {print $3}') | |
# Make sure it's checked out and reset | |
git reset --hard | |
git checkout ${HEAD_BRANCH} | |
git reset --hard | |
git pull | |
# Delete other branches | |
for branch in $(git branch --format '%(refname:short)') | |
do | |
# Skip default | |
if [[ ${branch} != ${HEAD_BRANCH} ]] | |
then | |
git branch -D ${branch} | |
fi | |
done | |
popd | |
fi | |
pushd ${REPO_PATH} | |
# Checkout ref and prep submodules if any | |
git checkout ${2} | |
git submodule sync --recursive | |
git submodule update --init --recursive -j 8 | |
popd | |
} | |
# PyNVML | |
uv pip install pynvml | |
# Install build deps that aren't in project requirements files | |
# Make sure to upgrade setuptools to avoid triton build bug | |
uv pip install -U build cmake ninja pybind11 "setuptools<=76" wheel | |
pushd src | |
# Build architecture specific differences | |
if [[ ${BUILD_ARCH} == "x86_64" ]] | |
then | |
uv pip install mkl-static mkl-include | |
elif [[ ${BUILD_ARCH} == "aarch64" ]] | |
then | |
if ! [[ -v SKIP_ACL ]] | |
then | |
# Deps for ACL | |
uv pip install patchelf scons | |
# Optimize ARM linking | |
export USE_PRIORITIZED_TEXT_FOR_LD=1 | |
# Build ARM ComputeLibrary | |
export ACL_REPO=https://github.com/ARM-software/ComputeLibrary.git | |
export ACL_REF=v24.09 | |
export ACL_ROOT_DIR=${PWD}/acl | |
export ACL_INCLUDE_DIR=${ACL_ROOT_DIR}/include | |
export ACL_LIBRARY=${ACL_ROOT_DIR}/build | |
export LD_LIBRARY_PATH=${ACL_LIBRARY}:${LD_LIBRARY_PATH} | |
mkdir -p acl | |
reset_repo ${ACL_REPO} ${ACL_REF} | |
pushd ComputeLibrary | |
scons Werror=1 -j$(nproc) build_dir=${ACL_LIBRARY} debug=0 neon=1 opencl=0 os=linux openmp=1 cppthreads=0 arch=armv8a multi_isa=1 fixed_format_kernels=1 build=native | |
popd | |
fi | |
else | |
echo "Unsupported architecture: ${BUILD_ARCH}" | |
exit 1 | |
fi | |
# Torch | |
export TORCH_REPO=${TORCH_REPO:-https://github.com/pytorch/pytorch.git} | |
export TORCH_REF=${TORCH_REF:-v2.6.0} | |
export TORCH_BUILD_VERSION=${TORCH_BUILD_VERSION:-${TORCH_REF#v}+${CUDA_TAG}} | |
export PYTORCH_BUILD_VERSION=${TORCH_BUILD_VERSION:-${TORCH_REF#v}+${CUDA_TAG}} | |
export PYTORCH_BUILD_NUMBER=0 | |
if ! [[ -v SKIP_TORCH ]] | |
then | |
reset_repo ${TORCH_REPO} ${TORCH_REF} | |
pushd pytorch | |
if [[ ${BUILD_ARCH} == "aarch64" ]] | |
then | |
# Use NVPL on ARM64 | |
export BLAS=NVPL | |
# Bump XNNPACK submodule ref to fix compilation bug on ARM64 | |
pushd third_party/XNNPACK | |
git checkout fcc06d1 | |
popd | |
fi | |
uv pip install -r requirements.txt | |
uv build --wheel --no-build-isolation -o ${WHEELS} | |
uv pip install ${WHEELS}/* | |
popd | |
else | |
uv pip install ${WHEELS}/* | |
fi | |
export AUDIO_REPO=${AUDIO_REPO:-https://github.com/pytorch/audio.git} | |
export AUDIO_REF=${AUDIO_REF:-v2.6.0} | |
export AUDIO_BUILD_VERSION=${AUDIO_BUILD_VERSION:-${AUDIO_REF#v}+${CUDA_TAG}} | |
export BUILD_VERSION=${AUDIO_BUILD_VERSION:-${AUDIO_REF#v}+${CUDA_TAG}} | |
if ! [[ -v SKIP_AUDIO ]] | |
then | |
reset_repo ${AUDIO_REPO} ${AUDIO_REF} | |
pushd audio | |
uv build --wheel --no-build-isolation -o ${WHEELS} | |
popd | |
fi | |
export VISION_REPO=${VISION_REPO:-https://github.com/pytorch/vision.git} | |
export VISION_REF=${VISION_REF:-v0.21.0} | |
export VISION_BUILD_VERSION=${VISION_BUILD_VERSION:-${VISION_REF#v}+${CUDA_TAG}} | |
export BUILD_VERSION=${VISION_BUILD_VERSION:-${VISION_REF#v}+${CUDA_TAG}} | |
if ! [[ -v SKIP_VISION ]] | |
then | |
reset_repo ${VISION_REPO} ${VISION_REF} | |
pushd vision | |
uv build --wheel --no-build-isolation -o ${WHEELS} | |
popd | |
fi | |
export TRITON_REPO=${TRITON_REPO:-https://github.com/triton-lang/triton.git} | |
export TRITON_REF=${TRITON_REF:-release/3.2.x} | |
export TRITON_VERSION=${TRITON_VERSION:-3.2.0} | |
export TRITON_BUILD_VERSION=${TRITON_BUILD_VERSION:-${TRITON_VERSION}+${CUDA_TAG}} | |
if ! [[ -v SKIP_TRITON ]] | |
then | |
reset_repo ${TRITON_REPO} ${TRITON_REF} | |
pushd triton/python | |
# Override package version because Triton is silly | |
sed -i "s/version=\"${TRITON_VERSION}\".*$/version=\"${TRITON_BUILD_VERSION}\",/" setup.py | |
uv build --wheel --no-build-isolation -o ${WHEELS} | |
popd | |
fi | |
export XFORMERS_REPO=${XFORMERS_REPO:-https://github.com/facebookresearch/xformers.git} | |
export XFORMERS_REF=${XFORMERS_REF:-v0.0.29.post2} | |
export XFORMERS_BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}+${CUDA_TAG}} | |
export BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}+${CUDA_TAG}} | |
if ! [[ -v SKIP_XFORMERS ]] | |
then | |
reset_repo ${XFORMERS_REPO} ${XFORMERS_REF} | |
pushd xformers | |
uv build --wheel --no-build-isolation -o ${WHEELS} | |
popd | |
fi | |
export FLASHINFER_ENABLE_AOT=1 | |
export FLASHINFER_REPO=${FLASHINFER_REPO:-https://github.com/flashinfer-ai/flashinfer.git} | |
export FLASHINFER_REF=${FLASHINFER_REF:-v0.2.2.post1} | |
export FLASHINFER_BUILD_SUFFIX=${FLASHINFER_BUILD_SUFFIX:-${CUDA_TAG}} | |
export FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-} | |
if ! [[ -v SKIP_FLASHINFER ]] | |
then | |
reset_repo ${FLASHINFER_REPO} ${FLASHINFER_REF} | |
pushd flashinfer | |
uv build --wheel --no-build-isolation -o ${WHEELS} | |
popd | |
fi | |
export VLLM_REPO=${VLLM_REPO:-https://github.com/vllm-project/vllm.git} | |
export VLLM_REF=${VLLM_REF:-main} | |
if ! [[ -v SKIP_VLLM ]] | |
then | |
reset_repo ${VLLM_REPO} ${VLLM_REF} | |
pushd vllm | |
python use_existing_torch.py | |
uv pip install -r requirements/build.txt | |
uv build --wheel --no-build-isolation -o ${WHEELS} | |
popd | |
fi | |
popd | |
uv pip install ${WHEELS}/* | |
# Add additional packages for vLLM | |
uv pip install accelerate bitsandbytes boto3 hf_transfer modelscope runai-model-streamer runai-model-streamer[s3] tensorizer timm |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment