Skip to content

Instantly share code, notes, and snippets.

@pokutuna
Last active April 20, 2026 03:26
Show Gist options
  • Select an option

  • Save pokutuna/3b53f277e2c9e62cab2afb0e5bb08067 to your computer and use it in GitHub Desktop.

Select an option

Save pokutuna/3b53f277e2c9e62cab2afb0e5bb08067 to your computer and use it in GitHub Desktop.
Cloud Run L4 llama.cpp server with CUDA 12.8 compat
substitutions:
_MODEL_REPO: 'HauhauCS/Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive'
_MODEL_FILE: 'Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf'
_REGION: 'us-central1'
_REPO: 'd2'
_SERVICE: 'llamacpp'
_LLAMA_EXTRA_ARGS: '--ctx-size,8192,--jinja'
_ENABLE_COMPAT: '1'
steps:
- name: 'gcr.io/cloud-builders/docker'
# gated model 用: secretEnv: ['HF_TOKEN'] を追加し availableSecrets を uncomment
args:
- 'build'
- '--build-arg=MODEL_REPO=${_MODEL_REPO}'
- '--build-arg=MODEL_FILE=${_MODEL_FILE}'
- '--build-arg=ENABLE_COMPAT=${_ENABLE_COMPAT}'
- '-t=${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPO}/llama-server:latest'
- '-f=llama-server/Dockerfile'
- 'llama-server/'
- name: 'gcr.io/cloud-builders/docker'
args: ['push', '${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPO}/llama-server:latest']
- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
entrypoint: 'gcloud'
args:
- 'beta'
- 'run'
- 'deploy'
- '${_SERVICE}'
- '--region=${_REGION}'
- '--project=$PROJECT_ID'
- '--image=${_REGION}-docker.pkg.dev/$PROJECT_ID/${_REPO}/llama-server:latest'
- '--cpu=4'
- '--memory=16Gi'
- '--gpu=1'
- '--gpu-type=nvidia-l4'
- '--no-gpu-zonal-redundancy'
- '--iap'
- '--no-cpu-throttling'
- '--cpu-boost'
- '--timeout=600'
- '--min-instances=0'
- '--max-instances=1'
- '--port=8080'
- '--startup-probe=httpGet.path=/health,httpGet.port=8080,periodSeconds=10,failureThreshold=60,timeoutSeconds=5'
- '--args=--model,/models/${_MODEL_FILE},--host,0.0.0.0,--port,8080,${_LLAMA_EXTRA_ARGS}'
options:
env:
- 'DOCKER_BUILDKIT=1'
timeout: '3600s'
# availableSecrets:
# secretManager:
# - versionName: projects/$PROJECT_ID/secrets/HF_TOKEN/versions/latest
# env: 'HF_TOKEN'
FROM --platform=linux/amd64 ghcr.io/astral-sh/uv:python3.13-bookworm-slim AS downloader
ARG MODEL_REPO
ARG MODEL_FILE
ENV HF_HUB_ENABLE_HF_TRANSFER=1
WORKDIR /models
RUN --mount=type=secret,id=HF_TOKEN,required=false \
HF_TOKEN=$(cat /run/secrets/HF_TOKEN 2>/dev/null || true) \
uvx --from 'huggingface_hub[hf_transfer,cli]' \
hf download "${MODEL_REPO}" "${MODEL_FILE}" --local-dir /models
FROM --platform=linux/amd64 ghcr.io/ggml-org/llama.cpp:server-cuda
COPY --from=downloader /models /models
ARG ENABLE_COMPAT=
ENV LD_LIBRARY_PATH=${ENABLE_COMPAT:+/usr/local/cuda/compat:}/usr/local/cuda/lib64
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment