-
-
Save Ithanil/fd7644bf3e44eec752d1263a8b8acb3a to your computer and use it in GitHub Desktop.
| ARG CUDA_VERSION=12.8.1 | |
| FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS vllm_gpt-oss | |
| ARG CUDA_VERSION | |
| ARG PYTHON_VERSION=3.12 | |
| ### --- Stuff from default Dockerfile ---- ### | |
| # The PyPA get-pip.py script is a self contained script+zip file, that provides | |
| # both the installer script and the pip base85-encoded zip archive. This allows | |
| # bootstrapping pip in environment where a dsitribution package does not exist. | |
| # | |
| # By parameterizing the URL for get-pip.py installation script, we allow | |
| # third-party to use their own copy of the script stored in a private mirror. | |
| # We set the default value to the PyPA owned get-pip.py script. | |
| # | |
| # Reference: https://pip.pypa.io/en/stable/installation/#get-pip-py | |
| ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py" | |
| # PIP supports fetching the packages from custom indexes, allowing third-party | |
| # to host the packages in private mirrors. The PIP_INDEX_URL and | |
| # PIP_EXTRA_INDEX_URL are standard PIP environment variables to override the | |
| # default indexes. By letting them empty by default, PIP will use its default | |
| # indexes if the build process doesn't override the indexes. | |
| # | |
| # Uv uses different variables. We set them by default to the same values as | |
| # PIP, but they can be overridden. | |
| ARG PIP_INDEX_URL | |
| ARG PIP_EXTRA_INDEX_URL | |
| ARG UV_INDEX_URL=${PIP_INDEX_URL} | |
| ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} | |
| # PyTorch provides its own indexes for standard and nightly builds | |
| ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl | |
| ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly | |
| # PIP supports multiple authentication schemes, including keyring | |
| # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to | |
| # disabled by default, we allow third-party to use keyring authentication for | |
| # their private Python indexes, while not changing the default behavior which | |
| # is no authentication. | |
| # | |
| # Reference: https://pip.pypa.io/en/stable/topics/authentication/#keyring-support | |
| ARG PIP_KEYRING_PROVIDER=disabled | |
| ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER} | |
| # Flag enables built-in KV-connector dependency libs into docker images | |
| ARG INSTALL_KV_CONNECTORS=false | |
| # prepare basic build environment | |
| ARG TARGETPLATFORM | |
| ARG INSTALL_KV_CONNECTORS=false | |
| ENV DEBIAN_FRONTEND=noninteractive | |
| ARG DEADSNAKES_MIRROR_URL | |
| ARG DEADSNAKES_GPGKEY_URL | |
| ARG GET_PIP_URL | |
| # Install Python and other dependencies | |
| RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ | |
| && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ | |
| && apt-get update -y \ | |
| && apt-get install -y ccache software-properties-common git curl sudo \ | |
| && add-apt-repository -y ppa:deadsnakes/ppa \ | |
| && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ | |
| && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ | |
| && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ | |
| && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ | |
| && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ | |
| && python3 --version && python3 -m pip --version | |
| ARG PIP_INDEX_URL UV_INDEX_URL | |
| ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL | |
| ARG PYTORCH_CUDA_INDEX_BASE_URL | |
| ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL | |
| ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER | |
| # Install uv for faster pip installs | |
| RUN --mount=type=cache,target=/root/.cache/uv \ | |
| python3 -m pip install uv | |
| # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out | |
| # Reference: https://github.com/astral-sh/uv/pull/1694 | |
| ENV UV_HTTP_TIMEOUT=500 | |
| ENV UV_INDEX_STRATEGY="unsafe-best-match" | |
| # Use copy mode to avoid hardlink failures with Docker cache mounts | |
| ENV UV_LINK_MODE=copy | |
| # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 | |
| # as it was causing spam when compiling the CUTLASS kernels | |
| RUN apt-get install -y gcc-10 g++-10 | |
| RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 | |
| RUN <<EOF | |
| gcc --version | |
| EOF | |
| # Workaround for https://github.com/openai/triton/issues/2507 and | |
| # https://github.com/pytorch/pytorch/issues/107960 -- hopefully | |
| # this won't be needed for future versions of this docker image | |
| # or future versions of triton. | |
| RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ | |
| # max jobs used by Ninja to build extensions | |
| ARG max_jobs=2 | |
| ENV MAX_JOBS=${max_jobs} | |
| # number of threads used by nvcc | |
| ARG nvcc_threads=8 | |
| ENV NVCC_THREADS=$nvcc_threads | |
| ### ----------------------------------------------------- ### | |
| ### --- Build instructions for GPT-OSS on Ampere --- ### | |
| ### Translated from https://github.com/vllm-project/vllm/issues/22290#issuecomment-3162301278 ### | |
| ARG CCACHE_NOHASHDIR="true" | |
| COPY . /tmp | |
| WORKDIR /tmp | |
| RUN pip install uv | |
| RUN uv pip install --system --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu128 | |
| RUN uv pip install --system "transformers[torch]" | |
| RUN python3 use_existing_torch.py | |
| RUN uv pip install --system -r requirements/build.txt | |
| RUN uv pip install --system --no-build-isolation -e . -v | |
| Run uv pip uninstall --system triton pytorch-triton | |
| RUN uv pip install --system triton==3.4.0 openai_harmony mcp | |
| RUN git clone https://github.com/openai/triton.git | |
| RUN uv pip install --system -e triton/python/triton_kernels --no-deps | |
| # Run | |
| ENV VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 | |
| ENTRYPOINT ["vllm"] |
Yes. Problem was solved after deleting this parameter "--enable-expert-parallel"
but i have a new problem, may be you known have to solve this???
(VllmWorker TP0 pid=44) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(VllmWorker TP1 pid=45) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(VllmWorker TP2 pid=46) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(VllmWorker TP3 pid=47) INFO 08-10 09:49:24 [multiproc_executor.py:520] Parent process exited, terminating worker
(APIServer pid=1) Traceback (most recent call last):
(APIServer pid=1) File "", line 198, in _run_module_as_main
(APIServer pid=1) File "", line 88, in _run_code
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1895, in
(APIServer pid=1) uvloop.run(run_server(args))
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 109, in run
(APIServer pid=1) return __asyncio.run(
(APIServer pid=1) ^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
(APIServer pid=1) return runner.run(main)
(APIServer pid=1) ^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=1) return self._loop.run_until_complete(task)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/init.py", line 61, in wrapper
(APIServer pid=1) return await main
(APIServer pid=1) ^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1827, in run_server
(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1855, in run_server_worker
(APIServer pid=1) await init_app_state(engine_client, vllm_config, app.state, args)
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/api_server.py", line 1657, in init_app_state
(APIServer pid=1) state.openai_serving_responses = OpenAIServingResponses(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/openai/serving_responses.py", line 130, in init
(APIServer pid=1) get_stop_tokens_for_assistant_actions())
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/harmony_utils.py", line 187, in get_stop_tokens_for_assistant_actions
(APIServer pid=1) return get_encoding().stop_tokens_for_assistant_actions()
(APIServer pid=1) ^^^^^^^^^^^^^^
(APIServer pid=1) File "/tmp/vllm/entrypoints/harmony_utils.py", line 37, in get_encoding
(APIServer pid=1) _harmony_encoding = load_harmony_encoding(
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/openai_harmony/init.py", line 689, in load_harmony_encoding
(APIServer pid=1) inner: _PyHarmonyEncoding = _load_harmony_encoding(name)
(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1) openai_harmony.HarmonyError: error downloading or loading vocab file: failed to download or load vocab file
I think I got something like this when I tried to use other parallelism than tensor parallelism. I think only TP is supported right now.
Please check my command-line posted above and see what you are doing differently.