Merge branch 'mxfp4'
This commit is contained in:
@@ -99,7 +99,7 @@ ARG FLASHINFER_SHA=f349e52496a72a00d8c4ac02c7a1e38523ff7194
|
|||||||
ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
|
ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
|
||||||
|
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
uv pip install nvidia-nvshmem-cu13
|
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
|
||||||
|
|
||||||
# Clone FlashInfer (cached for faster rebuilds)
|
# Clone FlashInfer (cached for faster rebuilds)
|
||||||
RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \
|
RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \
|
||||||
@@ -144,18 +144,18 @@ WORKDIR /workspace/flashinfer
|
|||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
--mount=type=cache,id=ccache,target=/root/.ccache \
|
--mount=type=cache,id=ccache,target=/root/.ccache \
|
||||||
sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \
|
sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \
|
||||||
uv pip install --no-build-isolation . -v
|
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||||
|
|
||||||
# flashinfer-cubin
|
# flashinfer-cubin
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
--mount=type=cache,id=ccache,target=/root/.ccache \
|
--mount=type=cache,id=ccache,target=/root/.ccache \
|
||||||
cd flashinfer-cubin && uv pip install --no-build-isolation . -v
|
cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||||
|
|
||||||
# flashinfer-jit-cache
|
# flashinfer-jit-cache
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
--mount=type=cache,id=ccache,target=/root/.ccache \
|
--mount=type=cache,id=ccache,target=/root/.ccache \
|
||||||
cd flashinfer-jit-cache && \
|
cd flashinfer-jit-cache && \
|
||||||
uv pip install --no-build-isolation . -v
|
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||||
|
|
||||||
# --- VLLM SOURCE CACHE BUSTER ---
|
# --- VLLM SOURCE CACHE BUSTER ---
|
||||||
# Change THIS argument to force a fresh git clone and rebuild of vLLM
|
# Change THIS argument to force a fresh git clone and rebuild of vLLM
|
||||||
@@ -218,7 +218,7 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|||||||
# across totally separate `docker build` invocations.
|
# across totally separate `docker build` invocations.
|
||||||
RUN --mount=type=cache,id=ccache,target=/root/.ccache \
|
RUN --mount=type=cache,id=ccache,target=/root/.ccache \
|
||||||
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
uv pip install --no-build-isolation . -v
|
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||||
|
|
||||||
# =========================================================
|
# =========================================================
|
||||||
# STAGE 4: Runner (Transfers only necessary artifacts)
|
# STAGE 4: Runner (Transfers only necessary artifacts)
|
||||||
@@ -256,8 +256,9 @@ RUN mkdir -p tiktoken_encodings && \
|
|||||||
# Copy artifacts from Builder Stage
|
# Copy artifacts from Builder Stage
|
||||||
# We copy the python packages and executables
|
# We copy the python packages and executables
|
||||||
# No need to copy source code, as it's already in the site-packages
|
# No need to copy source code, as it's already in the site-packages
|
||||||
COPY --from=builder /usr/local/lib/python3.12/dist-packages /usr/local/lib/python3.12/dist-packages
|
COPY --from=builder /workspace/wheels /workspace/wheels
|
||||||
COPY --from=builder /usr/local/bin /usr/local/bin
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
|
uv pip install /workspace/wheels/*.whl
|
||||||
|
|
||||||
# Setup Env for Runtime
|
# Setup Env for Runtime
|
||||||
ENV TORCH_CUDA_ARCH_LIST="12.0;12.1"
|
ENV TORCH_CUDA_ARCH_LIST="12.0;12.1"
|
||||||
@@ -272,4 +273,4 @@ RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
|
|||||||
|
|
||||||
# Final extra deps
|
# Final extra deps
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
uv pip install ray[default]
|
uv pip install ray[default] fastsafetensors
|
||||||
|
|||||||
@@ -146,9 +146,11 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi
|
|||||||
|
|
||||||
### 2026-01-29
|
### 2026-01-29
|
||||||
|
|
||||||
|
Added `-e` / `--env` parameter to `launch-cluster.sh` to pass environment variables to the container.
|
||||||
|
|
||||||
Added an experimental build option, optimized for DGX Spark and gpt-oss models by [Christopher Owen](https://github.com/christopherowen/spark-vllm-mxfp4-docker/blob/main/Dockerfile).
|
Added an experimental build option, optimized for DGX Spark and gpt-oss models by [Christopher Owen](https://github.com/christopherowen/spark-vllm-mxfp4-docker/blob/main/Dockerfile).
|
||||||
|
|
||||||
It is currently the fastest way to run GPT-OSS on DGX Spark, achieving 60 t/s on a single Spark and 75 t/s on dual Sparks.
|
It is currently the fastest way to run GPT-OSS on DGX Spark, achieving 60 t/s on a single Spark.
|
||||||
|
|
||||||
To use this build, first build the container with `--exp-mxfp4` flag. I recommend using a separate label as it is currently not recommended to use this build for models other than gpt-oss:
|
To use this build, first build the container with `--exp-mxfp4` flag. I recommend using a separate label as it is currently not recommended to use this build for models other than gpt-oss:
|
||||||
|
|
||||||
@@ -521,7 +523,7 @@ The script attempts to automatically detect:
|
|||||||
You can override the auto-detected values if needed:
|
You can override the auto-detected values if needed:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./launch-cluster.sh --nodes "10.0.0.1,10.0.0.2" --eth-if enp1s0f1np1 --ib-if rocep1s0f1
|
./launch-cluster.sh --nodes "10.0.0.1,10.0.0.2" --eth-if enp1s0f1np1 --ib-if rocep1s0f1 -e MY_ENV=123
|
||||||
```
|
```
|
||||||
|
|
||||||
| Flag | Description |
|
| Flag | Description |
|
||||||
@@ -531,6 +533,7 @@ You can override the auto-detected values if needed:
|
|||||||
| `--name` | Container name (default: `vllm_node`). |
|
| `--name` | Container name (default: `vllm_node`). |
|
||||||
| `--eth-if` | Ethernet interface name. |
|
| `--eth-if` | Ethernet interface name. |
|
||||||
| `--ib-if` | InfiniBand interface name. |
|
| `--ib-if` | InfiniBand interface name. |
|
||||||
|
| `-e, --env` | Environment variable to pass to container (e.g. `-e VAR=val`). Can be used multiple times. |
|
||||||
| `--apply-mod` | Apply mods/patches from specified directory. Can be used multiple times to apply multiple mods. |
|
| `--apply-mod` | Apply mods/patches from specified directory. Can be used multiple times to apply multiple mods. |
|
||||||
| `--nccl-debug` | NCCL debug level (e.g., INFO, WARN). Defaults to INFO if flag is present but value is omitted. |
|
| `--nccl-debug` | NCCL debug level (e.g., INFO, WARN). Defaults to INFO if flag is present but value is omitted. |
|
||||||
| `--check-config` | Check configuration and auto-detection without launching. |
|
| `--check-config` | Check configuration and auto-detection without launching. |
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ usage() {
|
|||||||
echo " --name Container name (Optional, default: $DEFAULT_CONTAINER_NAME)"
|
echo " --name Container name (Optional, default: $DEFAULT_CONTAINER_NAME)"
|
||||||
echo " --eth-if Ethernet interface (Optional, auto-detected)"
|
echo " --eth-if Ethernet interface (Optional, auto-detected)"
|
||||||
echo " --ib-if InfiniBand interface (Optional, auto-detected)"
|
echo " --ib-if InfiniBand interface (Optional, auto-detected)"
|
||||||
|
echo " -e, --env Environment variable to pass to container (e.g. -e VAR=val)"
|
||||||
echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO."
|
echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO."
|
||||||
echo " --apply-mod Path to directory or zip file containing run.sh to apply before launch (Can be specified multiple times)"
|
echo " --apply-mod Path to directory or zip file containing run.sh to apply before launch (Can be specified multiple times)"
|
||||||
echo " --check-config Check configuration and auto-detection without launching"
|
echo " --check-config Check configuration and auto-detection without launching"
|
||||||
@@ -52,6 +53,7 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
--name) CONTAINER_NAME="$2"; shift ;;
|
--name) CONTAINER_NAME="$2"; shift ;;
|
||||||
--eth-if) ETH_IF="$2"; shift ;;
|
--eth-if) ETH_IF="$2"; shift ;;
|
||||||
--ib-if) IB_IF="$2"; shift ;;
|
--ib-if) IB_IF="$2"; shift ;;
|
||||||
|
-e|--env) DOCKER_ARGS="$DOCKER_ARGS -e $2"; shift ;;
|
||||||
--apply-mod) MOD_PATHS+=("$2"); shift ;;
|
--apply-mod) MOD_PATHS+=("$2"); shift ;;
|
||||||
--nccl-debug)
|
--nccl-debug)
|
||||||
if [[ -n "$2" && "$2" =~ ^(VERSION|WARN|INFO|TRACE)$ ]]; then
|
if [[ -n "$2" && "$2" =~ ^(VERSION|WARN|INFO|TRACE)$ ]]; then
|
||||||
|
|||||||
Reference in New Issue
Block a user