Refactoring, updated README

2026-02-18 15:58:53 -08:00
parent 8873a0d959
commit f09c2c3ac8
4 changed files with 103 additions and 206 deletions
--- a/18
+++ b/18
@@ -173,18 +173,12 @@ RUN if [ -n "$VLLM_PRS" ]; then \
        done; \
    fi

-ARG PRE_TRANSFORMERS=0
-
 # Prepare build requirements
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    python3 use_existing_torch.py && \
    sed -i "/flashinfer/d" requirements/cuda.txt && \
    sed -i '/^triton\b/d' requirements/test.txt && \
    sed -i '/^fastsafetensors\b/d' requirements/test.txt && \
-    if [ "$PRE_TRANSFORMERS" = "1" ]; then \
-        sed -i '/^transformers\b/d' requirements/common.txt; \
-        sed -i '/^transformers\b/d' requirements/test.txt; \
-    fi && \
    uv pip install -r requirements/build.txt

 # Apply Patches
@@ -247,15 +241,17 @@ RUN mkdir -p tiktoken_encodings && \
    wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
    wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"

+ARG PRE_TRANSFORMERS=0
+
 # Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
+# With --tf5: override vLLM's transformers<5 constraint to get transformers>=5
 RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
    --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install /workspace/wheels/*.whl
-
-ARG PRE_TRANSFORMERS=0
-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    if [ "$PRE_TRANSFORMERS" = "1" ]; then \
-        uv pip install -U transformers --pre; \
+        echo "transformers>=5.0.0" > /tmp/tf-override.txt && \
+        uv pip install /workspace/wheels/*.whl --override /tmp/tf-override.txt; \
+    else \
+        uv pip install /workspace/wheels/*.whl; \
    fi

 # Setup environment for runtime
--- a/Dockerfile.wheels
+++ b/Dockerfile.wheels
@@ -1,107 +0,0 @@
-# syntax=docker/dockerfile:1.6
-
-FROM nvidia/cuda:13.1.1-devel-ubuntu24.04
-
-ENV DEBIAN_FRONTEND=noninteractive
-ENV PIP_BREAK_SYSTEM_PACKAGES=1
-ENV VLLM_BASE_DIR=/workspace/vllm
-
-# Just in case if some JIT compilation happens during runtime
-# Limit build parallelism to reduce OOM situations
-ARG BUILD_JOBS=16
-ENV MAX_JOBS=${BUILD_JOBS}
-ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
-ENV NINJAFLAGS="-j${BUILD_JOBS}"
-ENV MAKEFLAGS="-j${BUILD_JOBS}"
-
-# Set pip cache directory
-ENV PIP_CACHE_DIR=/root/.cache/pip
-ENV UV_CACHE_DIR=/root/.cache/uv
-ENV UV_SYSTEM_PYTHON=1
-ENV UV_LINK_MODE=copy
-ENV UV_BREAK_SYSTEM_PACKAGES=1
-
-# Install minimal runtime dependencies (NCCL, Python)
-# Note: "devel" tools like cmake/gcc are NOT installed here to save space
-RUN apt update && apt upgrade -y \
-    && apt install -y --allow-change-held-packages --no-install-recommends \
-    python3 python3-pip python3-dev vim curl git wget jq \
-    libcudnn9-cuda-13 \
-    libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
-    libxcb1 \
-    && rm -rf /var/lib/apt/lists/* \
-    && pip install uv
-
-# Set final working directory
-WORKDIR $VLLM_BASE_DIR
-
-# Download Tiktoken files
-RUN mkdir -p tiktoken_encodings && \
-    wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
-    wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
-
-# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
-# COPY fastsafetensors.patch .
-
-# Install fastsafetensors
-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install -U fastsafetensors
-
-# --- VLLM SOURCE CACHE BUSTER ---
-# Change THIS argument to force a fresh git clone and rebuild of vLLM
-# without re-installing the dependencies above.
-ARG CACHEBUST_VLLM=1
-ARG WHEELS_FROM_GITHUB_RELEASE=0
-
-# Install vLLM
-# If INSTALL_FROM_GITHUB_RELEASE is 1, install from GitHub releases (specific for aarch64/cu130 as requested)
-# Otherwise, install from nightly wheels
-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    if [ "$WHEELS_FROM_GITHUB_RELEASE" = "1" ]; then \
-        export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') && \
-        uv pip install -U https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl --torch-backend=auto; \
-    else \
-        uv pip install -U vllm \
-        --torch-backend=auto \
-        --extra-index-url https://wheels.vllm.ai/nightly/cu130; \
-    fi
-
-# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
-# Apply in site-packages
-# RUN if patch -p1 --dry-run --reverse -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch &>/dev/null; then \
-#         echo "PR #34180 is already applied"; \
-#     else \
-#         patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch; \
-#     fi
-
-ARG FLASHINFER_PRE=""
-
-# Install flashinfer helper packages
-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install ${FLASHINFER_PRE} flashinfer-python -U --no-deps --index-url https://flashinfer.ai/whl && \
-    uv pip install ${FLASHINFER_PRE} flashinfer-cubin --index-url https://flashinfer.ai/whl && \
-    uv pip install ${FLASHINFER_PRE} flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130
-
-ARG PRE_TRANSFORMERS=0
-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    if [ "$PRE_TRANSFORMERS" = "1" ]; then \
-        uv pip install -U transformers --pre; \
-        uv pip install numpy==2.2.6; \
-    fi
-
-# Setup Env for Runtime
-ARG TORCH_CUDA_ARCH_LIST="12.1a"
-ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
-ARG FLASHINFER_CUDA_ARCH_LIST="12.1a"
-ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}
-ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
-ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
-
-# Copy scripts
-COPY run-cluster-node.sh $VLLM_BASE_DIR/
-RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
-
-# Final extra deps
-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install ray[default]
-
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ While it was primarily developed to support multi-node inference, it works just

 This repository is not affiliated with NVIDIA or their subsidiaries. This is a community effort aimed to help DGX Spark users to set up and run the most recent versions of vLLM on Spark cluster or single nodes. 

-The Dockerfile builds from the main branch of VLLM, so depending on when you run the build process, it may not be in fully functioning state. You can target a specific vLLM release by setting `--vllm-ref` parameter or use `--use-wheels release` to install pre-built release wheels.
+The Dockerfile builds from the main branch of VLLM, so depending on when you run the build process, it may not be in fully functioning state. You can target a specific vLLM release by setting `--vllm-ref` parameter.

 ## QUICK START

@@ -41,26 +41,6 @@ cd spark-vllm-docker

 Build the container.

-**ATTENTION!** 
-
-As of February 9th, 2026, wheels build is no longer recommended way to build the container due to a lack of optimizations present in the source build.
-If you still want to use wheels build, please see a note below:
-
-If you are getting the following error (or similar) when building from wheels, you need to build the image from the source instead of using pre-built wheels. To do it, just remove `--use-wheels` parameter from the build command:
-
-```
-0.181 Using Python 3.12.3 environment at: /usr
-0.559   × No solution found when resolving dependencies:
-0.559   ╰─▶ Because only vllm==0.15.0rc2.dev49+g59bcc5b6f.cu130 is available and
-0.559       vllm==0.15.0rc2.dev49+g59bcc5b6f.cu130 has no wheels with a matching
-0.559       platform tag (e.g., `manylinux_2_39_aarch64`), we can conclude that all
-0.559       versions of vllm cannot be used.
-0.559       And because you require vllm, we can conclude that your requirements
-0.559       are unsatisfiable.
-```
-
-This error happens if vLLM nightly build fails for aarch64 platform, but succeeds for x86-64. You can check the status of vLLM nightly wheels at https://wheels.vllm.ai/nightly/cu130/vllm/
-
 **If you have only one DGX Spark:**

 ```bash
@@ -78,7 +58,7 @@ Then run the following command that will build and distribute image across the c
 ./build-and-copy.sh -c
 ```

-An initial build will take around 30 minutes, but subsequent builds will be faster. You can also use precompiled wheels which significantly speed up the build, but source build is recommended because it uses components specifically compiled for DGX Spark.
+An initial build will take around 20-30 minutes, but subsequent builds will be faster. Precompiled vLLM wheels for DGX Spark will also be available soon.

 ### Run

@@ -142,6 +122,8 @@ This will run the model on all available cluster nodes.

 **NOTE:** do not use `--load-format fastsafetensors` if you are loading models that would take >0.8 of available RAM (without KV cache) as it may result in out of memory situation.

+**Also:** You can use any vLLM container that has "bash" as its default entrypoint with the launch script. It was tested with NGC vLLM, but can work with others too. To use such container in the cluster, you need to specify `--apply-mod use-ngc-vllm` argument to `./launch-cluster.sh`. However, it's recommended to build the container using this repository for best compatibility and most up-to-date features. 
+
 ## CHANGELOG

 **IMPORTANT**
@@ -164,6 +146,26 @@ Don't do it every time you rebuild, because it will slow down compilation times.

 For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`

+### 2026-02-18
+
+#### Completely Redesigned Build Process
+
+`build-and-copy.sh` now automatically downloads prebuilt FlashInfer wheels from the [GitHub releases](https://github.com/eugr/spark-vllm-docker/releases/tag/prebuilt-flashinfer-current) before falling back to a local build. This eliminates the need to compile FlashInfer from source on first use, which typically takes around 20 minutes.
+
+The download logic:
+- If prebuilt wheels are available and newer than any locally cached version, they are downloaded automatically.
+- If the download fails (e.g. no network, release not found, gpu arch is not compatible), the script falls back to building locally, or reuses existing local wheels if present.
+- `--rebuild-flashinfer` skips the download entirely and forces a fresh local build.
+
+No new flags are required - the download happens transparently unless `--rebuild-flashinfer` is specified.
+
+All wheels (downloaded or built locally) are cached in the `./wheels` directory for subsequent reuse.
+
+- `--rebuild-flashinfer` will force FlashInfer rebuild from the flashinfer `main` branch.
+- `--rebuild-vllm` will force vLLM rebuild from vLLM `main` branch or specific commit in `--vllm-ref`.
+
+Please, note that specifying `--vllm-ref` or `--apply-vllm-pr` will force vLLM rebuild every time.
+
 ### 2026-02-17

 #### Non-Privileged Mode Support
@@ -183,6 +185,30 @@ Example usage:

 May result in a slightly reduced performance (within 2%) in exchange for better reliability and stability.

+#### Qwen3-Coder-Next recipe update
+
+Updated `qwen3-coder-next-fp8` recipe: KV cache type changed to `fp8` and maximum context length reduced to 131072 tokens to reliably fit within a single Spark's memory.
+
+### 2026-02-16
+
+#### MiniMax M2.5 AWQ recipe
+
+Added a new recipe `minimax-m2.5-awq` for running MiniMax-Text-01-AWQ (M2.5). Usage:
+
+```bash
+./run-recipe.sh minimax-m2.5-awq
+```
+
+#### GLM-4.7-Flash-AWQ mod extended with vLLM crash fix
+
+The `fix-glm-4.7-flash-AWQ` mod now also applies the fix from [PR #34695](https://github.com/vllm-project/vllm/pull/34695), which addresses a crash in `mla_attention.py` when running GLM models with AWQ quantization. The patch is applied automatically alongside the existing speed fix, and is skipped if it has already been merged into the installed vLLM version.
+
+### 2026-02-13
+
+#### FlashInfer cubin caching
+
+FlashInfer cubins (pre-compiled GPU kernels) are now cached via a Docker bind mount and reused across rebuilds. Previously, all cubins were recompiled from scratch on every FlashInfer rebuild even if unchanged. This significantly reduces FlashInfer rebuild times when only minor source changes are made.
+
 ### 2026-02-12

 Added a mod for Qwen3-Coder-Next-FP8 that fixes:
@@ -553,43 +579,11 @@ Applied patch to enable FastSafeTensors in cluster configuration (EXPERIMENTAL)

 ### Building Manually

-The Dockerfile includes specific **Build Arguments** to allow you to selectively rebuild layers (e.g., update the vLLM source code without re-downloading PyTorch).
-Using a provided build script is recommended, but if you want to build using `docker build` command, here are the supported build arguments:
+Building the container manually is no longer supported due to Dockerfile complexity. Please use the provided build script.

-| Argument | Default | Description |
-| :--- | :--- | :--- |
-| `CACHEBUST_DEPS` | `1` | Change this to force a re-download of PyTorch, FlashInfer, and system dependencies. |
-| `CACHEBUST_VLLM` | `1` | Change this to force a fresh git clone and rebuild of vLLM source code. |
-| `TRITON_REF` | `v3.6.0` | Triton commit SHA, branch, or tag to build - currently ignored. |
-| `VLLM_REF` | `main` | vLLM commit SHA, branch, or tag to build. |
-| `TORCH_CUDA_ARCH_LIST` | `12.1a` | Target GPU architecture list for PyTorch. |
-| `FLASHINFER_CUDA_ARCH_LIST` | `12.1a` | Target GPU architecture list for FlashInfer. |
-| `BUILD_JOBS` | `16` | Number of parallel build jobs (default: 16). |
-| `FLASHINFER_PRE` | `""` | Set to `--pre` to use pre-release versions of FlashInfer. |
-| `PRE_TRANSFORMERS` | `0` | Set to `1` to install pre-release transformers (5.0.0rc or higher). |
+### Using the Build Script

-### Building Manually using Wheels
-
-If you prefer to use pre-built wheels (faster build, smaller image), you can use `Dockerfile.wheels`.
-
-```bash
-docker build -f Dockerfile.wheels -t vllm-node .
-```
-
-Supported build arguments for `Dockerfile.wheels`:
-
-| Argument | Default | Description |
-| :--- | :--- | :--- |
-| `BUILD_JOBS` | `16` | Number of parallel build jobs (default: 16). |
-| `CACHEBUST_VLLM` | `1` | Change this to force a re-download of vLLM wheels. |
-| `WHEELS_FROM_GITHUB_RELEASE` | `0` | Set to `1` to use GitHub release wheels instead of nightly wheels. |
-| `FLASHINFER_PRE` | `""` | Set to `--pre` to use pre-release versions of FlashInfer. |
-| `PRE_TRANSFORMERS` | `0` | Set to `1` to install pre-release transformers (5.0.0rc or higher). |
-| `TORCH_CUDA_ARCH_LIST` | `12.1a` | Target GPU architecture list. |
-
-### Using the Build Script (Recommended)
-
-The `build-and-copy.sh` script automates the build process and optionally copies the image to one or more nodes. This is the recommended method for building and deploying to multiple Spark nodes.
+The `build-and-copy.sh` script automates the build process and optionally copies the image to one or more nodes. This is the officially supported method for building and deploying to multiple Spark nodes.

 **Basic usage (build only):**

@@ -600,7 +594,7 @@ The `build-and-copy.sh` script automates the build process and optionally copies
 **Build with a custom tag:**

 ```bash
-./build-and-copy.sh --tag my-vllm-node
+./build-and-copy.sh -t my-vllm-node
 ```

 **Build and copy to Spark node(s):**
@@ -637,29 +631,24 @@ Using a different username:
 ./build-and-copy.sh --copy-to 192.168.177.12 --user your_username
 ```

-**Force rebuild vLLM source only:**
+**Force rebuild vLLM from source:**

 ```bash
 ./build-and-copy.sh --rebuild-vllm
 ```

-**Force rebuild all dependencies:**
+**Force rebuild FlashInfer from source (skips prebuilt wheel download):**

 ```bash
-./build-and-copy.sh --rebuild-deps
+./build-and-copy.sh --rebuild-flashinfer
 ```

 **Combined example (rebuild vLLM and copy to another node):**

 ```bash
-./build-and-copy.sh --rebuild-vllm --copy-to 192.168.177.12
+./build-and-copy.sh --rebuild-vllm -c 192.168.177.12
 ```

-**Build with specific Triton commit:**
-
-```bash
-./build-and-copy.sh --triton-ref abc123def456
-```
 **Build for specific GPU architecture:**

 ```bash
@@ -676,26 +665,24 @@ Using a different username:

 | Flag | Description |
 | :--- | :--- |
-| `-t, --tag <tag>` | Image tag (default: 'vllm-node') |
-| `--gpu-arch <arch>` | Target GPU architecture (default: '12.1a') |
-| `-t, --tag <tag>` | Image tag (default: 'vllm-node') |
-| `--rebuild-deps` | Force rebuild all dependencies (sets CACHEBUST_DEPS) |
-| `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) |
-| `--triton-ref <ref>` | Triton commit SHA, branch or tag (default: 'v3.5.1') |
-| `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: 'main') |
-| `--pre-tf` | Install pre-release transformers (5.0.0rc or higher). Alias: `--pre-transformers`. |
+| `-t, --tag <tag>` | Image tag (default: `vllm-node`) |
+| `--gpu-arch <arch>` | Target GPU architecture (default: `12.1a`) |
+| `--rebuild-flashinfer` | Skip prebuilt wheel download; force a fresh local FlashInfer build |
+| `--rebuild-vllm` | Force rebuild vLLM from source |
+| `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: `main`) |
+| `--apply-vllm-pr <pr-num>` | Apply a vLLM PR patch during build. Can be specified multiple times. |
+| `--tf5` | Install transformers v5 (5.0.0 or higher). Aliases: `--pre-tf, --pre-transformers`. |
 | `--exp-mxfp4` | Build with experimental native MXFP4 support. Alias: `--experimental-mxfp4`. |
-| `--use-wheels [mode]` | Use pre-built vLLM wheels. Mode: `nightly` (default) or `release`. |
-| `--pre-flashinfer` | Use pre-release versions of FlashInfer. |
-| `-c, --copy-to <host[,host...] or host host...>` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). |
+| `-c, --copy-to <hosts>` | Host(s) to copy the image to after building (space- or comma-separated). |
 | `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). |
 | `--copy-parallel` | Copy to all specified hosts concurrently. |
-| `-j, --build-jobs <jobs>` | Number of parallel build jobs (default: Dockerfile default) |
+| `-j, --build-jobs <jobs>` | Number of parallel build jobs (default: 16) |
 | `-u, --user <user>` | Username for SSH connection (default: current user) |
+| `--full-log` | Enable full Docker build output (`--progress=plain`) |
 | `--no-build` | Skip building, only copy existing image (requires `--copy-to`) |
 | `-h, --help` | Show help message |

-**IMPORTANT**: When copying to another node, make sure you use the Spark IP assigned to its ConnectX 7 interface (enp1s0f1np1), and not the 10G interface (enP7s7)!
+**IMPORTANT**: When copying to another node, make sure you use the Spark IP assigned to its ConnectX 7 interface (enp1s0f1np1), and not the 10G interface (enP7s7)! If you omit the IP address and use `-c` without addresses, it will use autodiscovery to detect a proper IP address.

 ### Copying the container to another Spark node (Manual Method)

--- a/build-and-copy.sh
+++ b/build-and-copy.sh
@@ -23,6 +23,8 @@ BUILD_JOBS="16"
 GPU_ARCH_LIST="12.1a"
 WHEELS_REPO="eugr/spark-vllm-docker"
 FLASHINFER_RELEASE_TAG="prebuilt-flashinfer-current"
+# Space-separated list of GPU architectures for which prebuilt wheels are available
+PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a"

 cleanup() {
    if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
@@ -70,6 +72,16 @@ try_download_wheels() {
    local PREFIX="$2"
    local WHEELS_DIR="./wheels"

+    local arch
+    for arch in $PREBUILT_WHEELS_SUPPORTED_ARCHS; do
+        [ "$arch" = "$GPU_ARCH_LIST" ] && break
+        arch=""
+    done
+    if [ -z "$arch" ]; then
+        echo "GPU arch '$GPU_ARCH_LIST' not supported by prebuilt wheels (supported: $PREBUILT_WHEELS_SUPPORTED_ARCHS) — skipping download."
+        return 1
+    fi
+
    local RELEASE_JSON
    RELEASE_JSON=$(curl -sf --connect-timeout 10 \
        "https://api.github.com/repos/$WHEELS_REPO/releases/tags/$TAG") || {
@@ -134,7 +146,7 @@ usage() {
    echo "      --copy-parallel           : Copy to all hosts in parallel instead of serially."
    echo "  -j, --build-jobs <jobs>       : Number of concurrent build jobs (default: ${BUILD_JOBS})"
    echo "  -u, --user <user>             : Username for ssh command (default: \$USER)"
-    echo "  --pre-tf, --pre-transformers  : Install transformers 5.0.0rc0 or higher"
+    echo "  --tf5                         : Install transformers>=5 (aliases: --pre-tf, --pre-transformers)"
    echo "  --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support"
    echo "  --apply-vllm-pr <pr-num>      : Apply a specific PR patch to vLLM source. Can be specified multiple times."
    echo "  --full-log                    : Enable full build logging (--progress=plain)"
@@ -183,7 +195,7 @@ while [[ "$#" -gt 0 ]]; do
        -j|--build-jobs) BUILD_JOBS="$2"; shift ;;
        -u|--user) SSH_USER="$2"; shift ;;
        --copy-parallel) PARALLEL_COPY=true ;;
-        --pre-tf|--pre-transformers) PRE_TRANSFORMERS=true ;;
+        --tf5|--pre-tf|--pre-transformers) PRE_TRANSFORMERS=true ;;
        --exp-mxfp4|--experimental-mxfp4) EXP_MXFP4=true ;;
        --apply-vllm-pr)
            if [ -n "$2" ] && [[ "$2" != -* ]]; then
@@ -213,7 +225,7 @@ fi

 if [ "$EXP_MXFP4" = true ]; then
    if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi
-    if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi
+    if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --tf5"; exit 1; fi
    if [ "$REBUILD_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-flashinfer"; exit 1; fi
    if [ "$REBUILD_VLLM" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-vllm"; exit 1; fi
 fi
@@ -315,9 +327,21 @@ if [ "$NO_BUILD" = false ]; then
            VLLM_WHEELS_EXIST=true
        fi

+        if [ "$VLLM_REF_SET" = true ] || [ -n "$VLLM_PRS" ]; then
+            REBUILD_VLLM=true
+        fi
+
        if [ "$REBUILD_VLLM" = true ] || [ "$VLLM_WHEELS_EXIST" = false ]; then
            if [ "$REBUILD_VLLM" = true ]; then
+                if [ "$VLLM_REF_SET" = true ] && [ -n "$VLLM_PRS" ]; then
+                    echo "Rebuilding vLLM wheels (--vllm-ref and --apply-vllm-pr specified)..."
+                elif [ "$VLLM_REF_SET" = true ]; then
+                    echo "Rebuilding vLLM wheels (--vllm-ref specified)..."
+                elif [ -n "$VLLM_PRS" ]; then
+                    echo "Rebuilding vLLM wheels (--apply-vllm-pr specified)..."
+                else
                    echo "Rebuilding vLLM wheels (--rebuild-vllm specified)..."
+                fi
            else
                echo "No vLLM wheels found in ./wheels/ — building..."
            fi
@@ -344,10 +368,6 @@ if [ "$NO_BUILD" = false ]; then
                VLLM_CMD+=("--build-arg" "VLLM_PRS=$VLLM_PRS")
            fi

-            if [ "$PRE_TRANSFORMERS" = true ]; then
-                echo "Using transformers>=5.0.0..."
-                VLLM_CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
-            fi

            VLLM_CMD+=(".")

@@ -380,6 +400,7 @@ if [ "$NO_BUILD" = false ]; then
            "${COMMON_BUILD_FLAGS[@]}")

        if [ "$PRE_TRANSFORMERS" = true ]; then
+            echo "Using transformers>=5.0.0..."
            RUNNER_CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
        fi