Applied new fastsafetensors fix to mxfp4 build; disabled wheel builds by default

2026-02-09 23:47:06 -08:00
parent 74876dd442
commit ace16f3a8f
7 changed files with 71 additions and 35 deletions
--- a/8
+++ b/8
@@ -215,9 +215,13 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    uv pip install -r requirements/build.txt
 # Apply Patches
-# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
+# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
 # COPY fastsafetensors.patch .
-# RUN patch -p1 < fastsafetensors.patch
+# RUN if patch -p1 --dry-run --reverse < fastsafetensors.patch &>/dev/null; then \
 #         echo "PR #34180 is already applied"; \
 #     else \
 #         patch -p1 < fastsafetensors.patch; \
 #     fi
 # Final Compilation
 # We mount the ccache directory here. Ideally, map this to a host volume for persistence 
--- a/Dockerfile.mxfp4
+++ b/Dockerfile.mxfp4
@@ -211,9 +211,9 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    uv pip install -r requirements/build.txt
 # Apply Patches
-# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
+# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
-#COPY fastsafetensors.patch .
+COPY fastsafetensors_mxfp4.patch .
-#RUN patch -p1 < fastsafetensors.patch
+RUN patch -p1 < fastsafetensors_mxfp4.patch
 # Final Compilation
 # We mount the ccache directory here. Ideally, map this to a host volume for persistence 
--- a/Dockerfile.wheels
+++ b/Dockerfile.wheels
@@ -40,7 +40,7 @@ RUN mkdir -p tiktoken_encodings && \
    wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
    wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
-# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
+# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
 # COPY fastsafetensors.patch .
 # Install fastsafetensors
@@ -66,9 +66,13 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
        --extra-index-url https://wheels.vllm.ai/nightly/cu130; \
    fi
-# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
+# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
 # Apply in site-packages
-# RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch
+# RUN if patch -p1 --dry-run --reverse -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch &>/dev/null; then \
 #         echo "PR #34180 is already applied"; \
 #     else \
 #         patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch; \
 #     fi
 ARG FLASHINFER_PRE=""
--- a/README.md
+++ b/README.md
@@ -43,7 +43,10 @@ Build the container.
 **ATTENTION!** 
-If you are getting the following error (or similar), you need to build the image from the source instead of using pre-built wheels. To do it, just remove `--use-wheels` parameter from the build command:
+As of February 9th, 2026, wheels build is no longer recommended way to build the container due to a lack of optimizations present in the source build.
 If you still want to use wheels build, please see a note below:
 If you are getting the following error (or similar) when building from wheels, you need to build the image from the source instead of using pre-built wheels. To do it, just remove `--use-wheels` parameter from the build command:
 ```
 0.181 Using Python 3.12.3 environment at: /usr
@@ -61,7 +64,7 @@ This error happens if vLLM nightly build fails for aarch64 platform, but succeed
 **If you have only one DGX Spark:**
 ```bash
-./build-and-copy.sh --use-wheels
+./build-and-copy.sh
 ```
 **On DGX Spark cluster:**
@@ -72,9 +75,11 @@ You can also check out our new [Networking Guide](docs/NETWORKING.md).
 Then run the following command that will build and distribute image across the cluster.
 ```bash
-./build-and-copy.sh --use-wheels -c
+./build-and-copy.sh -c
 ```
 An initial build will take around 30 minutes, but subsequent builds will be faster. You can also use precompiled wheels which significantly speed up the build, but source build is recommended because it uses components specifically compiled for DGX Spark.
 ### Run
 **On a single node**:
@@ -161,7 +166,7 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi
 ### 2026-02-09
- Migrated to a new base image with PyTorch 2.10 compiled with Spark support.
+- Migrated to a new base image with PyTorch 2.10 compiled with Spark support. With this change, wheels build is no longer a recommended way - please use a source build instead.
 - Triton 3.6.0 is now default.
 - Removed temporary fastsafetensors patch, as proper fix is now merged into vLLM main branch.
@@ -294,11 +299,9 @@ See (this post on NVIDIA forums)[https://forums.developer.nvidia.com/t/make-glm-
 To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.:
 ```bash
-./build-and-copy.sh -t vllm-node-tf5 --use-wheels --pre-tf -c
+./build-and-copy.sh -t vllm-node-tf5 --pre-tf -c
 ```
 Drop `--use-wheels` if you experience an error during build (see the annoucement in the Quick Start section).
 Then, to run on a single node:
 ```bash
--- a/build-and-copy.sh
+++ b/build-and-copy.sh
@@ -175,6 +175,14 @@ if [ "$EXP_MXFP4" = true ]; then
    if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi
 fi
 if [ -n "$USE_WHEELS_MODE" ]; then
    read -p "!!! Wheels build is known not to work properly with all models after migration to Torch 2.10! Full build is recommended. Do you want to continue (y/N)? " choice
    case "$choice" in 
        y|Y ) echo "Continuing...";;
        * ) echo "Aborting."; exit 1;;
    esac
 fi
 # Validate --no-build usage
 if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
    echo "Error: --no-build requires --copy-to to be specified"
--- a/fastsafetensors.patch
+++ b/fastsafetensors.patch
@@ -1,28 +1,12 @@
 diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
-index 0809bdfa9..a7878f44f 100644
+index d43656c4f382..7025efd1c2de 100644
 --- a/vllm/model_executor/model_loader/weight_utils.py
 +++ b/vllm/model_executor/model_loader/weight_utils.py
-@@ -28,6 +28,7 @@ from vllm import envs
+@@ -826,6 +826,7 @@ def fastsafetensors_weights_iterator(
 from vllm.config import ModelConfig
 from vllm.config.load import LoadConfig
 from vllm.distributed import get_tensor_model_parallel_rank
 +from vllm.distributed.parallel_state import get_world_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (
     QuantizationConfig,
@@ -770,11 +771,13 @@ def fastsafetensors_weights_iterator(
     """Iterate over the weights in the model safetensor files
     using fastsafetensor library."""
     if torch.distributed.is_initialized():
 -        pg = torch.distributed.group.WORLD
 +        world = get_world_group()
 +        pg = world.device_group
 +        device = world.device
     else:
         pg = SingleGroup()
 +        device = torch.device(f"cuda:{pg.rank()}")
-    device = torch.device(f"cuda:{pg.rank()}")
+     device = torch.device(f"cuda:{current_platform.current_device()}")
 +    hf_weights_files = sorted(hf_weights_files, key=_natural_sort_key)
     weight_files_sub_lists = [
         hf_weights_files[i : i + pg.size()]
         for i in range(0, len(hf_weights_files), pg.size())
--- a/fastsafetensors_mxfp4.patch
+++ b/fastsafetensors_mxfp4.patch
@@ -0,0 +1,33 @@
 --- a/vllm/model_executor/model_loader/weight_utils.py
 +++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -8,6 +8,7 @@
 import hashlib
 import json
 import os
 +import re
 import tempfile
 import time
 from collections import defaultdict
@@ -786,6 +786,14 @@
     loader.add_filenames(rank_file_map)
     return loader
 +def _natural_sort_key(filepath: str) -> list:
 +    """Natural sort key for filenames with numeric components, such as
 +    model-00001-of-00005.safetensors -> ['model-', 1, '-of-', 5, '.safetensors']"""
 +    return [
 +        int(s) if s.isdigit() else s
 +        for s in re.split(r"(\d+)", os.path.basename(filepath))
 +    ]
 +
 def fastsafetensors_weights_iterator(
     hf_weights_files: list[str],
@@ -801,6 +809,7 @@
         pg = SingleGroup()
         device = torch.device(f"cuda:{pg.rank()}")
 +    hf_weights_files = sorted(hf_weights_files, key=_natural_sort_key)
     weight_files_sub_lists = [
         hf_weights_files[i : i + pg.size()]
         for i in range(0, len(hf_weights_files), pg.size())