Applied new fastsafetensors fix to mxfp4 build; disabled wheel builds by default

This commit is contained in:
Eugene Rakhmatulin
2026-02-09 23:47:06 -08:00
parent 74876dd442
commit ace16f3a8f
7 changed files with 71 additions and 35 deletions

View File

@@ -215,9 +215,13 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install -r requirements/build.txt uv pip install -r requirements/build.txt
# Apply Patches # Apply Patches
# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 # TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
# COPY fastsafetensors.patch . # COPY fastsafetensors.patch .
# RUN patch -p1 < fastsafetensors.patch # RUN if patch -p1 --dry-run --reverse < fastsafetensors.patch &>/dev/null; then \
# echo "PR #34180 is already applied"; \
# else \
# patch -p1 < fastsafetensors.patch; \
# fi
# Final Compilation # Final Compilation
# We mount the ccache directory here. Ideally, map this to a host volume for persistence # We mount the ccache directory here. Ideally, map this to a host volume for persistence

View File

@@ -211,9 +211,9 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install -r requirements/build.txt uv pip install -r requirements/build.txt
# Apply Patches # Apply Patches
# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 # TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
#COPY fastsafetensors.patch . COPY fastsafetensors_mxfp4.patch .
#RUN patch -p1 < fastsafetensors.patch RUN patch -p1 < fastsafetensors_mxfp4.patch
# Final Compilation # Final Compilation
# We mount the ccache directory here. Ideally, map this to a host volume for persistence # We mount the ccache directory here. Ideally, map this to a host volume for persistence

View File

@@ -40,7 +40,7 @@ RUN mkdir -p tiktoken_encodings && \
wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \ wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 # Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
# COPY fastsafetensors.patch . # COPY fastsafetensors.patch .
# Install fastsafetensors # Install fastsafetensors
@@ -66,9 +66,13 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
--extra-index-url https://wheels.vllm.ai/nightly/cu130; \ --extra-index-url https://wheels.vllm.ai/nightly/cu130; \
fi fi
# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 # Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
# Apply in site-packages # Apply in site-packages
# RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch # RUN if patch -p1 --dry-run --reverse -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch &>/dev/null; then \
# echo "PR #34180 is already applied"; \
# else \
# patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch; \
# fi
ARG FLASHINFER_PRE="" ARG FLASHINFER_PRE=""

View File

@@ -43,7 +43,10 @@ Build the container.
**ATTENTION!** **ATTENTION!**
If you are getting the following error (or similar), you need to build the image from the source instead of using pre-built wheels. To do it, just remove `--use-wheels` parameter from the build command: As of February 9th, 2026, wheels build is no longer recommended way to build the container due to a lack of optimizations present in the source build.
If you still want to use wheels build, please see a note below:
If you are getting the following error (or similar) when building from wheels, you need to build the image from the source instead of using pre-built wheels. To do it, just remove `--use-wheels` parameter from the build command:
``` ```
0.181 Using Python 3.12.3 environment at: /usr 0.181 Using Python 3.12.3 environment at: /usr
@@ -61,7 +64,7 @@ This error happens if vLLM nightly build fails for aarch64 platform, but succeed
**If you have only one DGX Spark:** **If you have only one DGX Spark:**
```bash ```bash
./build-and-copy.sh --use-wheels ./build-and-copy.sh
``` ```
**On DGX Spark cluster:** **On DGX Spark cluster:**
@@ -72,9 +75,11 @@ You can also check out our new [Networking Guide](docs/NETWORKING.md).
Then run the following command that will build and distribute image across the cluster. Then run the following command that will build and distribute image across the cluster.
```bash ```bash
./build-and-copy.sh --use-wheels -c ./build-and-copy.sh -c
``` ```
An initial build will take around 30 minutes, but subsequent builds will be faster. You can also use precompiled wheels which significantly speed up the build, but source build is recommended because it uses components specifically compiled for DGX Spark.
### Run ### Run
**On a single node**: **On a single node**:
@@ -161,7 +166,7 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi
### 2026-02-09 ### 2026-02-09
- Migrated to a new base image with PyTorch 2.10 compiled with Spark support. - Migrated to a new base image with PyTorch 2.10 compiled with Spark support. With this change, wheels build is no longer a recommended way - please use a source build instead.
- Triton 3.6.0 is now default. - Triton 3.6.0 is now default.
- Removed temporary fastsafetensors patch, as proper fix is now merged into vLLM main branch. - Removed temporary fastsafetensors patch, as proper fix is now merged into vLLM main branch.
@@ -294,11 +299,9 @@ See (this post on NVIDIA forums)[https://forums.developer.nvidia.com/t/make-glm-
To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.: To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.:
```bash ```bash
./build-and-copy.sh -t vllm-node-tf5 --use-wheels --pre-tf -c ./build-and-copy.sh -t vllm-node-tf5 --pre-tf -c
``` ```
Drop `--use-wheels` if you experience an error during build (see the annoucement in the Quick Start section).
Then, to run on a single node: Then, to run on a single node:
```bash ```bash

View File

@@ -175,6 +175,14 @@ if [ "$EXP_MXFP4" = true ]; then
if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi
fi fi
if [ -n "$USE_WHEELS_MODE" ]; then
read -p "!!! Wheels build is known not to work properly with all models after migration to Torch 2.10! Full build is recommended. Do you want to continue (y/N)? " choice
case "$choice" in
y|Y ) echo "Continuing...";;
* ) echo "Aborting."; exit 1;;
esac
fi
# Validate --no-build usage # Validate --no-build usage
if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: --no-build requires --copy-to to be specified" echo "Error: --no-build requires --copy-to to be specified"

View File

@@ -1,28 +1,12 @@
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 0809bdfa9..a7878f44f 100644 index d43656c4f382..7025efd1c2de 100644
--- a/vllm/model_executor/model_loader/weight_utils.py --- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -28,6 +28,7 @@ from vllm import envs @@ -826,6 +826,7 @@ def fastsafetensors_weights_iterator(
from vllm.config import ModelConfig
from vllm.config.load import LoadConfig
from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.distributed.parallel_state import get_world_group
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import (
QuantizationConfig,
@@ -770,11 +771,13 @@ def fastsafetensors_weights_iterator(
"""Iterate over the weights in the model safetensor files
using fastsafetensor library."""
if torch.distributed.is_initialized():
- pg = torch.distributed.group.WORLD
+ world = get_world_group()
+ pg = world.device_group
+ device = world.device
else:
pg = SingleGroup() pg = SingleGroup()
+ device = torch.device(f"cuda:{pg.rank()}")
- device = torch.device(f"cuda:{pg.rank()}") device = torch.device(f"cuda:{current_platform.current_device()}")
+ hf_weights_files = sorted(hf_weights_files, key=_natural_sort_key)
weight_files_sub_lists = [ weight_files_sub_lists = [
hf_weights_files[i : i + pg.size()] hf_weights_files[i : i + pg.size()]
for i in range(0, len(hf_weights_files), pg.size()) for i in range(0, len(hf_weights_files), pg.size())

View File

@@ -0,0 +1,33 @@
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -8,6 +8,7 @@
import hashlib
import json
import os
+import re
import tempfile
import time
from collections import defaultdict
@@ -786,6 +786,14 @@
loader.add_filenames(rank_file_map)
return loader
+def _natural_sort_key(filepath: str) -> list:
+ """Natural sort key for filenames with numeric components, such as
+ model-00001-of-00005.safetensors -> ['model-', 1, '-of-', 5, '.safetensors']"""
+ return [
+ int(s) if s.isdigit() else s
+ for s in re.split(r"(\d+)", os.path.basename(filepath))
+ ]
+
def fastsafetensors_weights_iterator(
hf_weights_files: list[str],
@@ -801,6 +809,7 @@
pg = SingleGroup()
device = torch.device(f"cuda:{pg.rank()}")
+ hf_weights_files = sorted(hf_weights_files, key=_natural_sort_key)
weight_files_sub_lists = [
hf_weights_files[i : i + pg.size()]
for i in range(0, len(hf_weights_files), pg.size())