Applied new fastsafetensors fix to mxfp4 build; disabled wheel builds by default
This commit is contained in:
@@ -215,9 +215,13 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|||||||
uv pip install -r requirements/build.txt
|
uv pip install -r requirements/build.txt
|
||||||
|
|
||||||
# Apply Patches
|
# Apply Patches
|
||||||
# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
|
||||||
# COPY fastsafetensors.patch .
|
# COPY fastsafetensors.patch .
|
||||||
# RUN patch -p1 < fastsafetensors.patch
|
# RUN if patch -p1 --dry-run --reverse < fastsafetensors.patch &>/dev/null; then \
|
||||||
|
# echo "PR #34180 is already applied"; \
|
||||||
|
# else \
|
||||||
|
# patch -p1 < fastsafetensors.patch; \
|
||||||
|
# fi
|
||||||
|
|
||||||
# Final Compilation
|
# Final Compilation
|
||||||
# We mount the ccache directory here. Ideally, map this to a host volume for persistence
|
# We mount the ccache directory here. Ideally, map this to a host volume for persistence
|
||||||
|
|||||||
@@ -211,9 +211,9 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|||||||
uv pip install -r requirements/build.txt
|
uv pip install -r requirements/build.txt
|
||||||
|
|
||||||
# Apply Patches
|
# Apply Patches
|
||||||
# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
|
||||||
#COPY fastsafetensors.patch .
|
COPY fastsafetensors_mxfp4.patch .
|
||||||
#RUN patch -p1 < fastsafetensors.patch
|
RUN patch -p1 < fastsafetensors_mxfp4.patch
|
||||||
|
|
||||||
# Final Compilation
|
# Final Compilation
|
||||||
# We mount the ccache directory here. Ideally, map this to a host volume for persistence
|
# We mount the ccache directory here. Ideally, map this to a host volume for persistence
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ RUN mkdir -p tiktoken_encodings && \
|
|||||||
wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
|
wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
|
||||||
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||||
|
|
||||||
# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
|
||||||
# COPY fastsafetensors.patch .
|
# COPY fastsafetensors.patch .
|
||||||
|
|
||||||
# Install fastsafetensors
|
# Install fastsafetensors
|
||||||
@@ -66,9 +66,13 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|||||||
--extra-index-url https://wheels.vllm.ai/nightly/cu130; \
|
--extra-index-url https://wheels.vllm.ai/nightly/cu130; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
|
||||||
# Apply in site-packages
|
# Apply in site-packages
|
||||||
# RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch
|
# RUN if patch -p1 --dry-run --reverse -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch &>/dev/null; then \
|
||||||
|
# echo "PR #34180 is already applied"; \
|
||||||
|
# else \
|
||||||
|
# patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch; \
|
||||||
|
# fi
|
||||||
|
|
||||||
ARG FLASHINFER_PRE=""
|
ARG FLASHINFER_PRE=""
|
||||||
|
|
||||||
|
|||||||
17
README.md
17
README.md
@@ -43,7 +43,10 @@ Build the container.
|
|||||||
|
|
||||||
**ATTENTION!**
|
**ATTENTION!**
|
||||||
|
|
||||||
If you are getting the following error (or similar), you need to build the image from the source instead of using pre-built wheels. To do it, just remove `--use-wheels` parameter from the build command:
|
As of February 9th, 2026, wheels build is no longer recommended way to build the container due to a lack of optimizations present in the source build.
|
||||||
|
If you still want to use wheels build, please see a note below:
|
||||||
|
|
||||||
|
If you are getting the following error (or similar) when building from wheels, you need to build the image from the source instead of using pre-built wheels. To do it, just remove `--use-wheels` parameter from the build command:
|
||||||
|
|
||||||
```
|
```
|
||||||
0.181 Using Python 3.12.3 environment at: /usr
|
0.181 Using Python 3.12.3 environment at: /usr
|
||||||
@@ -61,7 +64,7 @@ This error happens if vLLM nightly build fails for aarch64 platform, but succeed
|
|||||||
**If you have only one DGX Spark:**
|
**If you have only one DGX Spark:**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./build-and-copy.sh --use-wheels
|
./build-and-copy.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
**On DGX Spark cluster:**
|
**On DGX Spark cluster:**
|
||||||
@@ -72,9 +75,11 @@ You can also check out our new [Networking Guide](docs/NETWORKING.md).
|
|||||||
Then run the following command that will build and distribute image across the cluster.
|
Then run the following command that will build and distribute image across the cluster.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./build-and-copy.sh --use-wheels -c
|
./build-and-copy.sh -c
|
||||||
```
|
```
|
||||||
|
|
||||||
|
An initial build will take around 30 minutes, but subsequent builds will be faster. You can also use precompiled wheels which significantly speed up the build, but source build is recommended because it uses components specifically compiled for DGX Spark.
|
||||||
|
|
||||||
### Run
|
### Run
|
||||||
|
|
||||||
**On a single node**:
|
**On a single node**:
|
||||||
@@ -161,7 +166,7 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi
|
|||||||
|
|
||||||
### 2026-02-09
|
### 2026-02-09
|
||||||
|
|
||||||
- Migrated to a new base image with PyTorch 2.10 compiled with Spark support.
|
- Migrated to a new base image with PyTorch 2.10 compiled with Spark support. With this change, wheels build is no longer a recommended way - please use a source build instead.
|
||||||
- Triton 3.6.0 is now default.
|
- Triton 3.6.0 is now default.
|
||||||
- Removed temporary fastsafetensors patch, as proper fix is now merged into vLLM main branch.
|
- Removed temporary fastsafetensors patch, as proper fix is now merged into vLLM main branch.
|
||||||
|
|
||||||
@@ -294,11 +299,9 @@ See (this post on NVIDIA forums)[https://forums.developer.nvidia.com/t/make-glm-
|
|||||||
To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.:
|
To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./build-and-copy.sh -t vllm-node-tf5 --use-wheels --pre-tf -c
|
./build-and-copy.sh -t vllm-node-tf5 --pre-tf -c
|
||||||
```
|
```
|
||||||
|
|
||||||
Drop `--use-wheels` if you experience an error during build (see the annoucement in the Quick Start section).
|
|
||||||
|
|
||||||
Then, to run on a single node:
|
Then, to run on a single node:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -175,6 +175,14 @@ if [ "$EXP_MXFP4" = true ]; then
|
|||||||
if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi
|
if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ -n "$USE_WHEELS_MODE" ]; then
|
||||||
|
read -p "!!! Wheels build is known not to work properly with all models after migration to Torch 2.10! Full build is recommended. Do you want to continue (y/N)? " choice
|
||||||
|
case "$choice" in
|
||||||
|
y|Y ) echo "Continuing...";;
|
||||||
|
* ) echo "Aborting."; exit 1;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
|
||||||
# Validate --no-build usage
|
# Validate --no-build usage
|
||||||
if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||||
echo "Error: --no-build requires --copy-to to be specified"
|
echo "Error: --no-build requires --copy-to to be specified"
|
||||||
|
|||||||
@@ -1,28 +1,12 @@
|
|||||||
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
|
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
|
||||||
index 0809bdfa9..a7878f44f 100644
|
index d43656c4f382..7025efd1c2de 100644
|
||||||
--- a/vllm/model_executor/model_loader/weight_utils.py
|
--- a/vllm/model_executor/model_loader/weight_utils.py
|
||||||
+++ b/vllm/model_executor/model_loader/weight_utils.py
|
+++ b/vllm/model_executor/model_loader/weight_utils.py
|
||||||
@@ -28,6 +28,7 @@ from vllm import envs
|
@@ -826,6 +826,7 @@ def fastsafetensors_weights_iterator(
|
||||||
from vllm.config import ModelConfig
|
|
||||||
from vllm.config.load import LoadConfig
|
|
||||||
from vllm.distributed import get_tensor_model_parallel_rank
|
|
||||||
+from vllm.distributed.parallel_state import get_world_group
|
|
||||||
from vllm.logger import init_logger
|
|
||||||
from vllm.model_executor.layers.quantization import (
|
|
||||||
QuantizationConfig,
|
|
||||||
@@ -770,11 +771,13 @@ def fastsafetensors_weights_iterator(
|
|
||||||
"""Iterate over the weights in the model safetensor files
|
|
||||||
using fastsafetensor library."""
|
|
||||||
if torch.distributed.is_initialized():
|
|
||||||
- pg = torch.distributed.group.WORLD
|
|
||||||
+ world = get_world_group()
|
|
||||||
+ pg = world.device_group
|
|
||||||
+ device = world.device
|
|
||||||
else:
|
|
||||||
pg = SingleGroup()
|
pg = SingleGroup()
|
||||||
+ device = torch.device(f"cuda:{pg.rank()}")
|
|
||||||
|
|
||||||
- device = torch.device(f"cuda:{pg.rank()}")
|
device = torch.device(f"cuda:{current_platform.current_device()}")
|
||||||
|
+ hf_weights_files = sorted(hf_weights_files, key=_natural_sort_key)
|
||||||
weight_files_sub_lists = [
|
weight_files_sub_lists = [
|
||||||
hf_weights_files[i : i + pg.size()]
|
hf_weights_files[i : i + pg.size()]
|
||||||
for i in range(0, len(hf_weights_files), pg.size())
|
for i in range(0, len(hf_weights_files), pg.size())
|
||||||
|
|||||||
33
fastsafetensors_mxfp4.patch
Normal file
33
fastsafetensors_mxfp4.patch
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
--- a/vllm/model_executor/model_loader/weight_utils.py
|
||||||
|
+++ b/vllm/model_executor/model_loader/weight_utils.py
|
||||||
|
@@ -8,6 +8,7 @@
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
+import re
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
from collections import defaultdict
|
||||||
|
@@ -786,6 +786,14 @@
|
||||||
|
loader.add_filenames(rank_file_map)
|
||||||
|
return loader
|
||||||
|
|
||||||
|
+def _natural_sort_key(filepath: str) -> list:
|
||||||
|
+ """Natural sort key for filenames with numeric components, such as
|
||||||
|
+ model-00001-of-00005.safetensors -> ['model-', 1, '-of-', 5, '.safetensors']"""
|
||||||
|
+ return [
|
||||||
|
+ int(s) if s.isdigit() else s
|
||||||
|
+ for s in re.split(r"(\d+)", os.path.basename(filepath))
|
||||||
|
+ ]
|
||||||
|
+
|
||||||
|
|
||||||
|
def fastsafetensors_weights_iterator(
|
||||||
|
hf_weights_files: list[str],
|
||||||
|
@@ -801,6 +809,7 @@
|
||||||
|
pg = SingleGroup()
|
||||||
|
device = torch.device(f"cuda:{pg.rank()}")
|
||||||
|
|
||||||
|
+ hf_weights_files = sorted(hf_weights_files, key=_natural_sort_key)
|
||||||
|
weight_files_sub_lists = [
|
||||||
|
hf_weights_files[i : i + pg.size()]
|
||||||
|
for i in range(0, len(hf_weights_files), pg.size())
|
||||||
Reference in New Issue
Block a user