diff --git a/Dockerfile b/Dockerfile index 935a48e..27e5bdc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -215,9 +215,13 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ uv pip install -r requirements/build.txt # Apply Patches -# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 +# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180 # COPY fastsafetensors.patch . -# RUN patch -p1 < fastsafetensors.patch +# RUN if patch -p1 --dry-run --reverse < fastsafetensors.patch &>/dev/null; then \ +# echo "PR #34180 is already applied"; \ +# else \ +# patch -p1 < fastsafetensors.patch; \ +# fi # Final Compilation # We mount the ccache directory here. Ideally, map this to a host volume for persistence diff --git a/Dockerfile.mxfp4 b/Dockerfile.mxfp4 index 6863a98..ed7d6d9 100644 --- a/Dockerfile.mxfp4 +++ b/Dockerfile.mxfp4 @@ -211,9 +211,9 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ uv pip install -r requirements/build.txt # Apply Patches -# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 -#COPY fastsafetensors.patch . -#RUN patch -p1 < fastsafetensors.patch +# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180 +COPY fastsafetensors_mxfp4.patch . +RUN patch -p1 < fastsafetensors_mxfp4.patch # Final Compilation # We mount the ccache directory here. Ideally, map this to a host volume for persistence diff --git a/Dockerfile.wheels b/Dockerfile.wheels index 9edc7c9..ab48c1f 100644 --- a/Dockerfile.wheels +++ b/Dockerfile.wheels @@ -40,7 +40,7 @@ RUN mkdir -p tiktoken_encodings && \ wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \ wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" -# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 +# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180 # COPY fastsafetensors.patch . # Install fastsafetensors @@ -66,9 +66,13 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ --extra-index-url https://wheels.vllm.ai/nightly/cu130; \ fi -# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 +# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180 # Apply in site-packages -# RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch +# RUN if patch -p1 --dry-run --reverse -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch &>/dev/null; then \ +# echo "PR #34180 is already applied"; \ +# else \ +# patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch; \ +# fi ARG FLASHINFER_PRE="" diff --git a/README.md b/README.md index 7a2040d..6af4b37 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,10 @@ Build the container. **ATTENTION!** -If you are getting the following error (or similar), you need to build the image from the source instead of using pre-built wheels. To do it, just remove `--use-wheels` parameter from the build command: +As of February 9th, 2026, wheels build is no longer recommended way to build the container due to a lack of optimizations present in the source build. +If you still want to use wheels build, please see a note below: + +If you are getting the following error (or similar) when building from wheels, you need to build the image from the source instead of using pre-built wheels. To do it, just remove `--use-wheels` parameter from the build command: ``` 0.181 Using Python 3.12.3 environment at: /usr @@ -61,7 +64,7 @@ This error happens if vLLM nightly build fails for aarch64 platform, but succeed **If you have only one DGX Spark:** ```bash -./build-and-copy.sh --use-wheels +./build-and-copy.sh ``` **On DGX Spark cluster:** @@ -72,9 +75,11 @@ You can also check out our new [Networking Guide](docs/NETWORKING.md). Then run the following command that will build and distribute image across the cluster. ```bash -./build-and-copy.sh --use-wheels -c +./build-and-copy.sh -c ``` +An initial build will take around 30 minutes, but subsequent builds will be faster. You can also use precompiled wheels which significantly speed up the build, but source build is recommended because it uses components specifically compiled for DGX Spark. + ### Run **On a single node**: @@ -161,7 +166,7 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi ### 2026-02-09 -- Migrated to a new base image with PyTorch 2.10 compiled with Spark support. +- Migrated to a new base image with PyTorch 2.10 compiled with Spark support. With this change, wheels build is no longer a recommended way - please use a source build instead. - Triton 3.6.0 is now default. - Removed temporary fastsafetensors patch, as proper fix is now merged into vLLM main branch. @@ -294,11 +299,9 @@ See (this post on NVIDIA forums)[https://forums.developer.nvidia.com/t/make-glm- To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.: ```bash -./build-and-copy.sh -t vllm-node-tf5 --use-wheels --pre-tf -c +./build-and-copy.sh -t vllm-node-tf5 --pre-tf -c ``` -Drop `--use-wheels` if you experience an error during build (see the annoucement in the Quick Start section). - Then, to run on a single node: ```bash diff --git a/build-and-copy.sh b/build-and-copy.sh index 5e01e76..00456f3 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -175,6 +175,14 @@ if [ "$EXP_MXFP4" = true ]; then if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi fi +if [ -n "$USE_WHEELS_MODE" ]; then + read -p "!!! Wheels build is known not to work properly with all models after migration to Torch 2.10! Full build is recommended. Do you want to continue (y/N)? " choice + case "$choice" in + y|Y ) echo "Continuing...";; + * ) echo "Aborting."; exit 1;; + esac +fi + # Validate --no-build usage if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then echo "Error: --no-build requires --copy-to to be specified" diff --git a/fastsafetensors.patch b/fastsafetensors.patch index 1200e0d..0fcdfc5 100644 --- a/fastsafetensors.patch +++ b/fastsafetensors.patch @@ -1,28 +1,12 @@ diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py -index 0809bdfa9..a7878f44f 100644 +index d43656c4f382..7025efd1c2de 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py -@@ -28,6 +28,7 @@ from vllm import envs - from vllm.config import ModelConfig - from vllm.config.load import LoadConfig - from vllm.distributed import get_tensor_model_parallel_rank -+from vllm.distributed.parallel_state import get_world_group - from vllm.logger import init_logger - from vllm.model_executor.layers.quantization import ( - QuantizationConfig, -@@ -770,11 +771,13 @@ def fastsafetensors_weights_iterator( - """Iterate over the weights in the model safetensor files - using fastsafetensor library.""" - if torch.distributed.is_initialized(): -- pg = torch.distributed.group.WORLD -+ world = get_world_group() -+ pg = world.device_group -+ device = world.device - else: +@@ -826,6 +826,7 @@ def fastsafetensors_weights_iterator( pg = SingleGroup() -+ device = torch.device(f"cuda:{pg.rank()}") -- device = torch.device(f"cuda:{pg.rank()}") + device = torch.device(f"cuda:{current_platform.current_device()}") ++ hf_weights_files = sorted(hf_weights_files, key=_natural_sort_key) weight_files_sub_lists = [ hf_weights_files[i : i + pg.size()] for i in range(0, len(hf_weights_files), pg.size()) diff --git a/fastsafetensors_mxfp4.patch b/fastsafetensors_mxfp4.patch new file mode 100644 index 0000000..9a04aef --- /dev/null +++ b/fastsafetensors_mxfp4.patch @@ -0,0 +1,33 @@ +--- a/vllm/model_executor/model_loader/weight_utils.py ++++ b/vllm/model_executor/model_loader/weight_utils.py +@@ -8,6 +8,7 @@ + import hashlib + import json + import os ++import re + import tempfile + import time + from collections import defaultdict +@@ -786,6 +786,14 @@ + loader.add_filenames(rank_file_map) + return loader + ++def _natural_sort_key(filepath: str) -> list: ++ """Natural sort key for filenames with numeric components, such as ++ model-00001-of-00005.safetensors -> ['model-', 1, '-of-', 5, '.safetensors']""" ++ return [ ++ int(s) if s.isdigit() else s ++ for s in re.split(r"(\d+)", os.path.basename(filepath)) ++ ] ++ + + def fastsafetensors_weights_iterator( + hf_weights_files: list[str], +@@ -801,6 +809,7 @@ + pg = SingleGroup() + device = torch.device(f"cuda:{pg.rank()}") + ++ hf_weights_files = sorted(hf_weights_files, key=_natural_sort_key) + weight_files_sub_lists = [ + hf_weights_files[i : i + pg.size()] + for i in range(0, len(hf_weights_files), pg.size())