diff --git a/Dockerfile b/Dockerfile index 389372f..1a84650 100644 --- a/Dockerfile +++ b/Dockerfile @@ -80,6 +80,10 @@ RUN python3 use_existing_torch.py && \ sed -i "/flashinfer/d" requirements/cuda.txt && \ pip install -r requirements/build.txt +# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 +COPY fastsafetensors.patch . +RUN patch -p1 < fastsafetensors.patch + # Final Build # Uses --no-build-isolation to respect the pre-installed Torch/FlashInfer RUN pip install --no-build-isolation . -v diff --git a/README.md b/README.md index 552dda6..a0159a2 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,14 @@ Some of the steps and parameters may be unnecessary, and some may be missing. Th The Dockerfile builds from the main branch of VLLM, so depending on when you run the build process, it may not be in fully functioning state. +## CHANGELOG + +### 2025-11-26 + +Initial release. +Updated RoCE configuration example to include both interfaces in the list. +Applied patch to enable FastSafeTensors in cluster configuration (EXPERIMENTAL) and added documentation on fastsafetensors use. + ## 1\. Building the Docker Image The Dockerfile includes specific **Build Arguments** to allow you to selectively rebuild layers (e.g., update the vLLM source code without re-downloading PyTorch). @@ -198,7 +206,21 @@ docker exec -it vllm_node And execute vllm command inside. -## 6\. Benchmarking +## 6\. Fastsafetensors + +This build includes support for fastsafetensors loading which significantly improves loading speeds, especially on DGX Spark where MMAP performance is very poor currently. +[Fasttensors](https://github.com/foundation-model-stack/fastsafetensors/) solve this issue by using more efficient multi-threaded loading while avoiding mmap. + +This build also implements an EXPERIMENTAL patch to allow use of fastsafetensors in a cluster configuration (it won't work without it!). +Please refer to [this issue](https://github.com/foundation-model-stack/fastsafetensors/issues/36) for the details. + +To use this method, simply include `--load-format fastsafetensors` when running VLLM, for example: + +```bash +HF_HUB_OFFLINE=1 vllm serve openai/gpt-oss-120b --port 8888 --host 0.0.0.0 --trust_remote_code --swap-space 16 --gpu-memory-utilization 0.7 -tp 2 --distributed-executor-backend ray --load-format fastsafetensors +``` + +## 7\. Benchmarking Follow the guidance in [VLLM Benchmark Suites](https://docs.vllm.ai/en/latest/contributing/benchmarks/) to download benchmarking dataset, and then run a benchmark with a command like this (assuming you are running on head node, otherwise specify `--host` parameter): diff --git a/fastsafetensors.patch b/fastsafetensors.patch new file mode 100644 index 0000000..1200e0d --- /dev/null +++ b/fastsafetensors.patch @@ -0,0 +1,28 @@ +diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py +index 0809bdfa9..a7878f44f 100644 +--- a/vllm/model_executor/model_loader/weight_utils.py ++++ b/vllm/model_executor/model_loader/weight_utils.py +@@ -28,6 +28,7 @@ from vllm import envs + from vllm.config import ModelConfig + from vllm.config.load import LoadConfig + from vllm.distributed import get_tensor_model_parallel_rank ++from vllm.distributed.parallel_state import get_world_group + from vllm.logger import init_logger + from vllm.model_executor.layers.quantization import ( + QuantizationConfig, +@@ -770,11 +771,13 @@ def fastsafetensors_weights_iterator( + """Iterate over the weights in the model safetensor files + using fastsafetensor library.""" + if torch.distributed.is_initialized(): +- pg = torch.distributed.group.WORLD ++ world = get_world_group() ++ pg = world.device_group ++ device = world.device + else: + pg = SingleGroup() ++ device = torch.device(f"cuda:{pg.rank()}") + +- device = torch.device(f"cuda:{pg.rank()}") + weight_files_sub_lists = [ + hf_weights_files[i : i + pg.size()] + for i in range(0, len(hf_weights_files), pg.size())