Updated Dockerfile to include 2 levels of cache busters, added the cluster script and README.
This commit is contained in:
16
Dockerfile
16
Dockerfile
@@ -12,6 +12,7 @@ ENV VLLM_BASE_DIR=/workspace/vllm
|
||||
# 1. Install System Dependencies
|
||||
# Added 'git', 'wget', and 'python3-pip' as they are required for the script steps
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
cmake \
|
||||
build-essential \
|
||||
ninja-build \
|
||||
@@ -36,8 +37,11 @@ ENV TORCH_CUDA_ARCH_LIST=12.1a
|
||||
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
||||
|
||||
# --- CACHE BUSTER ---
|
||||
# Change this argument to force a re-download of PyTorch/FlashInfer
|
||||
ARG CACHEBUST_DEPS=1
|
||||
|
||||
# 4. Install Python Dependencies (Using pip instead of uv)
|
||||
#RUN python3 -m pip install --upgrade pip
|
||||
|
||||
# Install PyTorch for CUDA 13.0
|
||||
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
|
||||
@@ -54,6 +58,11 @@ RUN pip install flashinfer-python --no-deps --index-url https://flashinfer.ai/wh
|
||||
# Install fast safetensors to improve loading speeds
|
||||
RUN pip install fastsafetensors>=0.1.10
|
||||
|
||||
# --- VLLM SOURCE CACHE BUSTER ---
|
||||
# Change THIS argument to force a fresh git clone and rebuild of vLLM
|
||||
# without re-installing the dependencies above.
|
||||
ARG CACHEBUST_VLLM=1
|
||||
|
||||
# 5. Clone and Build vLLM
|
||||
RUN git clone --recursive https://github.com/vllm-project/vllm.git
|
||||
WORKDIR $VLLM_BASE_DIR/vllm
|
||||
@@ -63,6 +72,9 @@ RUN python3 use_existing_torch.py && \
|
||||
sed -i "/flashinfer/d" requirements/cuda.txt && \
|
||||
pip install -r requirements/build.txt
|
||||
|
||||
# TEMPORARY - apply NVFP4 patch
|
||||
RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/29242.diff | git apply
|
||||
|
||||
# Final Build
|
||||
# Uses --no-build-isolation to respect the pre-installed Torch/FlashInfer
|
||||
# Changed -e (editable) to . (standard install) for better Docker portability
|
||||
@@ -73,4 +85,4 @@ WORKDIR $VLLM_BASE_DIR
|
||||
|
||||
# Copy clustering script
|
||||
COPY run-cluster-node.sh $VLLM_BASE_DIR/
|
||||
|
||||
RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
|
||||
|
||||
118
README.md
Normal file
118
README.md
Normal file
@@ -0,0 +1,118 @@
|
||||
# vLLM Ray Cluster Node Docker for DGX Spark
|
||||
|
||||
This repository contains the Docker configuration and startup scripts to run a multi-node vLLM inference cluster using Ray. It supports InfiniBand/RDMA (NCCL) and custom environment configuration for high-performance setups.
|
||||
|
||||
## 1\. Building the Docker Image
|
||||
|
||||
The Dockerfile includes specific **Build Arguments** to allow you to selectively rebuild layers (e.g., update the vLLM source code without re-downloading PyTorch).
|
||||
|
||||
### Option A: Standard Build (First Time)
|
||||
|
||||
```bash
|
||||
docker build -t vllm-node .
|
||||
```
|
||||
|
||||
### Option B: Fast Rebuild (Update vLLM Source Only)
|
||||
|
||||
Use this if you want to pull the latest code from GitHub but keep the heavy dependencies (Torch, FlashInfer, system deps) cached.
|
||||
|
||||
```bash
|
||||
docker build \
|
||||
--build-arg CACHEBUST_VLLM=$(date +%s) \
|
||||
-t vllm-node .
|
||||
```
|
||||
|
||||
### Option C: Full Rebuild (Update All Dependencies)
|
||||
|
||||
Use this to force a re-download of PyTorch, FlashInfer, and system packages.
|
||||
|
||||
```bash
|
||||
docker build \
|
||||
--build-arg CACHEBUST_DEPS=$(date +%s) \
|
||||
-t vllm-node .
|
||||
```
|
||||
|
||||
-----
|
||||
|
||||
## 2\. Running the Container
|
||||
|
||||
Ray and NCCL require specific Docker flags to function correctly across multiple nodes (Shared memory, Network namespace, and Hardware access).
|
||||
|
||||
```bash
|
||||
docker run -it --rm \
|
||||
--gpus all \
|
||||
--net=host \
|
||||
--ipc=host \
|
||||
--privileged \
|
||||
--name vllm_node \
|
||||
vllm-node bash
|
||||
```
|
||||
|
||||
**Flags Explained:**
|
||||
|
||||
* `--net=host`: **Required.** Ray needs full control over network ports; port mapping is insufficient for multi-node clusters.
|
||||
* `--ipc=host`: **Required.** Allows shared memory access for PyTorch/NCCL.
|
||||
* `--privileged`: **Required for InfiniBand.** Grants the container access to RDMA devices (`/dev/infiniband`).
|
||||
|
||||
-----
|
||||
|
||||
## 3\. Using `run-cluster-node.sh`
|
||||
|
||||
Once inside the container, use the included script to configure the environment and launch Ray.
|
||||
|
||||
### Syntax
|
||||
|
||||
```bash
|
||||
./run-cluster-node.sh [OPTIONS]
|
||||
```
|
||||
|
||||
| Flag | Long Flag | Description | Required? |
|
||||
| :--- | :--- | :--- | :--- |
|
||||
| `-r` | `--role` | Role of the machine: `head` or `node`. | **Yes** |
|
||||
| `-h` | `--host-ip` | The IP address of **this** specific machine (IB or Eth IP). | **Yes** |
|
||||
| `-e` | `--eth-if` | Ethernet interface name (e.g., `eth0`, `enp3s0`). | **Yes** |
|
||||
| `-i` | `--ib-if` | InfiniBand interface name (e.g., `ib0`, `rocep1s0f1`). | **Yes** |
|
||||
| `-m` | `--head-ip` | The IP address of the **Head Node**. | Only if role is `node` |
|
||||
|
||||
### Example: Starting the Head Node
|
||||
|
||||
```bash
|
||||
./run-cluster-node.sh \
|
||||
--role head \
|
||||
--host-ip 192.168.177.11 \
|
||||
--eth-if enp1s0f1np1 \
|
||||
--ib-if rocep1s0f1
|
||||
```
|
||||
|
||||
### Example: Starting a Worker Node
|
||||
|
||||
```bash
|
||||
./run-cluster-node.sh \
|
||||
--role node \
|
||||
--host-ip 192.168.177.12 \
|
||||
--eth-if enp1s0f1np1 \
|
||||
--ib-if rocep1s0f1 \
|
||||
--head-ip 192.168.177.11
|
||||
```
|
||||
|
||||
-----
|
||||
|
||||
## 4\. Configuration Details
|
||||
|
||||
### Environment Persistence
|
||||
|
||||
The script automatically appends exported variables to `~/.bashrc`. If you need to open a second terminal into the running container for debugging, simply run:
|
||||
|
||||
```bash
|
||||
docker exec -it vllm_node bash
|
||||
```
|
||||
|
||||
All environment variables (NCCL, Ray, vLLM config) set by the startup script will be loaded automatically in this new session.
|
||||
|
||||
### Hardware Architecture
|
||||
|
||||
**Note:** The Dockerfile defaults to `TORCH_CUDA_ARCH_LIST=12.1a` (NVIDIA GB10). If you are using different hardware, update the `ENV` variable in the Dockerfile before building:
|
||||
|
||||
* **H100:** `9.0`
|
||||
* **A100:** `8.0`
|
||||
* **L40S:** `8.9`
|
||||
118
run-cluster-node.sh
Executable file
118
run-cluster-node.sh
Executable file
@@ -0,0 +1,118 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Define a function to export immediately AND save to .bashrc for future sessions
|
||||
export_persist() {
|
||||
local var_name="$1"
|
||||
local var_value="$2"
|
||||
|
||||
# 1. Export for the current running process
|
||||
export "$var_name"="$var_value"
|
||||
|
||||
# 2. Append to .bashrc (idempotent check to avoid duplicate lines)
|
||||
if ! grep -q "export $var_name=" ~/.bashrc; then
|
||||
echo "export $var_name=\"$var_value\"" >> ~/.bashrc
|
||||
else
|
||||
# Optional: Update the existing line if it exists
|
||||
sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Help Function ---
|
||||
usage() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo ""
|
||||
echo "Required Arguments:"
|
||||
echo " -r, --role <head|node> : Set the node type"
|
||||
echo " -h, --host-ip <ip> : IP address of this interface (Host IP)"
|
||||
echo " -e, --eth-if <name> : Ethernet interface name (e.g., eth0)"
|
||||
echo " -i, --ib-if <name> : InfiniBand/RDMA interface name"
|
||||
echo ""
|
||||
echo "Conditional Arguments:"
|
||||
echo " -m, --head-ip <ip> : IP of the head node (REQUIRED if role is 'node')"
|
||||
echo ""
|
||||
echo "Example:"
|
||||
echo " $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0"
|
||||
echo " $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# --- Argument Parsing ---
|
||||
|
||||
# Initialize variables to empty
|
||||
NODE_TYPE=""
|
||||
HOST_IP=""
|
||||
ETH_IF_NAME=""
|
||||
IB_IF_NAME=""
|
||||
HEAD_IP=""
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case $1 in
|
||||
-r|--role) NODE_TYPE="$2"; shift ;;
|
||||
-h|--host-ip) HOST_IP="$2"; shift ;;
|
||||
-e|--eth-if) ETH_IF_NAME="$2"; shift ;;
|
||||
-i|--ib-if) IB_IF_NAME="$2"; shift ;;
|
||||
-m|--head-ip) HEAD_IP="$2"; shift ;;
|
||||
*) echo "Unknown parameter passed: $1"; usage ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# --- Validation ---
|
||||
|
||||
# 1. Check if all common required arguments are present
|
||||
if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then
|
||||
echo "Error: Missing required arguments."
|
||||
usage
|
||||
fi
|
||||
|
||||
# 2. Validate Role
|
||||
if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then
|
||||
echo "Error: --role must be 'head' or 'node'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 3. Conditional Check for Head IP
|
||||
if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then
|
||||
echo "Error: When --role is 'node', you must provide --head-ip."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- Environment Configuration ---
|
||||
|
||||
echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..."
|
||||
|
||||
export_persist VLLM_HOST_IP=$HOST_IP
|
||||
export_persist RAY_NODE_IP_ADDRESS=$VLLM_HOST_IP
|
||||
export_persist RAY_OVERRIDE_NODE_IP_ADDRESS=$VLLM_HOST_IP
|
||||
|
||||
# Network Interface
|
||||
export_persist MN_IF_NAME=$ETH_IF_NAME
|
||||
export_persist UCX_NET_DEVICES=$MN_IF_NAME
|
||||
export_persist NCCL_SOCKET_IFNAME=$MN_IF_NAME
|
||||
|
||||
# InfiniBand
|
||||
export_persist NCCL_IB_HCA=$IB_IF_NAME
|
||||
export_persist NCCL_IB_DISABLE=0
|
||||
|
||||
# Sockets/Transport
|
||||
export_persist OMPI_MCA_btl_tcp_if_include=$MN_IF_NAME
|
||||
export_persist GLOO_SOCKET_IFNAME=$MN_IF_NAME
|
||||
export_persist TP_SOCKET_IFNAME=$MN_IF_NAME
|
||||
export_persist RAY_memory_monitor_refresh_ms=0
|
||||
|
||||
# --- Execution ---
|
||||
|
||||
if [ "${NODE_TYPE}" == "head" ]; then
|
||||
echo "Starting Ray HEAD node..."
|
||||
exec ray start --block --head --port 6379 \
|
||||
--node-ip-address "$VLLM_HOST_IP" \
|
||||
--dashboard-host "0.0.0.0" \
|
||||
--dashboard-port 8265
|
||||
else
|
||||
echo "Starting Ray WORKER node connecting to $HEAD_IP..."
|
||||
exec ray start --block \
|
||||
--address="$HEAD_IP:6379" \
|
||||
--node-ip-address "$VLLM_HOST_IP"
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user