diff --git a/Dockerfile b/Dockerfile index 21cbe84..5555ebb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,6 +12,7 @@ ENV VLLM_BASE_DIR=/workspace/vllm # 1. Install System Dependencies # Added 'git', 'wget', and 'python3-pip' as they are required for the script steps RUN apt-get update && apt-get install -y \ + curl \ cmake \ build-essential \ ninja-build \ @@ -36,8 +37,11 @@ ENV TORCH_CUDA_ARCH_LIST=12.1a ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings +# --- CACHE BUSTER --- +# Change this argument to force a re-download of PyTorch/FlashInfer +ARG CACHEBUST_DEPS=1 + # 4. Install Python Dependencies (Using pip instead of uv) -#RUN python3 -m pip install --upgrade pip # Install PyTorch for CUDA 13.0 RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 @@ -54,6 +58,11 @@ RUN pip install flashinfer-python --no-deps --index-url https://flashinfer.ai/wh # Install fast safetensors to improve loading speeds RUN pip install fastsafetensors>=0.1.10 +# --- VLLM SOURCE CACHE BUSTER --- +# Change THIS argument to force a fresh git clone and rebuild of vLLM +# without re-installing the dependencies above. +ARG CACHEBUST_VLLM=1 + # 5. Clone and Build vLLM RUN git clone --recursive https://github.com/vllm-project/vllm.git WORKDIR $VLLM_BASE_DIR/vllm @@ -63,6 +72,9 @@ RUN python3 use_existing_torch.py && \ sed -i "/flashinfer/d" requirements/cuda.txt && \ pip install -r requirements/build.txt +# TEMPORARY - apply NVFP4 patch +RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/29242.diff | git apply + # Final Build # Uses --no-build-isolation to respect the pre-installed Torch/FlashInfer # Changed -e (editable) to . (standard install) for better Docker portability @@ -73,4 +85,4 @@ WORKDIR $VLLM_BASE_DIR # Copy clustering script COPY run-cluster-node.sh $VLLM_BASE_DIR/ - +RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh diff --git a/README.md b/README.md new file mode 100644 index 0000000..e3da041 --- /dev/null +++ b/README.md @@ -0,0 +1,118 @@ +# vLLM Ray Cluster Node Docker for DGX Spark + +This repository contains the Docker configuration and startup scripts to run a multi-node vLLM inference cluster using Ray. It supports InfiniBand/RDMA (NCCL) and custom environment configuration for high-performance setups. + +## 1\. Building the Docker Image + +The Dockerfile includes specific **Build Arguments** to allow you to selectively rebuild layers (e.g., update the vLLM source code without re-downloading PyTorch). + +### Option A: Standard Build (First Time) + +```bash +docker build -t vllm-node . +``` + +### Option B: Fast Rebuild (Update vLLM Source Only) + +Use this if you want to pull the latest code from GitHub but keep the heavy dependencies (Torch, FlashInfer, system deps) cached. + +```bash +docker build \ + --build-arg CACHEBUST_VLLM=$(date +%s) \ + -t vllm-node . +``` + +### Option C: Full Rebuild (Update All Dependencies) + +Use this to force a re-download of PyTorch, FlashInfer, and system packages. + +```bash +docker build \ + --build-arg CACHEBUST_DEPS=$(date +%s) \ + -t vllm-node . +``` + +----- + +## 2\. Running the Container + +Ray and NCCL require specific Docker flags to function correctly across multiple nodes (Shared memory, Network namespace, and Hardware access). + +```bash +docker run -it --rm \ + --gpus all \ + --net=host \ + --ipc=host \ + --privileged \ + --name vllm_node \ + vllm-node bash +``` + +**Flags Explained:** + + * `--net=host`: **Required.** Ray needs full control over network ports; port mapping is insufficient for multi-node clusters. + * `--ipc=host`: **Required.** Allows shared memory access for PyTorch/NCCL. + * `--privileged`: **Required for InfiniBand.** Grants the container access to RDMA devices (`/dev/infiniband`). + +----- + +## 3\. Using `run-cluster-node.sh` + +Once inside the container, use the included script to configure the environment and launch Ray. + +### Syntax + +```bash +./run-cluster-node.sh [OPTIONS] +``` + +| Flag | Long Flag | Description | Required? | +| :--- | :--- | :--- | :--- | +| `-r` | `--role` | Role of the machine: `head` or `node`. | **Yes** | +| `-h` | `--host-ip` | The IP address of **this** specific machine (IB or Eth IP). | **Yes** | +| `-e` | `--eth-if` | Ethernet interface name (e.g., `eth0`, `enp3s0`). | **Yes** | +| `-i` | `--ib-if` | InfiniBand interface name (e.g., `ib0`, `rocep1s0f1`). | **Yes** | +| `-m` | `--head-ip` | The IP address of the **Head Node**. | Only if role is `node` | + +### Example: Starting the Head Node + +```bash +./run-cluster-node.sh \ + --role head \ + --host-ip 192.168.177.11 \ + --eth-if enp1s0f1np1 \ + --ib-if rocep1s0f1 +``` + +### Example: Starting a Worker Node + +```bash +./run-cluster-node.sh \ + --role node \ + --host-ip 192.168.177.12 \ + --eth-if enp1s0f1np1 \ + --ib-if rocep1s0f1 \ + --head-ip 192.168.177.11 +``` + +----- + +## 4\. Configuration Details + +### Environment Persistence + +The script automatically appends exported variables to `~/.bashrc`. If you need to open a second terminal into the running container for debugging, simply run: + +```bash +docker exec -it vllm_node bash +``` + +All environment variables (NCCL, Ray, vLLM config) set by the startup script will be loaded automatically in this new session. + +### Hardware Architecture + +**Note:** The Dockerfile defaults to `TORCH_CUDA_ARCH_LIST=12.1a` (NVIDIA GB10). If you are using different hardware, update the `ENV` variable in the Dockerfile before building: + + * **H100:** `9.0` + * **A100:** `8.0` + * **L40S:** `8.9` diff --git a/run-cluster-node.sh b/run-cluster-node.sh new file mode 100755 index 0000000..345659d --- /dev/null +++ b/run-cluster-node.sh @@ -0,0 +1,118 @@ +#!/bin/bash +set -e + +# Define a function to export immediately AND save to .bashrc for future sessions +export_persist() { + local var_name="$1" + local var_value="$2" + + # 1. Export for the current running process + export "$var_name"="$var_value" + + # 2. Append to .bashrc (idempotent check to avoid duplicate lines) + if ! grep -q "export $var_name=" ~/.bashrc; then + echo "export $var_name=\"$var_value\"" >> ~/.bashrc + else + # Optional: Update the existing line if it exists + sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc + fi +} + +# --- Help Function --- +usage() { + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Required Arguments:" + echo " -r, --role : Set the node type" + echo " -h, --host-ip : IP address of this interface (Host IP)" + echo " -e, --eth-if : Ethernet interface name (e.g., eth0)" + echo " -i, --ib-if : InfiniBand/RDMA interface name" + echo "" + echo "Conditional Arguments:" + echo " -m, --head-ip : IP of the head node (REQUIRED if role is 'node')" + echo "" + echo "Example:" + echo " $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0" + echo " $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10" + exit 1 +} + +# --- Argument Parsing --- + +# Initialize variables to empty +NODE_TYPE="" +HOST_IP="" +ETH_IF_NAME="" +IB_IF_NAME="" +HEAD_IP="" + +while [[ "$#" -gt 0 ]]; do + case $1 in + -r|--role) NODE_TYPE="$2"; shift ;; + -h|--host-ip) HOST_IP="$2"; shift ;; + -e|--eth-if) ETH_IF_NAME="$2"; shift ;; + -i|--ib-if) IB_IF_NAME="$2"; shift ;; + -m|--head-ip) HEAD_IP="$2"; shift ;; + *) echo "Unknown parameter passed: $1"; usage ;; + esac + shift +done + +# --- Validation --- + +# 1. Check if all common required arguments are present +if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then + echo "Error: Missing required arguments." + usage +fi + +# 2. Validate Role +if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then + echo "Error: --role must be 'head' or 'node'." + exit 1 +fi + +# 3. Conditional Check for Head IP +if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then + echo "Error: When --role is 'node', you must provide --head-ip." + exit 1 +fi + +# --- Environment Configuration --- + +echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..." + +export_persist VLLM_HOST_IP=$HOST_IP +export_persist RAY_NODE_IP_ADDRESS=$VLLM_HOST_IP +export_persist RAY_OVERRIDE_NODE_IP_ADDRESS=$VLLM_HOST_IP + +# Network Interface +export_persist MN_IF_NAME=$ETH_IF_NAME +export_persist UCX_NET_DEVICES=$MN_IF_NAME +export_persist NCCL_SOCKET_IFNAME=$MN_IF_NAME + +# InfiniBand +export_persist NCCL_IB_HCA=$IB_IF_NAME +export_persist NCCL_IB_DISABLE=0 + +# Sockets/Transport +export_persist OMPI_MCA_btl_tcp_if_include=$MN_IF_NAME +export_persist GLOO_SOCKET_IFNAME=$MN_IF_NAME +export_persist TP_SOCKET_IFNAME=$MN_IF_NAME +export_persist RAY_memory_monitor_refresh_ms=0 + +# --- Execution --- + +if [ "${NODE_TYPE}" == "head" ]; then + echo "Starting Ray HEAD node..." + exec ray start --block --head --port 6379 \ + --node-ip-address "$VLLM_HOST_IP" \ + --dashboard-host "0.0.0.0" \ + --dashboard-port 8265 +else + echo "Starting Ray WORKER node connecting to $HEAD_IP..." + exec ray start --block \ + --address="$HEAD_IP:6379" \ + --node-ip-address "$VLLM_HOST_IP" +fi +