From a13a9f6806520bdc458a6ac19b39deae860bf5f4 Mon Sep 17 00:00:00 2001 From: Christopher Owen Date: Thu, 18 Dec 2025 13:31:54 +0100 Subject: [PATCH 01/18] Limit build parallelism to reduce OOM situations --- Dockerfile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Dockerfile b/Dockerfile index 78bdd9d..2285d93 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,20 @@ # syntax=docker/dockerfile:1.6 +# Limit build parallelism to reduce OOM situations +ARG BUILD_JOBS=16 + # ========================================================= # STAGE 1: Base Image (Installs Dependencies) # ========================================================= FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 AS base +# Build parallemism +ARG BUILD_JOBS +ENV MAX_JOBS=${BUILD_JOBS} +ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} +ENV NINJAFLAGS="-j${BUILD_JOBS}" +ENV MAKEFLAGS="-j${BUILD_JOBS}" + # Set non-interactive frontend to prevent apt prompts ENV DEBIAN_FRONTEND=noninteractive From 1025243316436e0bfca06c867e531b4f39f334b6 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 13:10:57 -0800 Subject: [PATCH 02/18] Added launch_cluster script to simplify launching cluster on nodes. --- run-cluster-node.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/run-cluster-node.sh b/run-cluster-node.sh index a18991e..2ec9049 100755 --- a/run-cluster-node.sh +++ b/run-cluster-node.sh @@ -109,7 +109,8 @@ if [ "${NODE_TYPE}" == "head" ]; then --node-ip-address "$VLLM_HOST_IP" \ --include-dashboard=True \ --dashboard-host "0.0.0.0" \ - --dashboard-port 8265 + --dashboard-port 8265 \ + --disable-usage-stats else echo "Starting Ray WORKER node connecting to $HEAD_IP..." exec ray start --block \ From 20a6699bf71affd5b5d214d0e874fe1e00a26f1b Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 13:11:13 -0800 Subject: [PATCH 03/18] Add launch_cluster script for managing cluster nodes and actions --- launch_cluster.sh | 224 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100755 launch_cluster.sh diff --git a/launch_cluster.sh b/launch_cluster.sh new file mode 100755 index 0000000..a64da11 --- /dev/null +++ b/launch_cluster.sh @@ -0,0 +1,224 @@ +#!/bin/bash + +# Default Configuration +IMAGE_NAME="vllm-node" +DEFAULT_CONTAINER_NAME="vllm_node" +ETH_IF="enp1s0f1np1" +IB_IF="rocep1s0f1,roceP2p1s0f1" + +# Initialize variables +NODES_ARG="" +CONTAINER_NAME="$DEFAULT_CONTAINER_NAME" +COMMAND_TO_RUN="" +DAEMON_MODE="false" +ACTION="start" + +# Function to print usage +usage() { + echo "Usage: $0 -n [-t ] [--name ] [--eth-if ] [--ib-if ] [-d] [action] [command]" + echo " -n, --nodes Comma-separated list of node IPs (Mandatory)" + echo " -t Docker image name (Optional, default: $IMAGE_NAME)" + echo " --name Container name (Optional, default: $DEFAULT_CONTAINER_NAME)" + echo " --eth-if Ethernet interface (Optional, default: $ETH_IF)" + echo " --ib-if InfiniBand interface (Optional, default: $IB_IF)" + echo " -d Daemon mode (only for 'start' action)" + echo " action start | stop | status | exec (Default: start)" + echo " command Command to run (only for 'exec' action)" + exit 1 +} + +# Parse arguments +while [[ "$#" -gt 0 ]]; do + case $1 in + -n|--nodes) NODES_ARG="$2"; shift ;; + -t) IMAGE_NAME="$2"; shift ;; + --name) CONTAINER_NAME="$2"; shift ;; + --eth-if) ETH_IF="$2"; shift ;; + --ib-if) IB_IF="$2"; shift ;; + -d) DAEMON_MODE="true" ;; + -h|--help) usage ;; + start|stop|status) + ACTION="$1" + ;; + exec) + ACTION="exec" + shift + COMMAND_TO_RUN="$@" + break + ;; + *) + # If it's not a flag and not a known action, treat as exec command for backward compatibility + # unless it's the default 'start' implied. + # However, to support "omitted" = start, we need to be careful. + # If the arg looks like a command, it's exec. + ACTION="exec" + COMMAND_TO_RUN="$@" + break + ;; + esac + shift +done + +if [[ -z "$NODES_ARG" ]]; then + echo "Error: Nodes argument (-n) is mandatory." + usage +fi + +# Split nodes into array +IFS=',' read -r -a ALL_NODES <<< "$NODES_ARG" + +# Detect Head IP (Local IP) +HEAD_IP="" +LOCAL_IPS=$(hostname -I) +for ip in "${ALL_NODES[@]}"; do + # Trim whitespace + ip=$(echo "$ip" | xargs) + if [[ " $LOCAL_IPS " =~ " $ip " ]]; then + HEAD_IP="$ip" + break + fi +done + +if [[ -z "$HEAD_IP" ]]; then + echo "Error: Could not determine Head IP. This script must be run on one of the nodes specified in -n." + exit 1 +fi + +# Identify Worker Nodes +WORKER_NODES=() +for ip in "${ALL_NODES[@]}"; do + ip=$(echo "$ip" | xargs) + if [[ "$ip" != "$HEAD_IP" ]]; then + WORKER_NODES+=("$ip") + fi +done + +echo "Head Node: $HEAD_IP" +echo "Worker Nodes: ${WORKER_NODES[*]}" +echo "Container Name: $CONTAINER_NAME" +echo "Action: $ACTION" + +# Cleanup Function +cleanup() { + echo "" + echo "Stopping cluster..." + + # Stop Head + echo "Stopping head node ($HEAD_IP)..." + docker stop "$CONTAINER_NAME" >/dev/null 2>&1 || true + + # Stop Workers + for worker in "${WORKER_NODES[@]}"; do + echo "Stopping worker node ($worker)..." + ssh "$worker" "docker stop $CONTAINER_NAME" >/dev/null 2>&1 || true + done + + echo "Cluster stopped." +} + +# Handle 'stop' action +if [[ "$ACTION" == "stop" ]]; then + cleanup + exit 0 +fi + +# Handle 'status' action +if [[ "$ACTION" == "status" ]]; then + echo "Checking status..." + + # Check Head + if docker ps | grep -q "$CONTAINER_NAME"; then + echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is RUNNING." + echo "--- Ray Status ---" + docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status." + echo "------------------" + else + echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is NOT running." + fi + + # Check Workers + for worker in "${WORKER_NODES[@]}"; do + if ssh "$worker" "docker ps | grep -q '$CONTAINER_NAME'"; then + echo "[WORKER] $worker: Container '$CONTAINER_NAME' is RUNNING." + else + echo "[WORKER] $worker: Container '$CONTAINER_NAME' is NOT running." + fi + done + exit 0 +fi + +# Trap signals +# Only trap if we are NOT in daemon mode, OR if we are in exec mode (always cleanup after exec) +if [[ "$DAEMON_MODE" == "false" ]] || [[ "$ACTION" == "exec" ]]; then + trap cleanup EXIT INT TERM HUP +fi + +# Start Head Node +echo "Starting Head Node on $HEAD_IP..." +docker run -d --privileged --gpus all --rm \ + --ipc=host --network host \ + --name "$CONTAINER_NAME" \ + -e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + "$IMAGE_NAME" \ + ./run-cluster-node.sh \ + --role head \ + --host-ip "$HEAD_IP" \ + --eth-if "$ETH_IF" \ + --ib-if "$IB_IF" + +# Start Worker Nodes +for worker in "${WORKER_NODES[@]}"; do + echo "Starting Worker Node on $worker..." + ssh "$worker" "docker run -d --privileged --gpus all --rm \ + --ipc=host --network host \ + --name $CONTAINER_NAME \ + -e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + $IMAGE_NAME \ + ./run-cluster-node.sh \ + --role node \ + --host-ip $worker \ + --eth-if $ETH_IF \ + --ib-if $IB_IF \ + --head-ip $HEAD_IP" +done + +# Wait for Cluster Readiness +wait_for_cluster() { + echo "Waiting for cluster to be ready..." + local retries=30 + local count=0 + + while [[ $count -lt $retries ]]; do + # Check if ray is responsive + if docker exec "$CONTAINER_NAME" ray status >/dev/null 2>&1; then + echo "Cluster head is responsive." + # Give workers a moment to connect + sleep 5 + return 0 + fi + + sleep 2 + ((count++)) + done + + echo "Timeout waiting for cluster to start." + exit 1 +} + +if [[ "$ACTION" == "exec" ]]; then + wait_for_cluster + echo "Executing command: $COMMAND_TO_RUN" + eval "$COMMAND_TO_RUN" +elif [[ "$ACTION" == "start" ]]; then + wait_for_cluster + if [[ "$DAEMON_MODE" == "true" ]]; then + echo "Cluster started in background (Daemon mode)." + else + echo "Cluster started. Tailing logs from head node..." + echo "Press Ctrl+C to stop the cluster." + docker logs -f "$CONTAINER_NAME" & + wait $! + fi +fi From a1ed352635f3a8002e1113a2457c2115939a3403 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 13:11:48 -0800 Subject: [PATCH 04/18] renamed launch-cluster for consitency --- launch_cluster.sh => launch-cluster.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename launch_cluster.sh => launch-cluster.sh (100%) diff --git a/launch_cluster.sh b/launch-cluster.sh similarity index 100% rename from launch_cluster.sh rename to launch-cluster.sh From 25b1d8eb4f67f1f189961e73e78d8c48763d0906 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 13:53:28 -0800 Subject: [PATCH 05/18] Enhance launch-cluster script with auto-detection for interfaces and nodes --- launch-cluster.sh | 156 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 7 deletions(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index a64da11..29de635 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -3,24 +3,27 @@ # Default Configuration IMAGE_NAME="vllm-node" DEFAULT_CONTAINER_NAME="vllm_node" -ETH_IF="enp1s0f1np1" -IB_IF="rocep1s0f1,roceP2p1s0f1" +# ETH_IF and IB_IF will be auto-detected if not provided +ETH_IF="" +IB_IF="" # Initialize variables NODES_ARG="" CONTAINER_NAME="$DEFAULT_CONTAINER_NAME" COMMAND_TO_RUN="" DAEMON_MODE="false" +CHECK_CONFIG="false" ACTION="start" # Function to print usage usage() { - echo "Usage: $0 -n [-t ] [--name ] [--eth-if ] [--ib-if ] [-d] [action] [command]" - echo " -n, --nodes Comma-separated list of node IPs (Mandatory)" + echo "Usage: $0 [-n ] [-t ] [--name ] [--eth-if ] [--ib-if ] [--check-config] [-d] [action] [command]" + echo " -n, --nodes Comma-separated list of node IPs (Optional, auto-detected if omitted)" echo " -t Docker image name (Optional, default: $IMAGE_NAME)" echo " --name Container name (Optional, default: $DEFAULT_CONTAINER_NAME)" - echo " --eth-if Ethernet interface (Optional, default: $ETH_IF)" - echo " --ib-if InfiniBand interface (Optional, default: $IB_IF)" + echo " --eth-if Ethernet interface (Optional, auto-detected)" + echo " --ib-if InfiniBand interface (Optional, auto-detected)" + echo " --check-config Check configuration and auto-detection without launching" echo " -d Daemon mode (only for 'start' action)" echo " action start | stop | status | exec (Default: start)" echo " command Command to run (only for 'exec' action)" @@ -35,6 +38,7 @@ while [[ "$#" -gt 0 ]]; do --name) CONTAINER_NAME="$2"; shift ;; --eth-if) ETH_IF="$2"; shift ;; --ib-if) IB_IF="$2"; shift ;; + --check-config) CHECK_CONFIG="true" ;; -d) DAEMON_MODE="true" ;; -h|--help) usage ;; start|stop|status) @@ -59,8 +63,135 @@ while [[ "$#" -gt 0 ]]; do shift done +# --- Auto-Detection Logic --- + +# Check for required tools if auto-detection is needed +if [[ -z "$ETH_IF" || -z "$IB_IF" || -z "$NODES_ARG" ]]; then + if ! command -v ibdev2netdev &> /dev/null; then + echo "Error: ibdev2netdev not found. Cannot auto-detect interfaces." + exit 1 + fi +fi + +# 1. Detect Interfaces (ETH_IF and IB_IF) +if [[ -z "$ETH_IF" || -z "$IB_IF" ]]; then + echo "Auto-detecting interfaces..." + + # Get all Up interfaces: "mlx5_0 port 1 ==> enp1s0f0np0 (Up)" + # We capture: IB_DEV, NET_DEV + mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}') + + if [ ${#IB_NET_PAIRS[@]} -eq 0 ]; then + echo "Error: No active IB interfaces found." + exit 1 + fi + + DETECTED_IB_IFS=() + CANDIDATE_ETH_IFS=() + + for pair in "${IB_NET_PAIRS[@]}"; do + ib_dev=$(echo "$pair" | awk '{print $1}') + net_dev=$(echo "$pair" | awk '{print $2}') + + DETECTED_IB_IFS+=("$ib_dev") + + # Check if interface has an IP address + if ip addr show "$net_dev" | grep -q "inet "; then + CANDIDATE_ETH_IFS+=("$net_dev") + fi + done + + # Set IB_IF if not provided + if [[ -z "$IB_IF" ]]; then + IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}") + echo " Detected IB_IF: $IB_IF" + fi + + # Set ETH_IF if not provided + if [[ -z "$ETH_IF" ]]; then + if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then + echo "Error: No active IB-associated interfaces have IP addresses." + exit 1 + fi + + # Selection logic: Prefer interface without capital 'P' + SELECTED_ETH="" + for iface in "${CANDIDATE_ETH_IFS[@]}"; do + if [[ "$iface" != *"P"* ]]; then + SELECTED_ETH="$iface" + break + fi + done + + # Fallback: Use the first one if all have 'P' or none found yet + if [[ -z "$SELECTED_ETH" ]]; then + SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}" + fi + + ETH_IF="$SELECTED_ETH" + echo " Detected ETH_IF: $ETH_IF" + fi +fi + +# 2. Detect Nodes if not provided if [[ -z "$NODES_ARG" ]]; then - echo "Error: Nodes argument (-n) is mandatory." + echo "Auto-detecting nodes..." + + if ! command -v avahi-browse &> /dev/null; then + echo "Error: avahi-browse not found. Please install avahi-utils." + exit 1 + fi + + # Get local IP of the selected ETH_IF + LOCAL_IP=$(ip -4 addr show "$ETH_IF" | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | head -n 1) + if [[ -z "$LOCAL_IP" ]]; then + echo "Error: Could not determine IP for interface $ETH_IF" + exit 1 + fi + + echo " Detected Local IP: $LOCAL_IP" + + DETECTED_IPS=("$LOCAL_IP") + + # Scan for other nodes + echo " Scanning for peers via avahi..." + # Run avahi-browse, filter for _ssh._tcp, and look for our interface + # Note: avahi-browse output format varies, we use -p (parsable) + # Format: =;interface;IPv4;name;type;domain;hostname;ip;port;txt + + # We only care about services on our selected ETH_IF or related interfaces? + # The reference script scans ALL interfaces found by ibdev2netdev. + # Let's stick to the reference logic: scan on all IB-associated interfaces. + + TEMP_FILE=$(mktemp) + trap 'rm -f "$TEMP_FILE"' EXIT + + avahi_output=$(avahi-browse -p -r -f -t _ssh._tcp 2>/dev/null) + + # Filter by the selected management interface (ETH_IF) + echo "$avahi_output" | grep ";$ETH_IF;" > "$TEMP_FILE" + + # Extract IPs + while IFS=';' read -r prefix iface protocol name type domain hostname ip port txt; do + if [[ "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + # Avoid duplicates + if [[ ! " ${DETECTED_IPS[@]} " =~ " ${ip} " ]]; then + DETECTED_IPS+=("$ip") + echo " Found peer: $ip ($hostname)" + fi + fi + done < <(grep "^=" "$TEMP_FILE" | grep "IPv4") + + # Sort IPs + IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}")) + unset IFS + + NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}") + echo " Cluster Nodes: $NODES_ARG" +fi + +if [[ -z "$NODES_ARG" ]]; then + echo "Error: Nodes argument (-n) is mandatory or could not be auto-detected." usage fi @@ -98,8 +229,19 @@ echo "Worker Nodes: ${WORKER_NODES[*]}" echo "Container Name: $CONTAINER_NAME" echo "Action: $ACTION" +if [[ "$CHECK_CONFIG" == "true" ]]; then + echo "Configuration Check Complete." + echo " Image Name: $IMAGE_NAME" + echo " ETH Interface: $ETH_IF" + echo " IB Interface: $IB_IF" + exit 0 +fi + # Cleanup Function cleanup() { + # Remove traps to prevent nested cleanup + trap - EXIT INT TERM HUP + echo "" echo "Stopping cluster..." From f7a15bfaf5158069294a0eaa9ab5622521e94a61 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 14:22:48 -0800 Subject: [PATCH 06/18] Enhance launch-cluster script with improved SSH connectivity checks for worker nodes --- launch-cluster.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index 29de635..55cb870 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -77,7 +77,7 @@ fi if [[ -z "$ETH_IF" || -z "$IB_IF" ]]; then echo "Auto-detecting interfaces..." - # Get all Up interfaces: "mlx5_0 port 1 ==> enp1s0f0np0 (Up)" + # Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)" # We capture: IB_DEV, NET_DEV mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}') @@ -229,6 +229,21 @@ echo "Worker Nodes: ${WORKER_NODES[*]}" echo "Container Name: $CONTAINER_NAME" echo "Action: $ACTION" +# Check SSH connectivity to worker nodes +if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]]; then + if [ ${#WORKER_NODES[@]} -gt 0 ]; then + echo "Checking SSH connectivity to worker nodes..." + for worker in "${WORKER_NODES[@]}"; do + if ! ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$worker" true 2>/dev/null; then + echo "Error: Passwordless SSH to $worker failed." + echo " Please ensure SSH keys are configured and the host is reachable." + exit 1 + fi + echo " SSH to $worker: OK" + done + fi +fi + if [[ "$CHECK_CONFIG" == "true" ]]; then echo "Configuration Check Complete." echo " Image Name: $IMAGE_NAME" From 6c04ebfca16ee8c07108a95ca0b4e9f930c025a4 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 14:50:26 -0800 Subject: [PATCH 07/18] Refactor launch-cluster script to include cluster running checks and streamline start process for head and worker nodes --- launch-cluster.sh | 89 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 29 deletions(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index 55cb870..107c369 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -310,36 +310,67 @@ if [[ "$DAEMON_MODE" == "false" ]] || [[ "$ACTION" == "exec" ]]; then trap cleanup EXIT INT TERM HUP fi -# Start Head Node -echo "Starting Head Node on $HEAD_IP..." -docker run -d --privileged --gpus all --rm \ - --ipc=host --network host \ - --name "$CONTAINER_NAME" \ - -e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - "$IMAGE_NAME" \ - ./run-cluster-node.sh \ - --role head \ - --host-ip "$HEAD_IP" \ - --eth-if "$ETH_IF" \ - --ib-if "$IB_IF" +# Check if cluster is already running +check_cluster_running() { + local running=false + + # Check Head + if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then + echo "Warning: Container '$CONTAINER_NAME' is already running on head node ($HEAD_IP)." + running=true + fi + + # Check Workers + for worker in "${WORKER_NODES[@]}"; do + if ssh "$worker" "docker ps --format '{{.Names}}' | grep -q '^${CONTAINER_NAME}$'"; then + echo "Warning: Container '$CONTAINER_NAME' is already running on worker node ($worker)." + running=true + fi + done + + if [[ "$running" == "true" ]]; then + echo "Cluster containers are already running. Please stop them first or use a different name." + exit 1 + fi +} -# Start Worker Nodes -for worker in "${WORKER_NODES[@]}"; do - echo "Starting Worker Node on $worker..." - ssh "$worker" "docker run -d --privileged --gpus all --rm \ +# Start Cluster Function +start_cluster() { + check_cluster_running + + # Start Head Node + echo "Starting Head Node on $HEAD_IP..." + docker run -d --privileged --gpus all --rm \ --ipc=host --network host \ - --name $CONTAINER_NAME \ + --name "$CONTAINER_NAME" \ -e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ - $IMAGE_NAME \ + "$IMAGE_NAME" \ ./run-cluster-node.sh \ - --role node \ - --host-ip $worker \ - --eth-if $ETH_IF \ - --ib-if $IB_IF \ - --head-ip $HEAD_IP" -done + --role head \ + --host-ip "$HEAD_IP" \ + --eth-if "$ETH_IF" \ + --ib-if "$IB_IF" + + # Start Worker Nodes + for worker in "${WORKER_NODES[@]}"; do + echo "Starting Worker Node on $worker..." + ssh "$worker" "docker run -d --privileged --gpus all --rm \ + --ipc=host --network host \ + --name $CONTAINER_NAME \ + -e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + $IMAGE_NAME \ + ./run-cluster-node.sh \ + --role node \ + --host-ip $worker \ + --eth-if $ETH_IF \ + --ib-if $IB_IF \ + --head-ip $HEAD_IP" + done + + wait_for_cluster +} # Wait for Cluster Readiness wait_for_cluster() { @@ -365,11 +396,11 @@ wait_for_cluster() { } if [[ "$ACTION" == "exec" ]]; then - wait_for_cluster - echo "Executing command: $COMMAND_TO_RUN" - eval "$COMMAND_TO_RUN" + start_cluster + echo "Executing command on head node: $COMMAND_TO_RUN" + docker exec -it "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN" elif [[ "$ACTION" == "start" ]]; then - wait_for_cluster + start_cluster if [[ "$DAEMON_MODE" == "true" ]]; then echo "Cluster started in background (Daemon mode)." else From db5c4439058adb484969177928cbe02a836741b6 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 14:52:23 -0800 Subject: [PATCH 08/18] Enhance launch-cluster script with improved node detection and SSH scanning using netcat and Python --- launch-cluster.sh | 78 ++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 34 deletions(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index 107c369..fda39e5 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -137,50 +137,60 @@ fi if [[ -z "$NODES_ARG" ]]; then echo "Auto-detecting nodes..." - if ! command -v avahi-browse &> /dev/null; then - echo "Error: avahi-browse not found. Please install avahi-utils." - exit 1 - fi - - # Get local IP of the selected ETH_IF - LOCAL_IP=$(ip -4 addr show "$ETH_IF" | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | head -n 1) - if [[ -z "$LOCAL_IP" ]]; then - echo "Error: Could not determine IP for interface $ETH_IF" + if ! command -v nc &> /dev/null; then + echo "Error: nc (netcat) not found. Please install netcat." exit 1 fi - echo " Detected Local IP: $LOCAL_IP" + if ! command -v python3 &> /dev/null; then + echo "Error: python3 not found. Please install python3." + exit 1 + fi + + # Get CIDR of the selected ETH_IF + CIDR=$(ip -o -f inet addr show "$ETH_IF" | awk '{print $4}' | head -n 1) + + if [[ -z "$CIDR" ]]; then + echo "Error: Could not determine IP/CIDR for interface $ETH_IF" + exit 1 + fi + + LOCAL_IP=${CIDR%/*} + echo " Detected Local IP: $LOCAL_IP ($CIDR)" DETECTED_IPS=("$LOCAL_IP") - # Scan for other nodes - echo " Scanning for peers via avahi..." - # Run avahi-browse, filter for _ssh._tcp, and look for our interface - # Note: avahi-browse output format varies, we use -p (parsable) - # Format: =;interface;IPv4;name;type;domain;hostname;ip;port;txt + echo " Scanning for SSH peers on $CIDR..." - # We only care about services on our selected ETH_IF or related interfaces? - # The reference script scans ALL interfaces found by ibdev2netdev. - # Let's stick to the reference logic: scan on all IB-associated interfaces. + # Generate list of IPs using python + ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR") - TEMP_FILE=$(mktemp) - trap 'rm -f "$TEMP_FILE"' EXIT + TEMP_IPS_FILE=$(mktemp) - avahi_output=$(avahi-browse -p -r -f -t _ssh._tcp 2>/dev/null) + # Scan in parallel + for ip in $ALL_IPS; do + # Skip own IP + if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi + + ( + # Check port 22 with 1 second timeout + if nc -z -w 1 "$ip" 22 &>/dev/null; then + echo "$ip" >> "$TEMP_IPS_FILE" + fi + ) & + done - # Filter by the selected management interface (ETH_IF) - echo "$avahi_output" | grep ";$ETH_IF;" > "$TEMP_FILE" - - # Extract IPs - while IFS=';' read -r prefix iface protocol name type domain hostname ip port txt; do - if [[ "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then - # Avoid duplicates - if [[ ! " ${DETECTED_IPS[@]} " =~ " ${ip} " ]]; then - DETECTED_IPS+=("$ip") - echo " Found peer: $ip ($hostname)" - fi - fi - done < <(grep "^=" "$TEMP_FILE" | grep "IPv4") + # Wait for all background scans to complete + wait + + # Read found IPs + if [[ -f "$TEMP_IPS_FILE" ]]; then + while read -r ip; do + DETECTED_IPS+=("$ip") + echo " Found peer: $ip" + done < "$TEMP_IPS_FILE" + rm -f "$TEMP_IPS_FILE" + fi # Sort IPs IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}")) From 369283f6554e745e210c0fad8c3edb8e3e5f4fe1 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 15:25:22 -0800 Subject: [PATCH 09/18] Updated README.md with launch-cluster details. --- README.md | 93 ++++++++++++++++++++++++++++++++++++++++++++--- launch-cluster.sh | 15 ++++++-- 2 files changed, 98 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index cfa201e..5d49454 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run ## CHANGELOG +### 2025-12-18 + +Added `launch-cluster.sh` convenience script for basic cluster management - see details below. + ### 2025-12-15 Updated `build-and-copy.sh` flags: @@ -151,7 +155,84 @@ docker save vllm-node | ssh your_username@another_spark_hostname_or_ip "docker l ----- -## 2\. Running the Container +## 2\. Launching the Cluster (Recommended) + +The `launch-cluster.sh` script simplifies the process of starting the cluster nodes. It handles Docker parameters, network interface detection, and node configuration automatically. + +### Basic Usage + +**Start the container (auto-detects everything):** + +```bash +./launch-cluster.sh +``` + +This will: +1. Auto-detect the active InfiniBand and Ethernet interfaces. +2. Auto-detect the node IP. +3. Launch the container in interactive mode. +4. Start the Ray cluster node (head or worker depending on the IP). + +Assumptions and limitations: + +- It assumes that you've already set up passwordless SSH access on all nodes. If not, follow NVidia's [Connect Two Sparks Playbook](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks). I recommend setting up static IPs in the configuration instead of automatically assigning them every time, but this script should work with automatically assigned addresses too. +- By default, it assumes that the container image name is `vllm-node`. If it differs, you need to specify it with `-t ` parameter. +- If both ConnectX **physical** ports are utilized, and both have IP addresses, it will use whatever interface it finds first. Use `--eth-if` to override. +- It will ignore IPs associated with the 2nd "clone" of the physical interface. For instance, the outermost port on Spark has two logical Ethernet interfaces: `enp1s0f1np1` and `enP2p1s0f1np1`. Only `enp1s0f1np1` will be used. To override, use `--eth-if` parameter. +- It assumes that the same physical interfaces are named the same on all nodes (IOW, enp1s0f1np1 refers to the same physical port on all nodes). If it's not the case, you will have to launch cluster nodes manually or modify the script. +- It will mount only `~/.cache/huggingface` to the container by default. If you want to mount other caches, you'll have to pass set `VLLMSPARK_EXTRA_DOCKER_ARGS` environment variable, e.g.: `VLLM_SPARK_EXTRA_DOCKER_ARGS="-v ~/.cache/vllm:/root/.cache/vllm" ./launch-cluster.sh ...` + + +**Start in daemon mode (background):** + +```bash +./launch-cluster.sh -d +``` + +**Stop the container:** + +```bash +./launch-cluster.sh stop +``` + +**Check status:** + +```bash +./launch-cluster.sh status +``` + +**Execute a command inside the running container:** + +```bash +./launch-cluster.sh exec vllm serve ... +``` + +### Auto-Detection + +The script attempts to automatically detect: +* **Ethernet Interface:** The interface associated with the active InfiniBand device that has an IP address. +* **InfiniBand Interface:** The active InfiniBand devices. By default both active RoCE interfaces that correspond to active IB port(s) will be utilized. +* **Node Role:** Based on the detected IP address and the list of nodes (defaults to `192.168.177.11` as head and `192.168.177.12` as worker). + +### Manual Overrides + +You can override the auto-detected values if needed: + +```bash +./launch-cluster.sh --nodes "10.0.0.1,10.0.0.2" --eth-if enp1s0f1np1 --ib-if rocep1s0f1 +``` + +| Flag | Description | +| :--- | :--- | +| `-n, --nodes` | Comma-separated list of node IPs (Head node first). | +| `-t` | Docker image name (default: `vllm-node`). | +| `--name` | Container name (default: `vllm_node`). | +| `--eth-if` | Ethernet interface name. | +| `--ib-if` | InfiniBand interface name. | +| `--check-config` | Check configuration and auto-detection without launching. | +| `-d` | Run in daemon mode (detached). | + +## 3\. Running the Container (Manual) Ray and NCCL require specific Docker flags to function correctly across multiple nodes (Shared memory, Network namespace, and Hardware access). @@ -210,7 +291,7 @@ docker run --privileged --gpus all -it --rm \ ----- -## 3\. Using `run-cluster-node.sh` +## 4\. Using `run-cluster-node.sh` (Internal) The script is used to configure the environment and launch Ray either in head or node mode. @@ -268,7 +349,7 @@ You need to make sure you allocate IP addresses to them (no need to allocate IP ----- -## 4\. Configuration Details +## 5\. Configuration Details ### Environment Persistence @@ -280,7 +361,7 @@ docker exec -it vllm_node bash All environment variables (NCCL, Ray, vLLM config) set by the startup script will be loaded automatically in this new session. -## 5\. Using cluster mode for inference +## 6\. Using cluster mode for inference First, start follow the instructions above to start the head container on your first Spark, and node container on the second Spark. Then, on the first Spark, run vllm like this: @@ -297,7 +378,7 @@ docker exec -it vllm_node And execute vllm command inside. -## 6\. Fastsafetensors +## 7\. Fastsafetensors This build includes support for fastsafetensors loading which significantly improves loading speeds, especially on DGX Spark where MMAP performance is very poor currently. [Fasttensors](https://github.com/foundation-model-stack/fastsafetensors/) solve this issue by using more efficient multi-threaded loading while avoiding mmap. @@ -311,7 +392,7 @@ To use this method, simply include `--load-format fastsafetensors` when running HF_HUB_OFFLINE=1 vllm serve openai/gpt-oss-120b --port 8888 --host 0.0.0.0 --trust_remote_code --swap-space 16 --gpu-memory-utilization 0.7 -tp 2 --distributed-executor-backend ray --load-format fastsafetensors ``` -## 7\. Benchmarking +## 8\. Benchmarking Follow the guidance in [VLLM Benchmark Suites](https://docs.vllm.ai/en/latest/contributing/benchmarks/) to download benchmarking dataset, and then run a benchmark with a command like this (assuming you are running on head node, otherwise specify `--host` parameter): diff --git a/launch-cluster.sh b/launch-cluster.sh index fda39e5..d255927 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -3,6 +3,14 @@ # Default Configuration IMAGE_NAME="vllm-node" DEFAULT_CONTAINER_NAME="vllm_node" +# Modify these if you want to pass additional docker args +DOCKER_ARGS="-e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 -v ~/.cache/huggingface:/root/.cache/huggingface" + +# Append additional arguments from environment variable +if [[ -n "$VLLMSPARK_EXTRA_DOCKER_ARGS" ]]; then + DOCKER_ARGS="$DOCKER_ARGS $VLLMSPARK_EXTRA_DOCKER_ARGS" +fi + # ETH_IF and IB_IF will be auto-detected if not provided ETH_IF="" IB_IF="" @@ -353,8 +361,7 @@ start_cluster() { docker run -d --privileged --gpus all --rm \ --ipc=host --network host \ --name "$CONTAINER_NAME" \ - -e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ + $DOCKER_ARGS \ "$IMAGE_NAME" \ ./run-cluster-node.sh \ --role head \ @@ -362,14 +369,14 @@ start_cluster() { --eth-if "$ETH_IF" \ --ib-if "$IB_IF" + # Start Worker Nodes # Start Worker Nodes for worker in "${WORKER_NODES[@]}"; do echo "Starting Worker Node on $worker..." ssh "$worker" "docker run -d --privileged --gpus all --rm \ --ipc=host --network host \ --name $CONTAINER_NAME \ - -e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ + $DOCKER_ARGS \ $IMAGE_NAME \ ./run-cluster-node.sh \ --role node \ From 8be691e8063f17d06280e6ca8b7550aa3c3f6478 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 15:31:53 -0800 Subject: [PATCH 10/18] Fixed issue with argument passing --- README.md | 2 +- launch-cluster.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5d49454..3def732 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,7 @@ Assumptions and limitations: - If both ConnectX **physical** ports are utilized, and both have IP addresses, it will use whatever interface it finds first. Use `--eth-if` to override. - It will ignore IPs associated with the 2nd "clone" of the physical interface. For instance, the outermost port on Spark has two logical Ethernet interfaces: `enp1s0f1np1` and `enP2p1s0f1np1`. Only `enp1s0f1np1` will be used. To override, use `--eth-if` parameter. - It assumes that the same physical interfaces are named the same on all nodes (IOW, enp1s0f1np1 refers to the same physical port on all nodes). If it's not the case, you will have to launch cluster nodes manually or modify the script. -- It will mount only `~/.cache/huggingface` to the container by default. If you want to mount other caches, you'll have to pass set `VLLMSPARK_EXTRA_DOCKER_ARGS` environment variable, e.g.: `VLLM_SPARK_EXTRA_DOCKER_ARGS="-v ~/.cache/vllm:/root/.cache/vllm" ./launch-cluster.sh ...` +- It will mount only `~/.cache/huggingface` to the container by default. If you want to mount other caches, you'll have to pass set `VLLMSPARK_EXTRA_DOCKER_ARGS` environment variable, e.g.: `VLLM_SPARK_EXTRA_DOCKER_ARGS="-v $HOME/.cache/vllm:/root/.cache/vllm" ./launch-cluster.sh ...`. Please note that you must use `$HOME` instead of `~` here as the latter won't be expanded if passed through the variable to docker arguments. **Start in daemon mode (background):** diff --git a/launch-cluster.sh b/launch-cluster.sh index d255927..ddac0bf 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -3,8 +3,8 @@ # Default Configuration IMAGE_NAME="vllm-node" DEFAULT_CONTAINER_NAME="vllm_node" -# Modify these if you want to pass additional docker args -DOCKER_ARGS="-e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 -v ~/.cache/huggingface:/root/.cache/huggingface" +# Modify these if you want to pass additional docker args or set VLLMSPARK_EXTRA_DOCKER_ARGS variable +DOCKER_ARGS="-e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 -v $HOME/.cache/huggingface:/root/.cache/huggingface" # Append additional arguments from environment variable if [[ -n "$VLLMSPARK_EXTRA_DOCKER_ARGS" ]]; then From e6efd668cd4c4cfa3b28ff19af8771ea21edbbdd Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 15:43:09 -0800 Subject: [PATCH 11/18] Added Table of Contents to README --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 3def732..2bdbe5f 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,19 @@ This repository contains the Docker configuration and startup scripts to run a multi-node vLLM inference cluster using Ray. It supports InfiniBand/RDMA (NCCL) and custom environment configuration for high-performance setups. +## Table of Contents + +- [DISCLAIMER](#disclaimer) +- [CHANGELOG](#changelog) +- [1. Building the Docker Image](#1-building-the-docker-image) +- [2. Launching the Cluster (Recommended)](#2-launching-the-cluster-recommended) +- [3. Running the Container (Manual)](#3-running-the-container-manual) +- [4. Using `run-cluster-node.sh` (Internal)](#4-using-run-cluster-nodesh-internal) +- [5. Configuration Details](#5-configuration-details) +- [6. Using cluster mode for inference](#6-using-cluster-mode-for-inference) +- [7. Fastsafetensors](#7-fastsafetensors) +- [8. Benchmarking](#8-benchmarking) + ## DISCLAIMER This repository is not affiliated with NVIDIA or their subsidiaries. The content is provided as a reference material only, not intended for production use. From 442f7369addf23644da54413ba79a1854a453bc5 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 22:02:04 -0800 Subject: [PATCH 12/18] Updated build script to handle BUILD_JOBS argument --- build-and-copy.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/build-and-copy.sh b/build-and-copy.sh index 19a2c11..01bfd7e 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -13,6 +13,7 @@ SSH_USER="$USER" NO_BUILD=false TRITON_REF="v3.5.1" VLLM_REF="main" +BUILD_JOBS="16" # Help function usage() { @@ -22,6 +23,7 @@ usage() { echo " --rebuild-vllm : Set cache bust for vllm" echo " --triton-ref : Triton commit SHA, branch or tag (default: 'v3.5.1')" echo " --vllm-ref : vLLM commit SHA, branch or tag (default: 'main')" + echo " -j, --build-jobs : Number of concurrent build jobs (default: \${BUILD_JOBS})" echo " -h, --copy-to-host : Host address to copy the image to (if not set, don't copy)" echo " -u, --user : Username for ssh command (default: \$USER)" echo " --no-build : Skip building, only copy image (requires --copy-to-host)" @@ -37,6 +39,7 @@ while [[ "$#" -gt 0 ]]; do --rebuild-vllm) REBUILD_VLLM=true ;; --triton-ref) TRITON_REF="$2"; shift ;; --vllm-ref) VLLM_REF="$2"; shift ;; + -j|--build-jobs) BUILD_JOBS="$2"; shift ;; -h|--copy-to-host) COPY_HOST="$2"; shift ;; -u|--user) SSH_USER="$2"; shift ;; --no-build) NO_BUILD=true ;; @@ -74,6 +77,9 @@ if [ "$NO_BUILD" = false ]; then # Add VLLM_REF to build arguments CMD+=("--build-arg" "VLLM_REF=$VLLM_REF") + # Add BUILD_JOBS to build arguments + CMD+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS") + # Add build context CMD+=(".") From cf9da89545ef7c26dcad284ed00425e70034153f Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 22:03:46 -0800 Subject: [PATCH 13/18] Updated README --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 2bdbe5f..40b5bf1 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,8 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run Added `launch-cluster.sh` convenience script for basic cluster management - see details below. +Added `-j` / `--build-jobs` argument to `build-and-copy.sh` to control build parallelism. + ### 2025-12-15 Updated `build-and-copy.sh` flags: @@ -79,6 +81,7 @@ Using a provided build script is recommended, but if you want to build using `do | `CACHEBUST_VLLM` | `1` | Change this to force a fresh git clone and rebuild of vLLM source code. | | `TRITON_REF` | `v3.5.1` | Triton commit SHA, branch, or tag to build. | | `VLLM_REF` | `main` | vLLM commit SHA, branch, or tag to build. | +| `BUILD_JOBS` | `16` | Number of parallel build jobs (default: 16). | ### Using the Build Script (Recommended) @@ -149,6 +152,7 @@ Using a different username: | `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) | | `--triton-ref ` | Triton commit SHA, branch or tag (default: 'v3.5.1') | | `--vllm-ref ` | vLLM commit SHA, branch or tag (default: 'main') | +| `-j, --build-jobs ` | Number of parallel build jobs (default: Dockerfile default) | | `-h, --copy-to-host ` | Host address to copy the image to after building | | `-u, --user ` | Username for SSH connection (default: current user) | | `--no-build` | Skip building, only copy existing image (requires `--copy-to-host`) | From 8c53179cc219f97e42732fb6d1af03cb78a30d8d Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 22:27:27 -0800 Subject: [PATCH 14/18] changed extra docker args variable to VLLM_SPARK_EXTRA_DOCKER_ARGS for consistency --- README.md | 2 +- launch-cluster.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 40b5bf1..3919e2b 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,7 @@ Assumptions and limitations: - If both ConnectX **physical** ports are utilized, and both have IP addresses, it will use whatever interface it finds first. Use `--eth-if` to override. - It will ignore IPs associated with the 2nd "clone" of the physical interface. For instance, the outermost port on Spark has two logical Ethernet interfaces: `enp1s0f1np1` and `enP2p1s0f1np1`. Only `enp1s0f1np1` will be used. To override, use `--eth-if` parameter. - It assumes that the same physical interfaces are named the same on all nodes (IOW, enp1s0f1np1 refers to the same physical port on all nodes). If it's not the case, you will have to launch cluster nodes manually or modify the script. -- It will mount only `~/.cache/huggingface` to the container by default. If you want to mount other caches, you'll have to pass set `VLLMSPARK_EXTRA_DOCKER_ARGS` environment variable, e.g.: `VLLM_SPARK_EXTRA_DOCKER_ARGS="-v $HOME/.cache/vllm:/root/.cache/vllm" ./launch-cluster.sh ...`. Please note that you must use `$HOME` instead of `~` here as the latter won't be expanded if passed through the variable to docker arguments. +- It will mount only `~/.cache/huggingface` to the container by default. If you want to mount other caches, you'll have to pass set `VLLM_SPARK_EXTRA_DOCKER_ARGS` environment variable, e.g.: `VLLM_SPARK_EXTRA_DOCKER_ARGS="-v $HOME/.cache/vllm:/root/.cache/vllm" ./launch-cluster.sh ...`. Please note that you must use `$HOME` instead of `~` here as the latter won't be expanded if passed through the variable to docker arguments. **Start in daemon mode (background):** diff --git a/launch-cluster.sh b/launch-cluster.sh index ddac0bf..ae1e857 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -3,12 +3,12 @@ # Default Configuration IMAGE_NAME="vllm-node" DEFAULT_CONTAINER_NAME="vllm_node" -# Modify these if you want to pass additional docker args or set VLLMSPARK_EXTRA_DOCKER_ARGS variable +# Modify these if you want to pass additional docker args or set VLLM_SPARK_EXTRA_DOCKER_ARGS variable DOCKER_ARGS="-e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 -v $HOME/.cache/huggingface:/root/.cache/huggingface" # Append additional arguments from environment variable -if [[ -n "$VLLMSPARK_EXTRA_DOCKER_ARGS" ]]; then - DOCKER_ARGS="$DOCKER_ARGS $VLLMSPARK_EXTRA_DOCKER_ARGS" +if [[ -n "$VLLM_SPARK_EXTRA_DOCKER_ARGS" ]]; then + DOCKER_ARGS="$DOCKER_ARGS $VLLM_SPARK_EXTRA_DOCKER_ARGS" fi # ETH_IF and IB_IF will be auto-detected if not provided From 2a2f8f24e218287f829a22faebc039b09b1ec33d Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 23:02:58 -0800 Subject: [PATCH 15/18] Allow launch-cluster.sh to be executed in non-TTY environment --- launch-cluster.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index ae1e857..09e8b85 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -415,7 +415,15 @@ wait_for_cluster() { if [[ "$ACTION" == "exec" ]]; then start_cluster echo "Executing command on head node: $COMMAND_TO_RUN" - docker exec -it "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN" + + # Check if running in a TTY to avoid "input device is not a TTY" error + if [ -t 0 ]; then + DOCKER_EXEC_FLAGS="-it" + else + DOCKER_EXEC_FLAGS="-i" + fi + + docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN" elif [[ "$ACTION" == "start" ]]; then start_cluster if [[ "$DAEMON_MODE" == "true" ]]; then From 0377e9badf770c77814d2ce3565e784817ab7cb8 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 23:12:39 -0800 Subject: [PATCH 16/18] Bugfix: don't shut down on exit if cluster is already running --- launch-cluster.sh | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index 09e8b85..56fd05b 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -22,6 +22,7 @@ COMMAND_TO_RUN="" DAEMON_MODE="false" CHECK_CONFIG="false" ACTION="start" +CLUSTER_WAS_RUNNING="false" # Function to print usage usage() { @@ -275,6 +276,11 @@ cleanup() { # Remove traps to prevent nested cleanup trap - EXIT INT TERM HUP + if [[ "$CLUSTER_WAS_RUNNING" == "true" ]]; then + echo "Cluster was already running when script started. Skipping cleanup." + return + fi + echo "" echo "Stopping cluster..." @@ -347,8 +353,9 @@ check_cluster_running() { done if [[ "$running" == "true" ]]; then - echo "Cluster containers are already running. Please stop them first or use a different name." - exit 1 + echo "Cluster containers are already running. Skipping launch." + CLUSTER_WAS_RUNNING="true" + return 0 fi } @@ -356,6 +363,10 @@ check_cluster_running() { start_cluster() { check_cluster_running + if [[ "$CLUSTER_WAS_RUNNING" == "true" ]]; then + return + fi + # Start Head Node echo "Starting Head Node on $HEAD_IP..." docker run -d --privileged --gpus all --rm \ From 294d155532ac88aeacba10d06a5d7b9bc1b80ee7 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 23:28:12 -0800 Subject: [PATCH 17/18] Add NCCL debug level option to launch-cluster.sh --- README.md | 1 + launch-cluster.sh | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3919e2b..4875ab2 100644 --- a/README.md +++ b/README.md @@ -246,6 +246,7 @@ You can override the auto-detected values if needed: | `--name` | Container name (default: `vllm_node`). | | `--eth-if` | Ethernet interface name. | | `--ib-if` | InfiniBand interface name. | +| `--nccl-debug` | NCCL debug level (e.g., INFO, WARN). Defaults to INFO if flag is present but value is omitted. | | `--check-config` | Check configuration and auto-detection without launching. | | `-d` | Run in daemon mode (detached). | diff --git a/launch-cluster.sh b/launch-cluster.sh index 56fd05b..38aba90 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -4,7 +4,7 @@ IMAGE_NAME="vllm-node" DEFAULT_CONTAINER_NAME="vllm_node" # Modify these if you want to pass additional docker args or set VLLM_SPARK_EXTRA_DOCKER_ARGS variable -DOCKER_ARGS="-e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 -v $HOME/.cache/huggingface:/root/.cache/huggingface" +DOCKER_ARGS="-e NCCL_IGNORE_CPU_AFFINITY=1 -v $HOME/.cache/huggingface:/root/.cache/huggingface" # Append additional arguments from environment variable if [[ -n "$VLLM_SPARK_EXTRA_DOCKER_ARGS" ]]; then @@ -14,6 +14,7 @@ fi # ETH_IF and IB_IF will be auto-detected if not provided ETH_IF="" IB_IF="" +NCCL_DEBUG_VAL="" # Initialize variables NODES_ARG="" @@ -26,12 +27,13 @@ CLUSTER_WAS_RUNNING="false" # Function to print usage usage() { - echo "Usage: $0 [-n ] [-t ] [--name ] [--eth-if ] [--ib-if ] [--check-config] [-d] [action] [command]" + echo "Usage: $0 [-n ] [-t ] [--name ] [--eth-if ] [--ib-if ] [--nccl-debug ] [--check-config] [-d] [action] [command]" echo " -n, --nodes Comma-separated list of node IPs (Optional, auto-detected if omitted)" echo " -t Docker image name (Optional, default: $IMAGE_NAME)" echo " --name Container name (Optional, default: $DEFAULT_CONTAINER_NAME)" echo " --eth-if Ethernet interface (Optional, auto-detected)" echo " --ib-if InfiniBand interface (Optional, auto-detected)" + echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO." echo " --check-config Check configuration and auto-detection without launching" echo " -d Daemon mode (only for 'start' action)" echo " action start | stop | status | exec (Default: start)" @@ -47,6 +49,14 @@ while [[ "$#" -gt 0 ]]; do --name) CONTAINER_NAME="$2"; shift ;; --eth-if) ETH_IF="$2"; shift ;; --ib-if) IB_IF="$2"; shift ;; + --nccl-debug) + if [[ -n "$2" && "$2" =~ ^(VERSION|WARN|INFO|TRACE)$ ]]; then + NCCL_DEBUG_VAL="$2" + shift + else + NCCL_DEBUG_VAL="INFO" + fi + ;; --check-config) CHECK_CONFIG="true" ;; -d) DAEMON_MODE="true" ;; -h|--help) usage ;; @@ -72,6 +82,20 @@ while [[ "$#" -gt 0 ]]; do shift done +# Append NCCL_DEBUG if set, with validation +if [[ -n "$NCCL_DEBUG_VAL" ]]; then + case "$NCCL_DEBUG_VAL" in + VERSION|WARN|INFO|TRACE) + DOCKER_ARGS="$DOCKER_ARGS -e NCCL_DEBUG=$NCCL_DEBUG_VAL" + ;; + *) + echo "Error: Invalid value for --nccl-debug: $NCCL_DEBUG_VAL" + echo "Allowed values: VERSION, WARN, INFO, TRACE" + exit 1 + ;; + esac +fi + # --- Auto-Detection Logic --- # Check for required tools if auto-detection is needed From de055928b8db9bec58a4428098781c9bd1f2d9aa Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 23:29:03 -0800 Subject: [PATCH 18/18] Update CHANGELOG: Document --nccl-debug option for NCCL debug level control --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4875ab2..06cfe09 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run Added `launch-cluster.sh` convenience script for basic cluster management - see details below. Added `-j` / `--build-jobs` argument to `build-and-copy.sh` to control build parallelism. +Added `--nccl-debug` option to specify NCCL debug level. Default is none to decrease verbosity. ### 2025-12-15