From 03b055d7f0aee33986f890ae07c2b85b09b1d569 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Fri, 13 Mar 2026 11:55:18 -0700 Subject: [PATCH] Major cluster orchestration refactoring to support running without Ray --- Dockerfile | 3 - launch-cluster.sh | 258 ++++++++++++++++++-------- mods/use-ngc-vllm/run-cluster-node.sh | 117 ------------ mods/use-ngc-vllm/run.sh | 5 +- run-cluster-node.sh | 124 ------------- run-recipe.py | 27 ++- 6 files changed, 204 insertions(+), 330 deletions(-) delete mode 100755 mods/use-ngc-vllm/run-cluster-node.sh delete mode 100755 run-cluster-node.sh diff --git a/Dockerfile b/Dockerfile index dbf5709..cce0e4a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -270,9 +270,6 @@ ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings ENV PATH=$VLLM_BASE_DIR:$PATH -# Copy scripts -COPY run-cluster-node.sh $VLLM_BASE_DIR/ -RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh # Final extra deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ diff --git a/launch-cluster.sh b/launch-cluster.sh index 0d00407..62e01dc 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -32,6 +32,8 @@ SCRIPT_DIR="$(dirname "$(realpath "$0")")" ACTIONS_ARG="" SOLO_MODE="false" +NO_RAY_MODE="false" +LAUNCH_SCRIPT_MODE="false" MOUNT_CACHE_DIRS="true" BUILD_JOBS="" NON_PRIVILEGED_MODE="false" @@ -55,6 +57,7 @@ usage() { echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted." echo " --check-config Check configuration and auto-detection without launching" echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster" + echo " --no-ray No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)" echo " --no-cache-dirs Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton)" echo " -d Daemon mode (only for 'start' action)" echo " --non-privileged Run in non-privileged mode (removes --privileged and --ipc=host)" @@ -93,6 +96,7 @@ while [[ "$#" -gt 0 ]]; do ;; --check-config) CHECK_CONFIG="true" ;; --solo) SOLO_MODE="true" ;; + --no-ray) NO_RAY_MODE="true" ;; --no-cache-dirs) MOUNT_CACHE_DIRS="false" ;; --non-privileged) NON_PRIVILEGED_MODE="true" ;; --mem-limit-gb) MEM_LIMIT_GB="$2"; shift ;; @@ -204,7 +208,8 @@ if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then # Set command to run the copied script (use absolute path since docker exec may not be in /workspace) COMMAND_TO_RUN="/workspace/exec-script.sh" - + LAUNCH_SCRIPT_MODE="true" + # If launch script is specified, default action to exec unless explicitly set to stop/status if [[ "$ACTION" == "start" ]]; then ACTION="exec" @@ -303,6 +308,11 @@ if [[ "$SOLO_MODE" == "false" && ${#PEER_NODES[@]} -eq 0 ]]; then SOLO_MODE="true" fi +if [[ "$NO_RAY_MODE" == "true" && "$SOLO_MODE" == "true" ]]; then + echo "Warning: Only one node detected; --no-ray has no effect in solo mode. Proceeding normally." + NO_RAY_MODE="false" +fi + echo "Head Node: $HEAD_IP" echo "Worker Nodes: ${PEER_NODES[*]}" echo "Container Name: $CONTAINER_NAME" @@ -377,9 +387,11 @@ if [[ "$ACTION" == "status" ]]; then # Check Head if docker ps | grep -q "$CONTAINER_NAME"; then echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is RUNNING." - echo "--- Ray Status ---" - docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status." - echo "------------------" + if [[ "$NO_RAY_MODE" == "false" ]]; then + echo "--- Ray Status ---" + docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status." + echo "------------------" + fi else echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is NOT running." fi @@ -544,11 +556,8 @@ copy_launch_script_to_container() { echo "Copying launch script to head node..." - local target_script_path="$script_path" - # Copy script into container as /workspace/exec-script.sh - echo " Copying script into container..." - docker cp "$target_script_path" "$container:/workspace/exec-script.sh" + docker cp "$script_path" "$container:/workspace/exec-script.sh" # Make executable docker exec "$container" chmod +x /workspace/exec-script.sh @@ -556,6 +565,78 @@ copy_launch_script_to_container() { echo " Launch script copied to head node" } +# Copy Launch Script to Worker via SSH + docker cp +copy_launch_script_to_worker() { + local worker_ip="$1"; local container="$2"; local script_path="$3" + echo "Copying launch script to worker $worker_ip..." + local remote_tmp="/tmp/vllm_script_$(date +%s)_$RANDOM.sh" + scp -o BatchMode=yes -o StrictHostKeyChecking=no "$script_path" "$worker_ip:$remote_tmp" + ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \ + "docker cp $remote_tmp $container:/workspace/exec-script.sh && \ + docker exec $container chmod +x /workspace/exec-script.sh && \ + rm -f $remote_tmp" +} + +# Patch /workspace/exec-script.sh in container: inject --nnodes/--node-rank/--master-addr/--headless +patch_script_in_container() { + local is_local="$1"; local node_ip="$2"; local container="$3" + local nnodes="$4"; local node_rank="$5"; local master_addr="$6" + + local extra="--nnodes $nnodes --node-rank $node_rank --master-addr $master_addr" + [[ "$node_rank" -gt 0 ]] && extra="$extra --headless" + + local patch="sed -i '/--distributed-executor-backend/d' /workspace/exec-script.sh && \ + sed -i 's|vllm serve|vllm serve $extra|' /workspace/exec-script.sh" + + if [[ "$is_local" == "true" ]]; then + docker exec "$container" bash -c "$patch" + else + ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$node_ip" \ + "docker exec $container bash -c \"$patch\"" + fi +} + +# Build -e KEY=VALUE flags for a given node IP (used in docker run and docker exec) +get_env_flags() { + local node_ip="$1" + printf -- '-e %s ' \ + "VLLM_HOST_IP=$node_ip" \ + "RAY_NODE_IP_ADDRESS=$node_ip" \ + "RAY_OVERRIDE_NODE_IP_ADDRESS=$node_ip" \ + "MN_IF_NAME=$ETH_IF" \ + "UCX_NET_DEVICES=$ETH_IF" \ + "NCCL_SOCKET_IFNAME=$ETH_IF" \ + "NCCL_IB_HCA=$IB_IF" \ + "NCCL_IB_DISABLE=0" \ + "OMPI_MCA_btl_tcp_if_include=$ETH_IF" \ + "GLOO_SOCKET_IFNAME=$ETH_IF" \ + "TP_SOCKET_IFNAME=$ETH_IF" \ + "RAY_memory_monitor_refresh_ms=0" \ + "RAY_num_prestart_python_workers=0" \ + "RAY_object_store_memory=1073741824" +} + +# Start Ray head node inside the container +start_ray_head() { + local container="$1" + echo "Starting Ray HEAD node on $HEAD_IP..." + docker exec -d $(get_env_flags "$HEAD_IP") "$container" bash -c \ + "ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \ + --node-ip-address $HEAD_IP --include-dashboard=false --disable-usage-stats \ + >> /proc/1/fd/1 2>&1" +} + +# Start Ray worker node inside the container on a remote host +start_ray_worker() { + local worker_ip="$1"; local container="$2" + echo "Starting Ray WORKER node on $worker_ip..." + local env_flags; env_flags=$(get_env_flags "$worker_ip") + ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \ + "docker exec -d $env_flags $container bash -c \ + 'ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \ + --address=$HEAD_IP:6379 --node-ip-address $worker_ip >> /proc/1/fd/1 2>&1'" +} + # Start Cluster Function start_cluster() { check_cluster_running @@ -564,31 +645,6 @@ start_cluster() { return fi - # Start Head Node - echo "Starting Head Node on $HEAD_IP..." - - # Ensure cache dirs exist on head - if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then - for dir in "${CACHE_DIRS_TO_CREATE[@]}"; do - mkdir -p "$dir" - done - fi - - local head_cmd_args=() - if [[ "$SOLO_MODE" == "true" ]]; then - if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then - head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting container...; exec sleep infinity") - else - head_cmd_args=(sleep infinity) - fi - else - if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then - head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role head --host-ip $HEAD_IP --eth-if $ETH_IF --ib-if $IB_IF") - else - head_cmd_args=(./run-cluster-node.sh --role head --host-ip "$HEAD_IP" --eth-if "$ETH_IF" --ib-if "$IB_IF") - fi - fi - # Build docker run arguments based on mode local docker_args_common="--gpus all -d --rm --network host --name $CONTAINER_NAME $DOCKER_ARGS $IMAGE_NAME" local docker_caps_args="" @@ -603,62 +659,62 @@ start_cluster() { docker_resource_args="--ipc=host" fi + # Start Head Node + echo "Starting Head Node on $HEAD_IP..." + if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then + for dir in "${CACHE_DIRS_TO_CREATE[@]}"; do + mkdir -p "$dir" + done + fi docker run $docker_caps_args $docker_resource_args \ - $docker_args_common \ - "${head_cmd_args[@]}" + $(get_env_flags "$HEAD_IP") $docker_args_common sleep infinity # Start Worker Nodes for worker in "${PEER_NODES[@]}"; do echo "Starting Worker Node on $worker..." - - # Ensure cache dirs exist on worker if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then - # Create string of dirs to create - dirs_str="${CACHE_DIRS_TO_CREATE[*]}" - ssh "$worker" "mkdir -p $dirs_str" - fi - - local docker_run_cmd="docker run $docker_caps_args $docker_resource_args $docker_args_common" - - if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then - local inner_script="echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP" - ssh "$worker" "$docker_run_cmd bash -c \"$inner_script\"" - else - ssh "$worker" "$docker_run_cmd ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP" + ssh "$worker" "mkdir -p ${CACHE_DIRS_TO_CREATE[*]}" fi + local docker_run_cmd="docker run $docker_caps_args $docker_resource_args $(get_env_flags "$worker") $docker_args_common" + ssh "$worker" "$docker_run_cmd sleep infinity" done - # Apply mods if requested + # Apply mods (containers are idle — no mod_done sync needed) if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then echo "Applying modifications to cluster nodes..." - - # Apply to Head for i in "${!MOD_PATHS[@]}"; do apply_mod_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" "${MOD_PATHS[$i]}" "${MOD_TYPES[$i]}" done - # Signal completion on Head - docker exec "$CONTAINER_NAME" touch /tmp/mod_done - - # Apply to Workers for worker in "${PEER_NODES[@]}"; do for i in "${!MOD_PATHS[@]}"; do apply_mod_to_container "$worker" "$CONTAINER_NAME" "false" "${MOD_PATHS[$i]}" "${MOD_TYPES[$i]}" done - # Signal completion on Worker - ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" "docker exec $CONTAINER_NAME touch /tmp/mod_done" done fi - # Copy launch script to head node only (workers don't need it - they just run Ray) + # Copy (and patch for no-ray) launch script if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then + local total_nodes=$(( 1 + ${#PEER_NODES[@]} )) copy_launch_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH" + if [[ "$NO_RAY_MODE" == "true" ]]; then + patch_script_in_container "true" "$HEAD_IP" "$CONTAINER_NAME" "$total_nodes" "0" "$HEAD_IP" + local rank=1 + for worker in "${PEER_NODES[@]}"; do + copy_launch_script_to_worker "$worker" "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH" + patch_script_in_container "false" "$worker" "$CONTAINER_NAME" "$total_nodes" "$rank" "$HEAD_IP" + (( rank++ )) + done + fi fi - if [[ "$SOLO_MODE" == "false" ]]; then + # Start Ray cluster (unless solo or no-ray) + if [[ "$SOLO_MODE" == "false" && "$NO_RAY_MODE" == "false" ]]; then + start_ray_head "$CONTAINER_NAME" + for worker in "${PEER_NODES[@]}"; do + start_ray_worker "$worker" "$CONTAINER_NAME" + done wait_for_cluster else - echo "Solo mode active: Skipping Ray cluster readiness check." - # Give container a moment to start up sleep 2 fi } @@ -686,25 +742,73 @@ wait_for_cluster() { exit 1 } -if [[ "$ACTION" == "exec" ]]; then - start_cluster - echo "Executing command on head node: $COMMAND_TO_RUN" - +# Execute command on head node (daemon or interactive) +_exec_on_head() { + local cmd="$1" if [[ "$DAEMON_MODE" == "true" ]]; then - # Daemon mode: run command detached inside the container and exit immediately - # Extract env vars starting from VLLM_HOST_IP to avoid interactive check in .bashrc - # Redirect output to PID 1 stdout/stderr so it shows up in docker logs - docker exec -d "$CONTAINER_NAME" bash -c "eval \"\$(sed -n '/export VLLM_HOST_IP/,\$p' /root/.bashrc)\" && { $COMMAND_TO_RUN; } >> /proc/1/fd/1 2>> /proc/1/fd/2" + docker exec -d $(get_env_flags "$HEAD_IP") "$CONTAINER_NAME" bash -c "$cmd >> /proc/1/fd/1 2>&1" echo "Command dispatched in background (Daemon mode). Container: $CONTAINER_NAME" else - # Check if running in a TTY to avoid "input device is not a TTY" error - if [ -t 0 ]; then - DOCKER_EXEC_FLAGS="-it" - else - DOCKER_EXEC_FLAGS="-i" - fi + if [ -t 0 ]; then DOCKER_EXEC_FLAGS="-it"; else DOCKER_EXEC_FLAGS="-i"; fi + docker exec $DOCKER_EXEC_FLAGS $(get_env_flags "$HEAD_IP") "$CONTAINER_NAME" bash -c "$cmd" + fi +} - docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN" +# Execute a no-ray multi-node command: workers (background) then head +exec_no_ray_cluster() { + local base_cmd="$1" + local total_nodes=$(( 1 + ${#PEER_NODES[@]} )) + + # Launch workers first (always background) + local rank=1 + for worker in "${PEER_NODES[@]}"; do + local worker_cmd + if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then + worker_cmd="$base_cmd" # script already patched per-node in start_cluster() + else + local clean + clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//') + worker_cmd=$(echo "$clean" | sed "s|vllm serve|vllm serve --nnodes $total_nodes --node-rank $rank --master-addr $HEAD_IP --headless|") + fi + echo "Launching worker (rank $rank) on $worker..." + local env_flags; env_flags=$(get_env_flags "$worker") + ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" \ + "docker exec -d $env_flags $CONTAINER_NAME bash -c \"$worker_cmd >> /proc/1/fd/1 2>&1\"" + (( rank++ )) + done + + # Launch head (rank 0) last + local head_cmd + if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then + head_cmd="$base_cmd" + else + local clean + clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//') + head_cmd=$(echo "$clean" | sed "s|vllm serve|vllm serve --nnodes $total_nodes --node-rank 0 --master-addr $HEAD_IP|") + fi + + echo "Executing command on head node (rank 0): $head_cmd" + if [[ "$DAEMON_MODE" == "true" ]]; then + docker exec -d $(get_env_flags "$HEAD_IP") "$CONTAINER_NAME" bash -c "$head_cmd >> /proc/1/fd/1 2>&1" + echo "Command dispatched in background (Daemon mode). Container: $CONTAINER_NAME" + else + if [ -t 0 ]; then DOCKER_EXEC_FLAGS="-it"; else DOCKER_EXEC_FLAGS="-i"; fi + docker exec $DOCKER_EXEC_FLAGS $(get_env_flags "$HEAD_IP") "$CONTAINER_NAME" bash -c "$head_cmd" + fi +} + +if [[ "$ACTION" == "exec" ]]; then + start_cluster + echo "Executing command: $COMMAND_TO_RUN" + + if [[ "$NO_RAY_MODE" == "true" && ${#PEER_NODES[@]} -gt 0 ]]; then + if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]] || echo "$COMMAND_TO_RUN" | grep -q "vllm serve"; then + exec_no_ray_cluster "$COMMAND_TO_RUN" + else + _exec_on_head "$COMMAND_TO_RUN" + fi + else + _exec_on_head "$COMMAND_TO_RUN" fi elif [[ "$ACTION" == "start" ]]; then start_cluster diff --git a/mods/use-ngc-vllm/run-cluster-node.sh b/mods/use-ngc-vllm/run-cluster-node.sh deleted file mode 100755 index 4143e4d..0000000 --- a/mods/use-ngc-vllm/run-cluster-node.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash -set -e - -# Define a function to export immediately AND save to .bashrc for future sessions -export_persist() { - local var_name="$1" - local var_value="$2" - - # 1. Export for the current running process - export "$var_name"="$var_value" - - # 2. Append to .bashrc (idempotent check to avoid duplicate lines) - if ! grep -q "export $var_name=" ~/.bashrc; then - echo "export $var_name=\"$var_value\"" >> ~/.bashrc - else - # Optional: Update the existing line if it exists - sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc - fi -} - -# --- Help Function --- -usage() { - echo "Usage: $0 [OPTIONS]" - echo "" - echo "Required Arguments:" - echo " -r, --role : Set the node type" - echo " -h, --host-ip : IP address of this interface (Host IP)" - echo " -e, --eth-if : Ethernet interface name (e.g., eth0)" - echo " -i, --ib-if : InfiniBand/RDMA interface name" - echo "" - echo "Conditional Arguments:" - echo " -m, --head-ip : IP of the head node (REQUIRED if role is 'node')" - echo "" - echo "Example:" - echo " $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0" - echo " $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10" - exit 1 -} - -# --- Argument Parsing --- - -# Initialize variables to empty -NODE_TYPE="" -HOST_IP="" -ETH_IF_NAME="" -IB_IF_NAME="" -HEAD_IP="" - -while [[ "$#" -gt 0 ]]; do - case $1 in - -r|--role) NODE_TYPE="$2"; shift ;; - -h|--host-ip) HOST_IP="$2"; shift ;; - -e|--eth-if) ETH_IF_NAME="$2"; shift ;; - -i|--ib-if) IB_IF_NAME="$2"; shift ;; - -m|--head-ip) HEAD_IP="$2"; shift ;; - *) echo "Unknown parameter passed: $1"; usage ;; - esac - shift -done - -# --- Validation --- - -# 1. Check if all common required arguments are present -if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then - echo "Error: Missing required arguments." - usage -fi - -# 2. Validate Role -if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then - echo "Error: --role must be 'head' or 'node'." - exit 1 -fi - -# 3. Conditional Check for Head IP -if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then - echo "Error: When --role is 'node', you must provide --head-ip." - exit 1 -fi - -# --- Environment Configuration --- - -echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..." - -export_persist VLLM_HOST_IP "$HOST_IP" -export_persist RAY_NODE_IP_ADDRESS "$HOST_IP" -export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP" - -# Network Interface -export_persist MN_IF_NAME "$ETH_IF_NAME" -export_persist UCX_NET_DEVICES "$ETH_IF_NAME" -export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME" - -# InfiniBand -export_persist NCCL_IB_HCA "$IB_IF_NAME" -export_persist NCCL_IB_DISABLE "0" - -# Sockets/Transport -export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME" -export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME" -export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME" -export_persist RAY_memory_monitor_refresh_ms "0" - -# --- Execution --- - -if [ "${NODE_TYPE}" == "head" ]; then - echo "Starting Ray HEAD node..." - exec ray start --block --head --port 6379 \ - --node-ip-address "$VLLM_HOST_IP" \ - --disable-usage-stats -else - echo "Starting Ray WORKER node connecting to $HEAD_IP..." - exec ray start --block \ - --address="$HEAD_IP:6379" \ - --node-ip-address "$VLLM_HOST_IP" -fi - diff --git a/mods/use-ngc-vllm/run.sh b/mods/use-ngc-vllm/run.sh index 0f6e042..7d3f721 100644 --- a/mods/use-ngc-vllm/run.sh +++ b/mods/use-ngc-vllm/run.sh @@ -1,6 +1,5 @@ #!/bin/bash set -e -echo "Setting up cluster initialization script..." -cp run-cluster-node.sh $WORKSPACE_DIR/run-cluster-node.sh -chmod +x $WORKSPACE_DIR/run-cluster-node.sh +# NGC vLLM mod: container initialization is now handled by launch-cluster.sh +echo "NGC vLLM mod applied." diff --git a/run-cluster-node.sh b/run-cluster-node.sh deleted file mode 100755 index 796fe06..0000000 --- a/run-cluster-node.sh +++ /dev/null @@ -1,124 +0,0 @@ -#!/bin/bash -set -e - -# Define a function to export immediately AND save to .bashrc for future sessions -export_persist() { - local var_name="$1" - local var_value="$2" - - # 1. Export for the current running process - export "$var_name"="$var_value" - - # 2. Append to .bashrc (idempotent check to avoid duplicate lines) - if ! grep -q "export $var_name=" ~/.bashrc; then - echo "export $var_name=\"$var_value\"" >> ~/.bashrc - else - # Optional: Update the existing line if it exists - sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc - fi -} - -# --- Help Function --- -usage() { - echo "Usage: $0 [OPTIONS]" - echo "" - echo "Required Arguments:" - echo " -r, --role : Set the node type" - echo " -h, --host-ip : IP address of this interface (Host IP)" - echo " -e, --eth-if : Ethernet interface name (e.g., eth0)" - echo " -i, --ib-if : InfiniBand/RDMA interface name" - echo "" - echo "Conditional Arguments:" - echo " -m, --head-ip : IP of the head node (REQUIRED if role is 'node')" - echo "" - echo "Example:" - echo " $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0" - echo " $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10" - exit 1 -} - -# --- Argument Parsing --- - -# Initialize variables to empty -NODE_TYPE="" -HOST_IP="" -ETH_IF_NAME="" -IB_IF_NAME="" -HEAD_IP="" - -while [[ "$#" -gt 0 ]]; do - case $1 in - -r|--role) NODE_TYPE="$2"; shift ;; - -h|--host-ip) HOST_IP="$2"; shift ;; - -e|--eth-if) ETH_IF_NAME="$2"; shift ;; - -i|--ib-if) IB_IF_NAME="$2"; shift ;; - -m|--head-ip) HEAD_IP="$2"; shift ;; - *) echo "Unknown parameter passed: $1"; usage ;; - esac - shift -done - -# --- Validation --- - -# 1. Check if all common required arguments are present -if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then - echo "Error: Missing required arguments." - usage -fi - -# 2. Validate Role -if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then - echo "Error: --role must be 'head' or 'node'." - exit 1 -fi - -# 3. Conditional Check for Head IP -if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then - echo "Error: When --role is 'node', you must provide --head-ip." - exit 1 -fi - -# --- Environment Configuration --- - -echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..." - -export_persist VLLM_HOST_IP "$HOST_IP" -export_persist RAY_NODE_IP_ADDRESS "$HOST_IP" -export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP" - -# Network Interface -export_persist MN_IF_NAME "$ETH_IF_NAME" -export_persist UCX_NET_DEVICES "$ETH_IF_NAME" -export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME" - -# InfiniBand -export_persist NCCL_IB_HCA "$IB_IF_NAME" -export_persist NCCL_IB_DISABLE "0" - -# Sockets/Transport -export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME" -export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME" -export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME" -export_persist RAY_memory_monitor_refresh_ms "0" - -# UMA Memory Optimization (DGX Spark 128GB shared CPU/GPU memory) -# Disable pre-started idle workers (saves ~8 GiB on head node) -export_persist RAY_num_prestart_python_workers "0" -# Limit object store to 1 GiB (default 30% of RAM = 33 GiB, wastes UMA) -export_persist RAY_object_store_memory "1073741824" - -# --- Execution --- - -if [ "${NODE_TYPE}" == "head" ]; then - echo "Starting Ray HEAD node..." - exec ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \ - --node-ip-address "$VLLM_HOST_IP" \ - --include-dashboard=false \ - --disable-usage-stats -else - echo "Starting Ray WORKER node connecting to $HEAD_IP..." - exec ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \ - --address="$HEAD_IP:6379" \ - --node-ip-address "$VLLM_HOST_IP" -fi - diff --git a/run-recipe.py b/run-recipe.py index 395f185..219c2d4 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -393,7 +393,7 @@ def check_model_exists(model: str) -> bool: return False -def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False, extra_args: list[str] | None = None) -> str: +def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False, extra_args: list[str] | None = None, no_ray: bool = False) -> str: """ Generate a bash launch script from the recipe. @@ -458,9 +458,9 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is print(f"Available parameters: {list(params.keys())}") sys.exit(1) - # In solo mode, remove --distributed-executor-backend ray - # (it's not needed and can cause issues on single node) - if is_solo: + # In solo or no-ray mode, remove --distributed-executor-backend + # (not needed for solo; no-ray uses PyTorch distributed instead) + if is_solo or no_ray: import re # Remove the entire line containing --distributed-executor-backend # This handles multi-line commands with backslash continuations @@ -820,6 +820,12 @@ Examples: launch_group.add_argument("-t", "--container", dest="container_override", help="Override container image from recipe") launch_group.add_argument("--nccl-debug", choices=["VERSION", "WARN", "INFO", "TRACE"], help="NCCL debug level") launch_group.add_argument("-e", "--env", action="append", dest="env_vars", default=[], metavar="VAR=VALUE", help="Environment variable to pass to container (e.g. -e HF_TOKEN=xxx). Can be used multiple times.") + launch_group.add_argument( + "--no-ray", + action="store_true", + dest="no_ray", + help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)" + ) # Cluster discovery options discover_group = parser.add_argument_group("Cluster discovery") @@ -933,6 +939,10 @@ Examples: solo_only = recipe.get("solo_only", False) is_solo = args.solo or not is_cluster + if getattr(args, 'no_ray', False) and is_solo: + print("Error: --no-ray is incompatible with --solo. Solo mode already runs without Ray.") + return 1 + if cluster_only and is_solo: print(f"Error: Recipe '{recipe['name']}' requires cluster mode.") print(f"This model is too large to run on a single node.") @@ -1097,7 +1107,7 @@ Examples: print(f" vLLM uses last value; extra args appear after template substitution") # Generate launch script - script_content = generate_launch_script(recipe, overrides, is_solo=is_solo, extra_args=extra_args) + script_content = generate_launch_script(recipe, overrides, is_solo=is_solo, extra_args=extra_args, no_ray=getattr(args, 'no_ray', False)) if args.dry_run: print("=== Generated Launch Script ===") @@ -1116,6 +1126,8 @@ Examples: cmd_parts.append("--solo") if args.daemon: cmd_parts.append("-d") + if getattr(args, 'no_ray', False): + cmd_parts.append("--no-ray") if nodes: cmd_parts.extend(["-n", ",".join(nodes)]) if args.nccl_debug: @@ -1155,7 +1167,10 @@ Examples: if args.daemon: cmd.append("-d") - + + if getattr(args, 'no_ray', False): + cmd.append("--no-ray") + # Pass nodes to launch-cluster.sh (from command line, .env, or autodiscover) if nodes: cmd.extend(["-n", ",".join(nodes)])