diff --git a/launch_cluster.sh b/launch_cluster.sh new file mode 100755 index 0000000..a64da11 --- /dev/null +++ b/launch_cluster.sh @@ -0,0 +1,224 @@ +#!/bin/bash + +# Default Configuration +IMAGE_NAME="vllm-node" +DEFAULT_CONTAINER_NAME="vllm_node" +ETH_IF="enp1s0f1np1" +IB_IF="rocep1s0f1,roceP2p1s0f1" + +# Initialize variables +NODES_ARG="" +CONTAINER_NAME="$DEFAULT_CONTAINER_NAME" +COMMAND_TO_RUN="" +DAEMON_MODE="false" +ACTION="start" + +# Function to print usage +usage() { + echo "Usage: $0 -n [-t ] [--name ] [--eth-if ] [--ib-if ] [-d] [action] [command]" + echo " -n, --nodes Comma-separated list of node IPs (Mandatory)" + echo " -t Docker image name (Optional, default: $IMAGE_NAME)" + echo " --name Container name (Optional, default: $DEFAULT_CONTAINER_NAME)" + echo " --eth-if Ethernet interface (Optional, default: $ETH_IF)" + echo " --ib-if InfiniBand interface (Optional, default: $IB_IF)" + echo " -d Daemon mode (only for 'start' action)" + echo " action start | stop | status | exec (Default: start)" + echo " command Command to run (only for 'exec' action)" + exit 1 +} + +# Parse arguments +while [[ "$#" -gt 0 ]]; do + case $1 in + -n|--nodes) NODES_ARG="$2"; shift ;; + -t) IMAGE_NAME="$2"; shift ;; + --name) CONTAINER_NAME="$2"; shift ;; + --eth-if) ETH_IF="$2"; shift ;; + --ib-if) IB_IF="$2"; shift ;; + -d) DAEMON_MODE="true" ;; + -h|--help) usage ;; + start|stop|status) + ACTION="$1" + ;; + exec) + ACTION="exec" + shift + COMMAND_TO_RUN="$@" + break + ;; + *) + # If it's not a flag and not a known action, treat as exec command for backward compatibility + # unless it's the default 'start' implied. + # However, to support "omitted" = start, we need to be careful. + # If the arg looks like a command, it's exec. + ACTION="exec" + COMMAND_TO_RUN="$@" + break + ;; + esac + shift +done + +if [[ -z "$NODES_ARG" ]]; then + echo "Error: Nodes argument (-n) is mandatory." + usage +fi + +# Split nodes into array +IFS=',' read -r -a ALL_NODES <<< "$NODES_ARG" + +# Detect Head IP (Local IP) +HEAD_IP="" +LOCAL_IPS=$(hostname -I) +for ip in "${ALL_NODES[@]}"; do + # Trim whitespace + ip=$(echo "$ip" | xargs) + if [[ " $LOCAL_IPS " =~ " $ip " ]]; then + HEAD_IP="$ip" + break + fi +done + +if [[ -z "$HEAD_IP" ]]; then + echo "Error: Could not determine Head IP. This script must be run on one of the nodes specified in -n." + exit 1 +fi + +# Identify Worker Nodes +WORKER_NODES=() +for ip in "${ALL_NODES[@]}"; do + ip=$(echo "$ip" | xargs) + if [[ "$ip" != "$HEAD_IP" ]]; then + WORKER_NODES+=("$ip") + fi +done + +echo "Head Node: $HEAD_IP" +echo "Worker Nodes: ${WORKER_NODES[*]}" +echo "Container Name: $CONTAINER_NAME" +echo "Action: $ACTION" + +# Cleanup Function +cleanup() { + echo "" + echo "Stopping cluster..." + + # Stop Head + echo "Stopping head node ($HEAD_IP)..." + docker stop "$CONTAINER_NAME" >/dev/null 2>&1 || true + + # Stop Workers + for worker in "${WORKER_NODES[@]}"; do + echo "Stopping worker node ($worker)..." + ssh "$worker" "docker stop $CONTAINER_NAME" >/dev/null 2>&1 || true + done + + echo "Cluster stopped." +} + +# Handle 'stop' action +if [[ "$ACTION" == "stop" ]]; then + cleanup + exit 0 +fi + +# Handle 'status' action +if [[ "$ACTION" == "status" ]]; then + echo "Checking status..." + + # Check Head + if docker ps | grep -q "$CONTAINER_NAME"; then + echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is RUNNING." + echo "--- Ray Status ---" + docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status." + echo "------------------" + else + echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is NOT running." + fi + + # Check Workers + for worker in "${WORKER_NODES[@]}"; do + if ssh "$worker" "docker ps | grep -q '$CONTAINER_NAME'"; then + echo "[WORKER] $worker: Container '$CONTAINER_NAME' is RUNNING." + else + echo "[WORKER] $worker: Container '$CONTAINER_NAME' is NOT running." + fi + done + exit 0 +fi + +# Trap signals +# Only trap if we are NOT in daemon mode, OR if we are in exec mode (always cleanup after exec) +if [[ "$DAEMON_MODE" == "false" ]] || [[ "$ACTION" == "exec" ]]; then + trap cleanup EXIT INT TERM HUP +fi + +# Start Head Node +echo "Starting Head Node on $HEAD_IP..." +docker run -d --privileged --gpus all --rm \ + --ipc=host --network host \ + --name "$CONTAINER_NAME" \ + -e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + "$IMAGE_NAME" \ + ./run-cluster-node.sh \ + --role head \ + --host-ip "$HEAD_IP" \ + --eth-if "$ETH_IF" \ + --ib-if "$IB_IF" + +# Start Worker Nodes +for worker in "${WORKER_NODES[@]}"; do + echo "Starting Worker Node on $worker..." + ssh "$worker" "docker run -d --privileged --gpus all --rm \ + --ipc=host --network host \ + --name $CONTAINER_NAME \ + -e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + $IMAGE_NAME \ + ./run-cluster-node.sh \ + --role node \ + --host-ip $worker \ + --eth-if $ETH_IF \ + --ib-if $IB_IF \ + --head-ip $HEAD_IP" +done + +# Wait for Cluster Readiness +wait_for_cluster() { + echo "Waiting for cluster to be ready..." + local retries=30 + local count=0 + + while [[ $count -lt $retries ]]; do + # Check if ray is responsive + if docker exec "$CONTAINER_NAME" ray status >/dev/null 2>&1; then + echo "Cluster head is responsive." + # Give workers a moment to connect + sleep 5 + return 0 + fi + + sleep 2 + ((count++)) + done + + echo "Timeout waiting for cluster to start." + exit 1 +} + +if [[ "$ACTION" == "exec" ]]; then + wait_for_cluster + echo "Executing command: $COMMAND_TO_RUN" + eval "$COMMAND_TO_RUN" +elif [[ "$ACTION" == "start" ]]; then + wait_for_cluster + if [[ "$DAEMON_MODE" == "true" ]]; then + echo "Cluster started in background (Daemon mode)." + else + echo "Cluster started. Tailing logs from head node..." + echo "Press Ctrl+C to stop the cluster." + docker logs -f "$CONTAINER_NAME" & + wait $! + fi +fi