Major cluster orchestration refactoring to support running without Ray
This commit is contained in:
@@ -270,9 +270,6 @@ ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
||||
ENV PATH=$VLLM_BASE_DIR:$PATH
|
||||
|
||||
# Copy scripts
|
||||
COPY run-cluster-node.sh $VLLM_BASE_DIR/
|
||||
RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
|
||||
|
||||
# Final extra deps
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
|
||||
@@ -32,6 +32,8 @@ SCRIPT_DIR="$(dirname "$(realpath "$0")")"
|
||||
|
||||
ACTIONS_ARG=""
|
||||
SOLO_MODE="false"
|
||||
NO_RAY_MODE="false"
|
||||
LAUNCH_SCRIPT_MODE="false"
|
||||
MOUNT_CACHE_DIRS="true"
|
||||
BUILD_JOBS=""
|
||||
NON_PRIVILEGED_MODE="false"
|
||||
@@ -55,6 +57,7 @@ usage() {
|
||||
echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted."
|
||||
echo " --check-config Check configuration and auto-detection without launching"
|
||||
echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster"
|
||||
echo " --no-ray No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)"
|
||||
echo " --no-cache-dirs Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton)"
|
||||
echo " -d Daemon mode (only for 'start' action)"
|
||||
echo " --non-privileged Run in non-privileged mode (removes --privileged and --ipc=host)"
|
||||
@@ -93,6 +96,7 @@ while [[ "$#" -gt 0 ]]; do
|
||||
;;
|
||||
--check-config) CHECK_CONFIG="true" ;;
|
||||
--solo) SOLO_MODE="true" ;;
|
||||
--no-ray) NO_RAY_MODE="true" ;;
|
||||
--no-cache-dirs) MOUNT_CACHE_DIRS="false" ;;
|
||||
--non-privileged) NON_PRIVILEGED_MODE="true" ;;
|
||||
--mem-limit-gb) MEM_LIMIT_GB="$2"; shift ;;
|
||||
@@ -204,6 +208,7 @@ if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||
|
||||
# Set command to run the copied script (use absolute path since docker exec may not be in /workspace)
|
||||
COMMAND_TO_RUN="/workspace/exec-script.sh"
|
||||
LAUNCH_SCRIPT_MODE="true"
|
||||
|
||||
# If launch script is specified, default action to exec unless explicitly set to stop/status
|
||||
if [[ "$ACTION" == "start" ]]; then
|
||||
@@ -303,6 +308,11 @@ if [[ "$SOLO_MODE" == "false" && ${#PEER_NODES[@]} -eq 0 ]]; then
|
||||
SOLO_MODE="true"
|
||||
fi
|
||||
|
||||
if [[ "$NO_RAY_MODE" == "true" && "$SOLO_MODE" == "true" ]]; then
|
||||
echo "Warning: Only one node detected; --no-ray has no effect in solo mode. Proceeding normally."
|
||||
NO_RAY_MODE="false"
|
||||
fi
|
||||
|
||||
echo "Head Node: $HEAD_IP"
|
||||
echo "Worker Nodes: ${PEER_NODES[*]}"
|
||||
echo "Container Name: $CONTAINER_NAME"
|
||||
@@ -377,9 +387,11 @@ if [[ "$ACTION" == "status" ]]; then
|
||||
# Check Head
|
||||
if docker ps | grep -q "$CONTAINER_NAME"; then
|
||||
echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is RUNNING."
|
||||
echo "--- Ray Status ---"
|
||||
docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status."
|
||||
echo "------------------"
|
||||
if [[ "$NO_RAY_MODE" == "false" ]]; then
|
||||
echo "--- Ray Status ---"
|
||||
docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status."
|
||||
echo "------------------"
|
||||
fi
|
||||
else
|
||||
echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is NOT running."
|
||||
fi
|
||||
@@ -544,11 +556,8 @@ copy_launch_script_to_container() {
|
||||
|
||||
echo "Copying launch script to head node..."
|
||||
|
||||
local target_script_path="$script_path"
|
||||
|
||||
# Copy script into container as /workspace/exec-script.sh
|
||||
echo " Copying script into container..."
|
||||
docker cp "$target_script_path" "$container:/workspace/exec-script.sh"
|
||||
docker cp "$script_path" "$container:/workspace/exec-script.sh"
|
||||
|
||||
# Make executable
|
||||
docker exec "$container" chmod +x /workspace/exec-script.sh
|
||||
@@ -556,6 +565,78 @@ copy_launch_script_to_container() {
|
||||
echo " Launch script copied to head node"
|
||||
}
|
||||
|
||||
# Copy Launch Script to Worker via SSH + docker cp
|
||||
copy_launch_script_to_worker() {
|
||||
local worker_ip="$1"; local container="$2"; local script_path="$3"
|
||||
echo "Copying launch script to worker $worker_ip..."
|
||||
local remote_tmp="/tmp/vllm_script_$(date +%s)_$RANDOM.sh"
|
||||
scp -o BatchMode=yes -o StrictHostKeyChecking=no "$script_path" "$worker_ip:$remote_tmp"
|
||||
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \
|
||||
"docker cp $remote_tmp $container:/workspace/exec-script.sh && \
|
||||
docker exec $container chmod +x /workspace/exec-script.sh && \
|
||||
rm -f $remote_tmp"
|
||||
}
|
||||
|
||||
# Patch /workspace/exec-script.sh in container: inject --nnodes/--node-rank/--master-addr/--headless
|
||||
patch_script_in_container() {
|
||||
local is_local="$1"; local node_ip="$2"; local container="$3"
|
||||
local nnodes="$4"; local node_rank="$5"; local master_addr="$6"
|
||||
|
||||
local extra="--nnodes $nnodes --node-rank $node_rank --master-addr $master_addr"
|
||||
[[ "$node_rank" -gt 0 ]] && extra="$extra --headless"
|
||||
|
||||
local patch="sed -i '/--distributed-executor-backend/d' /workspace/exec-script.sh && \
|
||||
sed -i 's|vllm serve|vllm serve $extra|' /workspace/exec-script.sh"
|
||||
|
||||
if [[ "$is_local" == "true" ]]; then
|
||||
docker exec "$container" bash -c "$patch"
|
||||
else
|
||||
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$node_ip" \
|
||||
"docker exec $container bash -c \"$patch\""
|
||||
fi
|
||||
}
|
||||
|
||||
# Build -e KEY=VALUE flags for a given node IP (used in docker run and docker exec)
|
||||
get_env_flags() {
|
||||
local node_ip="$1"
|
||||
printf -- '-e %s ' \
|
||||
"VLLM_HOST_IP=$node_ip" \
|
||||
"RAY_NODE_IP_ADDRESS=$node_ip" \
|
||||
"RAY_OVERRIDE_NODE_IP_ADDRESS=$node_ip" \
|
||||
"MN_IF_NAME=$ETH_IF" \
|
||||
"UCX_NET_DEVICES=$ETH_IF" \
|
||||
"NCCL_SOCKET_IFNAME=$ETH_IF" \
|
||||
"NCCL_IB_HCA=$IB_IF" \
|
||||
"NCCL_IB_DISABLE=0" \
|
||||
"OMPI_MCA_btl_tcp_if_include=$ETH_IF" \
|
||||
"GLOO_SOCKET_IFNAME=$ETH_IF" \
|
||||
"TP_SOCKET_IFNAME=$ETH_IF" \
|
||||
"RAY_memory_monitor_refresh_ms=0" \
|
||||
"RAY_num_prestart_python_workers=0" \
|
||||
"RAY_object_store_memory=1073741824"
|
||||
}
|
||||
|
||||
# Start Ray head node inside the container
|
||||
start_ray_head() {
|
||||
local container="$1"
|
||||
echo "Starting Ray HEAD node on $HEAD_IP..."
|
||||
docker exec -d $(get_env_flags "$HEAD_IP") "$container" bash -c \
|
||||
"ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \
|
||||
--node-ip-address $HEAD_IP --include-dashboard=false --disable-usage-stats \
|
||||
>> /proc/1/fd/1 2>&1"
|
||||
}
|
||||
|
||||
# Start Ray worker node inside the container on a remote host
|
||||
start_ray_worker() {
|
||||
local worker_ip="$1"; local container="$2"
|
||||
echo "Starting Ray WORKER node on $worker_ip..."
|
||||
local env_flags; env_flags=$(get_env_flags "$worker_ip")
|
||||
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \
|
||||
"docker exec -d $env_flags $container bash -c \
|
||||
'ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
|
||||
--address=$HEAD_IP:6379 --node-ip-address $worker_ip >> /proc/1/fd/1 2>&1'"
|
||||
}
|
||||
|
||||
# Start Cluster Function
|
||||
start_cluster() {
|
||||
check_cluster_running
|
||||
@@ -564,31 +645,6 @@ start_cluster() {
|
||||
return
|
||||
fi
|
||||
|
||||
# Start Head Node
|
||||
echo "Starting Head Node on $HEAD_IP..."
|
||||
|
||||
# Ensure cache dirs exist on head
|
||||
if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
|
||||
for dir in "${CACHE_DIRS_TO_CREATE[@]}"; do
|
||||
mkdir -p "$dir"
|
||||
done
|
||||
fi
|
||||
|
||||
local head_cmd_args=()
|
||||
if [[ "$SOLO_MODE" == "true" ]]; then
|
||||
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
|
||||
head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting container...; exec sleep infinity")
|
||||
else
|
||||
head_cmd_args=(sleep infinity)
|
||||
fi
|
||||
else
|
||||
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
|
||||
head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role head --host-ip $HEAD_IP --eth-if $ETH_IF --ib-if $IB_IF")
|
||||
else
|
||||
head_cmd_args=(./run-cluster-node.sh --role head --host-ip "$HEAD_IP" --eth-if "$ETH_IF" --ib-if "$IB_IF")
|
||||
fi
|
||||
fi
|
||||
|
||||
# Build docker run arguments based on mode
|
||||
local docker_args_common="--gpus all -d --rm --network host --name $CONTAINER_NAME $DOCKER_ARGS $IMAGE_NAME"
|
||||
local docker_caps_args=""
|
||||
@@ -603,62 +659,62 @@ start_cluster() {
|
||||
docker_resource_args="--ipc=host"
|
||||
fi
|
||||
|
||||
# Start Head Node
|
||||
echo "Starting Head Node on $HEAD_IP..."
|
||||
if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
|
||||
for dir in "${CACHE_DIRS_TO_CREATE[@]}"; do
|
||||
mkdir -p "$dir"
|
||||
done
|
||||
fi
|
||||
docker run $docker_caps_args $docker_resource_args \
|
||||
$docker_args_common \
|
||||
"${head_cmd_args[@]}"
|
||||
$(get_env_flags "$HEAD_IP") $docker_args_common sleep infinity
|
||||
|
||||
# Start Worker Nodes
|
||||
for worker in "${PEER_NODES[@]}"; do
|
||||
echo "Starting Worker Node on $worker..."
|
||||
|
||||
# Ensure cache dirs exist on worker
|
||||
if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
|
||||
# Create string of dirs to create
|
||||
dirs_str="${CACHE_DIRS_TO_CREATE[*]}"
|
||||
ssh "$worker" "mkdir -p $dirs_str"
|
||||
fi
|
||||
|
||||
local docker_run_cmd="docker run $docker_caps_args $docker_resource_args $docker_args_common"
|
||||
|
||||
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
|
||||
local inner_script="echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP"
|
||||
ssh "$worker" "$docker_run_cmd bash -c \"$inner_script\""
|
||||
else
|
||||
ssh "$worker" "$docker_run_cmd ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP"
|
||||
ssh "$worker" "mkdir -p ${CACHE_DIRS_TO_CREATE[*]}"
|
||||
fi
|
||||
local docker_run_cmd="docker run $docker_caps_args $docker_resource_args $(get_env_flags "$worker") $docker_args_common"
|
||||
ssh "$worker" "$docker_run_cmd sleep infinity"
|
||||
done
|
||||
|
||||
# Apply mods if requested
|
||||
# Apply mods (containers are idle — no mod_done sync needed)
|
||||
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
|
||||
echo "Applying modifications to cluster nodes..."
|
||||
|
||||
# Apply to Head
|
||||
for i in "${!MOD_PATHS[@]}"; do
|
||||
apply_mod_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" "${MOD_PATHS[$i]}" "${MOD_TYPES[$i]}"
|
||||
done
|
||||
# Signal completion on Head
|
||||
docker exec "$CONTAINER_NAME" touch /tmp/mod_done
|
||||
|
||||
# Apply to Workers
|
||||
for worker in "${PEER_NODES[@]}"; do
|
||||
for i in "${!MOD_PATHS[@]}"; do
|
||||
apply_mod_to_container "$worker" "$CONTAINER_NAME" "false" "${MOD_PATHS[$i]}" "${MOD_TYPES[$i]}"
|
||||
done
|
||||
# Signal completion on Worker
|
||||
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" "docker exec $CONTAINER_NAME touch /tmp/mod_done"
|
||||
done
|
||||
fi
|
||||
|
||||
# Copy launch script to head node only (workers don't need it - they just run Ray)
|
||||
# Copy (and patch for no-ray) launch script
|
||||
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||
local total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
|
||||
copy_launch_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH"
|
||||
if [[ "$NO_RAY_MODE" == "true" ]]; then
|
||||
patch_script_in_container "true" "$HEAD_IP" "$CONTAINER_NAME" "$total_nodes" "0" "$HEAD_IP"
|
||||
local rank=1
|
||||
for worker in "${PEER_NODES[@]}"; do
|
||||
copy_launch_script_to_worker "$worker" "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH"
|
||||
patch_script_in_container "false" "$worker" "$CONTAINER_NAME" "$total_nodes" "$rank" "$HEAD_IP"
|
||||
(( rank++ ))
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$SOLO_MODE" == "false" ]]; then
|
||||
# Start Ray cluster (unless solo or no-ray)
|
||||
if [[ "$SOLO_MODE" == "false" && "$NO_RAY_MODE" == "false" ]]; then
|
||||
start_ray_head "$CONTAINER_NAME"
|
||||
for worker in "${PEER_NODES[@]}"; do
|
||||
start_ray_worker "$worker" "$CONTAINER_NAME"
|
||||
done
|
||||
wait_for_cluster
|
||||
else
|
||||
echo "Solo mode active: Skipping Ray cluster readiness check."
|
||||
# Give container a moment to start up
|
||||
sleep 2
|
||||
fi
|
||||
}
|
||||
@@ -686,25 +742,73 @@ wait_for_cluster() {
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [[ "$ACTION" == "exec" ]]; then
|
||||
start_cluster
|
||||
echo "Executing command on head node: $COMMAND_TO_RUN"
|
||||
|
||||
# Execute command on head node (daemon or interactive)
|
||||
_exec_on_head() {
|
||||
local cmd="$1"
|
||||
if [[ "$DAEMON_MODE" == "true" ]]; then
|
||||
# Daemon mode: run command detached inside the container and exit immediately
|
||||
# Extract env vars starting from VLLM_HOST_IP to avoid interactive check in .bashrc
|
||||
# Redirect output to PID 1 stdout/stderr so it shows up in docker logs
|
||||
docker exec -d "$CONTAINER_NAME" bash -c "eval \"\$(sed -n '/export VLLM_HOST_IP/,\$p' /root/.bashrc)\" && { $COMMAND_TO_RUN; } >> /proc/1/fd/1 2>> /proc/1/fd/2"
|
||||
docker exec -d $(get_env_flags "$HEAD_IP") "$CONTAINER_NAME" bash -c "$cmd >> /proc/1/fd/1 2>&1"
|
||||
echo "Command dispatched in background (Daemon mode). Container: $CONTAINER_NAME"
|
||||
else
|
||||
# Check if running in a TTY to avoid "input device is not a TTY" error
|
||||
if [ -t 0 ]; then
|
||||
DOCKER_EXEC_FLAGS="-it"
|
||||
else
|
||||
DOCKER_EXEC_FLAGS="-i"
|
||||
fi
|
||||
if [ -t 0 ]; then DOCKER_EXEC_FLAGS="-it"; else DOCKER_EXEC_FLAGS="-i"; fi
|
||||
docker exec $DOCKER_EXEC_FLAGS $(get_env_flags "$HEAD_IP") "$CONTAINER_NAME" bash -c "$cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN"
|
||||
# Execute a no-ray multi-node command: workers (background) then head
|
||||
exec_no_ray_cluster() {
|
||||
local base_cmd="$1"
|
||||
local total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
|
||||
|
||||
# Launch workers first (always background)
|
||||
local rank=1
|
||||
for worker in "${PEER_NODES[@]}"; do
|
||||
local worker_cmd
|
||||
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
|
||||
worker_cmd="$base_cmd" # script already patched per-node in start_cluster()
|
||||
else
|
||||
local clean
|
||||
clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//')
|
||||
worker_cmd=$(echo "$clean" | sed "s|vllm serve|vllm serve --nnodes $total_nodes --node-rank $rank --master-addr $HEAD_IP --headless|")
|
||||
fi
|
||||
echo "Launching worker (rank $rank) on $worker..."
|
||||
local env_flags; env_flags=$(get_env_flags "$worker")
|
||||
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" \
|
||||
"docker exec -d $env_flags $CONTAINER_NAME bash -c \"$worker_cmd >> /proc/1/fd/1 2>&1\""
|
||||
(( rank++ ))
|
||||
done
|
||||
|
||||
# Launch head (rank 0) last
|
||||
local head_cmd
|
||||
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
|
||||
head_cmd="$base_cmd"
|
||||
else
|
||||
local clean
|
||||
clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//')
|
||||
head_cmd=$(echo "$clean" | sed "s|vllm serve|vllm serve --nnodes $total_nodes --node-rank 0 --master-addr $HEAD_IP|")
|
||||
fi
|
||||
|
||||
echo "Executing command on head node (rank 0): $head_cmd"
|
||||
if [[ "$DAEMON_MODE" == "true" ]]; then
|
||||
docker exec -d $(get_env_flags "$HEAD_IP") "$CONTAINER_NAME" bash -c "$head_cmd >> /proc/1/fd/1 2>&1"
|
||||
echo "Command dispatched in background (Daemon mode). Container: $CONTAINER_NAME"
|
||||
else
|
||||
if [ -t 0 ]; then DOCKER_EXEC_FLAGS="-it"; else DOCKER_EXEC_FLAGS="-i"; fi
|
||||
docker exec $DOCKER_EXEC_FLAGS $(get_env_flags "$HEAD_IP") "$CONTAINER_NAME" bash -c "$head_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
if [[ "$ACTION" == "exec" ]]; then
|
||||
start_cluster
|
||||
echo "Executing command: $COMMAND_TO_RUN"
|
||||
|
||||
if [[ "$NO_RAY_MODE" == "true" && ${#PEER_NODES[@]} -gt 0 ]]; then
|
||||
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]] || echo "$COMMAND_TO_RUN" | grep -q "vllm serve"; then
|
||||
exec_no_ray_cluster "$COMMAND_TO_RUN"
|
||||
else
|
||||
_exec_on_head "$COMMAND_TO_RUN"
|
||||
fi
|
||||
else
|
||||
_exec_on_head "$COMMAND_TO_RUN"
|
||||
fi
|
||||
elif [[ "$ACTION" == "start" ]]; then
|
||||
start_cluster
|
||||
|
||||
@@ -1,117 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Define a function to export immediately AND save to .bashrc for future sessions
|
||||
export_persist() {
|
||||
local var_name="$1"
|
||||
local var_value="$2"
|
||||
|
||||
# 1. Export for the current running process
|
||||
export "$var_name"="$var_value"
|
||||
|
||||
# 2. Append to .bashrc (idempotent check to avoid duplicate lines)
|
||||
if ! grep -q "export $var_name=" ~/.bashrc; then
|
||||
echo "export $var_name=\"$var_value\"" >> ~/.bashrc
|
||||
else
|
||||
# Optional: Update the existing line if it exists
|
||||
sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Help Function ---
|
||||
usage() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo ""
|
||||
echo "Required Arguments:"
|
||||
echo " -r, --role <head|node> : Set the node type"
|
||||
echo " -h, --host-ip <ip> : IP address of this interface (Host IP)"
|
||||
echo " -e, --eth-if <name> : Ethernet interface name (e.g., eth0)"
|
||||
echo " -i, --ib-if <name> : InfiniBand/RDMA interface name"
|
||||
echo ""
|
||||
echo "Conditional Arguments:"
|
||||
echo " -m, --head-ip <ip> : IP of the head node (REQUIRED if role is 'node')"
|
||||
echo ""
|
||||
echo "Example:"
|
||||
echo " $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0"
|
||||
echo " $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# --- Argument Parsing ---
|
||||
|
||||
# Initialize variables to empty
|
||||
NODE_TYPE=""
|
||||
HOST_IP=""
|
||||
ETH_IF_NAME=""
|
||||
IB_IF_NAME=""
|
||||
HEAD_IP=""
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case $1 in
|
||||
-r|--role) NODE_TYPE="$2"; shift ;;
|
||||
-h|--host-ip) HOST_IP="$2"; shift ;;
|
||||
-e|--eth-if) ETH_IF_NAME="$2"; shift ;;
|
||||
-i|--ib-if) IB_IF_NAME="$2"; shift ;;
|
||||
-m|--head-ip) HEAD_IP="$2"; shift ;;
|
||||
*) echo "Unknown parameter passed: $1"; usage ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# --- Validation ---
|
||||
|
||||
# 1. Check if all common required arguments are present
|
||||
if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then
|
||||
echo "Error: Missing required arguments."
|
||||
usage
|
||||
fi
|
||||
|
||||
# 2. Validate Role
|
||||
if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then
|
||||
echo "Error: --role must be 'head' or 'node'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 3. Conditional Check for Head IP
|
||||
if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then
|
||||
echo "Error: When --role is 'node', you must provide --head-ip."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- Environment Configuration ---
|
||||
|
||||
echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..."
|
||||
|
||||
export_persist VLLM_HOST_IP "$HOST_IP"
|
||||
export_persist RAY_NODE_IP_ADDRESS "$HOST_IP"
|
||||
export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP"
|
||||
|
||||
# Network Interface
|
||||
export_persist MN_IF_NAME "$ETH_IF_NAME"
|
||||
export_persist UCX_NET_DEVICES "$ETH_IF_NAME"
|
||||
export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
|
||||
# InfiniBand
|
||||
export_persist NCCL_IB_HCA "$IB_IF_NAME"
|
||||
export_persist NCCL_IB_DISABLE "0"
|
||||
|
||||
# Sockets/Transport
|
||||
export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME"
|
||||
export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
export_persist RAY_memory_monitor_refresh_ms "0"
|
||||
|
||||
# --- Execution ---
|
||||
|
||||
if [ "${NODE_TYPE}" == "head" ]; then
|
||||
echo "Starting Ray HEAD node..."
|
||||
exec ray start --block --head --port 6379 \
|
||||
--node-ip-address "$VLLM_HOST_IP" \
|
||||
--disable-usage-stats
|
||||
else
|
||||
echo "Starting Ray WORKER node connecting to $HEAD_IP..."
|
||||
exec ray start --block \
|
||||
--address="$HEAD_IP:6379" \
|
||||
--node-ip-address "$VLLM_HOST_IP"
|
||||
fi
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
echo "Setting up cluster initialization script..."
|
||||
cp run-cluster-node.sh $WORKSPACE_DIR/run-cluster-node.sh
|
||||
chmod +x $WORKSPACE_DIR/run-cluster-node.sh
|
||||
# NGC vLLM mod: container initialization is now handled by launch-cluster.sh
|
||||
echo "NGC vLLM mod applied."
|
||||
|
||||
@@ -1,124 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Define a function to export immediately AND save to .bashrc for future sessions
|
||||
export_persist() {
|
||||
local var_name="$1"
|
||||
local var_value="$2"
|
||||
|
||||
# 1. Export for the current running process
|
||||
export "$var_name"="$var_value"
|
||||
|
||||
# 2. Append to .bashrc (idempotent check to avoid duplicate lines)
|
||||
if ! grep -q "export $var_name=" ~/.bashrc; then
|
||||
echo "export $var_name=\"$var_value\"" >> ~/.bashrc
|
||||
else
|
||||
# Optional: Update the existing line if it exists
|
||||
sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Help Function ---
|
||||
usage() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo ""
|
||||
echo "Required Arguments:"
|
||||
echo " -r, --role <head|node> : Set the node type"
|
||||
echo " -h, --host-ip <ip> : IP address of this interface (Host IP)"
|
||||
echo " -e, --eth-if <name> : Ethernet interface name (e.g., eth0)"
|
||||
echo " -i, --ib-if <name> : InfiniBand/RDMA interface name"
|
||||
echo ""
|
||||
echo "Conditional Arguments:"
|
||||
echo " -m, --head-ip <ip> : IP of the head node (REQUIRED if role is 'node')"
|
||||
echo ""
|
||||
echo "Example:"
|
||||
echo " $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0"
|
||||
echo " $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# --- Argument Parsing ---
|
||||
|
||||
# Initialize variables to empty
|
||||
NODE_TYPE=""
|
||||
HOST_IP=""
|
||||
ETH_IF_NAME=""
|
||||
IB_IF_NAME=""
|
||||
HEAD_IP=""
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case $1 in
|
||||
-r|--role) NODE_TYPE="$2"; shift ;;
|
||||
-h|--host-ip) HOST_IP="$2"; shift ;;
|
||||
-e|--eth-if) ETH_IF_NAME="$2"; shift ;;
|
||||
-i|--ib-if) IB_IF_NAME="$2"; shift ;;
|
||||
-m|--head-ip) HEAD_IP="$2"; shift ;;
|
||||
*) echo "Unknown parameter passed: $1"; usage ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# --- Validation ---
|
||||
|
||||
# 1. Check if all common required arguments are present
|
||||
if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then
|
||||
echo "Error: Missing required arguments."
|
||||
usage
|
||||
fi
|
||||
|
||||
# 2. Validate Role
|
||||
if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then
|
||||
echo "Error: --role must be 'head' or 'node'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 3. Conditional Check for Head IP
|
||||
if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then
|
||||
echo "Error: When --role is 'node', you must provide --head-ip."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- Environment Configuration ---
|
||||
|
||||
echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..."
|
||||
|
||||
export_persist VLLM_HOST_IP "$HOST_IP"
|
||||
export_persist RAY_NODE_IP_ADDRESS "$HOST_IP"
|
||||
export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP"
|
||||
|
||||
# Network Interface
|
||||
export_persist MN_IF_NAME "$ETH_IF_NAME"
|
||||
export_persist UCX_NET_DEVICES "$ETH_IF_NAME"
|
||||
export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
|
||||
# InfiniBand
|
||||
export_persist NCCL_IB_HCA "$IB_IF_NAME"
|
||||
export_persist NCCL_IB_DISABLE "0"
|
||||
|
||||
# Sockets/Transport
|
||||
export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME"
|
||||
export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
export_persist RAY_memory_monitor_refresh_ms "0"
|
||||
|
||||
# UMA Memory Optimization (DGX Spark 128GB shared CPU/GPU memory)
|
||||
# Disable pre-started idle workers (saves ~8 GiB on head node)
|
||||
export_persist RAY_num_prestart_python_workers "0"
|
||||
# Limit object store to 1 GiB (default 30% of RAM = 33 GiB, wastes UMA)
|
||||
export_persist RAY_object_store_memory "1073741824"
|
||||
|
||||
# --- Execution ---
|
||||
|
||||
if [ "${NODE_TYPE}" == "head" ]; then
|
||||
echo "Starting Ray HEAD node..."
|
||||
exec ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \
|
||||
--node-ip-address "$VLLM_HOST_IP" \
|
||||
--include-dashboard=false \
|
||||
--disable-usage-stats
|
||||
else
|
||||
echo "Starting Ray WORKER node connecting to $HEAD_IP..."
|
||||
exec ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
|
||||
--address="$HEAD_IP:6379" \
|
||||
--node-ip-address "$VLLM_HOST_IP"
|
||||
fi
|
||||
|
||||
@@ -393,7 +393,7 @@ def check_model_exists(model: str) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False, extra_args: list[str] | None = None) -> str:
|
||||
def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False, extra_args: list[str] | None = None, no_ray: bool = False) -> str:
|
||||
"""
|
||||
Generate a bash launch script from the recipe.
|
||||
|
||||
@@ -458,9 +458,9 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
|
||||
print(f"Available parameters: {list(params.keys())}")
|
||||
sys.exit(1)
|
||||
|
||||
# In solo mode, remove --distributed-executor-backend ray
|
||||
# (it's not needed and can cause issues on single node)
|
||||
if is_solo:
|
||||
# In solo or no-ray mode, remove --distributed-executor-backend
|
||||
# (not needed for solo; no-ray uses PyTorch distributed instead)
|
||||
if is_solo or no_ray:
|
||||
import re
|
||||
# Remove the entire line containing --distributed-executor-backend
|
||||
# This handles multi-line commands with backslash continuations
|
||||
@@ -820,6 +820,12 @@ Examples:
|
||||
launch_group.add_argument("-t", "--container", dest="container_override", help="Override container image from recipe")
|
||||
launch_group.add_argument("--nccl-debug", choices=["VERSION", "WARN", "INFO", "TRACE"], help="NCCL debug level")
|
||||
launch_group.add_argument("-e", "--env", action="append", dest="env_vars", default=[], metavar="VAR=VALUE", help="Environment variable to pass to container (e.g. -e HF_TOKEN=xxx). Can be used multiple times.")
|
||||
launch_group.add_argument(
|
||||
"--no-ray",
|
||||
action="store_true",
|
||||
dest="no_ray",
|
||||
help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)"
|
||||
)
|
||||
|
||||
# Cluster discovery options
|
||||
discover_group = parser.add_argument_group("Cluster discovery")
|
||||
@@ -933,6 +939,10 @@ Examples:
|
||||
solo_only = recipe.get("solo_only", False)
|
||||
is_solo = args.solo or not is_cluster
|
||||
|
||||
if getattr(args, 'no_ray', False) and is_solo:
|
||||
print("Error: --no-ray is incompatible with --solo. Solo mode already runs without Ray.")
|
||||
return 1
|
||||
|
||||
if cluster_only and is_solo:
|
||||
print(f"Error: Recipe '{recipe['name']}' requires cluster mode.")
|
||||
print(f"This model is too large to run on a single node.")
|
||||
@@ -1097,7 +1107,7 @@ Examples:
|
||||
print(f" vLLM uses last value; extra args appear after template substitution")
|
||||
|
||||
# Generate launch script
|
||||
script_content = generate_launch_script(recipe, overrides, is_solo=is_solo, extra_args=extra_args)
|
||||
script_content = generate_launch_script(recipe, overrides, is_solo=is_solo, extra_args=extra_args, no_ray=getattr(args, 'no_ray', False))
|
||||
|
||||
if args.dry_run:
|
||||
print("=== Generated Launch Script ===")
|
||||
@@ -1116,6 +1126,8 @@ Examples:
|
||||
cmd_parts.append("--solo")
|
||||
if args.daemon:
|
||||
cmd_parts.append("-d")
|
||||
if getattr(args, 'no_ray', False):
|
||||
cmd_parts.append("--no-ray")
|
||||
if nodes:
|
||||
cmd_parts.extend(["-n", ",".join(nodes)])
|
||||
if args.nccl_debug:
|
||||
@@ -1156,6 +1168,9 @@ Examples:
|
||||
if args.daemon:
|
||||
cmd.append("-d")
|
||||
|
||||
if getattr(args, 'no_ray', False):
|
||||
cmd.append("--no-ray")
|
||||
|
||||
# Pass nodes to launch-cluster.sh (from command line, .env, or autodiscover)
|
||||
if nodes:
|
||||
cmd.extend(["-n", ",".join(nodes)])
|
||||
|
||||
Reference in New Issue
Block a user