Cleaning up launch-cluster changes

This commit is contained in:
Eugene Rakhmatulin
2026-02-04 11:36:55 -08:00
parent b1516f688a
commit f7830636af

View File

@@ -43,12 +43,12 @@ usage() {
echo " -e, --env Environment variable to pass to container (e.g. -e VAR=val)" echo " -e, --env Environment variable to pass to container (e.g. -e VAR=val)"
echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO." echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO."
echo " --apply-mod Path to directory or zip file containing run.sh to apply before launch (Can be specified multiple times)" echo " --apply-mod Path to directory or zip file containing run.sh to apply before launch (Can be specified multiple times)"
echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path)" echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted."
echo " --check-config Check configuration and auto-detection without launching" echo " --check-config Check configuration and auto-detection without launching"
echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster" echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster"
echo " -d Daemon mode (only for 'start' action)" echo " -d Daemon mode (only for 'start' action)"
echo " action start | stop | status | exec (Default: start)" echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
echo " command Command to run (only for 'exec' action)" echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
echo "" echo ""
echo "Launch Script Usage:" echo "Launch Script Usage:"
echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed" echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed"
@@ -80,9 +80,17 @@ while [[ "$#" -gt 0 ]]; do
-d) DAEMON_MODE="true" ;; -d) DAEMON_MODE="true" ;;
-h|--help) usage ;; -h|--help) usage ;;
start|stop|status) start|stop|status)
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
exit 1
fi
ACTION="$1" ACTION="$1"
;; ;;
exec) exec)
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
echo "Error: Action 'exec' is not compatible with --launch-script. Please omit the action or not use --launch-script."
exit 1
fi
ACTION="exec" ACTION="exec"
shift shift
COMMAND_TO_RUN="$@" COMMAND_TO_RUN="$@"
@@ -93,6 +101,10 @@ while [[ "$#" -gt 0 ]]; do
# unless it's the default 'start' implied. # unless it's the default 'start' implied.
# However, to support "omitted" = start, we need to be careful. # However, to support "omitted" = start, we need to be careful.
# If the arg looks like a command, it's exec. # If the arg looks like a command, it's exec.
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
echo "Error: Command is not compatible with --launch-script. Please omit the command or not use --launch-script."
exit 1
fi
ACTION="exec" ACTION="exec"
COMMAND_TO_RUN="$@" COMMAND_TO_RUN="$@"
break break
@@ -467,47 +479,21 @@ apply_mod_to_container() {
# Copy Launch Script to Container Function # Copy Launch Script to Container Function
copy_launch_script_to_container() { copy_launch_script_to_container() {
local node_ip="$1" local container="$1"
local container="$2" local script_path="$2"
local is_local="$3" # true/false
local script_path="$4"
echo "Copying launch script to $node_ip..." echo "Copying launch script to head node..."
# Command prefix for remote vs local
local cmd_prefix=""
if [[ "$is_local" == "false" ]]; then
cmd_prefix="ssh -o BatchMode=yes -o StrictHostKeyChecking=no $node_ip"
fi
local target_script_path="$script_path" local target_script_path="$script_path"
local remote_cleanup_path=""
# Copy script to remote node first if needed
if [[ "$is_local" == "false" ]]; then
local remote_tmp="/tmp/exec_script_$(date +%s)_$RANDOM.sh"
echo " Copying script to $node_ip:$remote_tmp..."
if ! scp -o BatchMode=yes -o StrictHostKeyChecking=no "$script_path" "$node_ip:$remote_tmp"; then
echo "Error: Failed to copy launch script to $node_ip"
exit 1
fi
target_script_path="$remote_tmp"
remote_cleanup_path="$remote_tmp"
fi
# Copy script into container as /workspace/exec-script.sh # Copy script into container as /workspace/exec-script.sh
echo " Copying script into container..." echo " Copying script into container..."
$cmd_prefix docker cp "$target_script_path" "$container:/workspace/exec-script.sh" docker cp "$target_script_path" "$container:/workspace/exec-script.sh"
# Make executable # Make executable
$cmd_prefix docker exec "$container" chmod +x /workspace/exec-script.sh docker exec "$container" chmod +x /workspace/exec-script.sh
# Cleanup remote temp echo " Launch script copied to head node"
if [[ -n "$remote_cleanup_path" ]]; then
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$node_ip" "rm -f $remote_cleanup_path"
fi
echo " Launch script copied to $node_ip"
} }
# Start Cluster Function # Start Cluster Function
@@ -580,8 +566,7 @@ start_cluster() {
# Copy launch script to head node only (workers don't need it - they just run Ray) # Copy launch script to head node only (workers don't need it - they just run Ray)
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
echo "Copying launch script to head node..." copy_launch_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH"
copy_launch_script_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" "$LAUNCH_SCRIPT_PATH"
fi fi
if [[ "$SOLO_MODE" == "false" ]]; then if [[ "$SOLO_MODE" == "false" ]]; then