Fixed launch_cluster bug introduced by refactoring

This commit is contained in:
Eugene Rakhmatulin
2025-12-19 10:51:50 -08:00
parent 0cac77c286
commit f075801c59

View File

@@ -139,9 +139,9 @@ echo "Action: $ACTION"
# Check SSH connectivity to worker nodes # Check SSH connectivity to worker nodes
if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]]; then if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]]; then
if [ ${#WORKER_NODES[@]} -gt 0 ]; then if [ ${#PEER_NODES[@]} -gt 0 ]; then
echo "Checking SSH connectivity to worker nodes..." echo "Checking SSH connectivity to worker nodes..."
for worker in "${WORKER_NODES[@]}"; do for worker in "${PEER_NODES[@]}"; do
if ! ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$worker" true 2>/dev/null; then if ! ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$worker" true 2>/dev/null; then
echo "Error: Passwordless SSH to $worker failed." echo "Error: Passwordless SSH to $worker failed."
echo " Please ensure SSH keys are configured and the host is reachable." echo " Please ensure SSH keys are configured and the host is reachable."
@@ -178,7 +178,7 @@ cleanup() {
docker stop "$CONTAINER_NAME" >/dev/null 2>&1 || true docker stop "$CONTAINER_NAME" >/dev/null 2>&1 || true
# Stop Workers # Stop Workers
for worker in "${WORKER_NODES[@]}"; do for worker in "${PEER_NODES[@]}"; do
echo "Stopping worker node ($worker)..." echo "Stopping worker node ($worker)..."
ssh "$worker" "docker stop $CONTAINER_NAME" >/dev/null 2>&1 || true ssh "$worker" "docker stop $CONTAINER_NAME" >/dev/null 2>&1 || true
done done
@@ -207,7 +207,7 @@ if [[ "$ACTION" == "status" ]]; then
fi fi
# Check Workers # Check Workers
for worker in "${WORKER_NODES[@]}"; do for worker in "${PEER_NODES[@]}"; do
if ssh "$worker" "docker ps | grep -q '$CONTAINER_NAME'"; then if ssh "$worker" "docker ps | grep -q '$CONTAINER_NAME'"; then
echo "[WORKER] $worker: Container '$CONTAINER_NAME' is RUNNING." echo "[WORKER] $worker: Container '$CONTAINER_NAME' is RUNNING."
else else
@@ -234,7 +234,7 @@ check_cluster_running() {
fi fi
# Check Workers # Check Workers
for worker in "${WORKER_NODES[@]}"; do for worker in "${PEER_NODES[@]}"; do
if ssh "$worker" "docker ps --format '{{.Names}}' | grep -q '^${CONTAINER_NAME}$'"; then if ssh "$worker" "docker ps --format '{{.Names}}' | grep -q '^${CONTAINER_NAME}$'"; then
echo "Warning: Container '$CONTAINER_NAME' is already running on worker node ($worker)." echo "Warning: Container '$CONTAINER_NAME' is already running on worker node ($worker)."
running=true running=true
@@ -271,7 +271,7 @@ start_cluster() {
# Start Worker Nodes # Start Worker Nodes
# Start Worker Nodes # Start Worker Nodes
for worker in "${WORKER_NODES[@]}"; do for worker in "${PEER_NODES[@]}"; do
echo "Starting Worker Node on $worker..." echo "Starting Worker Node on $worker..."
ssh "$worker" "docker run -d --privileged --gpus all --rm \ ssh "$worker" "docker run -d --privileged --gpus all --rm \
--ipc=host --network host \ --ipc=host --network host \