Fixed launch_cluster bug introduced by refactoring
This commit is contained in:
@@ -139,9 +139,9 @@ echo "Action: $ACTION"
|
|||||||
|
|
||||||
# Check SSH connectivity to worker nodes
|
# Check SSH connectivity to worker nodes
|
||||||
if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]]; then
|
if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]]; then
|
||||||
if [ ${#WORKER_NODES[@]} -gt 0 ]; then
|
if [ ${#PEER_NODES[@]} -gt 0 ]; then
|
||||||
echo "Checking SSH connectivity to worker nodes..."
|
echo "Checking SSH connectivity to worker nodes..."
|
||||||
for worker in "${WORKER_NODES[@]}"; do
|
for worker in "${PEER_NODES[@]}"; do
|
||||||
if ! ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$worker" true 2>/dev/null; then
|
if ! ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$worker" true 2>/dev/null; then
|
||||||
echo "Error: Passwordless SSH to $worker failed."
|
echo "Error: Passwordless SSH to $worker failed."
|
||||||
echo " Please ensure SSH keys are configured and the host is reachable."
|
echo " Please ensure SSH keys are configured and the host is reachable."
|
||||||
@@ -178,7 +178,7 @@ cleanup() {
|
|||||||
docker stop "$CONTAINER_NAME" >/dev/null 2>&1 || true
|
docker stop "$CONTAINER_NAME" >/dev/null 2>&1 || true
|
||||||
|
|
||||||
# Stop Workers
|
# Stop Workers
|
||||||
for worker in "${WORKER_NODES[@]}"; do
|
for worker in "${PEER_NODES[@]}"; do
|
||||||
echo "Stopping worker node ($worker)..."
|
echo "Stopping worker node ($worker)..."
|
||||||
ssh "$worker" "docker stop $CONTAINER_NAME" >/dev/null 2>&1 || true
|
ssh "$worker" "docker stop $CONTAINER_NAME" >/dev/null 2>&1 || true
|
||||||
done
|
done
|
||||||
@@ -207,7 +207,7 @@ if [[ "$ACTION" == "status" ]]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Check Workers
|
# Check Workers
|
||||||
for worker in "${WORKER_NODES[@]}"; do
|
for worker in "${PEER_NODES[@]}"; do
|
||||||
if ssh "$worker" "docker ps | grep -q '$CONTAINER_NAME'"; then
|
if ssh "$worker" "docker ps | grep -q '$CONTAINER_NAME'"; then
|
||||||
echo "[WORKER] $worker: Container '$CONTAINER_NAME' is RUNNING."
|
echo "[WORKER] $worker: Container '$CONTAINER_NAME' is RUNNING."
|
||||||
else
|
else
|
||||||
@@ -234,7 +234,7 @@ check_cluster_running() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Check Workers
|
# Check Workers
|
||||||
for worker in "${WORKER_NODES[@]}"; do
|
for worker in "${PEER_NODES[@]}"; do
|
||||||
if ssh "$worker" "docker ps --format '{{.Names}}' | grep -q '^${CONTAINER_NAME}$'"; then
|
if ssh "$worker" "docker ps --format '{{.Names}}' | grep -q '^${CONTAINER_NAME}$'"; then
|
||||||
echo "Warning: Container '$CONTAINER_NAME' is already running on worker node ($worker)."
|
echo "Warning: Container '$CONTAINER_NAME' is already running on worker node ($worker)."
|
||||||
running=true
|
running=true
|
||||||
@@ -271,7 +271,7 @@ start_cluster() {
|
|||||||
|
|
||||||
# Start Worker Nodes
|
# Start Worker Nodes
|
||||||
# Start Worker Nodes
|
# Start Worker Nodes
|
||||||
for worker in "${WORKER_NODES[@]}"; do
|
for worker in "${PEER_NODES[@]}"; do
|
||||||
echo "Starting Worker Node on $worker..."
|
echo "Starting Worker Node on $worker..."
|
||||||
ssh "$worker" "docker run -d --privileged --gpus all --rm \
|
ssh "$worker" "docker run -d --privileged --gpus all --rm \
|
||||||
--ipc=host --network host \
|
--ipc=host --network host \
|
||||||
|
|||||||
Reference in New Issue
Block a user