Don't start the cluster if only --setup/--discover is specified

This commit is contained in:
Eugene Rakhmatulin
2026-03-31 13:29:56 -07:00
parent bb177383ff
commit 9370b2bb34
2 changed files with 18 additions and 11 deletions

View File

@@ -106,8 +106,9 @@ network:
enP2p1s0f1np1: enP2p1s0f1np1:
dhcp4: no dhcp4: no
dhcp6: no dhcp6: no
link-local: [ ipv4 ] link-local: []
mtu: 9000 mtu: 9000
addresses: [192.168.178.11/24]
``` ```
Create `/etc/netplan/40-cx7.yaml` on `spark2`: Create `/etc/netplan/40-cx7.yaml` on `spark2`:
@@ -124,16 +125,12 @@ network:
enP2p1s0f1np1: enP2p1s0f1np1:
dhcp4: no dhcp4: no
dhcp6: no dhcp6: no
link-local: [ ipv4 ] link-local: []
mtu: 9000 mtu: 9000
addresses: [192.168.178.12/24]
``` ```
Please note, that only one interface of the "twin" pair needs an IP address, but MTU needs to be set on both. **DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
You can also assign a separate address to another "twin" if you want to utilize the second interface independently, but make sure you assign an IP address from a different subnet.
For instance, for the example above, if you want to assign an IP to `enP2p1s0f1np1`, you need to use `192.168.177.12` on `spark`. **DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
This will not affect vLLM performance as it will use RDMA over RoCE using both "twins", even if the IP is only set on one.
Then run on each node: Then run on each node:

View File

@@ -24,7 +24,7 @@ CONTAINER_NAME="$DEFAULT_CONTAINER_NAME"
COMMAND_TO_RUN="" COMMAND_TO_RUN=""
DAEMON_MODE="false" DAEMON_MODE="false"
CHECK_CONFIG="false" CHECK_CONFIG="false"
ACTION="start" ACTION=""
CLUSTER_WAS_RUNNING="false" CLUSTER_WAS_RUNNING="false"
MOD_PATHS=() MOD_PATHS=()
MOD_TYPES=() MOD_TYPES=()
@@ -69,7 +69,7 @@ usage() {
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
echo " --config Path to .env configuration file (default: .env in script directory) echo " --config Path to .env configuration file (default: .env in script directory)
--setup Force autodiscovery and save configuration (even if .env exists)" --setup/--discover Force autodiscovery and save configuration (even if .env exists)"
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
echo "" echo ""
@@ -132,7 +132,7 @@ while [[ "$#" -gt 0 ]]; do
-d) DAEMON_MODE="true" ;; -d) DAEMON_MODE="true" ;;
-h|--help) usage ;; -h|--help) usage ;;
--config) CONFIG_FILE="$2"; shift ;; --config) CONFIG_FILE="$2"; shift ;;
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;; --setup|--discover) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
start|stop|status) start|stop|status)
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script." echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
@@ -426,6 +426,10 @@ if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
[[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES" [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES"
[[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF" [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF"
[[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF" [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF"
# If no action was specified, setup was the only intent — exit cleanly
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then
exit 0
fi
fi fi
if [[ "$SOLO_MODE" == "true" ]]; then if [[ "$SOLO_MODE" == "true" ]]; then
@@ -505,6 +509,12 @@ if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]
fi fi
fi fi
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then
echo "Error: No action specified. Use: start | stop | status | exec"
usage
exit 1
fi
if [[ "$CHECK_CONFIG" == "true" ]]; then if [[ "$CHECK_CONFIG" == "true" ]]; then
echo "Configuration Check Complete." echo "Configuration Check Complete."
echo " Image Name: $IMAGE_NAME" echo " Image Name: $IMAGE_NAME"