Don't start the cluster if only --setup/--discover is specified
This commit is contained in:
@@ -106,8 +106,9 @@ network:
|
||||
enP2p1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no
|
||||
link-local: [ ipv4 ]
|
||||
link-local: []
|
||||
mtu: 9000
|
||||
addresses: [192.168.178.11/24]
|
||||
```
|
||||
|
||||
Create `/etc/netplan/40-cx7.yaml` on `spark2`:
|
||||
@@ -124,16 +125,12 @@ network:
|
||||
enP2p1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no
|
||||
link-local: [ ipv4 ]
|
||||
link-local: []
|
||||
mtu: 9000
|
||||
addresses: [192.168.178.12/24]
|
||||
```
|
||||
|
||||
Please note, that only one interface of the "twin" pair needs an IP address, but MTU needs to be set on both.
|
||||
You can also assign a separate address to another "twin" if you want to utilize the second interface independently, but make sure you assign an IP address from a different subnet.
|
||||
|
||||
For instance, for the example above, if you want to assign an IP to `enP2p1s0f1np1`, you need to use `192.168.177.12` on `spark`. **DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
|
||||
|
||||
This will not affect vLLM performance as it will use RDMA over RoCE using both "twins", even if the IP is only set on one.
|
||||
**DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
|
||||
|
||||
Then run on each node:
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ CONTAINER_NAME="$DEFAULT_CONTAINER_NAME"
|
||||
COMMAND_TO_RUN=""
|
||||
DAEMON_MODE="false"
|
||||
CHECK_CONFIG="false"
|
||||
ACTION="start"
|
||||
ACTION=""
|
||||
CLUSTER_WAS_RUNNING="false"
|
||||
MOD_PATHS=()
|
||||
MOD_TYPES=()
|
||||
@@ -69,7 +69,7 @@ usage() {
|
||||
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
|
||||
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
|
||||
echo " --config Path to .env configuration file (default: .env in script directory)
|
||||
--setup Force autodiscovery and save configuration (even if .env exists)"
|
||||
--setup/--discover Force autodiscovery and save configuration (even if .env exists)"
|
||||
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
|
||||
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
|
||||
echo ""
|
||||
@@ -132,7 +132,7 @@ while [[ "$#" -gt 0 ]]; do
|
||||
-d) DAEMON_MODE="true" ;;
|
||||
-h|--help) usage ;;
|
||||
--config) CONFIG_FILE="$2"; shift ;;
|
||||
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
|
||||
--setup|--discover) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
|
||||
start|stop|status)
|
||||
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
|
||||
@@ -426,6 +426,10 @@ if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
|
||||
[[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES"
|
||||
[[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF"
|
||||
[[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF"
|
||||
# If no action was specified, setup was the only intent — exit cleanly
|
||||
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$SOLO_MODE" == "true" ]]; then
|
||||
@@ -505,6 +509,12 @@ if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then
|
||||
echo "Error: No action specified. Use: start | stop | status | exec"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "$CHECK_CONFIG" == "true" ]]; then
|
||||
echo "Configuration Check Complete."
|
||||
echo " Image Name: $IMAGE_NAME"
|
||||
|
||||
Reference in New Issue
Block a user