Don't start the cluster if only --setup/--discover is specified
This commit is contained in:
@@ -106,8 +106,9 @@ network:
|
|||||||
enP2p1s0f1np1:
|
enP2p1s0f1np1:
|
||||||
dhcp4: no
|
dhcp4: no
|
||||||
dhcp6: no
|
dhcp6: no
|
||||||
link-local: [ ipv4 ]
|
link-local: []
|
||||||
mtu: 9000
|
mtu: 9000
|
||||||
|
addresses: [192.168.178.11/24]
|
||||||
```
|
```
|
||||||
|
|
||||||
Create `/etc/netplan/40-cx7.yaml` on `spark2`:
|
Create `/etc/netplan/40-cx7.yaml` on `spark2`:
|
||||||
@@ -124,16 +125,12 @@ network:
|
|||||||
enP2p1s0f1np1:
|
enP2p1s0f1np1:
|
||||||
dhcp4: no
|
dhcp4: no
|
||||||
dhcp6: no
|
dhcp6: no
|
||||||
link-local: [ ipv4 ]
|
link-local: []
|
||||||
mtu: 9000
|
mtu: 9000
|
||||||
|
addresses: [192.168.178.12/24]
|
||||||
```
|
```
|
||||||
|
|
||||||
Please note, that only one interface of the "twin" pair needs an IP address, but MTU needs to be set on both.
|
**DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
|
||||||
You can also assign a separate address to another "twin" if you want to utilize the second interface independently, but make sure you assign an IP address from a different subnet.
|
|
||||||
|
|
||||||
For instance, for the example above, if you want to assign an IP to `enP2p1s0f1np1`, you need to use `192.168.177.12` on `spark`. **DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
|
|
||||||
|
|
||||||
This will not affect vLLM performance as it will use RDMA over RoCE using both "twins", even if the IP is only set on one.
|
|
||||||
|
|
||||||
Then run on each node:
|
Then run on each node:
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ CONTAINER_NAME="$DEFAULT_CONTAINER_NAME"
|
|||||||
COMMAND_TO_RUN=""
|
COMMAND_TO_RUN=""
|
||||||
DAEMON_MODE="false"
|
DAEMON_MODE="false"
|
||||||
CHECK_CONFIG="false"
|
CHECK_CONFIG="false"
|
||||||
ACTION="start"
|
ACTION=""
|
||||||
CLUSTER_WAS_RUNNING="false"
|
CLUSTER_WAS_RUNNING="false"
|
||||||
MOD_PATHS=()
|
MOD_PATHS=()
|
||||||
MOD_TYPES=()
|
MOD_TYPES=()
|
||||||
@@ -69,7 +69,7 @@ usage() {
|
|||||||
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
|
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
|
||||||
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
|
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
|
||||||
echo " --config Path to .env configuration file (default: .env in script directory)
|
echo " --config Path to .env configuration file (default: .env in script directory)
|
||||||
--setup Force autodiscovery and save configuration (even if .env exists)"
|
--setup/--discover Force autodiscovery and save configuration (even if .env exists)"
|
||||||
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
|
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
|
||||||
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
|
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
|
||||||
echo ""
|
echo ""
|
||||||
@@ -132,7 +132,7 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
-d) DAEMON_MODE="true" ;;
|
-d) DAEMON_MODE="true" ;;
|
||||||
-h|--help) usage ;;
|
-h|--help) usage ;;
|
||||||
--config) CONFIG_FILE="$2"; shift ;;
|
--config) CONFIG_FILE="$2"; shift ;;
|
||||||
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
|
--setup|--discover) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
|
||||||
start|stop|status)
|
start|stop|status)
|
||||||
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||||
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
|
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
|
||||||
@@ -426,6 +426,10 @@ if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
|
|||||||
[[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES"
|
[[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES"
|
||||||
[[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF"
|
[[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF"
|
||||||
[[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF"
|
[[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF"
|
||||||
|
# If no action was specified, setup was the only intent — exit cleanly
|
||||||
|
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "$SOLO_MODE" == "true" ]]; then
|
if [[ "$SOLO_MODE" == "true" ]]; then
|
||||||
@@ -505,6 +509,12 @@ if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then
|
||||||
|
echo "Error: No action specified. Use: start | stop | status | exec"
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ "$CHECK_CONFIG" == "true" ]]; then
|
if [[ "$CHECK_CONFIG" == "true" ]]; then
|
||||||
echo "Configuration Check Complete."
|
echo "Configuration Check Complete."
|
||||||
echo " Image Name: $IMAGE_NAME"
|
echo " Image Name: $IMAGE_NAME"
|
||||||
|
|||||||
Reference in New Issue
Block a user