diff --git a/docs/NETWORKING.md b/docs/NETWORKING.md index 734592d..8d786a2 100644 --- a/docs/NETWORKING.md +++ b/docs/NETWORKING.md @@ -106,8 +106,9 @@ network: enP2p1s0f1np1: dhcp4: no dhcp6: no - link-local: [ ipv4 ] + link-local: [] mtu: 9000 + addresses: [192.168.178.11/24] ``` Create `/etc/netplan/40-cx7.yaml` on `spark2`: @@ -124,16 +125,12 @@ network: enP2p1s0f1np1: dhcp4: no dhcp6: no - link-local: [ ipv4 ] + link-local: [] mtu: 9000 + addresses: [192.168.178.12/24] ``` -Please note, that only one interface of the "twin" pair needs an IP address, but MTU needs to be set on both. -You can also assign a separate address to another "twin" if you want to utilize the second interface independently, but make sure you assign an IP address from a different subnet. - -For instance, for the example above, if you want to assign an IP to `enP2p1s0f1np1`, you need to use `192.168.177.12` on `spark`. **DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing. - -This will not affect vLLM performance as it will use RDMA over RoCE using both "twins", even if the IP is only set on one. +**DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing. Then run on each node: diff --git a/launch-cluster.sh b/launch-cluster.sh index b20603e..af3d8f7 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -24,7 +24,7 @@ CONTAINER_NAME="$DEFAULT_CONTAINER_NAME" COMMAND_TO_RUN="" DAEMON_MODE="false" CHECK_CONFIG="false" -ACTION="start" +ACTION="" CLUSTER_WAS_RUNNING="false" MOD_PATHS=() MOD_TYPES=() @@ -69,7 +69,7 @@ usage() { echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" echo " --config Path to .env configuration file (default: .env in script directory) - --setup Force autodiscovery and save configuration (even if .env exists)" + --setup/--discover Force autodiscovery and save configuration (even if .env exists)" echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo "" @@ -132,7 +132,7 @@ while [[ "$#" -gt 0 ]]; do -d) DAEMON_MODE="true" ;; -h|--help) usage ;; --config) CONFIG_FILE="$2"; shift ;; - --setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;; + --setup|--discover) FORCE_DISCOVER=true; export FORCE_DISCOVER ;; start|stop|status) if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script." @@ -426,6 +426,10 @@ if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES" [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF" [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF" + # If no action was specified, setup was the only intent — exit cleanly + if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then + exit 0 + fi fi if [[ "$SOLO_MODE" == "true" ]]; then @@ -505,6 +509,12 @@ if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ] fi fi +if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then + echo "Error: No action specified. Use: start | stop | status | exec" + usage + exit 1 +fi + if [[ "$CHECK_CONFIG" == "true" ]]; then echo "Configuration Check Complete." echo " Image Name: $IMAGE_NAME"