Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node

This commit is contained in:
Eugene Rakhmatulin
2026-03-25 15:16:47 -07:00
2 changed files with 17 additions and 4 deletions

View File

@@ -3,13 +3,17 @@
# Cluster configuration # Cluster configuration
# CLUSTER_NODES: Comma-separated list of node IPs (first node is the head node) # CLUSTER_NODES: Comma-separated list of node IPs (first node is the head node)
CLUSTER_NODES="192.168.1.1,192.168.1.2,192.168.1.3" CLUSTER_NODES="192.168.177.11,192.168.177.12"
# ETH_IF: Ethernet interface name (optional, auto-detected if not specified) # ETH_IF: Ethernet interface name (optional, auto-detected if not specified)
ETH_IF="eth0" ETH_IF="enp1s0f1np1"
# IB_IF: InfiniBand interface name (optional, auto-detected if not specified) # IB_IF: InfiniBand interface name (optional, auto-detected if not specified)
IB_IF="ib0" IB_IF="rocep1s0f1,roceP2p1s0f1"
# LOCAL_IP: Local IP address (optional, auto-detected if not specified)
# Useful for solo mode or overriding auto-detection
LOCAL_IP="192.168.177.11"
# MASTER_PORT: Port for cluster coordination (default: 29501) # MASTER_PORT: Port for cluster coordination (default: 29501)
MASTER_PORT="29501" MASTER_PORT="29501"

View File

@@ -78,6 +78,7 @@ usage() {
echo " IB_IF InfiniBand interface name" echo " IB_IF InfiniBand interface name"
echo " MASTER_PORT Port for cluster coordination (default: 29501)" echo " MASTER_PORT Port for cluster coordination (default: 29501)"
echo " CONTAINER_NAME Container name (default: vllm_node)" echo " CONTAINER_NAME Container name (default: vllm_node)"
echo " LOCAL_IP Local IP address (for solo mode or override auto-detection)"
echo " CONTAINER_* Any variable starting with CONTAINER_ (except CONTAINER_NAME)" echo " CONTAINER_* Any variable starting with CONTAINER_ (except CONTAINER_NAME)"
echo " becomes -e flag. Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO" echo " becomes -e flag. Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO"
echo "" echo ""
@@ -87,6 +88,7 @@ usage() {
echo " IB_IF=ib0" echo " IB_IF=ib0"
echo " MASTER_PORT=29501" echo " MASTER_PORT=29501"
echo " CONTAINER_NAME=vllm_node" echo " CONTAINER_NAME=vllm_node"
echo " LOCAL_IP=192.168.1.1"
echo " CONTAINER_NCCL_DEBUG=INFO" echo " CONTAINER_NCCL_DEBUG=INFO"
echo " CONTAINER_HF_TOKEN=abc123" echo " CONTAINER_HF_TOKEN=abc123"
echo "" echo ""
@@ -256,6 +258,10 @@ if [[ -z "$CONTAINER_NAME" || "$CONTAINER_NAME" == "vllm_node" ]] && [[ -n "$DOT
CONTAINER_NAME="$DOTENV_CONTAINER_NAME" CONTAINER_NAME="$DOTENV_CONTAINER_NAME"
fi fi
if [[ -n "$DOTENV_LOCAL_IP" ]]; then
export LOCAL_IP="$DOTENV_LOCAL_IP"
fi
# Validate non-privileged mode flags # Validate non-privileged mode flags
if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then
# Set default swap limit if not specified # Set default swap limit if not specified
@@ -408,7 +414,10 @@ if [[ "$SOLO_MODE" == "true" ]]; then
exit 1 exit 1
fi fi
# Solo mode: skip node detection, just get local IP # Solo mode: skip node detection, just get local IP
# Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1
if [[ -z "$LOCAL_IP" ]]; then
LOCAL_IP="127.0.0.1" LOCAL_IP="127.0.0.1"
fi
NODES_ARG="$LOCAL_IP" NODES_ARG="$LOCAL_IP"
PEER_NODES=() PEER_NODES=()
echo "Solo mode enabled. Skipping node detection." echo "Solo mode enabled. Skipping node detection."