Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node

This commit is contained in:
Eugene Rakhmatulin
2026-03-25 14:18:32 -07:00
3 changed files with 548 additions and 231 deletions

29
.env.example Normal file
View File

@@ -0,0 +1,29 @@
# Example .env configuration file for spark-vllm-docker
# Copy this file to .env and customize for your environment
# Cluster configuration
# CLUSTER_NODES: Comma-separated list of node IPs (first node is the head node)
CLUSTER_NODES="192.168.1.1,192.168.1.2,192.168.1.3"
# ETH_IF: Ethernet interface name (optional, auto-detected if not specified)
ETH_IF="eth0"
# IB_IF: InfiniBand interface name (optional, auto-detected if not specified)
IB_IF="ib0"
# MASTER_PORT: Port for cluster coordination (default: 29501)
MASTER_PORT="29501"
# CONTAINER_NAME: Container name (default: vllm_node)
CONTAINER_NAME="vllm_node"
# Container environment variables
# Any variable starting with CONTAINER_ will be converted to -e flags
# Example: CONTAINER_NCCL_DEBUG=INFO becomes -e NCCL_DEBUG=INFO
CONTAINER_NCCL_DEBUG="INFO"
CONTAINER_HF_TOKEN="your_huggingface_token_here"
CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1"
# Additional container environment variables
# CONTAINER_MAX_JOBS="16"
# CONTAINER_CUDA_VISIBLE_DEVICES="0,1"

View File

@@ -30,6 +30,7 @@ MOD_PATHS=()
MOD_TYPES=() MOD_TYPES=()
LAUNCH_SCRIPT_PATH="" LAUNCH_SCRIPT_PATH=""
SCRIPT_DIR="$(dirname "$(realpath "$0")")" SCRIPT_DIR="$(dirname "$(realpath "$0")")"
CONFIG_FILE="" # Will be set to default after argument parsing
ACTIONS_ARG="" ACTIONS_ARG=""
SOLO_MODE="false" SOLO_MODE="false"
@@ -67,9 +68,27 @@ usage() {
echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)" echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)"
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
echo " --config Path to .env configuration file (default: .env in script directory)"
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
echo "" echo ""
echo "Supported .env file variables:"
echo " CLUSTER_NODES Comma-separated list of node IPs"
echo " ETH_IF Ethernet interface name"
echo " IB_IF InfiniBand interface name"
echo " MASTER_PORT Port for cluster coordination (default: 29501)"
echo " CONTAINER_NAME Container name (default: vllm_node)"
echo " CONTAINER_* Any variable starting with CONTAINER_ becomes -e flag"
echo " Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO"
echo ""
echo "Example .env file:"
echo " CLUSTER_NODES=192.168.1.1,192.168.1.2"
echo " ETH_IF=eth0"
echo " IB_IF=ib0"
echo " MASTER_PORT=29501"
echo " CONTAINER_NCCL_DEBUG=INFO"
echo " CONTAINER_HF_TOKEN=abc123"
echo ""
echo "Launch Script Usage:" echo "Launch Script Usage:"
echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed" echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed"
echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script" echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script"
@@ -108,6 +127,7 @@ while [[ "$#" -gt 0 ]]; do
--shm-size-gb) SHM_SIZE_GB="$2"; shift ;; --shm-size-gb) SHM_SIZE_GB="$2"; shift ;;
-d) DAEMON_MODE="true" ;; -d) DAEMON_MODE="true" ;;
-h|--help) usage ;; -h|--help) usage ;;
--config) CONFIG_FILE="$2"; shift ;;
start|stop|status) start|stop|status)
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script." echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
@@ -133,6 +153,108 @@ while [[ "$#" -gt 0 ]]; do
shift shift
done done
# Set .env file path (use default if not specified)
if [[ -z "$CONFIG_FILE" ]]; then
CONFIG_FILE="$SCRIPT_DIR/.env"
fi
# Load .env file if exists
if [[ -f "$CONFIG_FILE" ]]; then
echo "Loading configuration from .env file..."
# Validate .env file syntax
if ! python3 -c "
import sys
import re
env_file = '$CONFIG_FILE'
seen_keys = set()
with open(env_file, 'r') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith('#'):
continue
# Check for key=value format
if '=' not in line:
print(f'Error: Invalid syntax at line {line_num}: missing \"=\"')
sys.exit(1)
key = line.split('=', 1)[0].strip()
# Validate key format (alphanumeric + underscore)
if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', key):
print(f'Error: Invalid key format at line {line_num}: {key}')
sys.exit(1)
# Check for duplicates
if key in seen_keys:
print(f'Error: Duplicate key at line {line_num}: {key}')
sys.exit(1)
seen_keys.add(key)
sys.exit(0)
" 2>/dev/null; then
echo "Error: Invalid .env file syntax. Aborting."
exit 1
fi
# Load .env variables with DOTENV_ prefix
while IFS='=' read -r key value || [[ -n "$key" ]]; do
# Skip comments and empty lines
[[ "$key" =~ ^[[:space:]]*# ]] && continue
[[ -z "$key" ]] && continue
# Remove leading/trailing whitespace from key
key=$(echo "$key" | xargs)
# Skip if key is empty after trimming
[[ -z "$key" ]] && continue
# Remove quotes and whitespace from value using Python for proper shlex handling
value=$(python3 -c "
import shlex
import sys
value = '''$value'''
# Strip whitespace
value = value.strip()
# Remove surrounding quotes if present
if (value.startswith('\"') and value.endswith('\"')) or (value.startswith(\"'\" ) and value.endswith(\"'\")):
value = value[1:-1]
print(value)
")
# Export with DOTENV_ prefix
export "DOTENV_$key=$value"
done < "$CONFIG_FILE"
echo "Loaded .env variables: $(compgen -v DOTENV_ | tr '\n' ' ')"
fi
# Apply .env configuration (CLI args take precedence)
if [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]]; then
NODES_ARG="$DOTENV_CLUSTER_NODES"
fi
if [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]]; then
ETH_IF="$DOTENV_ETH_IF"
fi
if [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]]; then
IB_IF="$DOTENV_IB_IF"
fi
if [[ -z "$MASTER_PORT" || "$MASTER_PORT" == "29501" ]] && [[ -n "$DOTENV_MASTER_PORT" ]]; then
MASTER_PORT="$DOTENV_MASTER_PORT"
fi
if [[ -z "$CONTAINER_NAME" || "$CONTAINER_NAME" == "vllm_node" ]] && [[ -n "$DOTENV_CONTAINER_NAME" ]]; then
CONTAINER_NAME="$DOTENV_CONTAINER_NAME"
fi
# Validate non-privileged mode flags # Validate non-privileged mode flags
if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then
# Set default swap limit if not specified # Set default swap limit if not specified
@@ -163,6 +285,22 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then
esac esac
fi fi
# Add container environment variables from .env (CONTAINER_* pattern)
for env_var in $(compgen -v DOTENV_CONTAINER_); do
# Get the value
value="${!env_var}"
# Extract the actual env var name (remove DOTENV_CONTAINER_ prefix)
actual_var="${env_var#DOTENV_CONTAINER_}"
# Properly escape the value for shell using Python
escaped_value=$(python3 -c "import shlex; print(shlex.quote('$value'))")
# Add to docker args
DOCKER_ARGS="$DOCKER_ARGS -e $actual_var=$escaped_value"
echo "Adding container env: $actual_var"
done
# Add build job parallelization environment variables if BUILD_JOBS is set # Add build job parallelization environment variables if BUILD_JOBS is set
if [[ -n "$BUILD_JOBS" ]]; then if [[ -n "$BUILD_JOBS" ]]; then
DOCKER_ARGS="$DOCKER_ARGS -e MAX_JOBS=$BUILD_JOBS" DOCKER_ARGS="$DOCKER_ARGS -e MAX_JOBS=$BUILD_JOBS"

File diff suppressed because it is too large Load Diff