Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node
This commit is contained in:
@@ -29,6 +29,10 @@ CONTAINER_NCCL_DEBUG="INFO"
|
|||||||
CONTAINER_HF_TOKEN="your_huggingface_token_here"
|
CONTAINER_HF_TOKEN="your_huggingface_token_here"
|
||||||
CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1"
|
CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1"
|
||||||
|
|
||||||
|
# COPY_HOSTS: Comma-separated list of hosts for build-and-copy.sh (optional)
|
||||||
|
# Used by build-and-copy.sh to distribute images across cluster
|
||||||
|
COPY_HOSTS="192.168.177.12"
|
||||||
|
|
||||||
# Additional container environment variables
|
# Additional container environment variables
|
||||||
# CONTAINER_MAX_JOBS="16"
|
# CONTAINER_MAX_JOBS="16"
|
||||||
# CONTAINER_CUDA_VISIBLE_DEVICES="0,1"
|
# CONTAINER_CUDA_VISIBLE_DEVICES="0,1"
|
||||||
|
|||||||
@@ -1,5 +1,52 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Load .env file if exists (for shared configuration)
|
||||||
|
# This is called early so that DOTENV_* variables are available to all functions
|
||||||
|
load_env_if_exists() {
|
||||||
|
local env_file="${CONFIG_FILE:-}"
|
||||||
|
local config_explicit="${CONFIG_FILE_SET:-false}"
|
||||||
|
|
||||||
|
# If CONFIG_FILE is not set, check default location
|
||||||
|
if [[ -z "$env_file" ]]; then
|
||||||
|
local script_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
|
||||||
|
env_file="$script_dir/.env"
|
||||||
|
config_explicit="false"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Validate config file exists if explicitly specified
|
||||||
|
if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]]; then
|
||||||
|
echo "Error: Config file not found: $env_file"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -f "$env_file" ]]; then
|
||||||
|
# Load .env variables with DOTENV_ prefix
|
||||||
|
while IFS='=' read -r key value || [[ -n "$key" ]]; do
|
||||||
|
# Skip comments and empty lines
|
||||||
|
[[ "$key" =~ ^[[:space:]]*# ]] && continue
|
||||||
|
[[ -z "$key" ]] && continue
|
||||||
|
|
||||||
|
# Remove leading/trailing whitespace from key
|
||||||
|
key=$(echo "$key" | xargs)
|
||||||
|
|
||||||
|
# Skip if key is empty after trimming
|
||||||
|
[[ -z "$key" ]] && continue
|
||||||
|
|
||||||
|
# Remove quotes from value
|
||||||
|
value="${value%\"}"
|
||||||
|
value="${value#\"}"
|
||||||
|
value="${value%\'}"
|
||||||
|
value="${value#\'}"
|
||||||
|
|
||||||
|
# Export with DOTENV_ prefix
|
||||||
|
export "DOTENV_$key=$value"
|
||||||
|
done < "$env_file"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Load .env file
|
||||||
|
load_env_if_exists
|
||||||
|
|
||||||
# Function to detect IB and Ethernet interfaces
|
# Function to detect IB and Ethernet interfaces
|
||||||
detect_interfaces() {
|
detect_interfaces() {
|
||||||
# If both interfaces are already set, nothing to do
|
# If both interfaces are already set, nothing to do
|
||||||
@@ -110,6 +157,19 @@ detect_nodes() {
|
|||||||
done
|
done
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Try to use COPY_HOSTS from .env
|
||||||
|
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
|
||||||
|
echo " Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
|
||||||
|
PEER_NODES=()
|
||||||
|
IFS=',' read -ra ALL_NODES <<< "$DOTENV_COPY_HOSTS"
|
||||||
|
for node in "${ALL_NODES[@]}"; do
|
||||||
|
node=$(echo "$node" | xargs)
|
||||||
|
PEER_NODES+=("$node")
|
||||||
|
done
|
||||||
|
NODES_ARG="$DOTENV_COPY_HOSTS"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
echo "Auto-detecting nodes..."
|
echo "Auto-detecting nodes..."
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ VLLM_RELEASE_TAG="prebuilt-vllm-current"
|
|||||||
# Space-separated list of GPU architectures for which prebuilt wheels are available
|
# Space-separated list of GPU architectures for which prebuilt wheels are available
|
||||||
PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a"
|
PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a"
|
||||||
CLEANUP_MODE="false"
|
CLEANUP_MODE="false"
|
||||||
|
CONFIG_FILE=""
|
||||||
|
|
||||||
cleanup() {
|
cleanup() {
|
||||||
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
|
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
|
||||||
@@ -280,11 +281,13 @@ usage() {
|
|||||||
echo " --no-build : Skip building, only copy image (requires --copy-to)"
|
echo " --no-build : Skip building, only copy image (requires --copy-to)"
|
||||||
echo " --network <network> : Docker network to use during build"
|
echo " --network <network> : Docker network to use during build"
|
||||||
echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory"
|
echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory"
|
||||||
|
echo " --config : Path to .env configuration file (default: .env in script directory)"
|
||||||
echo " -h, --help : Show this help message"
|
echo " -h, --help : Show this help message"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# Argument parsing
|
# Parse all arguments
|
||||||
|
CONFIG_FILE_SET=false
|
||||||
while [[ "$#" -gt 0 ]]; do
|
while [[ "$#" -gt 0 ]]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
-t|--tag) IMAGE_TAG="$2"; shift ;;
|
-t|--tag) IMAGE_TAG="$2"; shift ;;
|
||||||
@@ -298,27 +301,6 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
add_copy_hosts "$1"
|
add_copy_hosts "$1"
|
||||||
shift
|
shift
|
||||||
done
|
done
|
||||||
|
|
||||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
|
||||||
echo "No hosts specified. Using autodiscovery..."
|
|
||||||
source "$(dirname "$0")/autodiscover.sh"
|
|
||||||
|
|
||||||
detect_nodes
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
echo "Error: Autodiscovery failed."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ${#PEER_NODES[@]} -gt 0 ]; then
|
|
||||||
COPY_HOSTS=("${PEER_NODES[@]}")
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
|
||||||
echo "Error: Autodiscovery found no other nodes."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
|
|
||||||
fi
|
|
||||||
continue
|
continue
|
||||||
;;
|
;;
|
||||||
-j|--build-jobs) BUILD_JOBS="$2"; shift ;;
|
-j|--build-jobs) BUILD_JOBS="$2"; shift ;;
|
||||||
@@ -351,12 +333,42 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
;;
|
;;
|
||||||
|
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
|
||||||
-h|--help) usage ;;
|
-h|--help) usage ;;
|
||||||
*) echo "Unknown parameter passed: $1"; usage ;;
|
*) echo "Unknown parameter passed: $1"; usage ;;
|
||||||
esac
|
esac
|
||||||
shift
|
shift
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Source autodiscover.sh to load .env file
|
||||||
|
source "$(dirname "$0")/autodiscover.sh"
|
||||||
|
|
||||||
|
# Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments
|
||||||
|
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||||
|
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
|
||||||
|
echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
|
||||||
|
IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
|
||||||
|
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
|
||||||
|
else
|
||||||
|
echo "No hosts specified. Using autodiscovery..."
|
||||||
|
detect_nodes
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Error: Autodiscovery failed."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${#PEER_NODES[@]} -gt 0 ]; then
|
||||||
|
COPY_HOSTS=("${PEER_NODES[@]}")
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||||
|
echo "Error: Autodiscovery found no other nodes."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# Validate flag combinations
|
# Validate flag combinations
|
||||||
if [ -n "$VLLM_PRS" ]; then
|
if [ -n "$VLLM_PRS" ]; then
|
||||||
if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi
|
if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi
|
||||||
|
|||||||
@@ -159,9 +159,12 @@ done
|
|||||||
# Set .env file path (use default if not specified)
|
# Set .env file path (use default if not specified)
|
||||||
if [[ -z "$CONFIG_FILE" ]]; then
|
if [[ -z "$CONFIG_FILE" ]]; then
|
||||||
CONFIG_FILE="$SCRIPT_DIR/.env"
|
CONFIG_FILE="$SCRIPT_DIR/.env"
|
||||||
|
CONFIG_FILE_SET=false
|
||||||
|
else
|
||||||
|
CONFIG_FILE_SET=true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Load .env file if exists
|
# Load .env file
|
||||||
if [[ -f "$CONFIG_FILE" ]]; then
|
if [[ -f "$CONFIG_FILE" ]]; then
|
||||||
echo "Loading configuration from .env file..."
|
echo "Loading configuration from .env file..."
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user