.env configuration support for launch-cluster.sh
This commit is contained in:
29
.env.example
Normal file
29
.env.example
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
# Example .env configuration file for spark-vllm-docker
|
||||||
|
# Copy this file to .env and customize for your environment
|
||||||
|
|
||||||
|
# Cluster configuration
|
||||||
|
# CLUSTER_NODES: Comma-separated list of node IPs (first node is the head node)
|
||||||
|
CLUSTER_NODES="192.168.1.1,192.168.1.2,192.168.1.3"
|
||||||
|
|
||||||
|
# ETH_IF: Ethernet interface name (optional, auto-detected if not specified)
|
||||||
|
ETH_IF="eth0"
|
||||||
|
|
||||||
|
# IB_IF: InfiniBand interface name (optional, auto-detected if not specified)
|
||||||
|
IB_IF="ib0"
|
||||||
|
|
||||||
|
# MASTER_PORT: Port for cluster coordination (default: 29501)
|
||||||
|
MASTER_PORT="29501"
|
||||||
|
|
||||||
|
# CONTAINER_NAME: Container name (default: vllm_node)
|
||||||
|
CONTAINER_NAME="vllm_node"
|
||||||
|
|
||||||
|
# Container environment variables
|
||||||
|
# Any variable starting with CONTAINER_ will be converted to -e flags
|
||||||
|
# Example: CONTAINER_NCCL_DEBUG=INFO becomes -e NCCL_DEBUG=INFO
|
||||||
|
CONTAINER_NCCL_DEBUG="INFO"
|
||||||
|
CONTAINER_HF_TOKEN="your_huggingface_token_here"
|
||||||
|
CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1"
|
||||||
|
|
||||||
|
# Additional container environment variables
|
||||||
|
# CONTAINER_MAX_JOBS="16"
|
||||||
|
# CONTAINER_CUDA_VISIBLE_DEVICES="0,1"
|
||||||
@@ -30,6 +30,7 @@ MOD_PATHS=()
|
|||||||
MOD_TYPES=()
|
MOD_TYPES=()
|
||||||
LAUNCH_SCRIPT_PATH=""
|
LAUNCH_SCRIPT_PATH=""
|
||||||
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
|
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
|
||||||
|
CONFIG_FILE="" # Will be set to default after argument parsing
|
||||||
|
|
||||||
ACTIONS_ARG=""
|
ACTIONS_ARG=""
|
||||||
SOLO_MODE="false"
|
SOLO_MODE="false"
|
||||||
@@ -67,9 +68,27 @@ usage() {
|
|||||||
echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)"
|
echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)"
|
||||||
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
|
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
|
||||||
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
|
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
|
||||||
|
echo " --config Path to .env configuration file (default: .env in script directory)"
|
||||||
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
|
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
|
||||||
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
|
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
|
||||||
echo ""
|
echo ""
|
||||||
|
echo "Supported .env file variables:"
|
||||||
|
echo " CLUSTER_NODES Comma-separated list of node IPs"
|
||||||
|
echo " ETH_IF Ethernet interface name"
|
||||||
|
echo " IB_IF InfiniBand interface name"
|
||||||
|
echo " MASTER_PORT Port for cluster coordination (default: 29501)"
|
||||||
|
echo " CONTAINER_NAME Container name (default: vllm_node)"
|
||||||
|
echo " CONTAINER_* Any variable starting with CONTAINER_ becomes -e flag"
|
||||||
|
echo " Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO"
|
||||||
|
echo ""
|
||||||
|
echo "Example .env file:"
|
||||||
|
echo " CLUSTER_NODES=192.168.1.1,192.168.1.2"
|
||||||
|
echo " ETH_IF=eth0"
|
||||||
|
echo " IB_IF=ib0"
|
||||||
|
echo " MASTER_PORT=29501"
|
||||||
|
echo " CONTAINER_NCCL_DEBUG=INFO"
|
||||||
|
echo " CONTAINER_HF_TOKEN=abc123"
|
||||||
|
echo ""
|
||||||
echo "Launch Script Usage:"
|
echo "Launch Script Usage:"
|
||||||
echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed"
|
echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed"
|
||||||
echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script"
|
echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script"
|
||||||
@@ -108,6 +127,7 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
--shm-size-gb) SHM_SIZE_GB="$2"; shift ;;
|
--shm-size-gb) SHM_SIZE_GB="$2"; shift ;;
|
||||||
-d) DAEMON_MODE="true" ;;
|
-d) DAEMON_MODE="true" ;;
|
||||||
-h|--help) usage ;;
|
-h|--help) usage ;;
|
||||||
|
--config) CONFIG_FILE="$2"; shift ;;
|
||||||
start|stop|status)
|
start|stop|status)
|
||||||
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||||
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
|
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
|
||||||
@@ -133,6 +153,108 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
shift
|
shift
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Set .env file path (use default if not specified)
|
||||||
|
if [[ -z "$CONFIG_FILE" ]]; then
|
||||||
|
CONFIG_FILE="$SCRIPT_DIR/.env"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Load .env file if exists
|
||||||
|
if [[ -f "$CONFIG_FILE" ]]; then
|
||||||
|
echo "Loading configuration from .env file..."
|
||||||
|
|
||||||
|
# Validate .env file syntax
|
||||||
|
if ! python3 -c "
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
|
env_file = '$CONFIG_FILE'
|
||||||
|
seen_keys = set()
|
||||||
|
|
||||||
|
with open(env_file, 'r') as f:
|
||||||
|
for line_num, line in enumerate(f, 1):
|
||||||
|
line = line.strip()
|
||||||
|
# Skip empty lines and comments
|
||||||
|
if not line or line.startswith('#'):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for key=value format
|
||||||
|
if '=' not in line:
|
||||||
|
print(f'Error: Invalid syntax at line {line_num}: missing \"=\"')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
key = line.split('=', 1)[0].strip()
|
||||||
|
|
||||||
|
# Validate key format (alphanumeric + underscore)
|
||||||
|
if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', key):
|
||||||
|
print(f'Error: Invalid key format at line {line_num}: {key}')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Check for duplicates
|
||||||
|
if key in seen_keys:
|
||||||
|
print(f'Error: Duplicate key at line {line_num}: {key}')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
seen_keys.add(key)
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
" 2>/dev/null; then
|
||||||
|
echo "Error: Invalid .env file syntax. Aborting."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Load .env variables with DOTENV_ prefix
|
||||||
|
while IFS='=' read -r key value || [[ -n "$key" ]]; do
|
||||||
|
# Skip comments and empty lines
|
||||||
|
[[ "$key" =~ ^[[:space:]]*# ]] && continue
|
||||||
|
[[ -z "$key" ]] && continue
|
||||||
|
|
||||||
|
# Remove leading/trailing whitespace from key
|
||||||
|
key=$(echo "$key" | xargs)
|
||||||
|
|
||||||
|
# Skip if key is empty after trimming
|
||||||
|
[[ -z "$key" ]] && continue
|
||||||
|
|
||||||
|
# Remove quotes and whitespace from value using Python for proper shlex handling
|
||||||
|
value=$(python3 -c "
|
||||||
|
import shlex
|
||||||
|
import sys
|
||||||
|
value = '''$value'''
|
||||||
|
# Strip whitespace
|
||||||
|
value = value.strip()
|
||||||
|
# Remove surrounding quotes if present
|
||||||
|
if (value.startswith('\"') and value.endswith('\"')) or (value.startswith(\"'\" ) and value.endswith(\"'\")):
|
||||||
|
value = value[1:-1]
|
||||||
|
print(value)
|
||||||
|
")
|
||||||
|
|
||||||
|
# Export with DOTENV_ prefix
|
||||||
|
export "DOTENV_$key=$value"
|
||||||
|
done < "$CONFIG_FILE"
|
||||||
|
|
||||||
|
echo "Loaded .env variables: $(compgen -v DOTENV_ | tr '\n' ' ')"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Apply .env configuration (CLI args take precedence)
|
||||||
|
if [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]]; then
|
||||||
|
NODES_ARG="$DOTENV_CLUSTER_NODES"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]]; then
|
||||||
|
ETH_IF="$DOTENV_ETH_IF"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]]; then
|
||||||
|
IB_IF="$DOTENV_IB_IF"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$MASTER_PORT" || "$MASTER_PORT" == "29501" ]] && [[ -n "$DOTENV_MASTER_PORT" ]]; then
|
||||||
|
MASTER_PORT="$DOTENV_MASTER_PORT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -z "$CONTAINER_NAME" || "$CONTAINER_NAME" == "vllm_node" ]] && [[ -n "$DOTENV_CONTAINER_NAME" ]]; then
|
||||||
|
CONTAINER_NAME="$DOTENV_CONTAINER_NAME"
|
||||||
|
fi
|
||||||
|
|
||||||
# Validate non-privileged mode flags
|
# Validate non-privileged mode flags
|
||||||
if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then
|
if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then
|
||||||
# Set default swap limit if not specified
|
# Set default swap limit if not specified
|
||||||
@@ -163,6 +285,22 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Add container environment variables from .env (CONTAINER_* pattern)
|
||||||
|
for env_var in $(compgen -v DOTENV_CONTAINER_); do
|
||||||
|
# Get the value
|
||||||
|
value="${!env_var}"
|
||||||
|
|
||||||
|
# Extract the actual env var name (remove DOTENV_CONTAINER_ prefix)
|
||||||
|
actual_var="${env_var#DOTENV_CONTAINER_}"
|
||||||
|
|
||||||
|
# Properly escape the value for shell using Python
|
||||||
|
escaped_value=$(python3 -c "import shlex; print(shlex.quote('$value'))")
|
||||||
|
|
||||||
|
# Add to docker args
|
||||||
|
DOCKER_ARGS="$DOCKER_ARGS -e $actual_var=$escaped_value"
|
||||||
|
echo "Adding container env: $actual_var"
|
||||||
|
done
|
||||||
|
|
||||||
# Add build job parallelization environment variables if BUILD_JOBS is set
|
# Add build job parallelization environment variables if BUILD_JOBS is set
|
||||||
if [[ -n "$BUILD_JOBS" ]]; then
|
if [[ -n "$BUILD_JOBS" ]]; then
|
||||||
DOCKER_ARGS="$DOCKER_ARGS -e MAX_JOBS=$BUILD_JOBS"
|
DOCKER_ARGS="$DOCKER_ARGS -e MAX_JOBS=$BUILD_JOBS"
|
||||||
|
|||||||
328
run-recipe.py
328
run-recipe.py
@@ -105,7 +105,7 @@ LAUNCH_SCRIPT = SCRIPT_DIR / "launch-cluster.sh"
|
|||||||
BUILD_SCRIPT = SCRIPT_DIR / "build-and-copy.sh"
|
BUILD_SCRIPT = SCRIPT_DIR / "build-and-copy.sh"
|
||||||
DOWNLOAD_SCRIPT = SCRIPT_DIR / "hf-download.sh"
|
DOWNLOAD_SCRIPT = SCRIPT_DIR / "hf-download.sh"
|
||||||
AUTODISCOVER_SCRIPT = SCRIPT_DIR / "autodiscover.sh"
|
AUTODISCOVER_SCRIPT = SCRIPT_DIR / "autodiscover.sh"
|
||||||
ENV_FILE = SCRIPT_DIR / ".env"
|
ENV_FILE = None # Will be set from CLI argument or default
|
||||||
|
|
||||||
|
|
||||||
def load_recipe(recipe_path: Path) -> dict[str, Any]:
|
def load_recipe(recipe_path: Path) -> dict[str, Any]:
|
||||||
@@ -187,7 +187,9 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]:
|
|||||||
SUPPORTED_VERSIONS = ["1"]
|
SUPPORTED_VERSIONS = ["1"]
|
||||||
recipe_ver = str(recipe["recipe_version"])
|
recipe_ver = str(recipe["recipe_version"])
|
||||||
if recipe_ver not in SUPPORTED_VERSIONS:
|
if recipe_ver not in SUPPORTED_VERSIONS:
|
||||||
print(f"Warning: Recipe uses schema version '{recipe_ver}', but this run-recipe.py supports: {SUPPORTED_VERSIONS}")
|
print(
|
||||||
|
f"Warning: Recipe uses schema version '{recipe_ver}', but this run-recipe.py supports: {SUPPORTED_VERSIONS}"
|
||||||
|
)
|
||||||
print("Some features may not work correctly. Consider updating run-recipe.py.")
|
print("Some features may not work correctly. Consider updating run-recipe.py.")
|
||||||
|
|
||||||
return recipe
|
return recipe
|
||||||
@@ -269,19 +271,27 @@ def check_image_exists(image: str, host: str | None = None) -> bool:
|
|||||||
"""
|
"""
|
||||||
if host:
|
if host:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
["ssh", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no",
|
[
|
||||||
host, f"docker image inspect '{image}'"],
|
"ssh",
|
||||||
capture_output=True
|
"-o",
|
||||||
|
"BatchMode=yes",
|
||||||
|
"-o",
|
||||||
|
"StrictHostKeyChecking=no",
|
||||||
|
host,
|
||||||
|
f"docker image inspect '{image}'",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
["docker", "image", "inspect", image],
|
["docker", "image", "inspect", image], capture_output=True
|
||||||
capture_output=True
|
|
||||||
)
|
)
|
||||||
return result.returncode == 0
|
return result.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
def build_image(image: str, copy_to: list[str] | None = None, build_args: list[str] | None = None) -> bool:
|
def build_image(
|
||||||
|
image: str, copy_to: list[str] | None = None, build_args: list[str] | None = None
|
||||||
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Build the container image using build-and-copy.sh.
|
Build the container image using build-and-copy.sh.
|
||||||
|
|
||||||
@@ -393,7 +403,13 @@ def check_model_exists(model: str) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False, extra_args: list[str] | None = None, no_ray: bool = False) -> str:
|
def generate_launch_script(
|
||||||
|
recipe: dict[str, Any],
|
||||||
|
overrides: dict[str, Any],
|
||||||
|
is_solo: bool = False,
|
||||||
|
extra_args: list[str] | None = None,
|
||||||
|
no_ray: bool = False,
|
||||||
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Generate a bash launch script from the recipe.
|
Generate a bash launch script from the recipe.
|
||||||
|
|
||||||
@@ -446,7 +462,7 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
|
|||||||
if env_vars:
|
if env_vars:
|
||||||
lines.append("# Environment variables")
|
lines.append("# Environment variables")
|
||||||
for key, value in env_vars.items():
|
for key, value in env_vars.items():
|
||||||
lines.append(f"export {key}=\"{value}\"")
|
lines.append(f'export {key}="{value}"')
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
# Format the command with parameters
|
# Format the command with parameters
|
||||||
@@ -462,26 +478,24 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
|
|||||||
# (not needed for solo; no-ray uses PyTorch distributed instead)
|
# (not needed for solo; no-ray uses PyTorch distributed instead)
|
||||||
if is_solo or no_ray:
|
if is_solo or no_ray:
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# Remove just the flag and its value, not the whole line
|
# Remove just the flag and its value, not the whole line
|
||||||
command = re.sub(r'--distributed-executor-backend\s+\S+', '', command)
|
command = re.sub(r"--distributed-executor-backend\s+\S+", "", command)
|
||||||
# Remove lines that are now empty or just a backslash continuation
|
# Remove lines that are now empty or just a backslash continuation
|
||||||
lines_list = command.split('\n')
|
lines_list = command.split("\n")
|
||||||
filtered_lines = [
|
filtered_lines = [line for line in lines_list if line.strip() not in ("", "\\")]
|
||||||
line for line in lines_list
|
command = "\n".join(filtered_lines)
|
||||||
if line.strip() not in ('', '\\')
|
|
||||||
]
|
|
||||||
command = '\n'.join(filtered_lines)
|
|
||||||
|
|
||||||
# Remove trailing backslash if present
|
# Remove trailing backslash if present
|
||||||
command = command.rstrip()
|
command = command.rstrip()
|
||||||
if command.endswith('\\'):
|
if command.endswith("\\"):
|
||||||
command = command.rstrip('\\\n').rstrip()
|
command = command.rstrip("\\\n").rstrip()
|
||||||
|
|
||||||
# Append extra args if provided (after --)
|
# Append extra args if provided (after --)
|
||||||
if extra_args:
|
if extra_args:
|
||||||
# Join extra args and append to command
|
# Join extra args and append to command
|
||||||
extra_args_str = ' '.join(shlex.quote(a) for a in extra_args)
|
extra_args_str = " ".join(shlex.quote(a) for a in extra_args)
|
||||||
command = command + ' ' + extra_args_str
|
command = command + " " + extra_args_str
|
||||||
|
|
||||||
lines.append("# Run the model")
|
lines.append("# Run the model")
|
||||||
lines.append(command.strip())
|
lines.append(command.strip())
|
||||||
@@ -631,11 +645,7 @@ def run_autodiscover() -> dict[str, str] | None:
|
|||||||
echo "IB_IF=$IB_IF"
|
echo "IB_IF=$IB_IF"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
result = subprocess.run(
|
result = subprocess.run(["bash", "-c", script], capture_output=True, text=True)
|
||||||
["bash", "-c", script],
|
|
||||||
capture_output=True,
|
|
||||||
text=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
print("Autodiscover output:")
|
print("Autodiscover output:")
|
||||||
@@ -649,7 +659,10 @@ def run_autodiscover() -> dict[str, str] | None:
|
|||||||
output_lines = result.stdout.strip().split("\n")
|
output_lines = result.stdout.strip().split("\n")
|
||||||
env = {}
|
env = {}
|
||||||
for line in output_lines:
|
for line in output_lines:
|
||||||
if "=" in line and any(line.startswith(k) for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="]):
|
if "=" in line and any(
|
||||||
|
line.startswith(k)
|
||||||
|
for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="]
|
||||||
|
):
|
||||||
key, _, value = line.partition("=")
|
key, _, value = line.partition("=")
|
||||||
env[key] = value
|
env[key] = value
|
||||||
else:
|
else:
|
||||||
@@ -692,7 +705,9 @@ def run_autodiscover() -> dict[str, str] | None:
|
|||||||
print(f"Only one node selected: {selected_nodes[0]}")
|
print(f"Only one node selected: {selected_nodes[0]}")
|
||||||
print("This will run in solo mode (single node).")
|
print("This will run in solo mode (single node).")
|
||||||
else:
|
else:
|
||||||
print(f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}")
|
print(
|
||||||
|
f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}"
|
||||||
|
)
|
||||||
|
|
||||||
env["CLUSTER_NODES"] = ",".join(selected_nodes)
|
env["CLUSTER_NODES"] = ",".join(selected_nodes)
|
||||||
print()
|
print()
|
||||||
@@ -757,18 +772,16 @@ Examples:
|
|||||||
|
|
||||||
# Show current .env configuration
|
# Show current .env configuration
|
||||||
%(prog)s --show-env
|
%(prog)s --show-env
|
||||||
"""
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"recipe",
|
"recipe",
|
||||||
nargs="?",
|
nargs="?",
|
||||||
help="Path to recipe YAML file (or just the name without .yaml)"
|
help="Path to recipe YAML file (or just the name without .yaml)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--list", "-l",
|
"--list", "-l", action="store_true", help="List available recipes"
|
||||||
action="store_true",
|
|
||||||
help="List available recipes"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Setup options
|
# Setup options
|
||||||
@@ -776,87 +789,194 @@ Examples:
|
|||||||
setup_group.add_argument(
|
setup_group.add_argument(
|
||||||
"--setup",
|
"--setup",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Full setup: build container (if missing) + download model (if missing) + run"
|
help="Full setup: build container (if missing) + download model (if missing) + run",
|
||||||
)
|
)
|
||||||
setup_group.add_argument(
|
setup_group.add_argument(
|
||||||
"--build-only",
|
"--build-only",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Only build/copy the container image, don't run"
|
help="Only build/copy the container image, don't run",
|
||||||
)
|
)
|
||||||
setup_group.add_argument(
|
setup_group.add_argument(
|
||||||
"--download-only",
|
"--download-only",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Only download/copy the model, don't run"
|
help="Only download/copy the model, don't run",
|
||||||
)
|
)
|
||||||
setup_group.add_argument(
|
setup_group.add_argument(
|
||||||
"--force-build",
|
"--force-build", action="store_true", help="Force rebuild even if image exists"
|
||||||
action="store_true",
|
|
||||||
help="Force rebuild even if image exists"
|
|
||||||
)
|
)
|
||||||
setup_group.add_argument(
|
setup_group.add_argument(
|
||||||
"--force-download",
|
"--force-download",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Force re-download even if model exists"
|
help="Force re-download even if model exists",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dry-run",
|
"--dry-run",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Show what would be executed without running"
|
help="Show what would be executed without running",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Override options
|
# Override options
|
||||||
override_group = parser.add_argument_group("Recipe overrides")
|
override_group = parser.add_argument_group("Recipe overrides")
|
||||||
override_group.add_argument("--port", type=int, help="Override port")
|
override_group.add_argument("--port", type=int, help="Override port")
|
||||||
override_group.add_argument("--host", help="Override host")
|
override_group.add_argument("--host", help="Override host")
|
||||||
override_group.add_argument("--tensor-parallel", "--tp", type=int, dest="tensor_parallel", help="Override tensor parallelism")
|
override_group.add_argument(
|
||||||
override_group.add_argument("--gpu-memory-utilization", "--gpu-mem", type=float, dest="gpu_memory_utilization", help="Override GPU memory utilization")
|
"--tensor-parallel",
|
||||||
override_group.add_argument("--max-model-len", type=int, dest="max_model_len", help="Override max model length")
|
"--tp",
|
||||||
|
type=int,
|
||||||
|
dest="tensor_parallel",
|
||||||
|
help="Override tensor parallelism",
|
||||||
|
)
|
||||||
|
override_group.add_argument(
|
||||||
|
"--gpu-memory-utilization",
|
||||||
|
"--gpu-mem",
|
||||||
|
type=float,
|
||||||
|
dest="gpu_memory_utilization",
|
||||||
|
help="Override GPU memory utilization",
|
||||||
|
)
|
||||||
|
override_group.add_argument(
|
||||||
|
"--max-model-len",
|
||||||
|
type=int,
|
||||||
|
dest="max_model_len",
|
||||||
|
help="Override max model length",
|
||||||
|
)
|
||||||
|
|
||||||
# Launch options (passed to launch-cluster.sh)
|
# Launch options (passed to launch-cluster.sh)
|
||||||
launch_group = parser.add_argument_group("Launch options (passed to launch-cluster.sh)")
|
launch_group = parser.add_argument_group(
|
||||||
launch_group.add_argument("--solo", action="store_true", help="Run in solo mode (single node, no Ray)")
|
"Launch options (passed to launch-cluster.sh)"
|
||||||
launch_group.add_argument("-n", "--nodes", help="Comma-separated list of node IPs (first is head node)")
|
)
|
||||||
launch_group.add_argument("-d", "--daemon", action="store_true", help="Run in daemon mode")
|
launch_group.add_argument(
|
||||||
launch_group.add_argument("-t", "--container", dest="container_override", help="Override container image from recipe")
|
"--solo", action="store_true", help="Run in solo mode (single node, no Ray)"
|
||||||
launch_group.add_argument("--nccl-debug", choices=["VERSION", "WARN", "INFO", "TRACE"], help="NCCL debug level")
|
)
|
||||||
launch_group.add_argument("-e", "--env", action="append", dest="env_vars", default=[], metavar="VAR=VALUE", help="Environment variable to pass to container (e.g. -e HF_TOKEN=xxx). Can be used multiple times.")
|
launch_group.add_argument(
|
||||||
|
"-n", "--nodes", help="Comma-separated list of node IPs (first is head node)"
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"-d", "--daemon", action="store_true", help="Run in daemon mode"
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"-t",
|
||||||
|
"--container",
|
||||||
|
dest="container_override",
|
||||||
|
help="Override container image from recipe",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"--nccl-debug",
|
||||||
|
choices=["VERSION", "WARN", "INFO", "TRACE"],
|
||||||
|
help="NCCL debug level",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"-e",
|
||||||
|
"--env",
|
||||||
|
action="append",
|
||||||
|
dest="env_vars",
|
||||||
|
default=[],
|
||||||
|
metavar="VAR=VALUE",
|
||||||
|
help="Environment variable to pass to container (e.g. -e HF_TOKEN=xxx). Can be used multiple times.",
|
||||||
|
)
|
||||||
launch_group.add_argument(
|
launch_group.add_argument(
|
||||||
"--no-ray",
|
"--no-ray",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
dest="no_ray",
|
dest="no_ray",
|
||||||
help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)"
|
help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"--master-port",
|
||||||
|
"--head-port",
|
||||||
|
type=int,
|
||||||
|
dest="master_port",
|
||||||
|
help="Port for cluster coordination (Ray head port or PyTorch distributed master port, default: 29501)",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"--name",
|
||||||
|
dest="container_name",
|
||||||
|
help="Override container name (default: vllm_node)",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"--eth-if",
|
||||||
|
dest="eth_if",
|
||||||
|
help="Ethernet interface (overrides .env and auto-detection)",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"--ib-if",
|
||||||
|
dest="ib_if",
|
||||||
|
help="InfiniBand interface (overrides .env and auto-detection)",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"-j",
|
||||||
|
dest="build_jobs",
|
||||||
|
type=int,
|
||||||
|
metavar="N",
|
||||||
|
help="Number of parallel build jobs inside container",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"--no-cache-dirs",
|
||||||
|
action="store_true",
|
||||||
|
dest="no_cache_dirs",
|
||||||
|
help="Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"--non-privileged",
|
||||||
|
action="store_true",
|
||||||
|
dest="non_privileged",
|
||||||
|
help="Run in non-privileged mode (removes --privileged and --ipc=host)",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"--mem-limit-gb",
|
||||||
|
type=int,
|
||||||
|
dest="mem_limit_gb",
|
||||||
|
help="Memory limit in GB (only with --non-privileged)",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"--mem-swap-limit-gb",
|
||||||
|
type=int,
|
||||||
|
dest="mem_swap_limit_gb",
|
||||||
|
help="Memory+swap limit in GB (only with --non-privileged)",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"--pids-limit",
|
||||||
|
type=int,
|
||||||
|
dest="pids_limit",
|
||||||
|
help="Process limit (only with --non-privileged, default: 4096)",
|
||||||
|
)
|
||||||
|
launch_group.add_argument(
|
||||||
|
"--shm-size-gb",
|
||||||
|
type=int,
|
||||||
|
dest="shm_size_gb",
|
||||||
|
help="Shared memory size in GB (only with --non-privileged, default: 64)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Config file option
|
||||||
|
parser.add_argument(
|
||||||
|
"--config",
|
||||||
|
dest="config_file",
|
||||||
|
metavar="FILE",
|
||||||
|
help="Path to .env configuration file (default: .env in script directory)",
|
||||||
)
|
)
|
||||||
launch_group.add_argument("--master-port", "--head-port", type=int, dest="master_port", help="Port for cluster coordination (Ray head port or PyTorch distributed master port, default: 29501)")
|
|
||||||
launch_group.add_argument("--name", dest="container_name", help="Override container name (default: vllm_node)")
|
|
||||||
launch_group.add_argument("--eth-if", dest="eth_if", help="Ethernet interface (overrides .env and auto-detection)")
|
|
||||||
launch_group.add_argument("--ib-if", dest="ib_if", help="InfiniBand interface (overrides .env and auto-detection)")
|
|
||||||
launch_group.add_argument("-j", dest="build_jobs", type=int, metavar="N", help="Number of parallel build jobs inside container")
|
|
||||||
launch_group.add_argument("--no-cache-dirs", action="store_true", dest="no_cache_dirs", help="Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton")
|
|
||||||
launch_group.add_argument("--non-privileged", action="store_true", dest="non_privileged", help="Run in non-privileged mode (removes --privileged and --ipc=host)")
|
|
||||||
launch_group.add_argument("--mem-limit-gb", type=int, dest="mem_limit_gb", help="Memory limit in GB (only with --non-privileged)")
|
|
||||||
launch_group.add_argument("--mem-swap-limit-gb", type=int, dest="mem_swap_limit_gb", help="Memory+swap limit in GB (only with --non-privileged)")
|
|
||||||
launch_group.add_argument("--pids-limit", type=int, dest="pids_limit", help="Process limit (only with --non-privileged, default: 4096)")
|
|
||||||
launch_group.add_argument("--shm-size-gb", type=int, dest="shm_size_gb", help="Shared memory size in GB (only with --non-privileged, default: 64)")
|
|
||||||
|
|
||||||
# Cluster discovery options
|
# Cluster discovery options
|
||||||
discover_group = parser.add_argument_group("Cluster discovery")
|
discover_group = parser.add_argument_group("Cluster discovery")
|
||||||
discover_group.add_argument(
|
discover_group.add_argument(
|
||||||
"--discover",
|
"--discover",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Auto-detect cluster nodes and save to .env file"
|
help="Auto-detect cluster nodes and save to .env file",
|
||||||
)
|
)
|
||||||
discover_group.add_argument(
|
discover_group.add_argument(
|
||||||
"--show-env",
|
"--show-env", action="store_true", help="Show current .env configuration"
|
||||||
action="store_true",
|
|
||||||
help="Show current .env configuration"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use parse_known_args to allow extra vLLM arguments after --
|
# Use parse_known_args to allow extra vLLM arguments after --
|
||||||
args, extra_args = parser.parse_known_args()
|
args, extra_args = parser.parse_known_args()
|
||||||
|
|
||||||
|
# Set .env file path (use default if not specified)
|
||||||
|
global ENV_FILE
|
||||||
|
if args.config_file:
|
||||||
|
ENV_FILE = Path(args.config_file).resolve()
|
||||||
|
else:
|
||||||
|
ENV_FILE = SCRIPT_DIR / ".env"
|
||||||
|
|
||||||
# Filter out the -- separator if present
|
# Filter out the -- separator if present
|
||||||
if extra_args and extra_args[0] == '--':
|
if extra_args and extra_args[0] == "--":
|
||||||
extra_args = extra_args[1:]
|
extra_args = extra_args[1:]
|
||||||
|
|
||||||
# Handle --discover (can be run with or without a recipe)
|
# Handle --discover (can be run with or without a recipe)
|
||||||
@@ -941,7 +1061,13 @@ Examples:
|
|||||||
if nodes:
|
if nodes:
|
||||||
# Ask if user wants to save to .env
|
# Ask if user wants to save to .env
|
||||||
print()
|
print()
|
||||||
response = input("Save this configuration to .env for future use? [Y/n]: ").strip().lower()
|
response = (
|
||||||
|
input(
|
||||||
|
"Save this configuration to .env for future use? [Y/n]: "
|
||||||
|
)
|
||||||
|
.strip()
|
||||||
|
.lower()
|
||||||
|
)
|
||||||
if response in ("", "y", "yes"):
|
if response in ("", "y", "yes"):
|
||||||
save_env_file(discovered_env)
|
save_env_file(discovered_env)
|
||||||
print()
|
print()
|
||||||
@@ -963,8 +1089,10 @@ Examples:
|
|||||||
solo_only = recipe.get("solo_only", False)
|
solo_only = recipe.get("solo_only", False)
|
||||||
is_solo = args.solo or not is_cluster
|
is_solo = args.solo or not is_cluster
|
||||||
|
|
||||||
if getattr(args, 'no_ray', False) and is_solo:
|
if getattr(args, "no_ray", False) and is_solo:
|
||||||
print("Error: --no-ray is incompatible with --solo. Solo mode already runs without Ray.")
|
print(
|
||||||
|
"Error: --no-ray is incompatible with --solo. Solo mode already runs without Ray."
|
||||||
|
)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
if cluster_only and is_solo:
|
if cluster_only and is_solo:
|
||||||
@@ -972,7 +1100,9 @@ Examples:
|
|||||||
print(f"This model is too large to run on a single node.")
|
print(f"This model is too large to run on a single node.")
|
||||||
print()
|
print()
|
||||||
print("Options:")
|
print("Options:")
|
||||||
print(f" 1. Specify nodes directly: {sys.argv[0]} {args.recipe} -n node1,node2")
|
print(
|
||||||
|
f" 1. Specify nodes directly: {sys.argv[0]} {args.recipe} -n node1,node2"
|
||||||
|
)
|
||||||
print(f" 2. Auto-discover and save: {sys.argv[0]} --discover")
|
print(f" 2. Auto-discover and save: {sys.argv[0]} --discover")
|
||||||
print(f" Then run: {sys.argv[0]} {args.recipe}")
|
print(f" Then run: {sys.argv[0]} {args.recipe}")
|
||||||
return 1
|
return 1
|
||||||
@@ -1007,9 +1137,13 @@ Examples:
|
|||||||
print(f" Workers: {', '.join(worker_nodes)}")
|
print(f" Workers: {', '.join(worker_nodes)}")
|
||||||
print(f"Solo mode: {is_solo}")
|
print(f"Solo mode: {is_solo}")
|
||||||
if eth_if:
|
if eth_if:
|
||||||
print(f"Ethernet interface: {eth_if}{' (from .env)' if not args.eth_if else ''}")
|
print(
|
||||||
|
f"Ethernet interface: {eth_if}{' (from .env)' if not args.eth_if else ''}"
|
||||||
|
)
|
||||||
if ib_if:
|
if ib_if:
|
||||||
print(f"InfiniBand interface: {ib_if}{' (from .env)' if not args.ib_if else ''}")
|
print(
|
||||||
|
f"InfiniBand interface: {ib_if}{' (from .env)' if not args.ib_if else ''}"
|
||||||
|
)
|
||||||
if args.container_name:
|
if args.container_name:
|
||||||
print(f"Container name: {args.container_name}")
|
print(f"Container name: {args.container_name}")
|
||||||
if args.non_privileged:
|
if args.non_privileged:
|
||||||
@@ -1099,7 +1233,7 @@ Examples:
|
|||||||
print(f" 2. Build manually: ./build-and-copy.sh -t {container}")
|
print(f" 2. Build manually: ./build-and-copy.sh -t {container}")
|
||||||
print()
|
print()
|
||||||
response = input("Build now? [y/N] ").strip().lower()
|
response = input("Build now? [y/N] ").strip().lower()
|
||||||
if response == 'y':
|
if response == "y":
|
||||||
if not build_image(container, copy_targets, build_args):
|
if not build_image(container, copy_targets, build_args):
|
||||||
print("Error: Failed to build image")
|
print("Error: Failed to build image")
|
||||||
return 1
|
return 1
|
||||||
@@ -1109,7 +1243,13 @@ Examples:
|
|||||||
|
|
||||||
# Build overrides from CLI args
|
# Build overrides from CLI args
|
||||||
overrides = {}
|
overrides = {}
|
||||||
for key in ["port", "host", "tensor_parallel", "gpu_memory_utilization", "max_model_len"]:
|
for key in [
|
||||||
|
"port",
|
||||||
|
"host",
|
||||||
|
"tensor_parallel",
|
||||||
|
"gpu_memory_utilization",
|
||||||
|
"max_model_len",
|
||||||
|
]:
|
||||||
value = getattr(args, key, None)
|
value = getattr(args, key, None)
|
||||||
if value is not None:
|
if value is not None:
|
||||||
overrides[key] = value
|
overrides[key] = value
|
||||||
@@ -1122,24 +1262,34 @@ Examples:
|
|||||||
if extra_args:
|
if extra_args:
|
||||||
# Map vLLM flags to our override keys
|
# Map vLLM flags to our override keys
|
||||||
flag_to_override = {
|
flag_to_override = {
|
||||||
'--port': 'port',
|
"--port": "port",
|
||||||
'--host': 'host',
|
"--host": "host",
|
||||||
'--tensor-parallel-size': 'tensor_parallel',
|
"--tensor-parallel-size": "tensor_parallel",
|
||||||
'-tp': 'tensor_parallel',
|
"-tp": "tensor_parallel",
|
||||||
'--gpu-memory-utilization': 'gpu_memory_utilization',
|
"--gpu-memory-utilization": "gpu_memory_utilization",
|
||||||
'--max-model-len': 'max_model_len',
|
"--max-model-len": "max_model_len",
|
||||||
}
|
}
|
||||||
for i, arg in enumerate(extra_args):
|
for i, arg in enumerate(extra_args):
|
||||||
# Check both exact flag and =value syntax
|
# Check both exact flag and =value syntax
|
||||||
flag = arg.split('=')[0] if '=' in arg else arg
|
flag = arg.split("=")[0] if "=" in arg else arg
|
||||||
if flag in flag_to_override:
|
if flag in flag_to_override:
|
||||||
override_key = flag_to_override[flag]
|
override_key = flag_to_override[flag]
|
||||||
if override_key in overrides:
|
if override_key in overrides:
|
||||||
print(f"Warning: '{arg}' in extra args duplicates --{override_key.replace('_', '-')} override")
|
print(
|
||||||
print(f" vLLM uses last value; extra args appear after template substitution")
|
f"Warning: '{arg}' in extra args duplicates --{override_key.replace('_', '-')} override"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" vLLM uses last value; extra args appear after template substitution"
|
||||||
|
)
|
||||||
|
|
||||||
# Generate launch script
|
# Generate launch script
|
||||||
script_content = generate_launch_script(recipe, overrides, is_solo=is_solo, extra_args=extra_args, no_ray=getattr(args, 'no_ray', False))
|
script_content = generate_launch_script(
|
||||||
|
recipe,
|
||||||
|
overrides,
|
||||||
|
is_solo=is_solo,
|
||||||
|
extra_args=extra_args,
|
||||||
|
no_ray=getattr(args, "no_ray", False),
|
||||||
|
)
|
||||||
|
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
print("=== Generated Launch Script ===")
|
print("=== Generated Launch Script ===")
|
||||||
@@ -1158,7 +1308,7 @@ Examples:
|
|||||||
cmd_parts.append("--solo")
|
cmd_parts.append("--solo")
|
||||||
if args.daemon:
|
if args.daemon:
|
||||||
cmd_parts.append("-d")
|
cmd_parts.append("-d")
|
||||||
if getattr(args, 'no_ray', False):
|
if getattr(args, "no_ray", False):
|
||||||
cmd_parts.append("--no-ray")
|
cmd_parts.append("--no-ray")
|
||||||
if nodes:
|
if nodes:
|
||||||
cmd_parts.extend(["-n", ",".join(nodes)])
|
cmd_parts.extend(["-n", ",".join(nodes)])
|
||||||
@@ -1195,7 +1345,7 @@ Examples:
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
# Write temporary launch script
|
# Write temporary launch script
|
||||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f:
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f:
|
||||||
f.write(script_content)
|
f.write(script_content)
|
||||||
temp_script = f.name
|
temp_script = f.name
|
||||||
|
|
||||||
@@ -1222,7 +1372,7 @@ Examples:
|
|||||||
if args.daemon:
|
if args.daemon:
|
||||||
cmd.append("-d")
|
cmd.append("-d")
|
||||||
|
|
||||||
if getattr(args, 'no_ray', False):
|
if getattr(args, "no_ray", False):
|
||||||
cmd.append("--no-ray")
|
cmd.append("--no-ray")
|
||||||
|
|
||||||
# Pass nodes to launch-cluster.sh (from command line, .env, or autodiscover)
|
# Pass nodes to launch-cluster.sh (from command line, .env, or autodiscover)
|
||||||
|
|||||||
Reference in New Issue
Block a user