Add NCCL debug level option to launch-cluster.sh
This commit is contained in:
@@ -246,6 +246,7 @@ You can override the auto-detected values if needed:
|
|||||||
| `--name` | Container name (default: `vllm_node`). |
|
| `--name` | Container name (default: `vllm_node`). |
|
||||||
| `--eth-if` | Ethernet interface name. |
|
| `--eth-if` | Ethernet interface name. |
|
||||||
| `--ib-if` | InfiniBand interface name. |
|
| `--ib-if` | InfiniBand interface name. |
|
||||||
|
| `--nccl-debug` | NCCL debug level (e.g., INFO, WARN). Defaults to INFO if flag is present but value is omitted. |
|
||||||
| `--check-config` | Check configuration and auto-detection without launching. |
|
| `--check-config` | Check configuration and auto-detection without launching. |
|
||||||
| `-d` | Run in daemon mode (detached). |
|
| `-d` | Run in daemon mode (detached). |
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
IMAGE_NAME="vllm-node"
|
IMAGE_NAME="vllm-node"
|
||||||
DEFAULT_CONTAINER_NAME="vllm_node"
|
DEFAULT_CONTAINER_NAME="vllm_node"
|
||||||
# Modify these if you want to pass additional docker args or set VLLM_SPARK_EXTRA_DOCKER_ARGS variable
|
# Modify these if you want to pass additional docker args or set VLLM_SPARK_EXTRA_DOCKER_ARGS variable
|
||||||
DOCKER_ARGS="-e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 -v $HOME/.cache/huggingface:/root/.cache/huggingface"
|
DOCKER_ARGS="-e NCCL_IGNORE_CPU_AFFINITY=1 -v $HOME/.cache/huggingface:/root/.cache/huggingface"
|
||||||
|
|
||||||
# Append additional arguments from environment variable
|
# Append additional arguments from environment variable
|
||||||
if [[ -n "$VLLM_SPARK_EXTRA_DOCKER_ARGS" ]]; then
|
if [[ -n "$VLLM_SPARK_EXTRA_DOCKER_ARGS" ]]; then
|
||||||
@@ -14,6 +14,7 @@ fi
|
|||||||
# ETH_IF and IB_IF will be auto-detected if not provided
|
# ETH_IF and IB_IF will be auto-detected if not provided
|
||||||
ETH_IF=""
|
ETH_IF=""
|
||||||
IB_IF=""
|
IB_IF=""
|
||||||
|
NCCL_DEBUG_VAL=""
|
||||||
|
|
||||||
# Initialize variables
|
# Initialize variables
|
||||||
NODES_ARG=""
|
NODES_ARG=""
|
||||||
@@ -26,12 +27,13 @@ CLUSTER_WAS_RUNNING="false"
|
|||||||
|
|
||||||
# Function to print usage
|
# Function to print usage
|
||||||
usage() {
|
usage() {
|
||||||
echo "Usage: $0 [-n <node_ips>] [-t <image_name>] [--name <container_name>] [--eth-if <if_name>] [--ib-if <if_name>] [--check-config] [-d] [action] [command]"
|
echo "Usage: $0 [-n <node_ips>] [-t <image_name>] [--name <container_name>] [--eth-if <if_name>] [--ib-if <if_name>] [--nccl-debug <level>] [--check-config] [-d] [action] [command]"
|
||||||
echo " -n, --nodes Comma-separated list of node IPs (Optional, auto-detected if omitted)"
|
echo " -n, --nodes Comma-separated list of node IPs (Optional, auto-detected if omitted)"
|
||||||
echo " -t Docker image name (Optional, default: $IMAGE_NAME)"
|
echo " -t Docker image name (Optional, default: $IMAGE_NAME)"
|
||||||
echo " --name Container name (Optional, default: $DEFAULT_CONTAINER_NAME)"
|
echo " --name Container name (Optional, default: $DEFAULT_CONTAINER_NAME)"
|
||||||
echo " --eth-if Ethernet interface (Optional, auto-detected)"
|
echo " --eth-if Ethernet interface (Optional, auto-detected)"
|
||||||
echo " --ib-if InfiniBand interface (Optional, auto-detected)"
|
echo " --ib-if InfiniBand interface (Optional, auto-detected)"
|
||||||
|
echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO."
|
||||||
echo " --check-config Check configuration and auto-detection without launching"
|
echo " --check-config Check configuration and auto-detection without launching"
|
||||||
echo " -d Daemon mode (only for 'start' action)"
|
echo " -d Daemon mode (only for 'start' action)"
|
||||||
echo " action start | stop | status | exec (Default: start)"
|
echo " action start | stop | status | exec (Default: start)"
|
||||||
@@ -47,6 +49,14 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
--name) CONTAINER_NAME="$2"; shift ;;
|
--name) CONTAINER_NAME="$2"; shift ;;
|
||||||
--eth-if) ETH_IF="$2"; shift ;;
|
--eth-if) ETH_IF="$2"; shift ;;
|
||||||
--ib-if) IB_IF="$2"; shift ;;
|
--ib-if) IB_IF="$2"; shift ;;
|
||||||
|
--nccl-debug)
|
||||||
|
if [[ -n "$2" && "$2" =~ ^(VERSION|WARN|INFO|TRACE)$ ]]; then
|
||||||
|
NCCL_DEBUG_VAL="$2"
|
||||||
|
shift
|
||||||
|
else
|
||||||
|
NCCL_DEBUG_VAL="INFO"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
--check-config) CHECK_CONFIG="true" ;;
|
--check-config) CHECK_CONFIG="true" ;;
|
||||||
-d) DAEMON_MODE="true" ;;
|
-d) DAEMON_MODE="true" ;;
|
||||||
-h|--help) usage ;;
|
-h|--help) usage ;;
|
||||||
@@ -72,6 +82,20 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
shift
|
shift
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Append NCCL_DEBUG if set, with validation
|
||||||
|
if [[ -n "$NCCL_DEBUG_VAL" ]]; then
|
||||||
|
case "$NCCL_DEBUG_VAL" in
|
||||||
|
VERSION|WARN|INFO|TRACE)
|
||||||
|
DOCKER_ARGS="$DOCKER_ARGS -e NCCL_DEBUG=$NCCL_DEBUG_VAL"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Error: Invalid value for --nccl-debug: $NCCL_DEBUG_VAL"
|
||||||
|
echo "Allowed values: VERSION, WARN, INFO, TRACE"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
|
||||||
# --- Auto-Detection Logic ---
|
# --- Auto-Detection Logic ---
|
||||||
|
|
||||||
# Check for required tools if auto-detection is needed
|
# Check for required tools if auto-detection is needed
|
||||||
|
|||||||
Reference in New Issue
Block a user