Added ability to launch NGC container in the cluster

2026-02-02 16:57:04 -08:00
parent 4634ee92a2
commit 4b9ab0de7c
5 changed files with 162 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -146,6 +146,8 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi
 ### 2026-02-02
 #### Nemotron Nano mod
 Added a mod for nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B support. It supports all Nemotron Nano models/quants using the same reasoning parser.
 To use, add `--apply-mod mods/nemotron-nano` to `./launch-cluster.sh` arguments.
@@ -172,6 +174,38 @@ For example, to run nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 on a single node
 Please note, that NVFP4 models on Spark are not fully supported on vLLM (any build) yet, so the performance will not be optimal. You will likely see Flashinfer errors during load. This model is also known to crash sometimes.
 #### Ability to use launch-cluster.sh with NVIDIA NGC containers
 Added a new mod that enables using cluster launch script with NVIDIA NGC vLLM or any other vLLM container that includes Infiniband libraries and Ray support.
 To use, add `--apply-mod mods/use-ngc-vllm` to `./launch-cluster.sh` arguments. It can be combined with other mods.
 For example, to launch Nemotron Nano in the cluster using NGC container, you can use the following command:
 ```bash
 ./launch-cluster.sh \
   -t nvcr.io/nvidia/vllm:26.01-py3 \
   --apply-mod mods/use-ngc-vllm \
   --apply-mod mods/nemotron-nano \
   -e VLLM_USE_FLASHINFER_MOE_FP4=1 \
   -e VLLM_FLASHINFER_MOE_BACKEND=throughput \
   exec vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 \
       --max-model-len 262144 \
       --port 8888 --host 0.0.0.0 \
       --trust-remote-code \
       --enable-auto-tool-choice \
       --tool-call-parser qwen3_coder \
       --reasoning-parser-plugin nano_v3_reasoning_parser.py \
       --reasoning-parser nano_v3 \
       --kv-cache-dtype fp8 \
       --gpu-memory-utilization 0.7 \
       --tensor-parallel-size 2 \
       --distributed-executor-backend ray
 ```
 Make sure you have the container pulled on both nodes!
 At this point it doesn't seem like NGC container performs any better for this model than a custom build.
 ### 2026-01-29
 #### New Parameters for launch-cluster.sh
--- a/launch-cluster.sh
+++ b/launch-cluster.sh
@@ -403,14 +403,15 @@ apply_mod_to_container() {
    # 3. Run run.sh
    echo "  Running patch script on $node_ip..."
-    local exec_cmd="cd $container_dest && chmod +x run.sh && ./run.sh"
+    local local_exec_cmd="export WORKSPACE_DIR=\$PWD && cd $container_dest && chmod +x run.sh && ./run.sh"
    local remote_exec_cmd="export WORKSPACE_DIR=\\\$PWD && cd $container_dest && chmod +x run.sh && ./run.sh"
    local ret_code=0
    if [[ "$is_local" == "true" ]]; then
-        docker exec "$container" bash -c "$exec_cmd"
+        docker exec "$container" bash -c "$local_exec_cmd"
        ret_code=$?
    else
-        $cmd_prefix docker exec "$container" bash -c "\"$exec_cmd\""
+        $cmd_prefix docker exec "$container" bash -c "\"$remote_exec_cmd\""
        ret_code=$?
    fi
--- a/mods/nemotron-nano/run.sh
+++ b/mods/nemotron-nano/run.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 set -e
-cd $VLLM_BASE_DIR
+cd $WORKSPACE_DIR
 wget https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4/resolve/main/nano_v3_reasoning_parser.py
--- a/mods/use-ngc-vllm/run-cluster-node.sh
+++ b/mods/use-ngc-vllm/run-cluster-node.sh
@@ -0,0 +1,117 @@
 #!/bin/bash
 set -e
 # Define a function to export immediately AND save to .bashrc for future sessions
 export_persist() {
    local var_name="$1"
    local var_value="$2"
    # 1. Export for the current running process
    export "$var_name"="$var_value"
    # 2. Append to .bashrc (idempotent check to avoid duplicate lines)
    if ! grep -q "export $var_name=" ~/.bashrc; then
        echo "export $var_name=\"$var_value\"" >> ~/.bashrc
    else
        # Optional: Update the existing line if it exists
        sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc
    fi
 }
 # --- Help Function ---
 usage() {
    echo "Usage: $0 [OPTIONS]"
    echo ""
    echo "Required Arguments:"
    echo "  -r, --role <head|node>      : Set the node type"
    echo "  -h, --host-ip <ip>          : IP address of this interface (Host IP)"
    echo "  -e, --eth-if <name>         : Ethernet interface name (e.g., eth0)"
    echo "  -i, --ib-if <name>          : InfiniBand/RDMA interface name"
    echo ""
    echo "Conditional Arguments:"
    echo "  -m, --head-ip <ip>          : IP of the head node (REQUIRED if role is 'node')"
    echo ""
    echo "Example:"
    echo "  $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0"
    echo "  $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10"
    exit 1
 }
 # --- Argument Parsing ---
 # Initialize variables to empty
 NODE_TYPE=""
 HOST_IP=""
 ETH_IF_NAME=""
 IB_IF_NAME=""
 HEAD_IP=""
 while [[ "$#" -gt 0 ]]; do
    case $1 in
        -r|--role) NODE_TYPE="$2"; shift ;;
        -h|--host-ip) HOST_IP="$2"; shift ;;
        -e|--eth-if) ETH_IF_NAME="$2"; shift ;;
        -i|--ib-if) IB_IF_NAME="$2"; shift ;;
        -m|--head-ip) HEAD_IP="$2"; shift ;;
        *) echo "Unknown parameter passed: $1"; usage ;;
    esac
    shift
 done
 # --- Validation ---
 # 1. Check if all common required arguments are present
 if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then
    echo "Error: Missing required arguments."
    usage
 fi
 # 2. Validate Role
 if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then
    echo "Error: --role must be 'head' or 'node'."
    exit 1
 fi
 # 3. Conditional Check for Head IP
 if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then
    echo "Error: When --role is 'node', you must provide --head-ip."
    exit 1
 fi
 # --- Environment Configuration ---
 echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..."
 export_persist VLLM_HOST_IP "$HOST_IP"
 export_persist RAY_NODE_IP_ADDRESS "$HOST_IP"
 export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP"
 # Network Interface
 export_persist MN_IF_NAME "$ETH_IF_NAME"
 export_persist UCX_NET_DEVICES "$ETH_IF_NAME"
 export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME"
 # InfiniBand
 export_persist NCCL_IB_HCA "$IB_IF_NAME"
 export_persist NCCL_IB_DISABLE "0"
 # Sockets/Transport
 export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME"
 export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
 export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
 export_persist RAY_memory_monitor_refresh_ms "0"
 # --- Execution ---
 if [ "${NODE_TYPE}" == "head" ]; then
    echo "Starting Ray HEAD node..."
    exec ray start --block --head --port 6379 \
        --node-ip-address "$VLLM_HOST_IP" \
        --disable-usage-stats
 else
    echo "Starting Ray WORKER node connecting to $HEAD_IP..."
    exec ray start --block \
        --address="$HEAD_IP:6379" \
        --node-ip-address "$VLLM_HOST_IP"
 fi
--- a/mods/use-ngc-vllm/run.sh
+++ b/mods/use-ngc-vllm/run.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 set -e
 echo "Setting up cluster initialization script..."
 cp run-cluster-node.sh $WORKSPACE_DIR/run-cluster-node.sh
 chmod +x $WORKSPACE_DIR/run-cluster-node.sh