From f5141974aea5c3d9afb6d4d1f86d6212fdfbaf28 Mon Sep 17 00:00:00 2001 From: eugr Date: Mon, 24 Nov 2025 15:45:04 -0800 Subject: [PATCH] Fixed cluster script and small fix for Dockerfilewq --- Dockerfile | 2 +- run-cluster-node.sh | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5555ebb..841c93c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -56,7 +56,7 @@ RUN pip install flashinfer-python --no-deps --index-url https://flashinfer.ai/wh pip install flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130 # Install fast safetensors to improve loading speeds -RUN pip install fastsafetensors>=0.1.10 +RUN pip install fastsafetensors # --- VLLM SOURCE CACHE BUSTER --- # Change THIS argument to force a fresh git clone and rebuild of vLLM diff --git a/run-cluster-node.sh b/run-cluster-node.sh index 345659d..8e24b60 100755 --- a/run-cluster-node.sh +++ b/run-cluster-node.sh @@ -82,24 +82,24 @@ fi echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..." -export_persist VLLM_HOST_IP=$HOST_IP -export_persist RAY_NODE_IP_ADDRESS=$VLLM_HOST_IP -export_persist RAY_OVERRIDE_NODE_IP_ADDRESS=$VLLM_HOST_IP +export_persist VLLM_HOST_IP "$HOST_IP" +export_persist RAY_NODE_IP_ADDRESS "$HOST_IP" +export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP" # Network Interface -export_persist MN_IF_NAME=$ETH_IF_NAME -export_persist UCX_NET_DEVICES=$MN_IF_NAME -export_persist NCCL_SOCKET_IFNAME=$MN_IF_NAME +export_persist MN_IF_NAME "$ETH_IF_NAME" +export_persist UCX_NET_DEVICES "$ETH_IF_NAME" +export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME" # InfiniBand -export_persist NCCL_IB_HCA=$IB_IF_NAME -export_persist NCCL_IB_DISABLE=0 +export_persist NCCL_IB_HCA "$IB_IF_NAME" +export_persist NCCL_IB_DISABLE "0" # Sockets/Transport -export_persist OMPI_MCA_btl_tcp_if_include=$MN_IF_NAME -export_persist GLOO_SOCKET_IFNAME=$MN_IF_NAME -export_persist TP_SOCKET_IFNAME=$MN_IF_NAME -export_persist RAY_memory_monitor_refresh_ms=0 +export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME" +export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME" +export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME" +export_persist RAY_memory_monitor_refresh_ms "0" # --- Execution ---