Fixed cluster script and small fix for Dockerfilewq

This commit is contained in:
eugr
2025-11-24 15:45:04 -08:00
parent 5c8feb086c
commit f5141974ae
2 changed files with 13 additions and 13 deletions

View File

@@ -56,7 +56,7 @@ RUN pip install flashinfer-python --no-deps --index-url https://flashinfer.ai/wh
pip install flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130
# Install fast safetensors to improve loading speeds
RUN pip install fastsafetensors>=0.1.10
RUN pip install fastsafetensors
# --- VLLM SOURCE CACHE BUSTER ---
# Change THIS argument to force a fresh git clone and rebuild of vLLM

View File

@@ -82,24 +82,24 @@ fi
echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..."
export_persist VLLM_HOST_IP=$HOST_IP
export_persist RAY_NODE_IP_ADDRESS=$VLLM_HOST_IP
export_persist RAY_OVERRIDE_NODE_IP_ADDRESS=$VLLM_HOST_IP
export_persist VLLM_HOST_IP "$HOST_IP"
export_persist RAY_NODE_IP_ADDRESS "$HOST_IP"
export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP"
# Network Interface
export_persist MN_IF_NAME=$ETH_IF_NAME
export_persist UCX_NET_DEVICES=$MN_IF_NAME
export_persist NCCL_SOCKET_IFNAME=$MN_IF_NAME
export_persist MN_IF_NAME "$ETH_IF_NAME"
export_persist UCX_NET_DEVICES "$ETH_IF_NAME"
export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME"
# InfiniBand
export_persist NCCL_IB_HCA=$IB_IF_NAME
export_persist NCCL_IB_DISABLE=0
export_persist NCCL_IB_HCA "$IB_IF_NAME"
export_persist NCCL_IB_DISABLE "0"
# Sockets/Transport
export_persist OMPI_MCA_btl_tcp_if_include=$MN_IF_NAME
export_persist GLOO_SOCKET_IFNAME=$MN_IF_NAME
export_persist TP_SOCKET_IFNAME=$MN_IF_NAME
export_persist RAY_memory_monitor_refresh_ms=0
export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME"
export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
export_persist RAY_memory_monitor_refresh_ms "0"
# --- Execution ---