diff --git a/Dockerfile.mxfp4 b/Dockerfile.mxfp4 index e00184f..b1ed0ff 100644 --- a/Dockerfile.mxfp4 +++ b/Dockerfile.mxfp4 @@ -256,12 +256,9 @@ RUN mkdir -p tiktoken_encodings && \ wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" # Copy artifacts from Builder Stage -# We copy the python packages and executables -# No need to copy source code, as it's already in the site-packages -COPY --from=builder /workspace/wheels /workspace/wheels -RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install /workspace/wheels/*.whl && \ - rm -rf /workspace/wheels +RUN --mount=type=bind,from=builder,source=/workspace/wheels,target=/mount/wheels \ + --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install /mount/wheels/*.whl # Setup Env for Runtime ENV TORCH_CUDA_ARCH_LIST="12.0;12.1" diff --git a/launch-cluster.sh b/launch-cluster.sh index 283d9df..f3e645d 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -144,24 +144,22 @@ for i in "${!MOD_PATHS[@]}"; do done # --- Auto-Detection Logic --- - # Source autodiscover module source "$(dirname "$0")/autodiscover.sh" -# Perform auto-detection -detect_interfaces || exit 1 - if [[ "$SOLO_MODE" == "true" ]]; then if [[ -n "$NODES_ARG" ]]; then echo "Error: --solo is incompatible with -n/--nodes." exit 1 fi # Solo mode: skip node detection, just get local IP - detect_local_ip || exit 1 + LOCAL_IP="127.0.0.1" NODES_ARG="$LOCAL_IP" PEER_NODES=() echo "Solo mode enabled. Skipping node detection." else + # Perform auto-detection + detect_interfaces || exit 1 detect_nodes || exit 1 fi @@ -173,8 +171,11 @@ fi # Split nodes into array IFS=',' read -r -a ALL_NODES <<< "$NODES_ARG" -# Detect Head IP (Local IP) -detect_local_ip || exit 1 +if [[ "$SOLO_MODE" != "true" ]]; then + # Detect Head IP (Local IP) + detect_local_ip || exit 1 +fi + HEAD_IP="$LOCAL_IP" # Verify HEAD_IP is in ALL_NODES