Merge branch 'main' into pytorch-base
This commit is contained in:
@@ -256,12 +256,9 @@ RUN mkdir -p tiktoken_encodings && \
|
|||||||
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||||
|
|
||||||
# Copy artifacts from Builder Stage
|
# Copy artifacts from Builder Stage
|
||||||
# We copy the python packages and executables
|
RUN --mount=type=bind,from=builder,source=/workspace/wheels,target=/mount/wheels \
|
||||||
# No need to copy source code, as it's already in the site-packages
|
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
COPY --from=builder /workspace/wheels /workspace/wheels
|
uv pip install /mount/wheels/*.whl
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|
||||||
uv pip install /workspace/wheels/*.whl && \
|
|
||||||
rm -rf /workspace/wheels
|
|
||||||
|
|
||||||
# Setup Env for Runtime
|
# Setup Env for Runtime
|
||||||
ENV TORCH_CUDA_ARCH_LIST="12.0;12.1"
|
ENV TORCH_CUDA_ARCH_LIST="12.0;12.1"
|
||||||
|
|||||||
@@ -144,24 +144,22 @@ for i in "${!MOD_PATHS[@]}"; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
# --- Auto-Detection Logic ---
|
# --- Auto-Detection Logic ---
|
||||||
|
|
||||||
# Source autodiscover module
|
# Source autodiscover module
|
||||||
source "$(dirname "$0")/autodiscover.sh"
|
source "$(dirname "$0")/autodiscover.sh"
|
||||||
|
|
||||||
# Perform auto-detection
|
|
||||||
detect_interfaces || exit 1
|
|
||||||
|
|
||||||
if [[ "$SOLO_MODE" == "true" ]]; then
|
if [[ "$SOLO_MODE" == "true" ]]; then
|
||||||
if [[ -n "$NODES_ARG" ]]; then
|
if [[ -n "$NODES_ARG" ]]; then
|
||||||
echo "Error: --solo is incompatible with -n/--nodes."
|
echo "Error: --solo is incompatible with -n/--nodes."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
# Solo mode: skip node detection, just get local IP
|
# Solo mode: skip node detection, just get local IP
|
||||||
detect_local_ip || exit 1
|
LOCAL_IP="127.0.0.1"
|
||||||
NODES_ARG="$LOCAL_IP"
|
NODES_ARG="$LOCAL_IP"
|
||||||
PEER_NODES=()
|
PEER_NODES=()
|
||||||
echo "Solo mode enabled. Skipping node detection."
|
echo "Solo mode enabled. Skipping node detection."
|
||||||
else
|
else
|
||||||
|
# Perform auto-detection
|
||||||
|
detect_interfaces || exit 1
|
||||||
detect_nodes || exit 1
|
detect_nodes || exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -173,8 +171,11 @@ fi
|
|||||||
# Split nodes into array
|
# Split nodes into array
|
||||||
IFS=',' read -r -a ALL_NODES <<< "$NODES_ARG"
|
IFS=',' read -r -a ALL_NODES <<< "$NODES_ARG"
|
||||||
|
|
||||||
|
if [[ "$SOLO_MODE" != "true" ]]; then
|
||||||
# Detect Head IP (Local IP)
|
# Detect Head IP (Local IP)
|
||||||
detect_local_ip || exit 1
|
detect_local_ip || exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
HEAD_IP="$LOCAL_IP"
|
HEAD_IP="$LOCAL_IP"
|
||||||
|
|
||||||
# Verify HEAD_IP is in ALL_NODES
|
# Verify HEAD_IP is in ALL_NODES
|
||||||
|
|||||||
Reference in New Issue
Block a user