Use mesh-optimized NCCL
This commit is contained in:
18
Dockerfile
18
Dockerfile
@@ -39,7 +39,7 @@ RUN apt update && \
|
|||||||
libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \
|
libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \
|
||||||
python3-dev python3-pip git wget \
|
python3-dev python3-pip git wget \
|
||||||
libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
|
libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
|
||||||
ccache \
|
ccache devscripts debhelper fakeroot \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& pip install uv
|
&& pip install uv
|
||||||
|
|
||||||
@@ -59,14 +59,19 @@ ENV CCACHE_COMPRESS=1
|
|||||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||||
ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache
|
ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache
|
||||||
|
|
||||||
# Setup Workspace
|
|
||||||
WORKDIR $VLLM_BASE_DIR
|
|
||||||
|
|
||||||
# 2. Set Environment Variables
|
# 2. Set Environment Variables
|
||||||
ARG TORCH_CUDA_ARCH_LIST="12.1a"
|
ARG TORCH_CUDA_ARCH_LIST="12.1a"
|
||||||
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||||
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||||
|
|
||||||
|
# Setup Workspace
|
||||||
|
WORKDIR $VLLM_BASE_DIR
|
||||||
|
|
||||||
|
# Build NCCL with mesh support (TODO: only do it if arch is 12.1) - artifacts will be in /workspace/nccl/build/pkg/deb
|
||||||
|
RUN git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git && \
|
||||||
|
cd nccl && make -j ${BUILD_JOBS} src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121" && \
|
||||||
|
make pkg.debian.build && apt install -y --no-install-recommends ./build/pkg/deb/*.deb
|
||||||
|
|
||||||
# =========================================================
|
# =========================================================
|
||||||
# STAGE 2: FlashInfer Builder
|
# STAGE 2: FlashInfer Builder
|
||||||
# =========================================================
|
# =========================================================
|
||||||
@@ -234,13 +239,16 @@ ENV UV_SYSTEM_PYTHON=1
|
|||||||
ENV UV_BREAK_SYSTEM_PACKAGES=1
|
ENV UV_BREAK_SYSTEM_PACKAGES=1
|
||||||
ENV UV_LINK_MODE=copy
|
ENV UV_LINK_MODE=copy
|
||||||
|
|
||||||
|
# Mount additional packages from base builder image
|
||||||
# Install runtime dependencies
|
# Install runtime dependencies
|
||||||
RUN apt update && \
|
RUN --mount=type=bind,from=base,source=/workspace/vllm/nccl/build/pkg/deb,target=/workspace/nccl-pkg \
|
||||||
|
apt update && \
|
||||||
apt install -y --no-install-recommends \
|
apt install -y --no-install-recommends \
|
||||||
python3 python3-pip python3-dev vim curl git wget \
|
python3 python3-pip python3-dev vim curl git wget \
|
||||||
libcudnn9-cuda-13 \
|
libcudnn9-cuda-13 \
|
||||||
libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
|
libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
|
||||||
libxcb1 \
|
libxcb1 \
|
||||||
|
&& cd /workspace/nccl-pkg && apt install -y --no-install-recommends --allow-downgrades ./*.deb \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& pip install uv
|
&& pip install uv
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user