diff --git a/Dockerfile b/Dockerfile index ef0d766..75c16e6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ RUN apt update && \ libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \ python3-dev python3-pip git wget \ libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ - ccache \ + ccache devscripts debhelper fakeroot \ && rm -rf /var/lib/apt/lists/* \ && pip install uv @@ -59,14 +59,19 @@ ENV CCACHE_COMPRESS=1 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache -# Setup Workspace -WORKDIR $VLLM_BASE_DIR - # 2. Set Environment Variables ARG TORCH_CUDA_ARCH_LIST="12.1a" ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas +# Setup Workspace +WORKDIR $VLLM_BASE_DIR + +# Build NCCL with mesh support (TODO: only do it if arch is 12.1) - artifacts will be in /workspace/nccl/build/pkg/deb +RUN git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git && \ + cd nccl && make -j ${BUILD_JOBS} src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121" && \ + make pkg.debian.build && apt install -y --no-install-recommends ./build/pkg/deb/*.deb + # ========================================================= # STAGE 2: FlashInfer Builder # ========================================================= @@ -234,13 +239,16 @@ ENV UV_SYSTEM_PYTHON=1 ENV UV_BREAK_SYSTEM_PACKAGES=1 ENV UV_LINK_MODE=copy +# Mount additional packages from base builder image # Install runtime dependencies -RUN apt update && \ +RUN --mount=type=bind,from=base,source=/workspace/vllm/nccl/build/pkg/deb,target=/workspace/nccl-pkg \ + apt update && \ apt install -y --no-install-recommends \ python3 python3-pip python3-dev vim curl git wget \ libcudnn9-cuda-13 \ libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ libxcb1 \ + && cd /workspace/nccl-pkg && apt install -y --no-install-recommends --allow-downgrades ./*.deb \ && rm -rf /var/lib/apt/lists/* \ && pip install uv