diff --git a/Dockerfile.mxfp4 b/Dockerfile.mxfp4 new file mode 100644 index 0000000..3e3e5a7 --- /dev/null +++ b/Dockerfile.mxfp4 @@ -0,0 +1,286 @@ +# syntax=docker/dockerfile:1.6 + +# Limit build parallelism to reduce OOM situations +ARG BUILD_JOBS=16 + +# ========================================================= +# STAGE 1: Base Image (Installs Dependencies) +# ========================================================= +FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 AS base + +# Build parallemism +ARG BUILD_JOBS +ENV MAX_JOBS=${BUILD_JOBS} +ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} +ENV NINJAFLAGS="-j${BUILD_JOBS}" +ENV MAKEFLAGS="-j${BUILD_JOBS}" + +# ============================================================================= +# Pinned versions from https://github.com/christopherowen/spark-vllm-mxfp4-docker/blob/main/Dockerfile +# ============================================================================= + +ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e +#ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838 +#ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083 + +ARG VLLM_REPO=https://github.com/christopherowen/vllm.git +ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git +ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git + +# Set non-interactive frontend to prevent apt prompts +ENV DEBIAN_FRONTEND=noninteractive + +# Allow pip to install globally on Ubuntu 24.04 without a venv +ENV PIP_BREAK_SYSTEM_PACKAGES=1 + +# Set pip cache directory +ENV PIP_CACHE_DIR=/root/.cache/pip +ENV UV_CACHE_DIR=/root/.cache/uv +ENV UV_SYSTEM_PYTHON=1 +ENV UV_BREAK_SYSTEM_PACKAGES=1 +ENV UV_LINK_MODE=copy + +# Set the base directory environment variable +ENV VLLM_BASE_DIR=/workspace/vllm + +# 1. Install Build Dependencies & Ccache +# Added ccache to enable incremental compilation caching +RUN apt update && apt upgrade -y \ + && apt install -y --allow-change-held-packages --no-install-recommends \ + curl vim cmake build-essential ninja-build \ + libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \ + python3-dev python3-pip git wget \ + libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ + ccache \ + && rm -rf /var/lib/apt/lists/* \ + && pip install uv + +# Configure Ccache for CUDA/C++ +ENV PATH=/usr/lib/ccache:$PATH +ENV CCACHE_DIR=/root/.ccache +# Limit ccache size to prevent unbounded growth (e.g. 50G) +ENV CCACHE_MAXSIZE=50G +# Enable compression to save space +ENV CCACHE_COMPRESS=1 +# Tell CMake to use ccache for compilation +ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache +ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache + +# Setup Workspace +WORKDIR $VLLM_BASE_DIR + +# 2. Set Environment Variables +ENV TORCH_CUDA_ARCH_LIST="12.0;12.1" +ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas + +# --- CACHE BUSTER --- +# Change this argument to force a re-download of PyTorch/FlashInfer +ARG CACHEBUST_DEPS=1 + +# 3. Install Python Dependencies with Cache Mounts +# Using --mount=type=cache ensures that even if this layer invalidates, +# pip reuses previously downloaded wheels. + +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 + +# Install additional dependencies +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install xgrammar fastsafetensors triton + +ARG PRE_TRANSFORMERS=0 +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + if [ "$PRE_TRANSFORMERS" = "1" ]; then \ + uv pip install -U transformers --pre; \ + fi +# ========================================================= +# STAGE 2: Flashinfer Builder +# ========================================================= +FROM base AS flashinfer-builder + +WORKDIR $VLLM_BASE_DIR + +ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838 +ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083 + +# Clone FlashInfer (cached for faster rebuilds) +RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \ + if [ -d /git-cache/flashinfer/.git ]; then \ + echo "=== Using cached FlashInfer repo ===" && \ + cp -a /git-cache/flashinfer /workspace/flashinfer && \ + cd /workspace/flashinfer && \ + git fetch origin; \ + else \ + echo "=== Cloning FlashInfer (first build) ===" && \ + git clone ${FLASHINFER_REPO} /workspace/flashinfer && \ + cp -a /workspace/flashinfer /git-cache/flashinfer; \ + fi && \ + cd /workspace/flashinfer && git checkout ${FLASHINFER_SHA} + +# Clone spdlog submodule (small, no caching needed) +RUN cd /workspace/flashinfer && \ + git submodule update --init 3rdparty/spdlog + +# Clone CUTLASS directly (skip submodule, use our fork) +RUN --mount=type=cache,id=git-cutlass,target=/git-cache/cutlass \ + cd /workspace/flashinfer && \ + rm -rf 3rdparty/cutlass && \ + if [ -d /git-cache/cutlass/.git ] && [ -d /git-cache/cutlass/.git/objects ]; then \ + echo "=== Using cached CUTLASS repo ===" && \ + cp -a /git-cache/cutlass 3rdparty/cutlass && \ + cd 3rdparty/cutlass && \ + git fetch origin; \ + else \ + echo "=== Cloning CUTLASS (first build) ===" && \ + rm -rf /git-cache/cutlass/* /git-cache/cutlass/.* 2>/dev/null || true && \ + git clone ${CUTLASS_REPO} 3rdparty/cutlass && \ + cp -a /workspace/flashinfer/3rdparty/cutlass/. /git-cache/cutlass/; \ + fi && \ + cd /workspace/flashinfer/3rdparty/cutlass && git checkout ${CUTLASS_SHA} + +# Build FlashInfer wheels + +WORKDIR /workspace/flashinfer + +# flashinfer-python +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + --mount=type=cache,id=ccache,target=/root/.ccache \ + uv build --no-build-isolation --out-dir=./wheels . + +# flashinfer-cubin +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + --mount=type=cache,id=ccache,target=/root/.ccache \ + cd flashinfer-cubin && python -m build --no-isolation --wheel --outdir=../wheels . + +# flashinfer-jit-cache +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + --mount=type=cache,id=ccache,target=/root/.ccache \ + cd flashinfer-jit-cache && \ + uv pip install -r requirements.txt && \ + python -m build --no-isolation --wheel --outdir=../wheels . + +# ========================================================= +# STAGE 3: vLLM Builder (Builds vLLM from Source) +# ========================================================= +FROM base AS builder + +# --- VLLM SOURCE CACHE BUSTER --- +# Change THIS argument to force a fresh git clone and rebuild of vLLM +# without re-installing the dependencies above. +ARG CACHEBUST_VLLM=1 + +# Git reference (branch, tag, or SHA) to checkout +ARG VLLM_REF=main + +# 4. Smart Git Clone (Fetch changes instead of full re-clone) +# We mount a cache at /repo-cache. This directory persists on your host machine. +RUN --mount=type=cache,id=repo-cache,target=/repo-cache \ + # 1. Go into the persistent cache directory + cd /repo-cache && \ + # 2. Logic: Clone if missing, otherwise Fetch & Reset + if [ ! -d "vllm-mxfp4" ]; then \ + echo "Cache miss: Cloning vLLM from scratch..." && \ + git clone --recursive ${VLLM_REPO} vllm-mxfp4; \ + else \ + echo "Cache hit: Fetching updates..." && \ + cd vllm && \ + git fetch --all && \ + git checkout ${VLLM_SHA} && \ + if [ "${VLLM_SHA}" = "main" ]; then \ + git reset --hard origin/main; \ + fi && \ + git submodule update --init --recursive && \ + # Optimize git repo size + git gc --auto; \ + fi && \ + # 3. Copy the updated code from the cache to the actual container workspace + # We use 'cp -a' to preserve permissions + mkdir $VLLM_BASE_DIR/vllm && \ + cp -a /repo-cache/vllm-mxfp4/* $VLLM_BASE_DIR/vllm/ + +WORKDIR $VLLM_BASE_DIR/vllm + +ARG PRE_TRANSFORMERS=0 + +# Prepare build requirements +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + python3 use_existing_torch.py && \ + sed -i "/flashinfer/d" requirements/cuda.txt && \ + sed -i '/^triton\b/d' requirements/test.txt && \ + sed -i '/^fastsafetensors\b/d' requirements/test.txt && \ + if [ "$PRE_TRANSFORMERS" = "1" ]; then \ + sed -i '/^transformers\b/d' requirements/common.txt; \ + sed -i '/^transformers\b/d' requirements/test.txt; \ + fi && \ + uv pip install -r requirements/build.txt + +# Apply Patches +# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 +COPY fastsafetensors.patch . +RUN patch -p1 < fastsafetensors.patch + +# Final Compilation +# We mount the ccache directory here. Ideally, map this to a host volume for persistence +# across totally separate `docker build` invocations. +RUN --mount=type=cache,id=ccache,target=/root/.ccache \ + --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install --no-build-isolation . -v + +# Install custom Flashinfer from flashinfer-builder +COPY --from=flashinfer-builder /workspace/flashinfer/wheels /workspace/wheels +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install --no-deps /workspace/wheels/*.whl + +# ========================================================= +# STAGE 4: Runner (Transfers only necessary artifacts) +# ========================================================= +FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 AS runner + +ENV DEBIAN_FRONTEND=noninteractive +ENV PIP_BREAK_SYSTEM_PACKAGES=1 +ENV VLLM_BASE_DIR=/workspace/vllm + +# Set pip cache directory +ENV PIP_CACHE_DIR=/root/.cache/pip +ENV UV_CACHE_DIR=/root/.cache/uv +ENV UV_SYSTEM_PYTHON=1 +ENV UV_BREAK_SYSTEM_PACKAGES=1 +ENV UV_LINK_MODE=copy + +# Install minimal runtime dependencies (NCCL, Python) +# Note: "devel" tools like cmake/gcc are NOT installed here to save space +RUN apt update && apt upgrade -y \ + && apt install -y --allow-change-held-packages --no-install-recommends \ + python3 python3-pip python3-dev vim curl git wget \ + libcudnn9-cuda-13 \ + libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ + libxcb1 \ + && rm -rf /var/lib/apt/lists/* + +# Set final working directory +WORKDIR $VLLM_BASE_DIR + +# Download Tiktoken files +RUN mkdir -p tiktoken_encodings && \ + wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \ + wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" + +# Copy artifacts from Builder Stage +# We copy the python packages and executables +# No need to copy source code, as it's already in the site-packages +COPY --from=builder /usr/local/lib/python3.12/dist-packages /usr/local/lib/python3.12/dist-packages +COPY --from=builder /usr/local/bin /usr/local/bin + +# Setup Env for Runtime +ENV TORCH_CUDA_ARCH_LIST="12.0;12.1" +ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas +ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings +ENV PATH=$VLLM_BASE_DIR:$PATH + +# Copy scripts +COPY run-cluster-node.sh $VLLM_BASE_DIR/ +RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh + +# Final extra deps +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install ray[default]