Initial import of MXFP4 branch

2026-01-24 22:40:36 -08:00
parent 25a16ef6c2
commit aece2fad78
1 changed files with 286 additions and 0 deletions
--- a/Dockerfile.mxfp4
+++ b/Dockerfile.mxfp4
@@ -0,0 +1,286 @@
+# syntax=docker/dockerfile:1.6
+
+# Limit build parallelism to reduce OOM situations
+ARG BUILD_JOBS=16
+
+# =========================================================
+# STAGE 1: Base Image (Installs Dependencies)
+# =========================================================
+FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 AS base
+
+# Build parallemism
+ARG BUILD_JOBS
+ENV MAX_JOBS=${BUILD_JOBS}
+ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
+ENV NINJAFLAGS="-j${BUILD_JOBS}"
+ENV MAKEFLAGS="-j${BUILD_JOBS}"
+
+# =============================================================================
+# Pinned versions from https://github.com/christopherowen/spark-vllm-mxfp4-docker/blob/main/Dockerfile
+# =============================================================================
+
+ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e
+#ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
+#ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
+
+ARG VLLM_REPO=https://github.com/christopherowen/vllm.git
+ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
+ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
+
+# Set non-interactive frontend to prevent apt prompts
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Allow pip to install globally on Ubuntu 24.04 without a venv
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
+
+# Set pip cache directory
+ENV PIP_CACHE_DIR=/root/.cache/pip
+ENV UV_CACHE_DIR=/root/.cache/uv
+ENV UV_SYSTEM_PYTHON=1
+ENV UV_BREAK_SYSTEM_PACKAGES=1
+ENV UV_LINK_MODE=copy
+
+# Set the base directory environment variable
+ENV VLLM_BASE_DIR=/workspace/vllm
+
+# 1. Install Build Dependencies & Ccache
+# Added ccache to enable incremental compilation caching
+RUN apt update && apt upgrade -y \
+    && apt install -y --allow-change-held-packages --no-install-recommends \
+    curl vim cmake build-essential ninja-build \
+    libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \
+    python3-dev python3-pip git wget \
+    libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
+    ccache \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip install uv
+
+# Configure Ccache for CUDA/C++
+ENV PATH=/usr/lib/ccache:$PATH
+ENV CCACHE_DIR=/root/.ccache
+# Limit ccache size to prevent unbounded growth (e.g. 50G)
+ENV CCACHE_MAXSIZE=50G
+# Enable compression to save space
+ENV CCACHE_COMPRESS=1
+# Tell CMake to use ccache for compilation
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache
+
+# Setup Workspace
+WORKDIR $VLLM_BASE_DIR
+
+# 2. Set Environment Variables
+ENV TORCH_CUDA_ARCH_LIST="12.0;12.1"
+ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+
+# --- CACHE BUSTER ---
+# Change this argument to force a re-download of PyTorch/FlashInfer
+ARG CACHEBUST_DEPS=1
+
+# 3. Install Python Dependencies with Cache Mounts
+# Using --mount=type=cache ensures that even if this layer invalidates, 
+# pip reuses previously downloaded wheels.
+
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
+
+# Install additional dependencies
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    uv pip install xgrammar fastsafetensors triton
+
+ARG PRE_TRANSFORMERS=0
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    if [ "$PRE_TRANSFORMERS" = "1" ]; then \
+        uv pip install -U transformers --pre; \
+    fi
+# =========================================================
+# STAGE 2: Flashinfer Builder
+# =========================================================
+FROM base AS flashinfer-builder
+
+WORKDIR $VLLM_BASE_DIR
+
+ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
+ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
+
+# Clone FlashInfer (cached for faster rebuilds)
+RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \
+    if [ -d /git-cache/flashinfer/.git ]; then \
+        echo "=== Using cached FlashInfer repo ===" && \
+        cp -a /git-cache/flashinfer /workspace/flashinfer && \
+        cd /workspace/flashinfer && \
+        git fetch origin; \
+    else \
+        echo "=== Cloning FlashInfer (first build) ===" && \
+        git clone ${FLASHINFER_REPO} /workspace/flashinfer && \
+        cp -a /workspace/flashinfer /git-cache/flashinfer; \
+    fi && \
+    cd /workspace/flashinfer && git checkout ${FLASHINFER_SHA}
+
+# Clone spdlog submodule (small, no caching needed)
+RUN cd /workspace/flashinfer && \
+    git submodule update --init 3rdparty/spdlog
+
+# Clone CUTLASS directly (skip submodule, use our fork)
+RUN --mount=type=cache,id=git-cutlass,target=/git-cache/cutlass \
+    cd /workspace/flashinfer && \
+    rm -rf 3rdparty/cutlass && \
+    if [ -d /git-cache/cutlass/.git ] && [ -d /git-cache/cutlass/.git/objects ]; then \
+        echo "=== Using cached CUTLASS repo ===" && \
+        cp -a /git-cache/cutlass 3rdparty/cutlass && \
+        cd 3rdparty/cutlass && \
+        git fetch origin; \
+    else \
+        echo "=== Cloning CUTLASS (first build) ===" && \
+        rm -rf /git-cache/cutlass/* /git-cache/cutlass/.* 2>/dev/null || true && \
+        git clone ${CUTLASS_REPO} 3rdparty/cutlass && \
+        cp -a /workspace/flashinfer/3rdparty/cutlass/. /git-cache/cutlass/; \
+    fi && \
+    cd /workspace/flashinfer/3rdparty/cutlass && git checkout ${CUTLASS_SHA}
+
+# Build FlashInfer wheels
+
+WORKDIR /workspace/flashinfer
+
+# flashinfer-python
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    --mount=type=cache,id=ccache,target=/root/.ccache \
+    uv build --no-build-isolation --out-dir=./wheels .
+
+# flashinfer-cubin
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    --mount=type=cache,id=ccache,target=/root/.ccache \
+    cd flashinfer-cubin && python -m build --no-isolation --wheel --outdir=../wheels .
+
+# flashinfer-jit-cache
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    --mount=type=cache,id=ccache,target=/root/.ccache \
+    cd flashinfer-jit-cache && \
+    uv pip install -r requirements.txt && \
+    python -m build --no-isolation --wheel --outdir=../wheels .
+
+# =========================================================
+# STAGE 3: vLLM Builder (Builds vLLM from Source)
+# =========================================================
+FROM base AS builder
+
+# --- VLLM SOURCE CACHE BUSTER ---
+# Change THIS argument to force a fresh git clone and rebuild of vLLM
+# without re-installing the dependencies above.
+ARG CACHEBUST_VLLM=1
+
+# Git reference (branch, tag, or SHA) to checkout
+ARG VLLM_REF=main
+
+# 4. Smart Git Clone (Fetch changes instead of full re-clone)
+# We mount a cache at /repo-cache. This directory persists on your host machine.
+RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
+    # 1. Go into the persistent cache directory
+    cd /repo-cache && \
+    # 2. Logic: Clone if missing, otherwise Fetch & Reset
+    if [ ! -d "vllm-mxfp4" ]; then \
+        echo "Cache miss: Cloning vLLM from scratch..." && \
+        git clone --recursive ${VLLM_REPO} vllm-mxfp4; \
+    else \
+        echo "Cache hit: Fetching updates..." && \
+        cd vllm && \
+        git fetch --all && \
+        git checkout ${VLLM_SHA} && \
+        if [ "${VLLM_SHA}" = "main" ]; then \
+            git reset --hard origin/main; \
+        fi && \
+        git submodule update --init --recursive && \
+        # Optimize git repo size
+        git gc --auto; \
+    fi && \
+    # 3. Copy the updated code from the cache to the actual container workspace
+    # We use 'cp -a' to preserve permissions
+    mkdir $VLLM_BASE_DIR/vllm && \
+    cp -a /repo-cache/vllm-mxfp4/* $VLLM_BASE_DIR/vllm/
+
+WORKDIR $VLLM_BASE_DIR/vllm
+
+ARG PRE_TRANSFORMERS=0
+
+# Prepare build requirements
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    python3 use_existing_torch.py && \
+    sed -i "/flashinfer/d" requirements/cuda.txt && \
+    sed -i '/^triton\b/d' requirements/test.txt && \
+    sed -i '/^fastsafetensors\b/d' requirements/test.txt && \
+    if [ "$PRE_TRANSFORMERS" = "1" ]; then \
+        sed -i '/^transformers\b/d' requirements/common.txt; \
+        sed -i '/^transformers\b/d' requirements/test.txt; \
+    fi && \
+    uv pip install -r requirements/build.txt
+
+# Apply Patches
+# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
+COPY fastsafetensors.patch .
+RUN patch -p1 < fastsafetensors.patch
+
+# Final Compilation
+# We mount the ccache directory here. Ideally, map this to a host volume for persistence 
+# across totally separate `docker build` invocations.
+RUN --mount=type=cache,id=ccache,target=/root/.ccache \
+    --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    uv pip install --no-build-isolation . -v
+
+# Install custom Flashinfer from flashinfer-builder
+COPY --from=flashinfer-builder /workspace/flashinfer/wheels /workspace/wheels
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    uv pip install --no-deps /workspace/wheels/*.whl
+
+# =========================================================
+# STAGE 4: Runner (Transfers only necessary artifacts)
+# =========================================================
+FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 AS runner
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
+ENV VLLM_BASE_DIR=/workspace/vllm
+
+# Set pip cache directory
+ENV PIP_CACHE_DIR=/root/.cache/pip
+ENV UV_CACHE_DIR=/root/.cache/uv
+ENV UV_SYSTEM_PYTHON=1
+ENV UV_BREAK_SYSTEM_PACKAGES=1
+ENV UV_LINK_MODE=copy
+
+# Install minimal runtime dependencies (NCCL, Python)
+# Note: "devel" tools like cmake/gcc are NOT installed here to save space
+RUN apt update && apt upgrade -y \
+    && apt install -y --allow-change-held-packages --no-install-recommends \
+    python3 python3-pip python3-dev vim curl git wget \
+    libcudnn9-cuda-13 \
+    libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
+    libxcb1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set final working directory
+WORKDIR $VLLM_BASE_DIR
+
+# Download Tiktoken files
+RUN mkdir -p tiktoken_encodings && \
+    wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
+    wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
+
+# Copy artifacts from Builder Stage
+# We copy the python packages and executables
+# No need to copy source code, as it's already in the site-packages
+COPY --from=builder /usr/local/lib/python3.12/dist-packages /usr/local/lib/python3.12/dist-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+
+# Setup Env for Runtime
+ENV TORCH_CUDA_ARCH_LIST="12.0;12.1"
+ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
+ENV PATH=$VLLM_BASE_DIR:$PATH
+
+# Copy scripts
+COPY run-cluster-node.sh $VLLM_BASE_DIR/
+RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
+
+# Final extra deps
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    uv pip install ray[default]