Working MXFP4 fork, updated build script

This commit is contained in:
Eugene Rakhmatulin
2026-01-26 22:31:46 -08:00
parent 90c8b30276
commit 564afc1f6b
2 changed files with 48 additions and 20 deletions

View File

@@ -10,7 +10,7 @@ FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 AS base
# Build parallemism
ARG BUILD_JOBS
ENV MAX_JOBS=${BUILD_JOBS}
ENV MAX_JOBS=${BUILD_JOBS}
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
ENV NINJAFLAGS="-j${BUILD_JOBS}"
ENV MAKEFLAGS="-j${BUILD_JOBS}"
@@ -19,13 +19,13 @@ ENV MAKEFLAGS="-j${BUILD_JOBS}"
# Pinned versions from https://github.com/christopherowen/spark-vllm-mxfp4-docker/blob/main/Dockerfile
# =============================================================================
ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e
#ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
#ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
# ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e
# ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
# ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
ARG VLLM_REPO=https://github.com/christopherowen/vllm.git
ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
# ARG VLLM_REPO=https://github.com/christopherowen/vllm.git
# ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
# ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
# Set non-interactive frontend to prevent apt prompts
ENV DEBIAN_FRONTEND=noninteractive
@@ -98,11 +98,19 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
# =========================================================
FROM base AS flashinfer-builder
ENV FLASHINFER_CUDA_ARCH_LIST="12.1f"
WORKDIR $VLLM_BASE_DIR
ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install "apache-tvm-ffi<0.2" nvidia-ml-py requests
# Clone FlashInfer (cached for faster rebuilds)
RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \
if [ -d /git-cache/flashinfer/.git ]; then \
@@ -145,19 +153,19 @@ WORKDIR /workspace/flashinfer
# flashinfer-python
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
--mount=type=cache,id=ccache,target=/root/.ccache \
uv build --no-build-isolation --out-dir=./wheels .
sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \
uv build --no-build-isolation --wheel --out-dir=./wheels .
# flashinfer-cubin
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
--mount=type=cache,id=ccache,target=/root/.ccache \
cd flashinfer-cubin && python -m build --no-isolation --wheel --outdir=../wheels .
cd flashinfer-cubin && uv build --no-build-isolation --wheel --out-dir=../wheels .
# flashinfer-jit-cache
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
--mount=type=cache,id=ccache,target=/root/.ccache \
cd flashinfer-jit-cache && \
uv pip install -r requirements.txt && \
python -m build --no-isolation --wheel --outdir=../wheels .
uv build --no-build-isolation --wheel --out-dir=../wheels .
# =========================================================
# STAGE 3: vLLM Builder (Builds vLLM from Source)
@@ -169,8 +177,10 @@ FROM base AS builder
# without re-installing the dependencies above.
ARG CACHEBUST_VLLM=1
ARG VLLM_REPO=https://github.com/christopherowen/vllm.git
# Git reference (branch, tag, or SHA) to checkout
ARG VLLM_REF=main
ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e
# 4. Smart Git Clone (Fetch changes instead of full re-clone)
# We mount a cache at /repo-cache. This directory persists on your host machine.
@@ -183,7 +193,7 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
git clone --recursive ${VLLM_REPO} vllm-mxfp4; \
else \
echo "Cache hit: Fetching updates..." && \
cd vllm && \
cd vllm-mxfp4 && \
git fetch --all && \
git checkout ${VLLM_SHA} && \
if [ "${VLLM_SHA}" = "main" ]; then \
@@ -196,7 +206,7 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
# 3. Copy the updated code from the cache to the actual container workspace
# We use 'cp -a' to preserve permissions
mkdir $VLLM_BASE_DIR/vllm && \
cp -a /repo-cache/vllm-mxfp4/* $VLLM_BASE_DIR/vllm/
cp -a -r /repo-cache/vllm-mxfp4/. $VLLM_BASE_DIR/vllm/
WORKDIR $VLLM_BASE_DIR/vllm
@@ -216,8 +226,8 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
# Apply Patches
# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
COPY fastsafetensors.patch .
RUN patch -p1 < fastsafetensors.patch
#COPY fastsafetensors.patch .
#RUN patch -p1 < fastsafetensors.patch
# Final Compilation
# We mount the ccache directory here. Ideally, map this to a host volume for persistence
@@ -229,7 +239,8 @@ RUN --mount=type=cache,id=ccache,target=/root/.ccache \
# Install custom Flashinfer from flashinfer-builder
COPY --from=flashinfer-builder /workspace/flashinfer/wheels /workspace/wheels
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install --no-deps /workspace/wheels/*.whl
uv pip install --no-deps /workspace/wheels/*.whl && \
uv pip install apache-tvm-ffi nvidia-cudnn-frontend nvidia-cutlass-dsl nvidia-ml-py tabulate
# =========================================================
# STAGE 4: Runner (Transfers only necessary artifacts)
@@ -273,6 +284,7 @@ COPY --from=builder /usr/local/bin /usr/local/bin
# Setup Env for Runtime
ENV TORCH_CUDA_ARCH_LIST="12.0;12.1"
ENV FLASHINFER_CUDA_ARCH_LIST="12.1f"
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
ENV PATH=$VLLM_BASE_DIR:$PATH