From 564afc1f6ba167d4bbb6145c8094ff2faf38f4d1 Mon Sep 17 00:00:00 2001
From: Eugene Rakhmatulin <eugr@eugr.com>
Date: Mon, 26 Jan 2026 22:31:46 -0800
Subject: [PATCH] Working MXFP4 fork, updated build script

---
 Dockerfile.mxfp4  | 46 +++++++++++++++++++++++++++++-----------------
 build-and-copy.sh | 22 +++++++++++++++++++---
 2 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/Dockerfile.mxfp4 b/Dockerfile.mxfp4
index 3e3e5a7..2c85762 100644
--- a/Dockerfile.mxfp4
+++ b/Dockerfile.mxfp4
@@ -10,7 +10,7 @@ FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 AS base
 
 # Build parallemism
 ARG BUILD_JOBS
-ENV MAX_JOBS=${BUILD_JOBS}
+ENV MAX_JOBS=${BUILD_JOBS}  
 ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
 ENV NINJAFLAGS="-j${BUILD_JOBS}"
 ENV MAKEFLAGS="-j${BUILD_JOBS}"
@@ -19,13 +19,13 @@ ENV MAKEFLAGS="-j${BUILD_JOBS}"
 # Pinned versions from https://github.com/christopherowen/spark-vllm-mxfp4-docker/blob/main/Dockerfile
 # =============================================================================
 
-ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e
-#ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
-#ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
+# ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e
+# ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
+# ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
 
-ARG VLLM_REPO=https://github.com/christopherowen/vllm.git
-ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
-ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
+# ARG VLLM_REPO=https://github.com/christopherowen/vllm.git
+# ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
+# ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
 
 # Set non-interactive frontend to prevent apt prompts
 ENV DEBIAN_FRONTEND=noninteractive
@@ -98,11 +98,19 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
 # =========================================================
 FROM base AS flashinfer-builder
 
+ENV FLASHINFER_CUDA_ARCH_LIST="12.1f"
+
 WORKDIR $VLLM_BASE_DIR
 
+ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
+ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
+
 ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
 ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
 
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    uv pip install "apache-tvm-ffi<0.2" nvidia-ml-py requests
+
 # Clone FlashInfer (cached for faster rebuilds)
 RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \
     if [ -d /git-cache/flashinfer/.git ]; then \
@@ -145,19 +153,19 @@ WORKDIR /workspace/flashinfer
 # flashinfer-python
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
     --mount=type=cache,id=ccache,target=/root/.ccache \
-    uv build --no-build-isolation --out-dir=./wheels .
+    sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \
+    uv build --no-build-isolation --wheel --out-dir=./wheels .
 
 # flashinfer-cubin
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
     --mount=type=cache,id=ccache,target=/root/.ccache \
-    cd flashinfer-cubin && python -m build --no-isolation --wheel --outdir=../wheels .
+    cd flashinfer-cubin && uv build --no-build-isolation --wheel --out-dir=../wheels .
 
 # flashinfer-jit-cache
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
     --mount=type=cache,id=ccache,target=/root/.ccache \
     cd flashinfer-jit-cache && \
-    uv pip install -r requirements.txt && \
-    python -m build --no-isolation --wheel --outdir=../wheels .
+    uv build --no-build-isolation --wheel --out-dir=../wheels .
 
 # =========================================================
 # STAGE 3: vLLM Builder (Builds vLLM from Source)
@@ -169,8 +177,10 @@ FROM base AS builder
 # without re-installing the dependencies above.
 ARG CACHEBUST_VLLM=1
 
+ARG VLLM_REPO=https://github.com/christopherowen/vllm.git
+
 # Git reference (branch, tag, or SHA) to checkout
-ARG VLLM_REF=main
+ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e
 
 # 4. Smart Git Clone (Fetch changes instead of full re-clone)
 # We mount a cache at /repo-cache. This directory persists on your host machine.
@@ -183,7 +193,7 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
         git clone --recursive ${VLLM_REPO} vllm-mxfp4; \
     else \
         echo "Cache hit: Fetching updates..." && \
-        cd vllm && \
+        cd vllm-mxfp4 && \
         git fetch --all && \
         git checkout ${VLLM_SHA} && \
         if [ "${VLLM_SHA}" = "main" ]; then \
@@ -196,7 +206,7 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
     # 3. Copy the updated code from the cache to the actual container workspace
     # We use 'cp -a' to preserve permissions
     mkdir $VLLM_BASE_DIR/vllm && \
-    cp -a /repo-cache/vllm-mxfp4/* $VLLM_BASE_DIR/vllm/
+    cp -a -r /repo-cache/vllm-mxfp4/. $VLLM_BASE_DIR/vllm/
 
 WORKDIR $VLLM_BASE_DIR/vllm
 
@@ -216,8 +226,8 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
 
 # Apply Patches
 # TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
-COPY fastsafetensors.patch .
-RUN patch -p1 < fastsafetensors.patch
+#COPY fastsafetensors.patch .
+#RUN patch -p1 < fastsafetensors.patch
 
 # Final Compilation
 # We mount the ccache directory here. Ideally, map this to a host volume for persistence 
@@ -229,7 +239,8 @@ RUN --mount=type=cache,id=ccache,target=/root/.ccache \
 # Install custom Flashinfer from flashinfer-builder
 COPY --from=flashinfer-builder /workspace/flashinfer/wheels /workspace/wheels
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install --no-deps /workspace/wheels/*.whl
+    uv pip install --no-deps /workspace/wheels/*.whl && \
+    uv pip install apache-tvm-ffi nvidia-cudnn-frontend nvidia-cutlass-dsl nvidia-ml-py tabulate
 
 # =========================================================
 # STAGE 4: Runner (Transfers only necessary artifacts)
@@ -273,6 +284,7 @@ COPY --from=builder /usr/local/bin /usr/local/bin
 
 # Setup Env for Runtime
 ENV TORCH_CUDA_ARCH_LIST="12.0;12.1"
+ENV FLASHINFER_CUDA_ARCH_LIST="12.1f"
 ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
 ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
 ENV PATH=$VLLM_BASE_DIR:$PATH
diff --git a/build-and-copy.sh b/build-and-copy.sh
index ad01c78..2030909 100755
--- a/build-and-copy.sh
+++ b/build-and-copy.sh
@@ -18,6 +18,9 @@ PARALLEL_COPY=false
 USE_WHEELS_MODE=""
 PRE_FLASHINFER=false
 PRE_TRANSFORMERS=false
+EXP_MXFP4=false
+TRITON_REF_SET=false
+VLLM_REF_SET=false
 
 cleanup() {
     if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
@@ -73,6 +76,7 @@ usage() {
     echo "  --use-wheels [mode]       : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'."
     echo "  --pre-flashinfer          : Use pre-release versions of FlashInfer"
     echo "  --pre-tf, --pre-transformers : Install transformers 5.0.0rc0 or higher"
+    echo "  --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support"
     echo "  --no-build                : Skip building, only copy image (requires --copy-to)"
     echo "  -h, --help                : Show this help message"
     exit 1
@@ -84,8 +88,8 @@ while [[ "$#" -gt 0 ]]; do
         -t|--tag) IMAGE_TAG="$2"; shift ;;
         --rebuild-deps) REBUILD_DEPS=true ;;
         --rebuild-vllm) REBUILD_VLLM=true ;;
-        --triton-ref) TRITON_REF="$2"; shift ;;
-        --vllm-ref) VLLM_REF="$2"; shift ;;
+        --triton-ref) TRITON_REF="$2"; TRITON_REF_SET=true; shift ;;
+        --vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;;
         -c|--copy-to|--copy-to-host|--copy-to-hosts)
             shift
             # Consume arguments until the next flag or end of args
@@ -135,6 +139,7 @@ while [[ "$#" -gt 0 ]]; do
             ;;
         --pre-flashinfer) PRE_FLASHINFER=true ;;
         --pre-tf|--pre-transformers) PRE_TRANSFORMERS=true ;;
+        --exp-mxfp4|--experimental-mxfp4) EXP_MXFP4=true ;;
         --no-build) NO_BUILD=true ;;
         -h|--help) usage ;;
         *) echo "Unknown parameter passed: $1"; usage ;;
@@ -142,6 +147,14 @@ while [[ "$#" -gt 0 ]]; do
     shift
 done
 
+if [ "$EXP_MXFP4" = true ]; then
+    if [ "$TRITON_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --triton-ref"; exit 1; fi
+    if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi
+    if [ -n "$USE_WHEELS_MODE" ]; then echo "Error: --exp-mxfp4 is incompatible with --use-wheels"; exit 1; fi
+    if [ "$PRE_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-flashinfer"; exit 1; fi
+    if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi
+fi
+
 # Validate --no-build usage
 if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
     echo "Error: --no-build requires --copy-to to be specified"
@@ -154,7 +167,10 @@ if [ "$NO_BUILD" = false ]; then
     # Construct build command
     CMD=("docker" "build" "-t" "$IMAGE_TAG")
 
-    if [ -n "$USE_WHEELS_MODE" ]; then
+    if [ "$EXP_MXFP4" = true ]; then
+        echo "Building with experimental MXFP4 support..."
+        CMD+=("-f" "Dockerfile.mxfp4")
+    elif [ -n "$USE_WHEELS_MODE" ]; then
         echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)"
         CMD+=("-f" "Dockerfile.wheels")
         if [ "$USE_WHEELS_MODE" = "release" ]; then