From 564afc1f6ba167d4bbb6145c8094ff2faf38f4d1 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Mon, 26 Jan 2026 22:31:46 -0800 Subject: [PATCH] Working MXFP4 fork, updated build script --- Dockerfile.mxfp4 | 46 +++++++++++++++++++++++++++++----------------- build-and-copy.sh | 22 +++++++++++++++++++--- 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/Dockerfile.mxfp4 b/Dockerfile.mxfp4 index 3e3e5a7..2c85762 100644 --- a/Dockerfile.mxfp4 +++ b/Dockerfile.mxfp4 @@ -10,7 +10,7 @@ FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 AS base # Build parallemism ARG BUILD_JOBS -ENV MAX_JOBS=${BUILD_JOBS} +ENV MAX_JOBS=${BUILD_JOBS} ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} ENV NINJAFLAGS="-j${BUILD_JOBS}" ENV MAKEFLAGS="-j${BUILD_JOBS}" @@ -19,13 +19,13 @@ ENV MAKEFLAGS="-j${BUILD_JOBS}" # Pinned versions from https://github.com/christopherowen/spark-vllm-mxfp4-docker/blob/main/Dockerfile # ============================================================================= -ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e -#ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838 -#ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083 +# ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e +# ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838 +# ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083 -ARG VLLM_REPO=https://github.com/christopherowen/vllm.git -ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git -ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git +# ARG VLLM_REPO=https://github.com/christopherowen/vllm.git +# ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git +# ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git # Set non-interactive frontend to prevent apt prompts ENV DEBIAN_FRONTEND=noninteractive @@ -98,11 +98,19 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ # ========================================================= FROM base AS flashinfer-builder +ENV FLASHINFER_CUDA_ARCH_LIST="12.1f" + WORKDIR $VLLM_BASE_DIR +ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git +ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git + ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838 ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083 +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install "apache-tvm-ffi<0.2" nvidia-ml-py requests + # Clone FlashInfer (cached for faster rebuilds) RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \ if [ -d /git-cache/flashinfer/.git ]; then \ @@ -145,19 +153,19 @@ WORKDIR /workspace/flashinfer # flashinfer-python RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ --mount=type=cache,id=ccache,target=/root/.ccache \ - uv build --no-build-isolation --out-dir=./wheels . + sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \ + uv build --no-build-isolation --wheel --out-dir=./wheels . # flashinfer-cubin RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ --mount=type=cache,id=ccache,target=/root/.ccache \ - cd flashinfer-cubin && python -m build --no-isolation --wheel --outdir=../wheels . + cd flashinfer-cubin && uv build --no-build-isolation --wheel --out-dir=../wheels . # flashinfer-jit-cache RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ --mount=type=cache,id=ccache,target=/root/.ccache \ cd flashinfer-jit-cache && \ - uv pip install -r requirements.txt && \ - python -m build --no-isolation --wheel --outdir=../wheels . + uv build --no-build-isolation --wheel --out-dir=../wheels . # ========================================================= # STAGE 3: vLLM Builder (Builds vLLM from Source) @@ -169,8 +177,10 @@ FROM base AS builder # without re-installing the dependencies above. ARG CACHEBUST_VLLM=1 +ARG VLLM_REPO=https://github.com/christopherowen/vllm.git + # Git reference (branch, tag, or SHA) to checkout -ARG VLLM_REF=main +ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e # 4. Smart Git Clone (Fetch changes instead of full re-clone) # We mount a cache at /repo-cache. This directory persists on your host machine. @@ -183,7 +193,7 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \ git clone --recursive ${VLLM_REPO} vllm-mxfp4; \ else \ echo "Cache hit: Fetching updates..." && \ - cd vllm && \ + cd vllm-mxfp4 && \ git fetch --all && \ git checkout ${VLLM_SHA} && \ if [ "${VLLM_SHA}" = "main" ]; then \ @@ -196,7 +206,7 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \ # 3. Copy the updated code from the cache to the actual container workspace # We use 'cp -a' to preserve permissions mkdir $VLLM_BASE_DIR/vllm && \ - cp -a /repo-cache/vllm-mxfp4/* $VLLM_BASE_DIR/vllm/ + cp -a -r /repo-cache/vllm-mxfp4/. $VLLM_BASE_DIR/vllm/ WORKDIR $VLLM_BASE_DIR/vllm @@ -216,8 +226,8 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ # Apply Patches # TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 -COPY fastsafetensors.patch . -RUN patch -p1 < fastsafetensors.patch +#COPY fastsafetensors.patch . +#RUN patch -p1 < fastsafetensors.patch # Final Compilation # We mount the ccache directory here. Ideally, map this to a host volume for persistence @@ -229,7 +239,8 @@ RUN --mount=type=cache,id=ccache,target=/root/.ccache \ # Install custom Flashinfer from flashinfer-builder COPY --from=flashinfer-builder /workspace/flashinfer/wheels /workspace/wheels RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install --no-deps /workspace/wheels/*.whl + uv pip install --no-deps /workspace/wheels/*.whl && \ + uv pip install apache-tvm-ffi nvidia-cudnn-frontend nvidia-cutlass-dsl nvidia-ml-py tabulate # ========================================================= # STAGE 4: Runner (Transfers only necessary artifacts) @@ -273,6 +284,7 @@ COPY --from=builder /usr/local/bin /usr/local/bin # Setup Env for Runtime ENV TORCH_CUDA_ARCH_LIST="12.0;12.1" +ENV FLASHINFER_CUDA_ARCH_LIST="12.1f" ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings ENV PATH=$VLLM_BASE_DIR:$PATH diff --git a/build-and-copy.sh b/build-and-copy.sh index ad01c78..2030909 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -18,6 +18,9 @@ PARALLEL_COPY=false USE_WHEELS_MODE="" PRE_FLASHINFER=false PRE_TRANSFORMERS=false +EXP_MXFP4=false +TRITON_REF_SET=false +VLLM_REF_SET=false cleanup() { if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then @@ -73,6 +76,7 @@ usage() { echo " --use-wheels [mode] : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'." echo " --pre-flashinfer : Use pre-release versions of FlashInfer" echo " --pre-tf, --pre-transformers : Install transformers 5.0.0rc0 or higher" + echo " --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support" echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " -h, --help : Show this help message" exit 1 @@ -84,8 +88,8 @@ while [[ "$#" -gt 0 ]]; do -t|--tag) IMAGE_TAG="$2"; shift ;; --rebuild-deps) REBUILD_DEPS=true ;; --rebuild-vllm) REBUILD_VLLM=true ;; - --triton-ref) TRITON_REF="$2"; shift ;; - --vllm-ref) VLLM_REF="$2"; shift ;; + --triton-ref) TRITON_REF="$2"; TRITON_REF_SET=true; shift ;; + --vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;; -c|--copy-to|--copy-to-host|--copy-to-hosts) shift # Consume arguments until the next flag or end of args @@ -135,6 +139,7 @@ while [[ "$#" -gt 0 ]]; do ;; --pre-flashinfer) PRE_FLASHINFER=true ;; --pre-tf|--pre-transformers) PRE_TRANSFORMERS=true ;; + --exp-mxfp4|--experimental-mxfp4) EXP_MXFP4=true ;; --no-build) NO_BUILD=true ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; @@ -142,6 +147,14 @@ while [[ "$#" -gt 0 ]]; do shift done +if [ "$EXP_MXFP4" = true ]; then + if [ "$TRITON_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --triton-ref"; exit 1; fi + if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi + if [ -n "$USE_WHEELS_MODE" ]; then echo "Error: --exp-mxfp4 is incompatible with --use-wheels"; exit 1; fi + if [ "$PRE_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-flashinfer"; exit 1; fi + if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi +fi + # Validate --no-build usage if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then echo "Error: --no-build requires --copy-to to be specified" @@ -154,7 +167,10 @@ if [ "$NO_BUILD" = false ]; then # Construct build command CMD=("docker" "build" "-t" "$IMAGE_TAG") - if [ -n "$USE_WHEELS_MODE" ]; then + if [ "$EXP_MXFP4" = true ]; then + echo "Building with experimental MXFP4 support..." + CMD+=("-f" "Dockerfile.mxfp4") + elif [ -n "$USE_WHEELS_MODE" ]; then echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)" CMD+=("-f" "Dockerfile.wheels") if [ "$USE_WHEELS_MODE" = "release" ]; then