Working MXFP4 fork, updated build script
This commit is contained in:
@@ -10,7 +10,7 @@ FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 AS base
|
|||||||
|
|
||||||
# Build parallemism
|
# Build parallemism
|
||||||
ARG BUILD_JOBS
|
ARG BUILD_JOBS
|
||||||
ENV MAX_JOBS=${BUILD_JOBS}
|
ENV MAX_JOBS=${BUILD_JOBS}
|
||||||
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
|
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
|
||||||
ENV NINJAFLAGS="-j${BUILD_JOBS}"
|
ENV NINJAFLAGS="-j${BUILD_JOBS}"
|
||||||
ENV MAKEFLAGS="-j${BUILD_JOBS}"
|
ENV MAKEFLAGS="-j${BUILD_JOBS}"
|
||||||
@@ -19,13 +19,13 @@ ENV MAKEFLAGS="-j${BUILD_JOBS}"
|
|||||||
# Pinned versions from https://github.com/christopherowen/spark-vllm-mxfp4-docker/blob/main/Dockerfile
|
# Pinned versions from https://github.com/christopherowen/spark-vllm-mxfp4-docker/blob/main/Dockerfile
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e
|
# ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e
|
||||||
#ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
|
# ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
|
||||||
#ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
|
# ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
|
||||||
|
|
||||||
ARG VLLM_REPO=https://github.com/christopherowen/vllm.git
|
# ARG VLLM_REPO=https://github.com/christopherowen/vllm.git
|
||||||
ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
|
# ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
|
||||||
ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
|
# ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
|
||||||
|
|
||||||
# Set non-interactive frontend to prevent apt prompts
|
# Set non-interactive frontend to prevent apt prompts
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
@@ -98,11 +98,19 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|||||||
# =========================================================
|
# =========================================================
|
||||||
FROM base AS flashinfer-builder
|
FROM base AS flashinfer-builder
|
||||||
|
|
||||||
|
ENV FLASHINFER_CUDA_ARCH_LIST="12.1f"
|
||||||
|
|
||||||
WORKDIR $VLLM_BASE_DIR
|
WORKDIR $VLLM_BASE_DIR
|
||||||
|
|
||||||
|
ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
|
||||||
|
ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
|
||||||
|
|
||||||
ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
|
ARG FLASHINFER_SHA=1660ee8d740b0385f235519f9e2750db944d1838
|
||||||
ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
|
ARG CUTLASS_SHA=11af7f02ab52c9130e422eeb4b44042fbd60c083
|
||||||
|
|
||||||
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
|
uv pip install "apache-tvm-ffi<0.2" nvidia-ml-py requests
|
||||||
|
|
||||||
# Clone FlashInfer (cached for faster rebuilds)
|
# Clone FlashInfer (cached for faster rebuilds)
|
||||||
RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \
|
RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \
|
||||||
if [ -d /git-cache/flashinfer/.git ]; then \
|
if [ -d /git-cache/flashinfer/.git ]; then \
|
||||||
@@ -145,19 +153,19 @@ WORKDIR /workspace/flashinfer
|
|||||||
# flashinfer-python
|
# flashinfer-python
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
--mount=type=cache,id=ccache,target=/root/.ccache \
|
--mount=type=cache,id=ccache,target=/root/.ccache \
|
||||||
uv build --no-build-isolation --out-dir=./wheels .
|
sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \
|
||||||
|
uv build --no-build-isolation --wheel --out-dir=./wheels .
|
||||||
|
|
||||||
# flashinfer-cubin
|
# flashinfer-cubin
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
--mount=type=cache,id=ccache,target=/root/.ccache \
|
--mount=type=cache,id=ccache,target=/root/.ccache \
|
||||||
cd flashinfer-cubin && python -m build --no-isolation --wheel --outdir=../wheels .
|
cd flashinfer-cubin && uv build --no-build-isolation --wheel --out-dir=../wheels .
|
||||||
|
|
||||||
# flashinfer-jit-cache
|
# flashinfer-jit-cache
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
--mount=type=cache,id=ccache,target=/root/.ccache \
|
--mount=type=cache,id=ccache,target=/root/.ccache \
|
||||||
cd flashinfer-jit-cache && \
|
cd flashinfer-jit-cache && \
|
||||||
uv pip install -r requirements.txt && \
|
uv build --no-build-isolation --wheel --out-dir=../wheels .
|
||||||
python -m build --no-isolation --wheel --outdir=../wheels .
|
|
||||||
|
|
||||||
# =========================================================
|
# =========================================================
|
||||||
# STAGE 3: vLLM Builder (Builds vLLM from Source)
|
# STAGE 3: vLLM Builder (Builds vLLM from Source)
|
||||||
@@ -169,8 +177,10 @@ FROM base AS builder
|
|||||||
# without re-installing the dependencies above.
|
# without re-installing the dependencies above.
|
||||||
ARG CACHEBUST_VLLM=1
|
ARG CACHEBUST_VLLM=1
|
||||||
|
|
||||||
|
ARG VLLM_REPO=https://github.com/christopherowen/vllm.git
|
||||||
|
|
||||||
# Git reference (branch, tag, or SHA) to checkout
|
# Git reference (branch, tag, or SHA) to checkout
|
||||||
ARG VLLM_REF=main
|
ARG VLLM_SHA=045293d82b832229560ac4a13152a095af603b6e
|
||||||
|
|
||||||
# 4. Smart Git Clone (Fetch changes instead of full re-clone)
|
# 4. Smart Git Clone (Fetch changes instead of full re-clone)
|
||||||
# We mount a cache at /repo-cache. This directory persists on your host machine.
|
# We mount a cache at /repo-cache. This directory persists on your host machine.
|
||||||
@@ -183,7 +193,7 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
|
|||||||
git clone --recursive ${VLLM_REPO} vllm-mxfp4; \
|
git clone --recursive ${VLLM_REPO} vllm-mxfp4; \
|
||||||
else \
|
else \
|
||||||
echo "Cache hit: Fetching updates..." && \
|
echo "Cache hit: Fetching updates..." && \
|
||||||
cd vllm && \
|
cd vllm-mxfp4 && \
|
||||||
git fetch --all && \
|
git fetch --all && \
|
||||||
git checkout ${VLLM_SHA} && \
|
git checkout ${VLLM_SHA} && \
|
||||||
if [ "${VLLM_SHA}" = "main" ]; then \
|
if [ "${VLLM_SHA}" = "main" ]; then \
|
||||||
@@ -196,7 +206,7 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
|
|||||||
# 3. Copy the updated code from the cache to the actual container workspace
|
# 3. Copy the updated code from the cache to the actual container workspace
|
||||||
# We use 'cp -a' to preserve permissions
|
# We use 'cp -a' to preserve permissions
|
||||||
mkdir $VLLM_BASE_DIR/vllm && \
|
mkdir $VLLM_BASE_DIR/vllm && \
|
||||||
cp -a /repo-cache/vllm-mxfp4/* $VLLM_BASE_DIR/vllm/
|
cp -a -r /repo-cache/vllm-mxfp4/. $VLLM_BASE_DIR/vllm/
|
||||||
|
|
||||||
WORKDIR $VLLM_BASE_DIR/vllm
|
WORKDIR $VLLM_BASE_DIR/vllm
|
||||||
|
|
||||||
@@ -216,8 +226,8 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|||||||
|
|
||||||
# Apply Patches
|
# Apply Patches
|
||||||
# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
||||||
COPY fastsafetensors.patch .
|
#COPY fastsafetensors.patch .
|
||||||
RUN patch -p1 < fastsafetensors.patch
|
#RUN patch -p1 < fastsafetensors.patch
|
||||||
|
|
||||||
# Final Compilation
|
# Final Compilation
|
||||||
# We mount the ccache directory here. Ideally, map this to a host volume for persistence
|
# We mount the ccache directory here. Ideally, map this to a host volume for persistence
|
||||||
@@ -229,7 +239,8 @@ RUN --mount=type=cache,id=ccache,target=/root/.ccache \
|
|||||||
# Install custom Flashinfer from flashinfer-builder
|
# Install custom Flashinfer from flashinfer-builder
|
||||||
COPY --from=flashinfer-builder /workspace/flashinfer/wheels /workspace/wheels
|
COPY --from=flashinfer-builder /workspace/flashinfer/wheels /workspace/wheels
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
uv pip install --no-deps /workspace/wheels/*.whl
|
uv pip install --no-deps /workspace/wheels/*.whl && \
|
||||||
|
uv pip install apache-tvm-ffi nvidia-cudnn-frontend nvidia-cutlass-dsl nvidia-ml-py tabulate
|
||||||
|
|
||||||
# =========================================================
|
# =========================================================
|
||||||
# STAGE 4: Runner (Transfers only necessary artifacts)
|
# STAGE 4: Runner (Transfers only necessary artifacts)
|
||||||
@@ -273,6 +284,7 @@ COPY --from=builder /usr/local/bin /usr/local/bin
|
|||||||
|
|
||||||
# Setup Env for Runtime
|
# Setup Env for Runtime
|
||||||
ENV TORCH_CUDA_ARCH_LIST="12.0;12.1"
|
ENV TORCH_CUDA_ARCH_LIST="12.0;12.1"
|
||||||
|
ENV FLASHINFER_CUDA_ARCH_LIST="12.1f"
|
||||||
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||||
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
||||||
ENV PATH=$VLLM_BASE_DIR:$PATH
|
ENV PATH=$VLLM_BASE_DIR:$PATH
|
||||||
|
|||||||
@@ -18,6 +18,9 @@ PARALLEL_COPY=false
|
|||||||
USE_WHEELS_MODE=""
|
USE_WHEELS_MODE=""
|
||||||
PRE_FLASHINFER=false
|
PRE_FLASHINFER=false
|
||||||
PRE_TRANSFORMERS=false
|
PRE_TRANSFORMERS=false
|
||||||
|
EXP_MXFP4=false
|
||||||
|
TRITON_REF_SET=false
|
||||||
|
VLLM_REF_SET=false
|
||||||
|
|
||||||
cleanup() {
|
cleanup() {
|
||||||
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
|
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
|
||||||
@@ -73,6 +76,7 @@ usage() {
|
|||||||
echo " --use-wheels [mode] : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'."
|
echo " --use-wheels [mode] : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'."
|
||||||
echo " --pre-flashinfer : Use pre-release versions of FlashInfer"
|
echo " --pre-flashinfer : Use pre-release versions of FlashInfer"
|
||||||
echo " --pre-tf, --pre-transformers : Install transformers 5.0.0rc0 or higher"
|
echo " --pre-tf, --pre-transformers : Install transformers 5.0.0rc0 or higher"
|
||||||
|
echo " --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support"
|
||||||
echo " --no-build : Skip building, only copy image (requires --copy-to)"
|
echo " --no-build : Skip building, only copy image (requires --copy-to)"
|
||||||
echo " -h, --help : Show this help message"
|
echo " -h, --help : Show this help message"
|
||||||
exit 1
|
exit 1
|
||||||
@@ -84,8 +88,8 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
-t|--tag) IMAGE_TAG="$2"; shift ;;
|
-t|--tag) IMAGE_TAG="$2"; shift ;;
|
||||||
--rebuild-deps) REBUILD_DEPS=true ;;
|
--rebuild-deps) REBUILD_DEPS=true ;;
|
||||||
--rebuild-vllm) REBUILD_VLLM=true ;;
|
--rebuild-vllm) REBUILD_VLLM=true ;;
|
||||||
--triton-ref) TRITON_REF="$2"; shift ;;
|
--triton-ref) TRITON_REF="$2"; TRITON_REF_SET=true; shift ;;
|
||||||
--vllm-ref) VLLM_REF="$2"; shift ;;
|
--vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;;
|
||||||
-c|--copy-to|--copy-to-host|--copy-to-hosts)
|
-c|--copy-to|--copy-to-host|--copy-to-hosts)
|
||||||
shift
|
shift
|
||||||
# Consume arguments until the next flag or end of args
|
# Consume arguments until the next flag or end of args
|
||||||
@@ -135,6 +139,7 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
;;
|
;;
|
||||||
--pre-flashinfer) PRE_FLASHINFER=true ;;
|
--pre-flashinfer) PRE_FLASHINFER=true ;;
|
||||||
--pre-tf|--pre-transformers) PRE_TRANSFORMERS=true ;;
|
--pre-tf|--pre-transformers) PRE_TRANSFORMERS=true ;;
|
||||||
|
--exp-mxfp4|--experimental-mxfp4) EXP_MXFP4=true ;;
|
||||||
--no-build) NO_BUILD=true ;;
|
--no-build) NO_BUILD=true ;;
|
||||||
-h|--help) usage ;;
|
-h|--help) usage ;;
|
||||||
*) echo "Unknown parameter passed: $1"; usage ;;
|
*) echo "Unknown parameter passed: $1"; usage ;;
|
||||||
@@ -142,6 +147,14 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
shift
|
shift
|
||||||
done
|
done
|
||||||
|
|
||||||
|
if [ "$EXP_MXFP4" = true ]; then
|
||||||
|
if [ "$TRITON_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --triton-ref"; exit 1; fi
|
||||||
|
if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi
|
||||||
|
if [ -n "$USE_WHEELS_MODE" ]; then echo "Error: --exp-mxfp4 is incompatible with --use-wheels"; exit 1; fi
|
||||||
|
if [ "$PRE_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-flashinfer"; exit 1; fi
|
||||||
|
if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi
|
||||||
|
fi
|
||||||
|
|
||||||
# Validate --no-build usage
|
# Validate --no-build usage
|
||||||
if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||||
echo "Error: --no-build requires --copy-to to be specified"
|
echo "Error: --no-build requires --copy-to to be specified"
|
||||||
@@ -154,7 +167,10 @@ if [ "$NO_BUILD" = false ]; then
|
|||||||
# Construct build command
|
# Construct build command
|
||||||
CMD=("docker" "build" "-t" "$IMAGE_TAG")
|
CMD=("docker" "build" "-t" "$IMAGE_TAG")
|
||||||
|
|
||||||
if [ -n "$USE_WHEELS_MODE" ]; then
|
if [ "$EXP_MXFP4" = true ]; then
|
||||||
|
echo "Building with experimental MXFP4 support..."
|
||||||
|
CMD+=("-f" "Dockerfile.mxfp4")
|
||||||
|
elif [ -n "$USE_WHEELS_MODE" ]; then
|
||||||
echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)"
|
echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)"
|
||||||
CMD+=("-f" "Dockerfile.wheels")
|
CMD+=("-f" "Dockerfile.wheels")
|
||||||
if [ "$USE_WHEELS_MODE" = "release" ]; then
|
if [ "$USE_WHEELS_MODE" = "release" ]; then
|
||||||
|
|||||||
Reference in New Issue
Block a user