From 4214d4fefe1d47ee2c2c013529505d5b32f22503 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Fri, 13 Feb 2026 19:30:28 -0800 Subject: [PATCH] Caching cubins during build for reuse --- Dockerfile | 9 ++++++++- Dockerfile.mxfp4 | 2 +- build-and-copy.sh | 7 +++++++ flashinfer_cache.patch | 18 ++++++++++++++++++ 4 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 flashinfer_cache.patch diff --git a/Dockerfile b/Dockerfile index 939b93f..8b52576 100644 --- a/Dockerfile +++ b/Dockerfile @@ -135,20 +135,27 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \ WORKDIR /workspace/flashinfer +# Apply patch to avoid re-downloading existing cubins +COPY flashinfer_cache.patch . +RUN patch -p1 < flashinfer_cache.patch + # flashinfer-python RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ --mount=type=cache,id=ccache,target=/root/.ccache \ + --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \ sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \ uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v # flashinfer-cubin RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ --mount=type=cache,id=ccache,target=/root/.ccache \ + --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \ cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v # flashinfer-jit-cache RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ --mount=type=cache,id=ccache,target=/root/.ccache \ + --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \ cd flashinfer-jit-cache && \ uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v @@ -301,7 +308,7 @@ RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh # Final extra deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install ray[default] fastsafetensors + uv pip install ray[default] fastsafetensors nvidia-nvshmem-cu13 # Cleanup diff --git a/Dockerfile.mxfp4 b/Dockerfile.mxfp4 index 8730952..28e10fb 100644 --- a/Dockerfile.mxfp4 +++ b/Dockerfile.mxfp4 @@ -276,7 +276,7 @@ RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh # Final extra deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install ray[default] fastsafetensors + uv pip install ray[default] fastsafetensors nvidia-nvshmem-cu13 # If not compiling Triton # remove triton-kernels as they are not compatible with this vLLM version yet diff --git a/build-and-copy.sh b/build-and-copy.sh index aee2b2f..b302b34 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -22,6 +22,7 @@ EXP_MXFP4=false TRITON_REF_SET=false VLLM_REF_SET=false VLLM_PRS="" +FULL_LOG=false cleanup() { if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then @@ -81,6 +82,7 @@ usage() { echo " --pre-tf, --pre-transformers : Install transformers 5.0.0rc0 or higher" echo " --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support" echo " --apply-vllm-pr : Apply a specific PR patch to vLLM source code. Can be specified multiple times." + echo " --full-log : Enable full build logging (--progress=plain)" echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " -h, --help : Show this help message" exit 1 @@ -158,6 +160,7 @@ while [[ "$#" -gt 0 ]]; do exit 1 fi ;; + --full-log) FULL_LOG=true ;; --no-build) NO_BUILD=true ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; @@ -198,6 +201,10 @@ if [ "$NO_BUILD" = false ]; then # Construct build command CMD=("docker" "build" "-t" "$IMAGE_TAG") + if [ "$FULL_LOG" = true ]; then + CMD+=("--progress=plain") + fi + if [ "$EXP_MXFP4" = true ]; then echo "Building with experimental MXFP4 support..." CMD+=("-f" "Dockerfile.mxfp4") diff --git a/flashinfer_cache.patch b/flashinfer_cache.patch new file mode 100644 index 0000000..081b2d8 --- /dev/null +++ b/flashinfer_cache.patch @@ -0,0 +1,18 @@ +--- a/flashinfer/artifacts.py ++++ b/flashinfer/artifacts.py +@@ -203,9 +203,13 @@ + with ThreadPoolExecutor(num_threads) as pool: + futures = [] +- for name, _ in cubin_files: +- source = safe_urljoin(FLASHINFER_CUBINS_REPOSITORY, name) +- local_path = FLASHINFER_CUBIN_DIR / name ++ for name, checksum in cubin_files: ++ local_path = FLASHINFER_CUBIN_DIR / name ++ if local_path.exists() and verify_cubin(str(local_path), checksum): ++ pbar.update(1) ++ continue ++ ++ source = safe_urljoin(FLASHINFER_CUBINS_REPOSITORY, name) + # Ensure parent directory exists + local_path.parent.mkdir(parents=True, exist_ok=True) + fut = pool.submit(