diff --git a/Dockerfile b/Dockerfile index b15b439..c9465db 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,11 +71,13 @@ RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \ RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \ pip install xgrammar fastsafetensors +ARG FLASHINFER_PRE="" + # Install FlashInfer packages RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \ - pip install flashinfer-python --no-deps --index-url https://flashinfer.ai/whl --pre && \ - pip install flashinfer-cubin --index-url https://flashinfer.ai/whl --pre && \ - pip install flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130 --pre && \ + pip install ${FLASHINFER_PRE} flashinfer-python --no-deps --index-url https://flashinfer.ai/whl && \ + pip install ${FLASHINFER_PRE} flashinfer-cubin --index-url https://flashinfer.ai/whl && \ + pip install ${FLASHINFER_PRE} flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130 && \ pip install apache-tvm-ffi nvidia-cudnn-frontend nvidia-cutlass-dsl nvidia-ml-py tabulate # ========================================================= diff --git a/Dockerfile.wheels b/Dockerfile.wheels index da6ed1f..4dbc20d 100644 --- a/Dockerfile.wheels +++ b/Dockerfile.wheels @@ -61,11 +61,13 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ # Apply in site-packages RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch +ARG FLASHINFER_PRE="" + # Install flashinfer helper packages RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install --system --break-system-packages flashinfer-python -U --no-deps --index-url https://flashinfer.ai/whl && \ - uv pip install --system --break-system-packages flashinfer-cubin --index-url https://flashinfer.ai/whl && \ - uv pip install --system --break-system-packages flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130 + uv pip install --system --break-system-packages ${FLASHINFER_PRE} flashinfer-python -U --no-deps --index-url https://flashinfer.ai/whl && \ + uv pip install --system --break-system-packages ${FLASHINFER_PRE} flashinfer-cubin --index-url https://flashinfer.ai/whl && \ + uv pip install --system --break-system-packages ${FLASHINFER_PRE} flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130 # Setup Env for Runtime ENV TORCH_CUDA_ARCH_LIST=12.1a diff --git a/README.md b/README.md index dbac2b1..fdbbc44 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run ### 2025-12-20 +- Limited ccache to 50G when building from source to reduce build cache size. +- Added `--pre-flashinfer` flag to `build-and-copy.sh` to use pre-release versions of FlashInfer. - Added `--use-wheels [mode]` flag to `build-and-copy.sh`. - Allows building the container using pre-built vLLM wheels instead of compiling from source. - The resulting Docker container size is reduced considerably (14GB vs 24GB) @@ -187,6 +189,7 @@ Using a different username: | `--triton-ref ` | Triton commit SHA, branch or tag (default: 'v3.5.1') | | `--vllm-ref ` | vLLM commit SHA, branch or tag (default: 'main') | | `--use-wheels [mode]` | Use pre-built vLLM wheels. Mode: `nightly` (default) or `release`. | +| `--pre-flashinfer` | Use pre-release versions of FlashInfer. | | `-c, --copy-to ` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). | | `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). | | `--copy-parallel` | Copy to all specified hosts concurrently. | diff --git a/build-and-copy.sh b/build-and-copy.sh index eb26cc5..01b6d16 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -16,6 +16,7 @@ VLLM_REF="main" TMP_IMAGE="" PARALLEL_COPY=false USE_WHEELS_MODE="" +PRE_FLASHINFER=false cleanup() { if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then @@ -69,6 +70,7 @@ usage() { echo " -j, --build-jobs : Number of concurrent build jobs (default: \${BUILD_JOBS})" echo " -u, --user : Username for ssh command (default: \$USER)" echo " --use-wheels [mode] : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'." + echo " --pre-flashinfer : Use pre-release versions of FlashInfer" echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " -h, --help : Show this help message" exit 1 @@ -129,6 +131,7 @@ while [[ "$#" -gt 0 ]]; do USE_WHEELS_MODE="nightly" fi ;; + --pre-flashinfer) PRE_FLASHINFER=true ;; --no-build) NO_BUILD=true ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; @@ -181,6 +184,11 @@ if [ "$NO_BUILD" = false ]; then # Add BUILD_JOBS to build arguments CMD+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS") + if [ "$PRE_FLASHINFER" = true ]; then + echo "Using pre-release FlashInfer..." + CMD+=("--build-arg" "FLASHINFER_PRE=--pre") + fi + # Add build context CMD+=(".")