Add support for pre-release FlashInfer packages in Docker builds
This commit is contained in:
@@ -71,11 +71,13 @@ RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
|
|||||||
RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
|
||||||
pip install xgrammar fastsafetensors
|
pip install xgrammar fastsafetensors
|
||||||
|
|
||||||
|
ARG FLASHINFER_PRE=""
|
||||||
|
|
||||||
# Install FlashInfer packages
|
# Install FlashInfer packages
|
||||||
RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
|
||||||
pip install flashinfer-python --no-deps --index-url https://flashinfer.ai/whl --pre && \
|
pip install ${FLASHINFER_PRE} flashinfer-python --no-deps --index-url https://flashinfer.ai/whl && \
|
||||||
pip install flashinfer-cubin --index-url https://flashinfer.ai/whl --pre && \
|
pip install ${FLASHINFER_PRE} flashinfer-cubin --index-url https://flashinfer.ai/whl && \
|
||||||
pip install flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130 --pre && \
|
pip install ${FLASHINFER_PRE} flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130 && \
|
||||||
pip install apache-tvm-ffi nvidia-cudnn-frontend nvidia-cutlass-dsl nvidia-ml-py tabulate
|
pip install apache-tvm-ffi nvidia-cudnn-frontend nvidia-cutlass-dsl nvidia-ml-py tabulate
|
||||||
|
|
||||||
# =========================================================
|
# =========================================================
|
||||||
|
|||||||
@@ -61,11 +61,13 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|||||||
# Apply in site-packages
|
# Apply in site-packages
|
||||||
RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch
|
RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch
|
||||||
|
|
||||||
|
ARG FLASHINFER_PRE=""
|
||||||
|
|
||||||
# Install flashinfer helper packages
|
# Install flashinfer helper packages
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
uv pip install --system --break-system-packages flashinfer-python -U --no-deps --index-url https://flashinfer.ai/whl && \
|
uv pip install --system --break-system-packages ${FLASHINFER_PRE} flashinfer-python -U --no-deps --index-url https://flashinfer.ai/whl && \
|
||||||
uv pip install --system --break-system-packages flashinfer-cubin --index-url https://flashinfer.ai/whl && \
|
uv pip install --system --break-system-packages ${FLASHINFER_PRE} flashinfer-cubin --index-url https://flashinfer.ai/whl && \
|
||||||
uv pip install --system --break-system-packages flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130
|
uv pip install --system --break-system-packages ${FLASHINFER_PRE} flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130
|
||||||
|
|
||||||
# Setup Env for Runtime
|
# Setup Env for Runtime
|
||||||
ENV TORCH_CUDA_ARCH_LIST=12.1a
|
ENV TORCH_CUDA_ARCH_LIST=12.1a
|
||||||
|
|||||||
@@ -26,6 +26,8 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run
|
|||||||
|
|
||||||
### 2025-12-20
|
### 2025-12-20
|
||||||
|
|
||||||
|
- Limited ccache to 50G when building from source to reduce build cache size.
|
||||||
|
- Added `--pre-flashinfer` flag to `build-and-copy.sh` to use pre-release versions of FlashInfer.
|
||||||
- Added `--use-wheels [mode]` flag to `build-and-copy.sh`.
|
- Added `--use-wheels [mode]` flag to `build-and-copy.sh`.
|
||||||
- Allows building the container using pre-built vLLM wheels instead of compiling from source.
|
- Allows building the container using pre-built vLLM wheels instead of compiling from source.
|
||||||
- The resulting Docker container size is reduced considerably (14GB vs 24GB)
|
- The resulting Docker container size is reduced considerably (14GB vs 24GB)
|
||||||
@@ -187,6 +189,7 @@ Using a different username:
|
|||||||
| `--triton-ref <ref>` | Triton commit SHA, branch or tag (default: 'v3.5.1') |
|
| `--triton-ref <ref>` | Triton commit SHA, branch or tag (default: 'v3.5.1') |
|
||||||
| `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: 'main') |
|
| `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: 'main') |
|
||||||
| `--use-wheels [mode]` | Use pre-built vLLM wheels. Mode: `nightly` (default) or `release`. |
|
| `--use-wheels [mode]` | Use pre-built vLLM wheels. Mode: `nightly` (default) or `release`. |
|
||||||
|
| `--pre-flashinfer` | Use pre-release versions of FlashInfer. |
|
||||||
| `-c, --copy-to <host[,host...] or host host...>` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). |
|
| `-c, --copy-to <host[,host...] or host host...>` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). |
|
||||||
| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). |
|
| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). |
|
||||||
| `--copy-parallel` | Copy to all specified hosts concurrently. |
|
| `--copy-parallel` | Copy to all specified hosts concurrently. |
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ VLLM_REF="main"
|
|||||||
TMP_IMAGE=""
|
TMP_IMAGE=""
|
||||||
PARALLEL_COPY=false
|
PARALLEL_COPY=false
|
||||||
USE_WHEELS_MODE=""
|
USE_WHEELS_MODE=""
|
||||||
|
PRE_FLASHINFER=false
|
||||||
|
|
||||||
cleanup() {
|
cleanup() {
|
||||||
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
|
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
|
||||||
@@ -69,6 +70,7 @@ usage() {
|
|||||||
echo " -j, --build-jobs <jobs> : Number of concurrent build jobs (default: \${BUILD_JOBS})"
|
echo " -j, --build-jobs <jobs> : Number of concurrent build jobs (default: \${BUILD_JOBS})"
|
||||||
echo " -u, --user <user> : Username for ssh command (default: \$USER)"
|
echo " -u, --user <user> : Username for ssh command (default: \$USER)"
|
||||||
echo " --use-wheels [mode] : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'."
|
echo " --use-wheels [mode] : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'."
|
||||||
|
echo " --pre-flashinfer : Use pre-release versions of FlashInfer"
|
||||||
echo " --no-build : Skip building, only copy image (requires --copy-to)"
|
echo " --no-build : Skip building, only copy image (requires --copy-to)"
|
||||||
echo " -h, --help : Show this help message"
|
echo " -h, --help : Show this help message"
|
||||||
exit 1
|
exit 1
|
||||||
@@ -129,6 +131,7 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
USE_WHEELS_MODE="nightly"
|
USE_WHEELS_MODE="nightly"
|
||||||
fi
|
fi
|
||||||
;;
|
;;
|
||||||
|
--pre-flashinfer) PRE_FLASHINFER=true ;;
|
||||||
--no-build) NO_BUILD=true ;;
|
--no-build) NO_BUILD=true ;;
|
||||||
-h|--help) usage ;;
|
-h|--help) usage ;;
|
||||||
*) echo "Unknown parameter passed: $1"; usage ;;
|
*) echo "Unknown parameter passed: $1"; usage ;;
|
||||||
@@ -181,6 +184,11 @@ if [ "$NO_BUILD" = false ]; then
|
|||||||
# Add BUILD_JOBS to build arguments
|
# Add BUILD_JOBS to build arguments
|
||||||
CMD+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
|
CMD+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
|
||||||
|
|
||||||
|
if [ "$PRE_FLASHINFER" = true ]; then
|
||||||
|
echo "Using pre-release FlashInfer..."
|
||||||
|
CMD+=("--build-arg" "FLASHINFER_PRE=--pre")
|
||||||
|
fi
|
||||||
|
|
||||||
# Add build context
|
# Add build context
|
||||||
CMD+=(".")
|
CMD+=(".")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user