83 lines
3.3 KiB
Docker
83 lines
3.3 KiB
Docker
# syntax=docker/dockerfile:1.6
|
|
|
|
FROM nvidia/cuda:13.1.0-devel-ubuntu24.04
|
|
|
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
ENV PIP_BREAK_SYSTEM_PACKAGES=1
|
|
ENV VLLM_BASE_DIR=/workspace/vllm
|
|
|
|
# Just in case if some JIT compilation happens during runtime
|
|
# Limit build parallelism to reduce OOM situations
|
|
ARG BUILD_JOBS=16
|
|
ENV MAX_JOBS=${BUILD_JOBS}
|
|
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
|
|
ENV NINJAFLAGS="-j${BUILD_JOBS}"
|
|
ENV MAKEFLAGS="-j${BUILD_JOBS}"
|
|
|
|
# Set pip cache directory
|
|
ENV PIP_CACHE_DIR=/root/.cache/pip
|
|
ENV UV_CACHE_DIR=/root/.cache/uv
|
|
ENV UV_SYSTEM_PYTHON=1
|
|
ENV UV_LINK_MODE=copy
|
|
|
|
# Install minimal runtime dependencies (NCCL, Python)
|
|
# Note: "devel" tools like cmake/gcc are NOT installed here to save space
|
|
RUN apt update && apt upgrade -y \
|
|
&& apt install -y --allow-change-held-packages --no-install-recommends \
|
|
python3 python3-pip python3-dev vim curl git wget \
|
|
libcudnn9-cuda-13 \
|
|
libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
|
|
&& rm -rf /var/lib/apt/lists/* \
|
|
&& pip install uv
|
|
|
|
# Set final working directory
|
|
WORKDIR $VLLM_BASE_DIR
|
|
|
|
# Download Tiktoken files
|
|
RUN mkdir -p tiktoken_encodings && \
|
|
wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
|
|
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
|
|
|
# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
|
COPY fastsafetensors.patch .
|
|
|
|
# Install fastsafetensors
|
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|
uv pip install --system --break-system-packages -U fastsafetensors
|
|
|
|
# --- VLLM SOURCE CACHE BUSTER ---
|
|
# Change THIS argument to force a fresh git clone and rebuild of vLLM
|
|
# without re-installing the dependencies above.
|
|
ARG CACHEBUST_VLLM=1
|
|
ARG VLLM_WHEELS_URL=https://wheels.vllm.ai/nightly/cu130
|
|
|
|
# Install nightly vLLM build from prebuilt wheels
|
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|
uv pip install --system --break-system-packages -U vllm \
|
|
--torch-backend=auto \
|
|
--extra-index-url $VLLM_WHEELS_URL
|
|
|
|
# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
|
# Apply in site-packages
|
|
RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch
|
|
|
|
# Install flashinfer helper packages
|
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|
uv pip install --system --break-system-packages flashinfer-python -U --no-deps --pre --index-url https://flashinfer.ai/whl && \
|
|
uv pip install --system --break-system-packages flashinfer-cubin --pre --index-url https://flashinfer.ai/whl && \
|
|
uv pip install --system --break-system-packages flashinfer-jit-cache --pre --index-url https://flashinfer.ai/whl/cu130
|
|
|
|
# Setup Env for Runtime
|
|
ENV TORCH_CUDA_ARCH_LIST=12.1a
|
|
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
|
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
|
|
|
# Copy scripts
|
|
COPY run-cluster-node.sh $VLLM_BASE_DIR/
|
|
RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
|
|
|
|
# Final extra deps
|
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|
uv pip install --system --break-system-packages ray[default]
|
|
|