# syntax=docker/dockerfile:1.6 FROM nvidia/cuda:13.1.1-devel-ubuntu24.04 ENV DEBIAN_FRONTEND=noninteractive ENV PIP_BREAK_SYSTEM_PACKAGES=1 ENV VLLM_BASE_DIR=/workspace/vllm # Just in case if some JIT compilation happens during runtime # Limit build parallelism to reduce OOM situations ARG BUILD_JOBS=16 ENV MAX_JOBS=${BUILD_JOBS} ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} ENV NINJAFLAGS="-j${BUILD_JOBS}" ENV MAKEFLAGS="-j${BUILD_JOBS}" # Set pip cache directory ENV PIP_CACHE_DIR=/root/.cache/pip ENV UV_CACHE_DIR=/root/.cache/uv ENV UV_SYSTEM_PYTHON=1 ENV UV_LINK_MODE=copy ENV UV_BREAK_SYSTEM_PACKAGES=1 # Install minimal runtime dependencies (NCCL, Python) # Note: "devel" tools like cmake/gcc are NOT installed here to save space RUN apt update && apt upgrade -y \ && apt install -y --allow-change-held-packages --no-install-recommends \ python3 python3-pip python3-dev vim curl git wget jq \ libcudnn9-cuda-13 \ libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ libxcb1 \ && rm -rf /var/lib/apt/lists/* \ && pip install uv # Set final working directory WORKDIR $VLLM_BASE_DIR # Download Tiktoken files RUN mkdir -p tiktoken_encodings && \ wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \ wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" # Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180 # COPY fastsafetensors.patch . # Install fastsafetensors RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ uv pip install -U fastsafetensors # --- VLLM SOURCE CACHE BUSTER --- # Change THIS argument to force a fresh git clone and rebuild of vLLM # without re-installing the dependencies above. ARG CACHEBUST_VLLM=1 ARG WHEELS_FROM_GITHUB_RELEASE=0 # Install vLLM # If INSTALL_FROM_GITHUB_RELEASE is 1, install from GitHub releases (specific for aarch64/cu130 as requested) # Otherwise, install from nightly wheels RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ if [ "$WHEELS_FROM_GITHUB_RELEASE" = "1" ]; then \ export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') && \ uv pip install -U https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl --torch-backend=auto; \ else \ uv pip install -U vllm \ --torch-backend=auto \ --extra-index-url https://wheels.vllm.ai/nightly/cu130; \ fi # Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180 # Apply in site-packages # RUN if patch -p1 --dry-run --reverse -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch &>/dev/null; then \ # echo "PR #34180 is already applied"; \ # else \ # patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch; \ # fi ARG FLASHINFER_PRE="" # Install flashinfer helper packages RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ uv pip install ${FLASHINFER_PRE} flashinfer-python -U --no-deps --index-url https://flashinfer.ai/whl && \ uv pip install ${FLASHINFER_PRE} flashinfer-cubin --index-url https://flashinfer.ai/whl && \ uv pip install ${FLASHINFER_PRE} flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130 ARG PRE_TRANSFORMERS=0 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ if [ "$PRE_TRANSFORMERS" = "1" ]; then \ uv pip install -U transformers --pre; \ uv pip install numpy==2.2.6; \ fi # Setup Env for Runtime ENV TORCH_CUDA_ARCH_LIST=12.1a ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings # Copy scripts COPY run-cluster-node.sh $VLLM_BASE_DIR/ RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh # Final extra deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ uv pip install ray[default]