232 lines
9.1 KiB
Docker
232 lines
9.1 KiB
Docker
# syntax=docker/dockerfile:1.6
|
|
|
|
# =========================================================
|
|
# STAGE 1: Base Image (Installs Dependencies)
|
|
# =========================================================
|
|
FROM nvcr.io/nvidia/pytorch:26.01-py3 AS base
|
|
|
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
ENV PIP_BREAK_SYSTEM_PACKAGES=1
|
|
ENV VLLM_BASE_DIR=/workspace/vllm
|
|
|
|
# Just in case if some JIT compilation happens during runtime
|
|
# Limit build parallelism to reduce OOM situations
|
|
ARG BUILD_JOBS=16
|
|
ENV MAX_JOBS=${BUILD_JOBS}
|
|
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
|
|
ENV NINJAFLAGS="-j${BUILD_JOBS}"
|
|
ENV MAKEFLAGS="-j${BUILD_JOBS}"
|
|
|
|
# Set pip cache directory
|
|
ENV PIP_CACHE_DIR=/root/.cache/pip
|
|
ENV UV_CACHE_DIR=/root/.cache/uv
|
|
ENV UV_SYSTEM_PYTHON=1
|
|
ENV UV_LINK_MODE=copy
|
|
ENV UV_BREAK_SYSTEM_PACKAGES=1
|
|
|
|
# Install runtime dependencies
|
|
RUN apt update && \
|
|
apt install -y --no-install-recommends \
|
|
curl vim git \
|
|
libxcb1 \
|
|
&& rm -rf /var/lib/apt/lists/* \
|
|
&& pip install uv && pip uninstall -y flash-attn
|
|
|
|
# Set final working directory
|
|
WORKDIR $VLLM_BASE_DIR
|
|
|
|
# Download Tiktoken files
|
|
RUN mkdir -p tiktoken_encodings && \
|
|
wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
|
|
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
|
|
|
# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
|
COPY fastsafetensors.patch .
|
|
|
|
# Install fastsafetensors
|
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|
uv pip install -U fastsafetensors ray[default] "apache-tvm-ffi<0.2"
|
|
|
|
# --- VLLM SOURCE CACHE BUSTER ---
|
|
# Change THIS argument to force a fresh git clone and rebuild of vLLM
|
|
# without re-installing the dependencies above.
|
|
ARG CACHEBUST_VLLM=1
|
|
ARG WHEELS_FROM_GITHUB_RELEASE=0
|
|
|
|
# Install vLLM
|
|
# If INSTALL_FROM_GITHUB_RELEASE is 1, install from GitHub releases (specific for aarch64/cu130 as requested)
|
|
# Otherwise, install from nightly wheels
|
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|
if [ "$WHEELS_FROM_GITHUB_RELEASE" = "1" ]; then \
|
|
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') && \
|
|
uv pip install -U https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl --torch-backend=auto; \
|
|
else \
|
|
uv pip install -U vllm \
|
|
--torch-backend=auto \
|
|
--extra-index-url https://wheels.vllm.ai/nightly/cu130; \
|
|
fi
|
|
|
|
# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
|
# Apply in site-packages
|
|
RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch
|
|
|
|
ARG FLASHINFER_PRE=""
|
|
|
|
# Install flashinfer helper packages
|
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|
uv pip install ${FLASHINFER_PRE} flashinfer-python -U --no-deps --index-url https://flashinfer.ai/whl && \
|
|
uv pip install ${FLASHINFER_PRE} flashinfer-cubin --index-url https://flashinfer.ai/whl && \
|
|
uv pip install ${FLASHINFER_PRE} flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130
|
|
|
|
ARG PRE_TRANSFORMERS=0
|
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|
if [ "$PRE_TRANSFORMERS" = "1" ]; then \
|
|
uv pip install -U transformers --pre; \
|
|
uv pip install numpy==2.2.6; \
|
|
fi
|
|
|
|
# Setup Env for Runtime
|
|
ENV TORCH_CUDA_ARCH_LIST=12.1a
|
|
ENV FLASHINFER_CUDA_ARCH_LIST="12.1f"
|
|
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
|
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
|
|
|
# Copy scripts
|
|
COPY run-cluster-node.sh $VLLM_BASE_DIR/
|
|
RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
|
|
|
|
# Cleanup
|
|
|
|
# Remove triton-kernels as they are not compatible with this vLLM version yet
|
|
RUN uv pip uninstall triton-kernels
|
|
|
|
# Cleanup unneeded packages to reduce image size
|
|
RUN uv pip uninstall absl-py apex argon2-cffi \
|
|
argon2-cffi-bindings arrow asttokens astunparse async-lru audioread babel beautifulsoup4 \
|
|
black bleach comm contourpy cycler datasets debugpy decorator defusedxml dllist dm-tree \
|
|
execnet executing expecttest fastjsonschema fonttools fqdn gast hypothesis \
|
|
ipykernel ipython ipython_pygments_lexers isoduration isort jedi joblib jupyter-events \
|
|
jupyter-lsp jupyter_client jupyter_core jupyter_server jupyter_server_terminals jupyterlab \
|
|
jupyterlab_code_formatter jupyterlab_code_formatter jupyterlab_pygments jupyterlab_server \
|
|
jupyterlab_tensorboard_pro jupytext kiwisolver matplotlib matplotlib-inline matplotlib-inline \
|
|
mistune ml_dtypes mock nbclient nbconvert nbformat nest-asyncio notebook notebook_shim \
|
|
opt_einsum optree outlines_core overrides pandas pandocfilters parso pexpect polygraphy pooch \
|
|
pyarrow pycocotools pytest-flakefinder pytest-rerunfailures pytest-shard pytest-xdist \
|
|
scikit-learn scipy Send2Trash soundfile soupsieve soxr spin stack-data \
|
|
wcwidth webcolors xdoctest Werkzeug
|
|
|
|
# Final build
|
|
FROM scratch
|
|
|
|
# 1. Copy everything from build stage except for the deleted files
|
|
COPY --from=base / /
|
|
|
|
# 2. Restore NVIDIA container environment variables
|
|
ENV PATH=/usr/local/lib/python3.12/dist-packages/torch_tensorrt/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/mpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/ucx/bin:/opt/amazon/efa/bin:/opt/tensorrt/bin
|
|
ENV GDRCOPY_VERSION=2.5.1
|
|
ENV HPCX_VERSION=2.25.1-RC2
|
|
ENV MOFED_VERSION=5.4-rdmacore56.0
|
|
ENV OPENUCX_VERSION=1.20.0
|
|
ENV OPENMPI_VERSION=4.1.7
|
|
ENV RDMACORE_VERSION=56.0
|
|
ENV EFA_VERSION=1.43.1
|
|
ENV AWS_OFI_NCCL_VERSION=1.17.0
|
|
ENV OPAL_PREFIX=/opt/hpcx/ompi
|
|
ENV OMPI_MCA_coll_hcoll_enable=0
|
|
ENV CUDA_VERSION=13.1.1.006
|
|
ENV CUDA_DRIVER_VERSION=590.48.01
|
|
ENV NVVM_VERSION=13.1.115
|
|
ENV DOCA_VERSION=3.1.0
|
|
ENV _CUDA_COMPAT_PATH=/usr/local/cuda/compat
|
|
ENV ENV=/etc/shinit_v2
|
|
ENV BASH_ENV=/etc/bash.bashrc
|
|
ENV SHELL=/bin/bash
|
|
ENV NVIDIA_REQUIRE_CUDA=cuda>=9.0
|
|
ENV NCCL_VERSION=2.29.stable.20260109
|
|
ENV CUBLAS_VERSION=13.2.1.1
|
|
ENV CUFFT_VERSION=12.1.0.78
|
|
ENV CURAND_VERSION=10.4.1.81
|
|
ENV CUSPARSE_VERSION=12.7.3.1
|
|
ENV CUSPARSELT_VERSION=0.8.1.1
|
|
ENV CUSOLVER_VERSION=12.0.9.81
|
|
ENV NPP_VERSION=13.0.3.3
|
|
ENV NVJPEG_VERSION=13.0.3.75
|
|
ENV CUFILE_VERSION=1.16.1.26
|
|
ENV NVJITLINK_VERSION=13.1.115
|
|
ENV NVFATBIN_VERSION=13.1.115
|
|
ENV CUBLASMP_VERSION=0.7.0.125
|
|
ENV NVSHMEM_VERSION=3.4.5
|
|
ENV CUDLA_VERSION=13.1.1.006
|
|
ENV NVPTXCOMPILER_VERSION=13.1.115
|
|
ENV CUDNN_VERSION=9.17.1.4
|
|
ENV CUDNN_FRONTEND_VERSION=1.17.0
|
|
ENV TRT_VERSION=10.14.1.48+cuda13.0
|
|
ENV TRTOSS_VERSION=
|
|
ENV NSIGHT_SYSTEMS_VERSION=2025.6.1.190
|
|
ENV NSIGHT_COMPUTE_VERSION=2025.4.1.2
|
|
ENV DALI_VERSION=1.53.0
|
|
ENV DALI_BUILD=
|
|
ENV DALI_URL_SUFFIX=130
|
|
ENV POLYGRAPHY_VERSION=0.49.26
|
|
ENV TRANSFORMER_ENGINE_VERSION=2.11
|
|
ENV MODEL_OPT_VERSION=0.40.0
|
|
ENV CUDA_ARCH_LIST="12.0 12.1"
|
|
ENV MAXSMVER=121
|
|
ENV NVRX_VERSION=0.5.0
|
|
ENV LD_LIBRARY_PATH=/usr/local/lib/python3.12/dist-packages/torch/lib:/usr/local/lib/python3.12/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
|
ENV NVIDIA_VISIBLE_DEVICES=all
|
|
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,video
|
|
ENV NVIDIA_PRODUCT_NAME=PyTorch
|
|
ENV CUDA_COMPONENT_LIST="cccl crt nvrtc driver-dev culibos-dev cudart cudart-dev nvcc tileiras"
|
|
ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/usr/local/cuda/lib64/stubs:
|
|
ENV PYTORCH_BUILD_VERSION=2.10.0a0+a36e1d3
|
|
ENV PYTORCH_VERSION=2.10.0a0+a36e1d3
|
|
ENV PYTORCH_BUILD_NUMBER=0
|
|
ENV NVIDIA_PYTORCH_VERSION=26.01
|
|
ENV NVFUSER_BUILD_VERSION=5d8efce
|
|
ENV NVFUSER_VERSION=5d8efce
|
|
ENV TORCHAO_BUILD_VERSION=+git1272f3cf
|
|
ENV TORCHTITAN_BUILD_VERSION=0.2.0+gite98ae995
|
|
ENV PIP_BREAK_SYSTEM_PACKAGES=1
|
|
ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
|
ENV PIP_CONSTRAINT=/etc/pip/constraint.txt
|
|
ENV NVPL_LAPACK_MATH_MODE=PEDANTIC
|
|
ENV PYTHONIOENCODING=utf-8
|
|
ENV LC_ALL=C.UTF-8
|
|
ENV PIP_DEFAULT_TIMEOUT=100
|
|
ENV JUPYTER_PORT=8888
|
|
ENV TENSORBOARD_PORT=6006
|
|
ENV UCC_CL_BASIC_TLS=^sharp
|
|
ENV UCC_EC_CUDA_EXEC_NUM_THREADS=256
|
|
ENV TORCH_CUDA_ARCH_LIST=12.1a
|
|
ENV PYTORCH_HOME=/opt/pytorch/pytorch
|
|
ENV CUDA_HOME=/usr/local/cuda
|
|
ENV TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1
|
|
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
|
ENV TRITON_CUOBJDUMP_PATH=/usr/local/cuda/bin/cuobjdump
|
|
ENV TRITON_NVDISASM_PATH=/usr/local/cuda/bin/nvdisasm
|
|
ENV TRITON_CUDACRT_PATH=/usr/local/cuda/include
|
|
ENV TRITON_CUDART_PATH=/usr/local/cuda/include
|
|
ENV TRITON_CUPTI_LIB_PATH=/usr/local/cuda/lib64
|
|
ENV TRITON_CUPTI_INCLUDE_PATH=/usr/local/cuda/include
|
|
ENV COCOAPI_VERSION=2.0+nv0.8.1
|
|
ENV CUDA_BINARY_LOADER_THREAD_COUNT=8
|
|
ENV CUDA_MODULE_LOADING=LAZY
|
|
ENV TORCH_NCCL_USE_COMM_NONBLOCKING=0
|
|
ENV TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION=0
|
|
ENV NVIDIA_BUILD_ID=256811084
|
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
ENV VLLM_BASE_DIR=/workspace/vllm
|
|
ENV MAX_JOBS=16
|
|
ENV CMAKE_BUILD_PARALLEL_LEVEL=16
|
|
ENV NINJAFLAGS=-j16
|
|
ENV MAKEFLAGS=-j16
|
|
ENV PIP_CACHE_DIR=/root/.cache/pip
|
|
ENV UV_CACHE_DIR=/root/.cache/uv
|
|
ENV UV_SYSTEM_PYTHON=1
|
|
ENV UV_LINK_MODE=copy
|
|
ENV UV_BREAK_SYSTEM_PACKAGES=1
|
|
ENV FLASHINFER_CUDA_ARCH_LIST=12.1f
|
|
ENV TIKTOKEN_ENCODINGS_BASE=/workspace/vllm/tiktoken_encodings
|
|
|
|
CMD ["/bin/bash"] |