Merge remote-tracking branch 'upstream/main'
# Conflicts: # Dockerfile
This commit is contained in:
56
Dockerfile
56
Dockerfile
@@ -4,9 +4,9 @@
|
|||||||
ARG BUILD_JOBS=16
|
ARG BUILD_JOBS=16
|
||||||
|
|
||||||
# =========================================================
|
# =========================================================
|
||||||
# STAGE 1: Base Image (Installs Dependencies)
|
# STAGE 1: Base Build Image
|
||||||
# =========================================================
|
# =========================================================
|
||||||
FROM nvcr.io/nvidia/pytorch:26.01-py3 AS base
|
FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS base
|
||||||
|
|
||||||
# Build parallemism
|
# Build parallemism
|
||||||
ARG BUILD_JOBS
|
ARG BUILD_JOBS
|
||||||
@@ -35,10 +35,18 @@ ENV VLLM_BASE_DIR=/workspace/vllm
|
|||||||
# Added ccache to enable incremental compilation caching
|
# Added ccache to enable incremental compilation caching
|
||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install -y --no-install-recommends \
|
apt install -y --no-install-recommends \
|
||||||
curl vim ninja-build git \
|
curl vim cmake build-essential ninja-build \
|
||||||
|
libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \
|
||||||
|
python3-dev python3-pip git wget \
|
||||||
|
libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
|
||||||
ccache \
|
ccache \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& pip install uv && pip uninstall -y flash-attn
|
&& pip install uv
|
||||||
|
|
||||||
|
# Additional deps
|
||||||
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
|
uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \
|
||||||
|
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" filelock pynvml requests tqdm
|
||||||
|
|
||||||
# Configure Ccache for CUDA/C++
|
# Configure Ccache for CUDA/C++
|
||||||
ENV PATH=/usr/lib/ccache:$PATH
|
ENV PATH=/usr/lib/ccache:$PATH
|
||||||
@@ -73,9 +81,6 @@ ARG FLASHINFER_REF=main
|
|||||||
# Change this argument to force a re-download of FlashInfer
|
# Change this argument to force a re-download of FlashInfer
|
||||||
ARG CACHEBUST_FLASHINFER=1
|
ARG CACHEBUST_FLASHINFER=1
|
||||||
|
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|
||||||
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
|
|
||||||
|
|
||||||
# Smart Git Clone (Fetch changes instead of full re-clone)
|
# Smart Git Clone (Fetch changes instead of full re-clone)
|
||||||
RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
|
RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
|
||||||
cd /repo-cache && \
|
cd /repo-cache && \
|
||||||
@@ -132,9 +137,6 @@ ARG TORCH_CUDA_ARCH_LIST="12.1a"
|
|||||||
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||||
WORKDIR $VLLM_BASE_DIR
|
WORKDIR $VLLM_BASE_DIR
|
||||||
|
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|
||||||
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
|
|
||||||
|
|
||||||
# --- VLLM SOURCE CACHE BUSTER ---
|
# --- VLLM SOURCE CACHE BUSTER ---
|
||||||
ARG CACHEBUST_VLLM=1
|
ARG CACHEBUST_VLLM=1
|
||||||
|
|
||||||
@@ -211,7 +213,7 @@ COPY --from=vllm-builder /workspace/wheels /
|
|||||||
# =========================================================
|
# =========================================================
|
||||||
# STAGE 6: Runner (Installs wheels from host ./wheels/)
|
# STAGE 6: Runner (Installs wheels from host ./wheels/)
|
||||||
# =========================================================
|
# =========================================================
|
||||||
FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner
|
FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS runner
|
||||||
|
|
||||||
# Transferring build settings from build image because of ptxas/jit compilation during vLLM startup
|
# Transferring build settings from build image because of ptxas/jit compilation during vLLM startup
|
||||||
# Build parallemism
|
# Build parallemism
|
||||||
@@ -235,10 +237,12 @@ ENV UV_LINK_MODE=copy
|
|||||||
# Install runtime dependencies
|
# Install runtime dependencies
|
||||||
RUN apt update && \
|
RUN apt update && \
|
||||||
apt install -y --no-install-recommends \
|
apt install -y --no-install-recommends \
|
||||||
curl vim git \
|
python3 python3-pip python3-dev vim curl git wget \
|
||||||
|
libcudnn9-cuda-13 \
|
||||||
|
libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
|
||||||
libxcb1 \
|
libxcb1 \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& pip install uv && pip uninstall -y flash-attn # triton-kernels pytorch-triton
|
&& pip install uv
|
||||||
|
|
||||||
# Set final working directory
|
# Set final working directory
|
||||||
WORKDIR $VLLM_BASE_DIR
|
WORKDIR $VLLM_BASE_DIR
|
||||||
@@ -250,6 +254,11 @@ RUN mkdir -p tiktoken_encodings && \
|
|||||||
|
|
||||||
ARG PRE_TRANSFORMERS=0
|
ARG PRE_TRANSFORMERS=0
|
||||||
|
|
||||||
|
# Install deps
|
||||||
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
|
uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \
|
||||||
|
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
|
||||||
|
|
||||||
# Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
|
# Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
|
||||||
# With --tf5: override vLLM's transformers<5 constraint to get transformers>=5
|
# With --tf5: override vLLM's transformers<5 constraint to get transformers>=5
|
||||||
RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
|
RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
|
||||||
@@ -273,24 +282,7 @@ ENV PATH=$VLLM_BASE_DIR:$PATH
|
|||||||
|
|
||||||
# Final extra deps
|
# Final extra deps
|
||||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
uv pip install ray[default] fastsafetensors nvidia-nvshmem-cu13
|
uv pip install ray[default] fastsafetensors
|
||||||
|
|
||||||
# Build metadata (generated by build-and-copy.sh)
|
# Build metadata (generated by build-and-copy.sh)
|
||||||
COPY build-metadata.yaml /workspace/build-metadata.yaml
|
COPY build-metadata.yaml /workspace/build-metadata.yaml
|
||||||
|
|
||||||
# Cleanup
|
|
||||||
|
|
||||||
# Keeping it here for reference - this won't work as is without squashing layers
|
|
||||||
# RUN uv pip uninstall absl-py apex argon2-cffi \
|
|
||||||
# argon2-cffi-bindings arrow asttokens astunparse async-lru audioread babel beautifulsoup4 \
|
|
||||||
# black bleach comm contourpy cycler datasets debugpy decorator defusedxml dllist dm-tree \
|
|
||||||
# execnet executing expecttest fastjsonschema fonttools fqdn gast hypothesis \
|
|
||||||
# ipykernel ipython ipython_pygments_lexers isoduration isort jedi joblib jupyter-events \
|
|
||||||
# jupyter-lsp jupyter_client jupyter_core jupyter_server jupyter_server_terminals jupyterlab \
|
|
||||||
# jupyterlab_code_formatter jupyterlab_code_formatter jupyterlab_pygments jupyterlab_server \
|
|
||||||
# jupyterlab_tensorboard_pro jupytext kiwisolver matplotlib matplotlib-inline matplotlib-inline \
|
|
||||||
# mistune ml_dtypes mock nbclient nbconvert nbformat nest-asyncio notebook notebook_shim \
|
|
||||||
# opt_einsum optree outlines_core overrides pandas pandocfilters parso pexpect polygraphy pooch \
|
|
||||||
# pyarrow pycocotools pytest-flakefinder pytest-rerunfailures pytest-shard pytest-xdist \
|
|
||||||
# scikit-learn scipy Send2Trash soundfile soupsieve soxr spin stack-data \
|
|
||||||
# wcwidth webcolors xdoctest Werkzeug
|
|
||||||
@@ -27,15 +27,10 @@ defaults:
|
|||||||
gpu_memory_utilization: 0.7
|
gpu_memory_utilization: 0.7
|
||||||
max_model_len: 262144
|
max_model_len: 262144
|
||||||
|
|
||||||
# Environment variables
|
|
||||||
env:
|
|
||||||
VLLM_NVFP4_GEMM_BACKEND: "marlin"
|
|
||||||
VLLM_TEST_FORCE_FP8_MARLIN: "1"
|
|
||||||
VLLM_MARLIN_USE_ATOMIC_ADD: "1"
|
|
||||||
|
|
||||||
# The vLLM serve command template
|
# The vLLM serve command template
|
||||||
command: |
|
command: |
|
||||||
vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 \
|
vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 \
|
||||||
|
--moe-backend cutlass \
|
||||||
--max-model-len {max_model_len} \
|
--max-model-len {max_model_len} \
|
||||||
--port {port} --host {host} \
|
--port {port} --host {host} \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
# Recipe: Nemotron-3-Super-NVFP4
|
# Recipe: Nemotron-3-Super-NVFP4
|
||||||
# Optimized for Marlin backend throughput
|
# Uses VLLM_CUTLASS for NVFP4
|
||||||
recipe_version: "1"
|
recipe_version: "1"
|
||||||
name: Nemotron-3-Super-NVFP4-Marlin-Optimized
|
name: Nemotron-3-Super-NVFP4-CUTLASS-Optimized
|
||||||
description: vLLM serving Nemotron-3-Super-120B using Marlin kernels
|
description: vLLM serving Nemotron-3-Super-120B using CUTLASS kernels
|
||||||
|
|
||||||
model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
|
model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
|
||||||
container: vllm-node
|
container: vllm-node
|
||||||
@@ -20,15 +20,11 @@ defaults:
|
|||||||
gpu_memory_utilization: 0.7
|
gpu_memory_utilization: 0.7
|
||||||
max_model_len: 262144
|
max_model_len: 262144
|
||||||
max_num_seqs: 10
|
max_num_seqs: 10
|
||||||
env:
|
|
||||||
VLLM_NVFP4_GEMM_BACKEND: "marlin"
|
|
||||||
VLLM_TEST_FORCE_FP8_MARLIN: "1"
|
|
||||||
VLLM_MARLIN_USE_ATOMIC_ADD: "1"
|
|
||||||
|
|
||||||
command: |
|
command: |
|
||||||
vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \
|
vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \
|
||||||
--kv-cache-dtype fp8 \
|
--kv-cache-dtype fp8 \
|
||||||
-tp {tensor_parallel} \
|
--moe-backend cutlass \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||||
--max-model-len {max_model_len} \
|
--max-model-len {max_model_len} \
|
||||||
|
|||||||
Reference in New Issue
Block a user