diff --git a/Dockerfile b/Dockerfile index 4b22f61..66921b0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,8 @@ ENV MAX_JOBS=${BUILD_JOBS} ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} ENV NINJAFLAGS="-j${BUILD_JOBS}" ENV MAKEFLAGS="-j${BUILD_JOBS}" +ENV DG_JIT_USE_NVRTC=1 +ENV USE_CUDNN=1 # Set non-interactive frontend to prevent apt prompts ENV DEBIAN_FRONTEND=noninteractive @@ -120,6 +122,16 @@ RUN if [ -n "$FLASHINFER_PRS" ]; then \ done; \ fi +# TEMPORARY patch for NVFP4 crash (PR 2913) +RUN curl -fsL https://github.com/flashinfer-ai/flashinfer/pull/38423.diff -o pr2913.diff \ + && if git apply --reverse --check pr2913.diff 2>/dev/null; then \ + echo "PR #2913 already applied, skipping."; \ + else \ + echo "Applying FI PR #2913..."; \ + git apply -v pr2913.diff; \ + fi \ + && rm pr2913.diff + # Apply patch to avoid re-downloading existing cubins COPY flashinfer_cache.patch . RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ @@ -247,6 +259,8 @@ ENV MAX_JOBS=${BUILD_JOBS} ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} ENV NINJAFLAGS="-j${BUILD_JOBS}" ENV MAKEFLAGS="-j${BUILD_JOBS}" +ENV DG_JIT_USE_NVRTC=1 +ENV USE_CUDNN=1 ENV DEBIAN_FRONTEND=noninteractive ENV PIP_BREAK_SYSTEM_PACKAGES=1 diff --git a/README.md b/README.md index 125f0b6..9d45eb1 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ An initial build speed depends on your Internet connection speed and whether the **On a single node**: -**NEW** - `launch-cluster.sh` now supports solo mode, which is now a recommended way to run the container on a single Spark: +`launch-cluster.sh` supports solo mode, which is now a recommended way to run the container on a single Spark: ```bash ./launch-cluster.sh --solo exec \ @@ -80,23 +80,6 @@ An initial build speed depends on your Internet connection speed and whether the --load-format fastsafetensors ``` -**To launch using regular `docker run`** - -```bash - docker run \ - --privileged \ - --gpus all \ - -it --rm \ - --network host --ipc=host \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - vllm-node \ - bash -c -i "vllm serve \ - QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ \ - --port 8000 --host 0.0.0.0 \ - --gpu-memory-utilization 0.7 \ - --load-format fastsafetensors" -``` - **On a cluster** It's recommended to download the model on one node and distribute across the cluster using ConnectX interconnect prior to launching. This is to avoid re-downloading the model from the Internet on every node in the cluster. @@ -151,7 +134,7 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi ## CHANGELOG -### 2026-03-29 +### 2026-03-30 #### Flags to specify Flashinfer ref and apply PRs @@ -162,8 +145,6 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi Both flags are incompatible with `--exp-mxfp4`. -### 2026-03-27 - #### Default image tag in `build-and-copy.sh` `build-and-copy.sh` now automatically sets a sensible default image tag when `-t` is not specified: