Re-enable flashinfer_cutlass

2026-04-16 16:40:56 -07:00
parent 6b7f8dace6
commit d49fac1b8b
2 changed files with 8 additions and 17 deletions
--- a/24
+++ b/24
@@ -221,25 +221,15 @@ RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pul
       fi \
    && rm pr35568.diff
-# TEMPORARY PATCH for broken compilation - https://github.com/vllm-project/vllm/pull/38919
+# TEMPORARY PATCH to re-enable Flashinfer 0.6.8 - https://github.com/vllm-project/vllm/pull/39959
-RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38919.diff -o pr38919.diff \
+RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/39959.diff -o pr39959.diff \
-    && if git apply --reverse --check pr38919.diff 2>/dev/null; then \
+    && if git apply --reverse --check pr39959.diff 2>/dev/null; then \
-         echo "PR 38919 already applied, skipping."; \
+         echo "PR 39959 already applied, skipping."; \
       else \
-         echo "Applying PR 38919..."; \
+         echo "Applying PR 39959..."; \
-         git apply -v pr38919.diff; \
+         git apply -v pr39959.diff; \
       fi \
-    && rm pr38919.diff
+    && rm pr39959.diff
 # TEMPORARY PATCH for broken MiniMax M2.5 parser - https://github.com/vllm-project/vllm/pull/39861
 RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/39861.diff -o pr39861.diff \
    && if git apply --reverse --check pr39861.diff 2>/dev/null; then \
         echo "PR 39861 already applied, skipping."; \
       else \
         echo "Applying PR 39861..."; \
         git apply -v pr39861.diff; \
       fi \
    && rm pr39861.diff
 # Prepare build requirements
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
--- a/recipes/qwen3.5-397b-int4-autoround.yaml
+++ b/recipes/qwen3.5-397b-int4-autoround.yaml
@@ -55,6 +55,7 @@ command: |
    --max-num-batched-tokens {max_num_batched_tokens} \
    --trust-remote-code \
    --chat-template unsloth.jinja \
    --load-format instanttensor \
    -tp {tensor_parallel} \
    --distributed-executor-backend ray