diff --git a/Dockerfile b/Dockerfile index ef2b5ce..22f1ed8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -221,25 +221,15 @@ RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pul fi \ && rm pr35568.diff -# TEMPORARY PATCH for broken compilation - https://github.com/vllm-project/vllm/pull/38919 -RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38919.diff -o pr38919.diff \ - && if git apply --reverse --check pr38919.diff 2>/dev/null; then \ - echo "PR 38919 already applied, skipping."; \ +# TEMPORARY PATCH to re-enable Flashinfer 0.6.8 - https://github.com/vllm-project/vllm/pull/39959 +RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/39959.diff -o pr39959.diff \ + && if git apply --reverse --check pr39959.diff 2>/dev/null; then \ + echo "PR 39959 already applied, skipping."; \ else \ - echo "Applying PR 38919..."; \ - git apply -v pr38919.diff; \ + echo "Applying PR 39959..."; \ + git apply -v pr39959.diff; \ fi \ - && rm pr38919.diff - -# TEMPORARY PATCH for broken MiniMax M2.5 parser - https://github.com/vllm-project/vllm/pull/39861 -RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/39861.diff -o pr39861.diff \ - && if git apply --reverse --check pr39861.diff 2>/dev/null; then \ - echo "PR 39861 already applied, skipping."; \ - else \ - echo "Applying PR 39861..."; \ - git apply -v pr39861.diff; \ - fi \ - && rm pr39861.diff + && rm pr39959.diff # Prepare build requirements RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ diff --git a/recipes/qwen3.5-397b-int4-autoround.yaml b/recipes/qwen3.5-397b-int4-autoround.yaml index 750accd..884c453 100644 --- a/recipes/qwen3.5-397b-int4-autoround.yaml +++ b/recipes/qwen3.5-397b-int4-autoround.yaml @@ -55,6 +55,7 @@ command: | --max-num-batched-tokens {max_num_batched_tokens} \ --trust-remote-code \ --chat-template unsloth.jinja \ + --load-format instanttensor \ -tp {tensor_parallel} \ --distributed-executor-backend ray