Unsloth chat template for qwen3.5

Renamed recipe for qwen3.5-35b-a3b-fp8 to match others
Merge pull request #76 from mmonad/fix-exec-arg-quoting
2026-03-06 23:35:18 -08:00 · 2026-03-06 13:56:06 -08:00 · 2026-03-06 13:45:53 -08:00 · 2026-03-06 11:47:47 -08:00 · 2026-03-06 11:46:37 -08:00 · 2026-03-05 17:06:57 -08:00
12 changed files with 402 additions and 14 deletions
--- a/8
+++ b/8
@@ -113,7 +113,9 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
    # flashinfer-jit-cache
    cd ../flashinfer-jit-cache && \
-    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
+    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
+    # dump git ref in the wheels dir
+    cd .. && git rev-parse HEAD > /workspace/wheels/.flashinfer-commit

 # =========================================================
 # STAGE 3: FlashInfer Wheel Export
@@ -196,7 +198,9 @@ RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/
 # Final Compilation
 RUN --mount=type=cache,id=ccache,target=/root/.ccache \
    --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
+    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
+    # dump git ref in the wheels dir
+    git rev-parse HEAD > /workspace/wheels/.vllm-commit

 # =========================================================
 # STAGE 5: vLLM Wheel Export
--- a/build-and-copy.sh
+++ b/build-and-copy.sh
@@ -66,7 +66,12 @@ copy_to_host() {

 # try_download_wheels TAG PREFIX
 # Downloads wheels matching PREFIX*.whl from a GitHub release.
-# Skips files that are already present and up to date (by remote updated_at vs local mtime).
+# Skip conditions (either is sufficient):
+#   1. Commit hash in release name matches .wheels/.{PREFIX}_commit (primary check).
+#   2. All local wheels are newer than the latest GitHub asset (freshly built).
+# Only downloads a file when the remote asset is newer than the local copy AND
+# the above skip conditions are not met.
+# On success, persists the release commit hash to .wheels/.{PREFIX}_commit.
 # Returns 0 if all matching wheels are now available, 1 on any error.
 try_download_wheels() {
    local TAG="$1"
@@ -92,7 +97,7 @@ try_download_wheels() {

    local DOWNLOAD_LIST
    DOWNLOAD_LIST=$(echo "$RELEASE_JSON" | python3 -c '
-import json, sys, os
+import json, sys, os, re
 from datetime import datetime, timezone

 wheels_dir, prefix = sys.argv[1], sys.argv[2]
@@ -104,6 +109,31 @@ if not assets:
    print("No assets found matching prefix: " + prefix, file=sys.stderr)
    sys.exit(1)

+# Extract commit hash from the release name:
+#   FlashInfer: "Prebuilt FlashInfer Wheels (0.6.5-124a2d32-d20260305) - DGX Spark Only"
+#   vLLM:       "Prebuilt vLLM Wheels (0.16.1rc1.dev296+ga73af584f.d20260305.cu131) - DGX Spark only"
+release_name = data.get("name", "")
+commit_hash = None
+if prefix.startswith("flashinfer"):
+    m = re.search(r"\([\d.]+\w*-([0-9a-f]{6,})-d\d{8}\)", release_name, re.IGNORECASE)
+    if m:
+        commit_hash = m.group(1)
+else:
+    m = re.search(r"\+g([0-9a-f]{6,})\.", release_name, re.IGNORECASE)
+    if m:
+        commit_hash = m.group(1)
+
+# Compare against the locally stored commit hash
+commit_file = os.path.join(wheels_dir, "." + prefix + "-commit")
+local_commit = None
+if os.path.exists(commit_file):
+    with open(commit_file) as f:
+        local_commit = f.read().strip()
+
+if commit_hash and local_commit and local_commit[:len(commit_hash)] == commit_hash:
+    print("Commit hash matches (" + commit_hash + ") — wheels are up to date.", file=sys.stderr)
+    sys.exit(0)
+
 newest_remote_ts = max(
    datetime.strptime(a["updated_at"], "%Y-%m-%dT%H:%M:%SZ")
            .replace(tzinfo=timezone.utc).timestamp()
@@ -119,12 +149,19 @@ local_wheels = [
 if local_wheels and all(os.path.getmtime(p) >= newest_remote_ts for p in local_wheels):
    sys.exit(0)

+downloads = []
 for a in assets:
    local_path = os.path.join(wheels_dir, a["name"])
    remote_ts = datetime.strptime(a["updated_at"], "%Y-%m-%dT%H:%M:%SZ") \
                    .replace(tzinfo=timezone.utc).timestamp()
    if not os.path.exists(local_path) or remote_ts > os.path.getmtime(local_path):
-        print(a["browser_download_url"] + " " + a["name"])
+        downloads.append(a["browser_download_url"] + " " + a["name"])
+
+if downloads:
+    if commit_hash:
+        print("#commit:" + commit_hash)
+    for d in downloads:
+        print(d)
 ' "$WHEELS_DIR" "$PREFIX") || return 1

    if [ -z "$DOWNLOAD_LIST" ]; then
@@ -132,12 +169,31 @@ for a in assets:
        return 0
    fi

+    # Parse the optional '#commit:HASH' sentinel emitted by the Python script
+    local REMOTE_COMMIT=""
+    local DOWNLOAD_ENTRIES=""
+    while IFS= read -r LINE; do
+        if [[ "$LINE" == "#commit:"* ]]; then
+            REMOTE_COMMIT="${LINE#"#commit:"}"
+        elif [[ -n "$LINE" ]]; then
+            DOWNLOAD_ENTRIES+="$LINE"$'\n'
+        fi
+    done <<< "$DOWNLOAD_LIST"
+
+    if [ -z "$DOWNLOAD_ENTRIES" ]; then
+        echo "All $PREFIX wheels are up to date — skipping download."
+        return 0
+    fi
+
    # Back up existing wheels so we never leave a mix of old and new on failure
    local DL_BACKUP="$WHEELS_DIR/.backup-download-${PREFIX}"
    rm -rf "$DL_BACKUP" && mkdir -p "$DL_BACKUP"
    for f in "$WHEELS_DIR/${PREFIX}"*.whl; do
        [ -f "$f" ] && mv "$f" "$DL_BACKUP/"
    done
+    for f in "$WHEELS_DIR/.${PREFIX}"*; do
+        [ -f "$f" ] && mv "$f" "$DL_BACKUP/"
+    done

    local URL NAME TMP_WHL
    local DOWNLOADED=()
@@ -154,13 +210,18 @@ for a in assets:
            if compgen -G "$DL_BACKUP/${PREFIX}*.whl" > /dev/null 2>&1; then
                echo "Restoring previous $PREFIX wheels..."
                mv "$DL_BACKUP/${PREFIX}"*.whl "$WHEELS_DIR/"
+                mv "$DL_BACKUP/.${PREFIX}"* "$WHEELS_DIR/"
            fi
            rm -rf "$DL_BACKUP"
            return 1
        fi
-    done <<< "$DOWNLOAD_LIST"
+    done <<< "$DOWNLOAD_ENTRIES"

    rm -rf "$DL_BACKUP"
+    if [ -n "$REMOTE_COMMIT" ]; then
+        echo "$REMOTE_COMMIT" > "$WHEELS_DIR/.${PREFIX}-commit"
+        echo "Recorded $PREFIX commit hash: $REMOTE_COMMIT"
+    fi
    return 0
 }

--- a/docs/NETWORKING.md
+++ b/docs/NETWORKING.md
@@ -58,8 +58,8 @@ network:
  ethernets:
    enp1s0f1np1:
      dhcp4: no
-      dhcp6: no              # Explicitly disable DHCPv6
-      link-local: [ ipv4 ]   # Restrict link-local addresses to IPv4 only
+      dhcp6: no        # Explicitly disable DHCPv6
+      link-local: []   # Restrict link-local addresses to static IPv4 only
      mtu: 9000
      addresses: [192.168.177.11/24]
    enP2p1s0f1np1:
@@ -76,8 +76,8 @@ network:
  ethernets:
    enp1s0f1np1:
      dhcp4: no
-      dhcp6: no              # Explicitly disable DHCPv6
-      link-local: [ ipv4 ]   # Restrict link-local addresses to IPv4 only
+      dhcp6: no        # Explicitly disable DHCPv6
+      link-local: []   # Restrict link-local addresses to static IPv4 only
      mtu: 9000
      addresses: [192.168.177.12/24]
    enP2p1s0f1np1:
@@ -239,4 +239,4 @@ mpirun -np 2 -H 192.168.177.11:1,192.168.177.12:1 \
  -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
  $HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 2

-```
+```
--- a/launch-cluster.sh
+++ b/launch-cluster.sh
@@ -115,7 +115,7 @@ while [[ "$#" -gt 0 ]]; do
            fi
            ACTION="exec"
            shift
-            COMMAND_TO_RUN="$@"
+            COMMAND_TO_RUN=$(printf "%q " "$@")
            break
            ;;
        *) 
--- a/mods/fix-qwen3.5-chat-template/chat_template.jinja
+++ b/mods/fix-qwen3.5-chat-template/chat_template.jinja
@@ -0,0 +1,155 @@
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+    {%- if content is string %}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping %}
+        {%- for item in content %}
+            {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain images.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Picture ' ~ image_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+            {%- elif 'video' in item or item.type == 'video' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain videos.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Video ' ~ video_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+            {%- elif 'text' in item %}
+                {{- item.text }}
+            {%- else %}
+                {{- raise_exception('Unexpected item type in content.') }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif content is none or content is undefined %}
+        {{- '' }}
+    {%- else %}
+        {{- raise_exception('Unexpected content type.') }}
+    {%- endif %}
+{%- endmacro %}
+{%- if not messages %}
+    {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+    {{- '<|im_start|>system\n' }}
+    {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {%- if content %}
+            {{- '\n\n' + content }}
+        {%- endif %}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" %}
+        {%- set content = render_content(message.content, false)|trim %}
+        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
+            {%- set ns.multi_step_tool = false %}
+            {%- set ns.last_query_index = index %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+    {{- raise_exception('No user query found in messages.') }}
+{%- endif %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content, true)|trim %}
+    {%- if message.role == "system" %}
+        {%- if not loop.first %}
+            {{- raise_exception('System message must be at the beginning.') }}
+        {%- endif %}
+    {%- elif message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- set reasoning_content = reasoning_content|trim %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {%- if loop.first %}
+                    {%- if content|trim %}
+                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- else %}
+                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- endif %}
+                {%- else %}
+                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                {%- endif %}
+                {%- if tool_call.arguments is mapping %}
+                    {%- for args_name in tool_call.arguments %}
+                        {%- set args_value = tool_call.arguments[args_name] %}
+                        {{- '<parameter=' + args_name + '>\n' }}
+                        {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                        {{- args_value }}
+                        {{- '\n</parameter>\n' }}
+                    {%- endfor %}
+                {%- endif %}
+                {{- '</function>\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- raise_exception('Unexpected message role.') }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- else %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}
--- a/mods/fix-qwen3.5-chat-template/run.sh
+++ b/mods/fix-qwen3.5-chat-template/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+cp chat_template.jinja $WORKSPACE_DIR/unsloth.jinja
+echo "=======> to apply chat template, use --chat-template unsloth.jinja"
--- a/recipes/4x-spark-cluster/minimax-m2.5.yaml
+++ b/recipes/4x-spark-cluster/minimax-m2.5.yaml
@@ -0,0 +1,45 @@
+# Recipe: MiniMax-M2.5
+# MiniMaxAI/MiniMax-M2.5
+
+recipe_version: "1"
+name: MiniMax-M2.5
+description: vLLM serving MiniMax-M2.5 with Ray distributed backend
+
+# HuggingFace model to download (optional, for --download-model)
+model: MiniMaxAI/MiniMax-M2.5
+
+# Container image to use
+container: vllm-node
+
+# Can only be run in a cluster
+cluster_only: true
+
+# No mods required
+mods: []
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 4
+  gpu_memory_utilization: 0.90
+  max_model_len: 128000
+
+# Environment variables
+env:
+  VLLM_DISTRIBUTED_EXECUTOR_CONFIG: '{"placement_group_options":{"strategy":"SPREAD"}}'
+
+# The vLLM serve command template
+command: |
+  vllm serve MiniMaxAI/MiniMax-M2.5 \
+      --trust-remote-code \
+      --port {port} \
+      --host {host} \
+      --gpu-memory-utilization {gpu_memory_utilization} \
+      -tp {tensor_parallel} \
+      --distributed-executor-backend ray \
+      --max-model-len {max_model_len} \
+      --load-format fastsafetensors \
+      --enable-auto-tool-choice \
+      --tool-call-parser minimax_m2 \
+      --reasoning-parser minimax_m2_append_think
--- a/recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml
+++ b/recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml
@@ -0,0 +1,63 @@
+# Recipe: Qwen3.5-397B-A17B-FP8
+# Qwen3.5-397B-A17B model in FP8 precision
+# Multi-modal input
+
+recipe_version: "1"
+name: Qwen3.5-397B-A17B-FP8
+description: vLLM serving Qwen3.5-397B-A17B-FP8
+
+# HuggingFace model to download (optional, for --download-model)
+model: Qwen/Qwen3.5-397B-A17B-FP8
+
+#solo_only: true
+
+# Container image to use
+container: vllm-node-tf5
+
+build_args:
+  - --tf5
+  - --rebuild-flashinfer
+  - --rebuild-vllm
+
+# Mod required to fix ROPE syntax error
+mods:
+  - mods/fix-qwen3.5-autoround
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 4
+  gpu_memory_utilization: 0.85
+  max_model_len: 262144
+  max_num_batched_tokens: 8192
+
+# Environment variables
+env: 
+  VLLM_USE_DEEP_GEMM: 0
+  VLLM_USE_FLASHINFER_MOE_FP16: 1
+  VLLM_USE_FLASHINFER_SAMPLER: 0
+  OMP_NUM_THREADS: 4
+
+# The vLLM serve command template
+command: |
+  vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \
+    --max-model-len {max_model_len} \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --port {port} \
+    --host {host} \
+    --load-format fastsafetensors \
+    --enable-prefix-caching \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_coder \
+    --reasoning-parser qwen3 \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    --trust-remote-code \
+    -tp {tensor_parallel} \
+    --distributed-executor-backend ray \
+    --mm-encoder-tp-mode data \
+    --kv-cache-dtype fp8 \
+    --compilation-config.cudagraph_mode none \
+    --max-num-seqs 32 \
+    --attention-backend flashinfer
+
--- a/recipes/qwen3.5-122b-fp8.yaml
+++ b/recipes/qwen3.5-122b-fp8.yaml
@@ -15,7 +15,8 @@ cluster_only: true
 container: vllm-node

 # No mods required
-mods: []
+mods:
+  - mods/fix-qwen3.5-chat-template

 # Default settings (can be overridden via CLI)
 defaults:
@@ -41,5 +42,6 @@ command: |
    --enable-auto-tool-choice \
    --tool-call-parser qwen3_coder \
    --reasoning-parser qwen3 \
+    --chat-template unsloth.jinja \
    -tp {tensor_parallel} --distributed-executor-backend ray \
    --max-num-batched-tokens {max_num_batched_tokens}
--- a/recipes/qwen3.5-122b-int4-autoround.yaml
+++ b/recipes/qwen3.5-122b-int4-autoround.yaml
@@ -19,6 +19,7 @@ build_args:
 # Mod required to fix ROPE syntax error
 mods:
  - mods/fix-qwen3.5-autoround
+  - mods/fix-qwen3.5-chat-template

 # Default settings (can be overridden via CLI)
 defaults:
@@ -47,6 +48,7 @@ command: |
    --reasoning-parser qwen3 \
    --max-num-batched-tokens {max_num_batched_tokens} \
    --trust-remote-code \
+    --chat-template unsloth.jinja \
    -tp {tensor_parallel} \
    --distributed-executor-backend ray

--- a/recipes/qwen3.5-35b-a3b-fp8.yaml
+++ b/recipes/qwen3.5-35b-a3b-fp8.yaml
@@ -0,0 +1,51 @@
+# Recipe: Qwen/Qwen3.5-35B-A3B-FP8
+# Qwen/Qwen3.5-35B-A3B model in native FP8 format
+
+
+recipe_version: "1"
+name: Qwen35-35B-A3B
+description: vLLM serving Qwen3.5-35B-A3B-FP8
+
+# HuggingFace model to download (optional, for --download-model)
+model: Qwen/Qwen3.5-35B-A3B-FP8
+
+#solo_only: true
+
+# Container image to use
+container: vllm-node
+
+# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
+mods:
+  - mods/fix-qwen3-coder-next
+  - mods/fix-qwen3.5-chat-template
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_batched_tokens: 16384
+
+# Environment variables
+env: 
+  VLLM_MARLIN_USE_ATOMIC_ADD: 1
+
+# The vLLM serve command template
+command: |
+  vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \
+    --host {host} \
+    --port {port} \
+    --max-model-len {max_model_len} \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_coder \
+    --kv-cache-dtype fp8 \
+    --load-format fastsafetensors \
+    --attention-backend flashinfer \
+    --enable-prefix-caching \
+    --chat-template unsloth.jinja \
+    -tp {tensor_parallel} \
+    --distributed-executor-backend ray
--- a/run-recipe.py
+++ b/run-recipe.py
@@ -86,6 +86,7 @@ RELATED FILES:
 import argparse
 import os
 import subprocess
+import shlex
 import sys
 import tempfile
 from pathlib import Path
@@ -473,7 +474,7 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
    # Append extra args if provided (after --)
    if extra_args:
        # Join extra args and append to command
-        extra_args_str = ' '.join(extra_args)
+        extra_args_str = ' '.join(shlex.quote(a) for a in extra_args)
        command = command.rstrip()
        # Handle multi-line commands with backslash continuations
        if command.endswith('\\'):
Author	SHA1	Message	Date
Eugene Rakhmatulin	d42c4199fa	Unsloth chat template for qwen3.5	2026-03-06 23:35:18 -08:00
Eugene Rakhmatulin	9dc09bd04b	Renamed recipe for qwen3.5-35b-a3b-fp8 to match others	2026-03-06 13:56:06 -08:00
eugr	e88426646b	Merge pull request #76 from mmonad/fix-exec-arg-quoting Fix shell quoting for exec command arguments	2026-03-06 13:45:53 -08:00
Olivier Paroz	eb8abcca7f	Prevent 169.254.x.x fallback when setting fix IP address (#84 ) * Prevent 169.254.x.x fallback when setting fix IP address To force the use of the IP we've chosen to be assigned to the interface, it's safer to disable the fallback to avoid problems down the line * Prevent 169.254.x.x fallback when setting fix IP address To force the use of the static IP address we've chosen to be assigned to the interface, it's safer to disable the fallback to avoid problems down the line	2026-03-06 11:47:47 -08:00
eugr	d148d95a19	Merge pull request #80 from oliverjohnwilson/recipe-add_minimax-m2.5_qwen3.5-397b-a17B-fp8 added minimax-m2.5 and qwen3.5-397b-a17B-fp8 recipes to a recipes/4x-spark-cluster/ subdirectory	2026-03-06 11:46:37 -08:00
Eugene Rakhmatulin	5346372f14	More robust wheels check before download	2026-03-05 17:06:57 -08:00
Eugene Rakhmatulin	5f8f988d91	Merge branch 'main' of github.com:eugr/spark-vllm-docker	2026-03-05 16:29:00 -08:00
eugr	3fabd3fb1c	Merge pull request #72 from erikvullings/main Add Qwen35-35B-A3B recipe in FP8 format	2026-03-05 16:27:50 -08:00
Eugene Rakhmatulin	2d03bc138d	saving flashinfer and vllm commits in wheels directories	2026-03-05 14:41:25 -08:00
oliverjohnwilson	4303f8b6d0	added minimax-m2.5 and qwen3.5-397b-a17B-fp8 recipes to a recipes/4x-spark-cluster/ subdirectory	2026-03-04 16:01:37 -06:00
L.B.R.	50b3ca60f3	Fix shell quoting for exec command arguments Arguments with special characters (e.g. JSON strings) were passed unquoted, causing breakage for commands like: --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":2}' Use printf %q in launch-cluster.sh and shlex.quote() in run-recipe.py to properly escape arguments.	2026-03-04 15:22:42 +00:00
Erik Vullings	163f23d85b	Update qwen35-35b-a3b-fp8.yaml --max_num_batched_tokens is a default variable now, which can be overriden via the CLI	2026-03-03 12:46:12 +01:00
Erik Vullings	e8f94d6b8b	Add Qwen35-35B-A3B recipe in FP8 format	2026-02-27 17:46:06 +01:00