Renamed recipe for qwen3.5-35b-a3b-fp8 to match others

2026-03-06 13:56:06 -08:00
parent e88426646b
commit 9dc09bd04b
1 changed files with 1 additions and 1 deletions
--- a/recipes/qwen3.5-35b-a3b-fp8.yaml
+++ b/recipes/qwen3.5-35b-a3b-fp8.yaml
@@ -0,0 +1,49 @@
+# Recipe: Qwen/Qwen3.5-35B-A3B-FP8
+# Qwen/Qwen3.5-35B-A3B model in native FP8 format
+
+
+recipe_version: "1"
+name: Qwen35-35B-A3B
+description: vLLM serving Qwen3.5-35B-A3B-FP8
+
+# HuggingFace model to download (optional, for --download-model)
+model: Qwen/Qwen3.5-35B-A3B-FP8
+
+#solo_only: true
+
+# Container image to use
+container: vllm-node
+
+# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
+mods:
+  - mods/fix-qwen3-coder-next
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_batched_tokens: 16384
+
+# Environment variables
+env: 
+  VLLM_MARLIN_USE_ATOMIC_ADD: 1
+
+# The vLLM serve command template
+command: |
+  vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \
+    --host {host} \
+    --port {port} \
+    --max-model-len {max_model_len} \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_coder \
+    --kv-cache-dtype fp8 \
+    --load-format fastsafetensors \
+    --attention-backend flashinfer \
+    --enable-prefix-caching \
+    -tp {tensor_parallel} \
+    --distributed-executor-backend ray