Added a recipe for qwen3.5-122B-FP8

2026-03-04 16:49:39 -08:00
parent 505a060a7d
commit a749fcce87
1 changed files with 45 additions and 0 deletions
--- a/recipes/qwen3.5-122b-fp8.yaml
+++ b/recipes/qwen3.5-122b-fp8.yaml
@@ -0,0 +1,45 @@
+# Recipe: Qwen3.5-122B-A10B-FP8
+# Qwen3.5-122B model in native FP8 quantization
+
+recipe_version: "1"
+name: Qwen3.5-122B-FP8
+description: vLLM serving Qwen3.5-122B-FP8
+
+# HuggingFace model to download (optional, for --download-model)
+model: Qwen/Qwen3.5-122B-A10B-FP8
+
+# Only cluster is supported
+cluster_only: true
+
+# Container image to use
+container: vllm-node
+
+# No mods required
+mods: []
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_batched_tokens: 8192
+
+# Environment variables
+env: {}
+
+# The vLLM serve command template
+command: |
+  vllm serve Qwen/Qwen3.5-122B-A10B-FP8 \
+    --max-model-len {max_model_len} \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --port {port} \
+    --host {host} \
+    --load-format fastsafetensors \
+    --enable-prefix-caching \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_coder \
+    --reasoning-parser qwen3 \
+    -tp {tensor_parallel} --distributed-executor-backend ray \
+    --max-num-batched-tokens {max_num_batched_tokens}