From 4303f8b6d0eb931c0acbdd0db415994b3c68de95 Mon Sep 17 00:00:00 2001 From: oliverjohnwilson Date: Wed, 4 Mar 2026 16:01:37 -0600 Subject: [PATCH] added minimax-m2.5 and qwen3.5-397b-a17B-fp8 recipes to a recipes/4x-spark-cluster/ subdirectory --- recipes/4x-spark-cluster/minimax-m2.5.yaml | 45 +++++++++++++ .../qwen3.5-397b-a17B-fp8.yaml | 63 +++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 recipes/4x-spark-cluster/minimax-m2.5.yaml create mode 100644 recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml diff --git a/recipes/4x-spark-cluster/minimax-m2.5.yaml b/recipes/4x-spark-cluster/minimax-m2.5.yaml new file mode 100644 index 0000000..df913b0 --- /dev/null +++ b/recipes/4x-spark-cluster/minimax-m2.5.yaml @@ -0,0 +1,45 @@ +# Recipe: MiniMax-M2.5 +# MiniMaxAI/MiniMax-M2.5 + +recipe_version: "1" +name: MiniMax-M2.5 +description: vLLM serving MiniMax-M2.5 with Ray distributed backend + +# HuggingFace model to download (optional, for --download-model) +model: MiniMaxAI/MiniMax-M2.5 + +# Container image to use +container: vllm-node + +# Can only be run in a cluster +cluster_only: true + +# No mods required +mods: [] + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 4 + gpu_memory_utilization: 0.90 + max_model_len: 128000 + +# Environment variables +env: + VLLM_DISTRIBUTED_EXECUTOR_CONFIG: '{"placement_group_options":{"strategy":"SPREAD"}}' + +# The vLLM serve command template +command: | + vllm serve MiniMaxAI/MiniMax-M2.5 \ + --trust-remote-code \ + --port {port} \ + --host {host} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + -tp {tensor_parallel} \ + --distributed-executor-backend ray \ + --max-model-len {max_model_len} \ + --load-format fastsafetensors \ + --enable-auto-tool-choice \ + --tool-call-parser minimax_m2 \ + --reasoning-parser minimax_m2_append_think diff --git a/recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml b/recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml new file mode 100644 index 0000000..e3e7e48 --- /dev/null +++ b/recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml @@ -0,0 +1,63 @@ +# Recipe: Qwen3.5-397B-A17B-FP8 +# Qwen3.5-397B-A17B model in FP8 precision +# Multi-modal input + +recipe_version: "1" +name: Qwen3.5-397B-A17B-FP8 +description: vLLM serving Qwen3.5-397B-A17B-FP8 + +# HuggingFace model to download (optional, for --download-model) +model: Qwen/Qwen3.5-397B-A17B-FP8 + +#solo_only: true + +# Container image to use +container: vllm-node-tf5 + +build_args: + - --tf5 + - --rebuild-flashinfer + - --rebuild-vllm + +# Mod required to fix ROPE syntax error +mods: + - mods/fix-qwen3.5-autoround + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 4 + gpu_memory_utilization: 0.85 + max_model_len: 262144 + max_num_batched_tokens: 8192 + +# Environment variables +env: + VLLM_USE_DEEP_GEMM: 0 + VLLM_USE_FLASHINFER_MOE_FP16: 1 + VLLM_USE_FLASHINFER_SAMPLER: 0 + OMP_NUM_THREADS: 4 + +# The vLLM serve command template +command: | + vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \ + --max-model-len {max_model_len} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --port {port} \ + --host {host} \ + --load-format fastsafetensors \ + --enable-prefix-caching \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ + --reasoning-parser qwen3 \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --trust-remote-code \ + -tp {tensor_parallel} \ + --distributed-executor-backend ray \ + --mm-encoder-tp-mode data \ + --kv-cache-dtype fp8 \ + --compilation-config.cudagraph_mode none \ + --max-num-seqs 32 \ + --attention-backend flashinfer +