Adding suggestions from Eugr and unit tests

2026-02-03 17:32:59 -05:00
parent 30f16f1d4e
commit 28ba6090fc
9 changed files with 1024 additions and 21 deletions
--- a/tests/expected_commands.sh
+++ b/tests/expected_commands.sh
@@ -0,0 +1,89 @@
+# Expected vLLM serve arguments for each recipe
+# This file is used by test_recipes.sh to verify recipes match README documentation
+#
+# Format: Each recipe has a section with expected arguments
+# Tests will verify these arguments appear in the dry-run output
+#
+# IMPORTANT: Keep this in sync with README.md documentation
+# When updating recipes, update both README.md and this file
+
+# ==============================================================================
+# glm-4.7-flash-awq
+# README Reference: Lines 186-198 (solo) and 203-218 (cluster)
+# ==============================================================================
+GLM_FLASH_AWQ_MODEL="cyankiwi/GLM-4.7-Flash-AWQ-4bit"
+GLM_FLASH_AWQ_CONTAINER="vllm-node-tf5"
+GLM_FLASH_AWQ_MOD="mods/fix-glm-4.7-flash-AWQ"
+GLM_FLASH_AWQ_ARGS=(
+    "--tool-call-parser glm47"
+    "--reasoning-parser glm45"
+    "--enable-auto-tool-choice"
+    "--served-model-name glm-4.7-flash"
+    "--max-model-len 202752"
+    "--max-num-batched-tokens 4096"
+    "--max-num-seqs 64"
+    "--gpu-memory-utilization 0.7"
+    "--port 8888"
+    "--host 0.0.0.0"
+)
+
+# ==============================================================================
+# openai-gpt-oss-120b
+# README Reference: Lines 244-257 (solo) and 264-280 (cluster)
+# ==============================================================================
+GPT_OSS_MODEL="openai/gpt-oss-120b"
+GPT_OSS_CONTAINER="vllm-node-mxfp4"
+GPT_OSS_ARGS=(
+    "--port 8888"
+    "--host 0.0.0.0"
+    "--enable-auto-tool-choice"
+    "--tool-call-parser openai"
+    "--reasoning-parser openai_gptoss"
+    "--gpu-memory-utilization 0.7"
+    "--enable-prefix-caching"
+    "--load-format fastsafetensors"
+    "--quantization mxfp4"
+    "--mxfp4-backend CUTLASS"
+    "--mxfp4-layers moe,qkv,o,lm_head"
+    "--attention-backend FLASHINFER"
+    "--kv-cache-dtype fp8"
+    "--max-num-batched-tokens 8192"
+)
+
+# ==============================================================================
+# minimax-m2-awq
+# README Reference: Not explicitly documented, but based on model requirements
+# ==============================================================================
+MINIMAX_MODEL="QuantTrio/MiniMax-M2-AWQ"
+MINIMAX_CONTAINER="vllm-node"
+MINIMAX_ARGS=(
+    "--port 8000"
+    "--host 0.0.0.0"
+    "--gpu-memory-utilization 0.7"
+    "--max-model-len 128000"
+    "--load-format fastsafetensors"
+    "--enable-auto-tool-choice"
+    "--tool-call-parser minimax_m2"
+    "--reasoning-parser minimax_m2_append_think"
+)
+
+# ==============================================================================
+# Cluster Mode Expected Arguments
+# These are arguments that should appear ONLY in cluster mode
+# Note: Tests use 2 nodes, so tensor_parallel = 2 (1 GPU per node)
+# ==============================================================================
+
+# glm-4.7-flash-awq cluster mode (no distributed backend - single GPU model)
+GLM_FLASH_AWQ_CLUSTER_TP="1"
+
+# openai-gpt-oss-120b cluster mode (2 nodes = tp 2)
+GPT_OSS_CLUSTER_TP="2"
+GPT_OSS_CLUSTER_ARGS=(
+    "--distributed-executor-backend ray"
+)
+
+# minimax-m2-awq cluster mode (2 nodes = tp 2)
+MINIMAX_CLUSTER_TP="2"
+MINIMAX_CLUSTER_ARGS=(
+    "--distributed-executor-backend ray"
+)