Adding suggestions from Eugr and unit tests
This commit is contained in:
89
tests/expected_commands.sh
Normal file
89
tests/expected_commands.sh
Normal file
@@ -0,0 +1,89 @@
|
||||
# Expected vLLM serve arguments for each recipe
|
||||
# This file is used by test_recipes.sh to verify recipes match README documentation
|
||||
#
|
||||
# Format: Each recipe has a section with expected arguments
|
||||
# Tests will verify these arguments appear in the dry-run output
|
||||
#
|
||||
# IMPORTANT: Keep this in sync with README.md documentation
|
||||
# When updating recipes, update both README.md and this file
|
||||
|
||||
# ==============================================================================
|
||||
# glm-4.7-flash-awq
|
||||
# README Reference: Lines 186-198 (solo) and 203-218 (cluster)
|
||||
# ==============================================================================
|
||||
GLM_FLASH_AWQ_MODEL="cyankiwi/GLM-4.7-Flash-AWQ-4bit"
|
||||
GLM_FLASH_AWQ_CONTAINER="vllm-node-tf5"
|
||||
GLM_FLASH_AWQ_MOD="mods/fix-glm-4.7-flash-AWQ"
|
||||
GLM_FLASH_AWQ_ARGS=(
|
||||
"--tool-call-parser glm47"
|
||||
"--reasoning-parser glm45"
|
||||
"--enable-auto-tool-choice"
|
||||
"--served-model-name glm-4.7-flash"
|
||||
"--max-model-len 202752"
|
||||
"--max-num-batched-tokens 4096"
|
||||
"--max-num-seqs 64"
|
||||
"--gpu-memory-utilization 0.7"
|
||||
"--port 8888"
|
||||
"--host 0.0.0.0"
|
||||
)
|
||||
|
||||
# ==============================================================================
|
||||
# openai-gpt-oss-120b
|
||||
# README Reference: Lines 244-257 (solo) and 264-280 (cluster)
|
||||
# ==============================================================================
|
||||
GPT_OSS_MODEL="openai/gpt-oss-120b"
|
||||
GPT_OSS_CONTAINER="vllm-node-mxfp4"
|
||||
GPT_OSS_ARGS=(
|
||||
"--port 8888"
|
||||
"--host 0.0.0.0"
|
||||
"--enable-auto-tool-choice"
|
||||
"--tool-call-parser openai"
|
||||
"--reasoning-parser openai_gptoss"
|
||||
"--gpu-memory-utilization 0.7"
|
||||
"--enable-prefix-caching"
|
||||
"--load-format fastsafetensors"
|
||||
"--quantization mxfp4"
|
||||
"--mxfp4-backend CUTLASS"
|
||||
"--mxfp4-layers moe,qkv,o,lm_head"
|
||||
"--attention-backend FLASHINFER"
|
||||
"--kv-cache-dtype fp8"
|
||||
"--max-num-batched-tokens 8192"
|
||||
)
|
||||
|
||||
# ==============================================================================
|
||||
# minimax-m2-awq
|
||||
# README Reference: Not explicitly documented, but based on model requirements
|
||||
# ==============================================================================
|
||||
MINIMAX_MODEL="QuantTrio/MiniMax-M2-AWQ"
|
||||
MINIMAX_CONTAINER="vllm-node"
|
||||
MINIMAX_ARGS=(
|
||||
"--port 8000"
|
||||
"--host 0.0.0.0"
|
||||
"--gpu-memory-utilization 0.7"
|
||||
"--max-model-len 128000"
|
||||
"--load-format fastsafetensors"
|
||||
"--enable-auto-tool-choice"
|
||||
"--tool-call-parser minimax_m2"
|
||||
"--reasoning-parser minimax_m2_append_think"
|
||||
)
|
||||
|
||||
# ==============================================================================
|
||||
# Cluster Mode Expected Arguments
|
||||
# These are arguments that should appear ONLY in cluster mode
|
||||
# Note: Tests use 2 nodes, so tensor_parallel = 2 (1 GPU per node)
|
||||
# ==============================================================================
|
||||
|
||||
# glm-4.7-flash-awq cluster mode (no distributed backend - single GPU model)
|
||||
GLM_FLASH_AWQ_CLUSTER_TP="1"
|
||||
|
||||
# openai-gpt-oss-120b cluster mode (2 nodes = tp 2)
|
||||
GPT_OSS_CLUSTER_TP="2"
|
||||
GPT_OSS_CLUSTER_ARGS=(
|
||||
"--distributed-executor-backend ray"
|
||||
)
|
||||
|
||||
# minimax-m2-awq cluster mode (2 nodes = tp 2)
|
||||
MINIMAX_CLUSTER_TP="2"
|
||||
MINIMAX_CLUSTER_ARGS=(
|
||||
"--distributed-executor-backend ray"
|
||||
)
|
||||
Reference in New Issue
Block a user