90 lines
3.1 KiB
Bash
90 lines
3.1 KiB
Bash
# Expected vLLM serve arguments for each recipe
|
|
# This file is used by test_recipes.sh to verify recipes match README documentation
|
|
#
|
|
# Format: Each recipe has a section with expected arguments
|
|
# Tests will verify these arguments appear in the dry-run output
|
|
#
|
|
# IMPORTANT: Keep this in sync with README.md documentation
|
|
# When updating recipes, update both README.md and this file
|
|
|
|
# ==============================================================================
|
|
# glm-4.7-flash-awq
|
|
# README Reference: Lines 186-198 (solo) and 203-218 (cluster)
|
|
# ==============================================================================
|
|
GLM_FLASH_AWQ_MODEL="cyankiwi/GLM-4.7-Flash-AWQ-4bit"
|
|
GLM_FLASH_AWQ_CONTAINER="vllm-node-tf5"
|
|
GLM_FLASH_AWQ_MOD="mods/fix-glm-4.7-flash-AWQ"
|
|
GLM_FLASH_AWQ_ARGS=(
|
|
"--tool-call-parser glm47"
|
|
"--reasoning-parser glm45"
|
|
"--enable-auto-tool-choice"
|
|
"--served-model-name glm-4.7-flash"
|
|
"--max-model-len 202752"
|
|
"--max-num-batched-tokens 4096"
|
|
"--max-num-seqs 64"
|
|
"--gpu-memory-utilization 0.7"
|
|
"--port 8000"
|
|
"--host 0.0.0.0"
|
|
)
|
|
|
|
# ==============================================================================
|
|
# openai-gpt-oss-120b
|
|
# README Reference: Lines 244-257 (solo) and 264-280 (cluster)
|
|
# ==============================================================================
|
|
GPT_OSS_MODEL="openai/gpt-oss-120b"
|
|
GPT_OSS_CONTAINER="vllm-node-mxfp4"
|
|
GPT_OSS_ARGS=(
|
|
"--port 8000"
|
|
"--host 0.0.0.0"
|
|
"--enable-auto-tool-choice"
|
|
"--tool-call-parser openai"
|
|
"--reasoning-parser openai_gptoss"
|
|
"--gpu-memory-utilization 0.7"
|
|
"--enable-prefix-caching"
|
|
"--load-format fastsafetensors"
|
|
"--quantization mxfp4"
|
|
"--mxfp4-backend CUTLASS"
|
|
"--mxfp4-layers moe,qkv,o,lm_head"
|
|
"--attention-backend FLASHINFER"
|
|
"--kv-cache-dtype fp8"
|
|
"--max-num-batched-tokens 8192"
|
|
)
|
|
|
|
# ==============================================================================
|
|
# minimax-m2-awq
|
|
# README Reference: Not explicitly documented, but based on model requirements
|
|
# ==============================================================================
|
|
MINIMAX_MODEL="QuantTrio/MiniMax-M2-AWQ"
|
|
MINIMAX_CONTAINER="vllm-node"
|
|
MINIMAX_ARGS=(
|
|
"--port 8000"
|
|
"--host 0.0.0.0"
|
|
"--gpu-memory-utilization 0.7"
|
|
"--max-model-len 128000"
|
|
"--load-format fastsafetensors"
|
|
"--enable-auto-tool-choice"
|
|
"--tool-call-parser minimax_m2"
|
|
"--reasoning-parser minimax_m2_append_think"
|
|
)
|
|
|
|
# ==============================================================================
|
|
# Cluster Mode Expected Arguments
|
|
# These are arguments that should appear ONLY in cluster mode
|
|
# Note: Tests use 2 nodes, so tensor_parallel = 2 (1 GPU per node)
|
|
# ==============================================================================
|
|
|
|
# glm-4.7-flash-awq cluster mode (no distributed backend - single GPU model)
|
|
GLM_FLASH_AWQ_CLUSTER_TP="1"
|
|
|
|
# openai-gpt-oss-120b cluster mode (2 nodes = tp 2)
|
|
GPT_OSS_CLUSTER_TP="2"
|
|
GPT_OSS_CLUSTER_ARGS=(
|
|
"--distributed-executor-backend ray"
|
|
)
|
|
|
|
# minimax-m2-awq cluster mode (2 nodes = tp 2)
|
|
MINIMAX_CLUSTER_TP="2"
|
|
MINIMAX_CLUSTER_ARGS=(
|
|
"--distributed-executor-backend ray"
|
|
)
|