spark-vllm-docker/tests/expected_commands.sh

# Expected vLLM serve arguments for each recipe
# This file is used by test_recipes.sh to verify recipes match README documentation
#
# Format: Each recipe has a section with expected arguments
# Tests will verify these arguments appear in the dry-run output
#
# IMPORTANT: Keep this in sync with README.md documentation
# When updating recipes, update both README.md and this file

# ==============================================================================
# glm-4.7-flash-awq
# README Reference: Lines 186-198 (solo) and 203-218 (cluster)
# ==============================================================================
GLM_FLASH_AWQ_MODEL="cyankiwi/GLM-4.7-Flash-AWQ-4bit"
GLM_FLASH_AWQ_CONTAINER="vllm-node-tf5"
GLM_FLASH_AWQ_MOD="mods/fix-glm-4.7-flash-AWQ"
GLM_FLASH_AWQ_ARGS=(
    "--tool-call-parser glm47"
    "--reasoning-parser glm45"
    "--enable-auto-tool-choice"
    "--served-model-name glm-4.7-flash"
    "--max-model-len 202752"
    "--max-num-batched-tokens 4096"
    "--max-num-seqs 64"
    "--gpu-memory-utilization 0.7"
    "--port 8000"
    "--host 0.0.0.0"
)

# ==============================================================================
# openai-gpt-oss-120b
# README Reference: Lines 244-257 (solo) and 264-280 (cluster)
# ==============================================================================
GPT_OSS_MODEL="openai/gpt-oss-120b"
GPT_OSS_CONTAINER="vllm-node-mxfp4"
GPT_OSS_ARGS=(
    "--port 8000"
    "--host 0.0.0.0"
    "--enable-auto-tool-choice"
    "--tool-call-parser openai"
    "--reasoning-parser openai_gptoss"
    "--gpu-memory-utilization 0.7"
    "--enable-prefix-caching"
    "--load-format fastsafetensors"
    "--quantization mxfp4"
    "--mxfp4-backend CUTLASS"
    "--mxfp4-layers moe,qkv,o,lm_head"
    "--attention-backend FLASHINFER"
    "--kv-cache-dtype fp8"
    "--max-num-batched-tokens 8192"
)

# ==============================================================================
# minimax-m2-awq
# README Reference: Not explicitly documented, but based on model requirements
# ==============================================================================
MINIMAX_MODEL="QuantTrio/MiniMax-M2-AWQ"
MINIMAX_CONTAINER="vllm-node"
MINIMAX_ARGS=(
    "--port 8000"
    "--host 0.0.0.0"
    "--gpu-memory-utilization 0.7"
    "--max-model-len 128000"
    "--load-format fastsafetensors"
    "--enable-auto-tool-choice"
    "--tool-call-parser minimax_m2"
    "--reasoning-parser minimax_m2_append_think"
)

# ==============================================================================
# Cluster Mode Expected Arguments
# These are arguments that should appear ONLY in cluster mode
# Note: Tests use 2 nodes, so tensor_parallel = 2 (1 GPU per node)
# ==============================================================================

# glm-4.7-flash-awq cluster mode (no distributed backend - single GPU model)
GLM_FLASH_AWQ_CLUSTER_TP="1"

# openai-gpt-oss-120b cluster mode (2 nodes = tp 2)
GPT_OSS_CLUSTER_TP="2"
GPT_OSS_CLUSTER_ARGS=(
    "--distributed-executor-backend ray"
)

# minimax-m2-awq cluster mode (2 nodes = tp 2)
MINIMAX_CLUSTER_TP="2"
MINIMAX_CLUSTER_ARGS=(
    "--distributed-executor-backend ray"
)