18 lines
613 B
Bash
18 lines
613 B
Bash
#!/bin/bash
|
|
# PROFILE: Salyut1/GLM-4.7-NVFP4
|
|
# DESCRIPTION: vLLM serving GLM-4.7-NVFP4
|
|
# NOTE: This profile requires --apply-mod mods/fix-Salyut1-GLM-4.7-NVFP4 to fix k/v scales incompatibility
|
|
# See: https://huggingface.co/Salyut1/GLM-4.7-NVFP4/discussions/3#694ab9b6e2efa04b7ecb0c4b
|
|
|
|
vllm serve Salyut1/GLM-4.7-NVFP4 \
|
|
--attention-config.backend flashinfer \
|
|
--tool-call-parser glm47 \
|
|
--reasoning-parser glm45 \
|
|
--enable-auto-tool-choice \
|
|
-tp 2 \
|
|
--gpu-memory-utilization 0.88 \
|
|
--max-model-len 32000 \
|
|
--distributed-executor-backend ray \
|
|
--host 0.0.0.0 \
|
|
--port 8000
|