Updated Nemotron recipes to use VLLM CUTLASS
This commit is contained in:
@@ -27,15 +27,10 @@ defaults:
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 262144
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_NVFP4_GEMM_BACKEND: "marlin"
|
||||
VLLM_TEST_FORCE_FP8_MARLIN: "1"
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: "1"
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 \
|
||||
--moe-backend cutlass \
|
||||
--max-model-len {max_model_len} \
|
||||
--port {port} --host {host} \
|
||||
--trust-remote-code \
|
||||
|
||||
Reference in New Issue
Block a user