diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml index 61fae86..71c84bb 100644 --- a/clusters/k3s-dgx/nim-service/qwen.yaml +++ b/clusters/k3s-dgx/nim-service/qwen.yaml @@ -63,8 +63,8 @@ spec: - "131072" # ↑ 32K→128K — larger decode batches = more tokens/sec - --max-num-seqs - "254" # ↑ Allow more concurrent sequences - --enable-prefix-caching - --dtype + - --enable-prefix-caching + - --dtype - "float8" # Explicit FP8 encoding --quantization - "fbgemm-fp8" # Explicit quantization backend