From 689f7665af544107bbed2df52f38427668d8b243 Mon Sep 17 00:00:00 2001 From: HaimKortovich Date: Mon, 11 May 2026 11:59:32 -0500 Subject: [PATCH] set fashinfer --- clusters/k3s-dgx/nim-service/qwen.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml index c42626f..932f138 100644 --- a/clusters/k3s-dgx/nim-service/qwen.yaml +++ b/clusters/k3s-dgx/nim-service/qwen.yaml @@ -33,8 +33,6 @@ spec: env: - name: VLLM_CACHE_ROOT value: /model-store/vllm-cache - - name: VLLM_ATTENTION_BACKEND - value: FLASHINFER # Faster attention on CUDA >= 12.1 - name: VLLM_DISABLE_LOGGING value: "1" # Remove logging overhead command: @@ -64,6 +62,8 @@ spec: - --max-num-seqs - "254" # ↑ Allow more concurrent sequences - --enable-prefix-caching + - --attention-backend + - FLASHINFER authSecret: hf-api-secret storage: sharedMemorySizeLimit: 64Gi