set fashinfer

This commit is contained in:
2026-05-11 11:59:32 -05:00
parent 93d23bc46a
commit 689f7665af

View File

@@ -33,8 +33,6 @@ spec:
env: env:
- name: VLLM_CACHE_ROOT - name: VLLM_CACHE_ROOT
value: /model-store/vllm-cache value: /model-store/vllm-cache
- name: VLLM_ATTENTION_BACKEND
value: FLASHINFER # Faster attention on CUDA >= 12.1
- name: VLLM_DISABLE_LOGGING - name: VLLM_DISABLE_LOGGING
value: "1" # Remove logging overhead value: "1" # Remove logging overhead
command: command:
@@ -64,6 +62,8 @@ spec:
- --max-num-seqs - --max-num-seqs
- "254" # ↑ Allow more concurrent sequences - "254" # ↑ Allow more concurrent sequences
- --enable-prefix-caching - --enable-prefix-caching
- --attention-backend
- FLASHINFER
authSecret: hf-api-secret authSecret: hf-api-secret
storage: storage:
sharedMemorySizeLimit: 64Gi sharedMemorySizeLimit: 64Gi