set fashinfer
This commit is contained in:
@@ -33,8 +33,6 @@ spec:
|
|||||||
env:
|
env:
|
||||||
- name: VLLM_CACHE_ROOT
|
- name: VLLM_CACHE_ROOT
|
||||||
value: /model-store/vllm-cache
|
value: /model-store/vllm-cache
|
||||||
- name: VLLM_ATTENTION_BACKEND
|
|
||||||
value: FLASHINFER # Faster attention on CUDA >= 12.1
|
|
||||||
- name: VLLM_DISABLE_LOGGING
|
- name: VLLM_DISABLE_LOGGING
|
||||||
value: "1" # Remove logging overhead
|
value: "1" # Remove logging overhead
|
||||||
command:
|
command:
|
||||||
@@ -64,6 +62,8 @@ spec:
|
|||||||
- --max-num-seqs
|
- --max-num-seqs
|
||||||
- "254" # ↑ Allow more concurrent sequences
|
- "254" # ↑ Allow more concurrent sequences
|
||||||
- --enable-prefix-caching
|
- --enable-prefix-caching
|
||||||
|
- --attention-backend
|
||||||
|
- FLASHINFER
|
||||||
authSecret: hf-api-secret
|
authSecret: hf-api-secret
|
||||||
storage:
|
storage:
|
||||||
sharedMemorySizeLimit: 64Gi
|
sharedMemorySizeLimit: 64Gi
|
||||||
|
|||||||
Reference in New Issue
Block a user