Merge branch 'main' into privileged
This commit is contained in:
23
mods/fix-glm-4.7-flash-AWQ/glm47_vllm_bug.patch
Normal file
23
mods/fix-glm-4.7-flash-AWQ/glm47_vllm_bug.patch
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
--- a/vllm/model_executor/layers/attention/mla_attention.py
|
||||||
|
+++ b/vllm/model_executor/layers/attention/mla_attention.py
|
||||||
|
@@ -403,7 +403,7 @@
|
||||||
|
# If kv_b_proj_weight is unquantized, quantize it to mxfp4 if supported
|
||||||
|
self.is_aiter_triton_fp4_bmm_enabled = (
|
||||||
|
rocm_aiter_ops.is_fp4bmm_enabled()
|
||||||
|
- and self.kv_b_proj.weight.dtype == torch.bfloat16
|
||||||
|
+ and (self.kv_b_proj.weight.dtype if hasattr(self.kv_b_proj, "weight") else torch.bfloat16) == torch.bfloat16
|
||||||
|
)
|
||||||
|
|
||||||
|
# Attributes for forward_impl method
|
||||||
|
@@ -2358,9 +2358,9 @@
|
||||||
|
# model dtype input and will quantize internally.
|
||||||
|
if (
|
||||||
|
use_fp8_prefill
|
||||||
|
- or self.kv_b_proj.weight.dtype != current_platform.fp8_dtype()
|
||||||
|
+ or (self.kv_b_proj.weight.dtype if hasattr(self.kv_b_proj, "weight") else torch.bfloat16) != current_platform.fp8_dtype()
|
||||||
|
):
|
||||||
|
- kv_c_normed = kv_c_normed.to(self.kv_b_proj.weight.dtype)
|
||||||
|
+ kv_c_normed = kv_c_normed.to((self.kv_b_proj.weight.dtype if hasattr(self.kv_b_proj, "weight") else torch.bfloat16))
|
||||||
|
|
||||||
|
k_pe = workspace[:toks][..., self.kv_lora_rank :].unsqueeze(1)
|
||||||
|
kv_nope = self.kv_b_proj(kv_c_normed)[0].view(
|
||||||
@@ -1,3 +1,8 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -e
|
set -e
|
||||||
|
echo "--- Applying GLM 4.7 AWQ speed patch..."
|
||||||
patch -p1 -d / < glm47_flash.patch
|
patch -p1 -d / < glm47_flash.patch
|
||||||
|
echo "=== OK"
|
||||||
|
echo "--- Applying vLLM crash patch..."
|
||||||
|
patch -p1 -d /usr/local/lib/python3.12/dist-packages < glm47_vllm_bug.patch || echo "=== Patch is not applicable, skipping"
|
||||||
|
echo "=== OK"
|
||||||
|
|||||||
Reference in New Issue
Block a user