diff --git a/mods/fix-glm-4.7-flash-AWQ/glm47_vllm_bug.patch b/mods/fix-glm-4.7-flash-AWQ/glm47_vllm_bug.patch new file mode 100644 index 0000000..317366b --- /dev/null +++ b/mods/fix-glm-4.7-flash-AWQ/glm47_vllm_bug.patch @@ -0,0 +1,23 @@ +--- a/vllm/model_executor/layers/attention/mla_attention.py ++++ b/vllm/model_executor/layers/attention/mla_attention.py +@@ -403,7 +403,7 @@ + # If kv_b_proj_weight is unquantized, quantize it to mxfp4 if supported + self.is_aiter_triton_fp4_bmm_enabled = ( + rocm_aiter_ops.is_fp4bmm_enabled() +- and self.kv_b_proj.weight.dtype == torch.bfloat16 ++ and (self.kv_b_proj.weight.dtype if hasattr(self.kv_b_proj, "weight") else torch.bfloat16) == torch.bfloat16 + ) + + # Attributes for forward_impl method +@@ -2358,9 +2358,9 @@ + # model dtype input and will quantize internally. + if ( + use_fp8_prefill +- or self.kv_b_proj.weight.dtype != current_platform.fp8_dtype() ++ or (self.kv_b_proj.weight.dtype if hasattr(self.kv_b_proj, "weight") else torch.bfloat16) != current_platform.fp8_dtype() + ): +- kv_c_normed = kv_c_normed.to(self.kv_b_proj.weight.dtype) ++ kv_c_normed = kv_c_normed.to((self.kv_b_proj.weight.dtype if hasattr(self.kv_b_proj, "weight") else torch.bfloat16)) + + k_pe = workspace[:toks][..., self.kv_lora_rank :].unsqueeze(1) + kv_nope = self.kv_b_proj(kv_c_normed)[0].view( diff --git a/mods/fix-glm-4.7-flash-AWQ/run.sh b/mods/fix-glm-4.7-flash-AWQ/run.sh index c61bd6a..e696e8e 100644 --- a/mods/fix-glm-4.7-flash-AWQ/run.sh +++ b/mods/fix-glm-4.7-flash-AWQ/run.sh @@ -1,3 +1,8 @@ #!/bin/bash set -e +echo "--- Applying GLM 4.7 AWQ speed patch..." patch -p1 -d / < glm47_flash.patch +echo "=== OK" +echo "--- Applying vLLM crash patch..." +patch -p1 -d /usr/local/lib/python3.12/dist-packages < glm47_vllm_bug.patch || echo "=== Patch is not applicable, skipping" +echo "=== OK"