diff --git a/README.md b/README.md
index 80d463b..0fc55f9 100644
--- a/README.md
+++ b/README.md
@@ -164,6 +164,16 @@ Don't do it every time you rebuild, because it will slow down compilation times.
 
 For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`
 
+### 2026-02-12
+
+Added a mod for Qwen3-Coder-Next-FP8 that fixes:
+
+- A bug with Triton allocator (https://github.com/vllm-project/vllm/issues/33857) that prevented the model to run in a cluster.
+- A bug that introduced crash when `--enable-prefix-caching` is on (https://github.com/vllm-project/vllm/issues/34361).
+- A bug that significantly impacted the performance on Spark (https://github.com/vllm-project/vllm/issues/34413).
+
+This mod was included in `qwen3-coder-next-fp8` recipe.
+
 ### 2026-02-11
 
 #### Configurable GPU Architecture
diff --git a/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth b/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth
new file mode 100644
index 0000000..e658dd1
--- /dev/null
+++ b/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth
@@ -0,0 +1 @@
+import _triton_alloc_setup
diff --git a/mods/fix-qwen3-coder-next/_triton_alloc_setup.py b/mods/fix-qwen3-coder-next/_triton_alloc_setup.py
new file mode 100644
index 0000000..8ade2bf
--- /dev/null
+++ b/mods/fix-qwen3-coder-next/_triton_alloc_setup.py
@@ -0,0 +1,9 @@
+try:
+    import triton.runtime._allocation as _alloc
+    import torch
+
+    _alloc.NullAllocator.__call__ = staticmethod(
+        lambda size, alignment, stream:
+            torch.cuda.caching_allocator_alloc(size, stream=stream))
+except Exception:
+    pass
diff --git a/mods/fix-qwen3-coder-next/fix_crash.diff b/mods/fix-qwen3-coder-next/fix_crash.diff
new file mode 100644
index 0000000..5b5dcc0
--- /dev/null
+++ b/mods/fix-qwen3-coder-next/fix_crash.diff
@@ -0,0 +1,14 @@
+diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
+index 0b6b7ed42ac1..b6e0305a312d 100644
+--- a/vllm/v1/core/single_type_kv_cache_manager.py
++++ b/vllm/v1/core/single_type_kv_cache_manager.py
+@@ -1000,7 +1000,8 @@ def cache_blocks(self, request: Request, num_tokens: int) -> None:
+             for block in self.req_to_blocks[request.request_id][
+                 num_cached_blocks_before:num_cached_blocks_after
+             ]:
+-                assert block.block_hash is not None
++                if block.is_null:
++                    continue
+                 self.cached_blocks_this_step.add(block.block_hash)
+ 
+     def new_step_starts(self) -> None:
diff --git a/mods/fix-qwen3-coder-next/fix_slowness.diff b/mods/fix-qwen3-coder-next/fix_slowness.diff
new file mode 100644
index 0000000..d3111bf
--- /dev/null
+++ b/mods/fix-qwen3-coder-next/fix_slowness.diff
@@ -0,0 +1,72 @@
+diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
+index 63aae43c3ddf..6ca3213fbd8d 100644
+--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
++++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
+@@ -95,19 +95,19 @@ def fused_moe_kernel_gptq_awq(
+     # moving by 1 element in a particular dimension. E.g. `stride_am` is
+     # how much to increase `a_ptr` by to get the element one row down
+     # (A has M rows).
+-    stride_am,
+-    stride_ak,
+-    stride_be,
+-    stride_bk,
+-    stride_bn,
+-    stride_cm,
+-    stride_cn,
+-    stride_bse,
+-    stride_bsk,
+-    stride_bsn,
+-    stride_bze,
+-    stride_bzk,
+-    stride_bzn,
++    stride_am: tl.int64,
++    stride_ak: tl.int64,
++    stride_be: tl.int64,
++    stride_bk: tl.int64,
++    stride_bn: tl.int64,
++    stride_cm: tl.int64,
++    stride_cn: tl.int64,
++    stride_bse: tl.int64,
++    stride_bsk: tl.int64,
++    stride_bsn: tl.int64,
++    stride_bze: tl.int64,
++    stride_bzk: tl.int64,
++    stride_bzn: tl.int64,
+     block_k_diviable: tl.constexpr,
+     group_size: tl.constexpr,
+     # Meta-parameters
+@@ -329,20 +329,20 @@ def fused_moe_kernel(
+     # moving by 1 element in a particular dimension. E.g. `stride_am` is
+     # how much to increase `a_ptr` by to get the element one row down
+     # (A has M rows).
+-    stride_am,
+-    stride_ak,
+-    stride_be,
+-    stride_bk,
+-    stride_bn,
+-    stride_cm,
+-    stride_cn,
+-    stride_asm,
+-    stride_ask,
+-    stride_bse,
+-    stride_bsk,
+-    stride_bsn,
+-    stride_bbe,  # bias expert stride
+-    stride_bbn,  # bias N stride
++    stride_am: tl.int64,
++    stride_ak: tl.int64,
++    stride_be: tl.int64,
++    stride_bk: tl.int64,
++    stride_bn: tl.int64,
++    stride_cm: tl.int64,
++    stride_cn: tl.int64,
++    stride_asm: tl.int64,
++    stride_ask: tl.int64,
++    stride_bse: tl.int64,
++    stride_bsk: tl.int64,
++    stride_bsn: tl.int64,
++    stride_bbe: tl.int64,  # bias expert stride
++    stride_bbn: tl.int64,  # bias N stride
+     # Block size for block-wise quantization
+     group_n: tl.constexpr,
+     group_k: tl.constexpr,
diff --git a/mods/fix-qwen3-coder-next/run.sh b/mods/fix-qwen3-coder-next/run.sh
new file mode 100644
index 0000000..b0f3499
--- /dev/null
+++ b/mods/fix-qwen3-coder-next/run.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+echo "Patching Qwen3-Coder-Next crashing on start"
+patch -p1 -d /usr/local/lib/python3.12/dist-packages < fix_crash.diff || echo "Patch is not applicable, skipping"
+
+echo "Reverting PR #34279 that causes slowness"
+patch -p1 -R -d /usr/local/lib/python3.12/dist-packages < fix_slowness.diff || echo "Reversing PR #34279 failed, skipping"
+
+echo "Fixing Triton allocator bug"
+cp _triton* /usr/local/lib/python3.12/dist-packages/
diff --git a/recipes/qwen3-coder-next-fp8.yaml b/recipes/qwen3-coder-next-fp8.yaml
index 0200c0c..192db84 100644
--- a/recipes/qwen3-coder-next-fp8.yaml
+++ b/recipes/qwen3-coder-next-fp8.yaml
@@ -1,30 +1,30 @@
 # Recipe: Qwen3-Coder-Next-FP8
 # Qwen3-Coder-Next model in native FP8 format
-# Currently can only be run in solo mode, cluster mode fails with error - tracking https://github.com/vllm-project/vllm/issues/33857
+
 
 recipe_version: "1"
 name: Qwen3-Coder-Next-FP8
-description: vLLM serving Qwen3-Coder-Next-FP8 on a SINGLE NODE ONLY!
+description: vLLM serving Qwen3-Coder-Next-FP8
 
 # HuggingFace model to download (optional, for --download-model)
 model: Qwen/Qwen3-Coder-Next-FP8
 
-# This model can only run on single node (solo)
-solo_only: true
+#solo_only: true
 
 # Container image to use
 container: vllm-node
 
-# No mods required
-mods: []
+# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
+mods:
+  - mods/fix-qwen3-coder-next
 
 # Default settings (can be overridden via CLI)
 defaults:
   port: 8000
   host: 0.0.0.0
-  tensor_parallel: 1
+  tensor_parallel: 2
   gpu_memory_utilization: 0.7
-  max_model_len: 131072
+  max_model_len: 262144
 
 # Environment variables
 env: {}
@@ -40,4 +40,7 @@ command: |
     --load-format fastsafetensors \
     --attention-backend flashinfer \
     --enable-prefix-caching \
-    --max-model-len {max_model_len} 
+    --max-model-len {max_model_len} \
+    -tp {tensor_parallel} \
+    --distributed-executor-backend ray
+