A recipe for Gemma4-26B

2026-04-02 23:53:55 -07:00
parent 44808f7018
commit ed32612cdd
2 changed files with 65 additions and 0 deletions
--- a/recipes/gemma4-26b-a4b.yaml
+++ b/recipes/gemma4-26b-a4b.yaml
@@ -0,0 +1,49 @@
+# Recipe: Gemma4-26B-A4B
+# Gemma4-26B-A4B model in online FP8 quantization
+
+recipe_version: "1"
+name: Gemma4-26B-A4B
+description: vLLM serving Gemma4-26B-A4B
+
+# HuggingFace model to download (optional, for --download-model)
+model: google/gemma-4-26B-A4B-it
+
+# Only cluster is supported
+cluster_only: false
+solo_only: false
+
+# Container image to use
+container: vllm-node-tf5
+
+# No mods required
+mods: []
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_batched_tokens: 8192
+
+# Environment variables
+env: {}
+
+# The vLLM serve command template
+command: |
+  vllm serve google/gemma-4-26B-A4B-it  \
+    --max-model-len {max_model_len} \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --port {port} \
+    --host {host} \
+    --load-format fastsafetensors \
+    --enable-prefix-caching \
+    --enable-auto-tool-choice \
+    --tool-call-parser gemma4 \
+    --reasoning-parser gemma4 \
+    --quantization fp8 \
+    --kv-cache-dtype fp8 \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    -tp {tensor_parallel} --distributed-executor-backend ray
+