A recipe for Gemma4-26B

2026-04-02 23:53:55 -07:00
parent 44808f7018
commit ed32612cdd
2 changed files with 65 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -135,6 +135,22 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi
 ## CHANGELOG
 ### 2026-04-02
 A new recipe for Gemma4-26B-A4B in "on-the-fly" FP8 quantization:
 Single Spark:
 ```bash
 ./run-recipe.sh gemma4-26b-a4b --solo
 ```
 Dual Sparks: 
 ```bash
 ./run-recipe.sh gemma4-26b-a4b --no-ray
 ```
 ### 2026-03-31
 #### Flags to specify Flashinfer ref and apply PRs
--- a/recipes/gemma4-26b-a4b.yaml
+++ b/recipes/gemma4-26b-a4b.yaml
@@ -0,0 +1,49 @@
 # Recipe: Gemma4-26B-A4B
 # Gemma4-26B-A4B model in online FP8 quantization
 recipe_version: "1"
 name: Gemma4-26B-A4B
 description: vLLM serving Gemma4-26B-A4B
 # HuggingFace model to download (optional, for --download-model)
 model: google/gemma-4-26B-A4B-it
 # Only cluster is supported
 cluster_only: false
 solo_only: false
 # Container image to use
 container: vllm-node-tf5
 # No mods required
 mods: []
 # Default settings (can be overridden via CLI)
 defaults:
  port: 8000
  host: 0.0.0.0
  tensor_parallel: 2
  gpu_memory_utilization: 0.7
  max_model_len: 262144
  max_num_batched_tokens: 8192
 # Environment variables
 env: {}
 # The vLLM serve command template
 command: |
  vllm serve google/gemma-4-26B-A4B-it  \
    --max-model-len {max_model_len} \
    --gpu-memory-utilization {gpu_memory_utilization} \
    --port {port} \
    --host {host} \
    --load-format fastsafetensors \
    --enable-prefix-caching \
    --enable-auto-tool-choice \
    --tool-call-parser gemma4 \
    --reasoning-parser gemma4 \
    --quantization fp8 \
    --kv-cache-dtype fp8 \
    --max-num-batched-tokens {max_num_batched_tokens} \
    -tp {tensor_parallel} --distributed-executor-backend ray