diff --git a/README.md b/README.md index 0cdc99e..bfa3dd5 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,22 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi ## CHANGELOG +### 2026-04-02 + +A new recipe for Gemma4-26B-A4B in "on-the-fly" FP8 quantization: + +Single Spark: + +```bash +./run-recipe.sh gemma4-26b-a4b --solo +``` + +Dual Sparks: + +```bash +./run-recipe.sh gemma4-26b-a4b --no-ray +``` + ### 2026-03-31 #### Flags to specify Flashinfer ref and apply PRs diff --git a/recipes/gemma4-26b-a4b.yaml b/recipes/gemma4-26b-a4b.yaml new file mode 100644 index 0000000..26e5094 --- /dev/null +++ b/recipes/gemma4-26b-a4b.yaml @@ -0,0 +1,49 @@ +# Recipe: Gemma4-26B-A4B +# Gemma4-26B-A4B model in online FP8 quantization + +recipe_version: "1" +name: Gemma4-26B-A4B +description: vLLM serving Gemma4-26B-A4B + +# HuggingFace model to download (optional, for --download-model) +model: google/gemma-4-26B-A4B-it + +# Only cluster is supported +cluster_only: false +solo_only: false + +# Container image to use +container: vllm-node-tf5 + +# No mods required +mods: [] + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 2 + gpu_memory_utilization: 0.7 + max_model_len: 262144 + max_num_batched_tokens: 8192 + +# Environment variables +env: {} + +# The vLLM serve command template +command: | + vllm serve google/gemma-4-26B-A4B-it \ + --max-model-len {max_model_len} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --port {port} \ + --host {host} \ + --load-format fastsafetensors \ + --enable-prefix-caching \ + --enable-auto-tool-choice \ + --tool-call-parser gemma4 \ + --reasoning-parser gemma4 \ + --quantization fp8 \ + --kv-cache-dtype fp8 \ + --max-num-batched-tokens {max_num_batched_tokens} \ + -tp {tensor_parallel} --distributed-executor-backend ray +