diff --git a/README.md b/README.md index b2de752..bfa5875 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,39 @@ Don't do it every time you rebuild, because it will slow down compilation times. For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h` +### 2026-03-02 + +#### Qwen3.5-122B-INT4-Autoround Support + +Added support for Intel/Qwen3.5-122B-A10B-int4-AutoRound model with a new mod `mods/fix-qwen3.5-autoround` that fixes a ROPE syntax error. + +Recipe available at `recipes/qwen3.5-122b-int4-autoround.yaml`. + +### 2026-02-26 + +#### Daemon Mode Improvements + +- You can now use daemon mode (both solo and in the cluster) when exec action is specified. +- Piping exec command to docker logs when running in daemon mode. + +### 2026-02-25 + +#### HF_HOME Support + +Added support for using `$HF_HOME` environment variable as huggingface cache directory. + +#### Intel/Qwen3-Coder-Next-INT4-Autoround Mod + +Added a new mod for Intel/Qwen3-Coder-Next-INT4-Autoround model support: `mods/fix-qwen3-next-autoround` + + +### 2026-02-21 + +#### Minimax Reasoning Parser Update + +Changed reasoning parser in Minimax for better compatibility with modern clients (like coding tools). + + ### 2026-02-18 #### Completely Redesigned Build Process diff --git a/recipes/qwen3.5-122b-int4-autoround.yaml b/recipes/qwen3.5-122b-int4-autoround.yaml new file mode 100644 index 0000000..7cc10fb --- /dev/null +++ b/recipes/qwen3.5-122b-int4-autoround.yaml @@ -0,0 +1,52 @@ +# Recipe: Qwen3.5-122B-A10B-INT4-Autoround +# Qwen3.5-122B model in Intel INT4-Autoround quantization + +recipe_version: "1" +name: Qwen3.5-122B-INT4-Autoround +description: vLLM serving Qwen3.5-122B-INT4-Autoround + +# HuggingFace model to download (optional, for --download-model) +model: Intel/Qwen3.5-122B-A10B-int4-AutoRound + +#solo_only: true + +# Container image to use +container: vllm-node-tf5 + +build_args: + - --tf5 + +# Mod required to fix ROPE syntax error +mods: + - mods/fix-qwen3.5-autoround + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 2 + gpu_memory_utilization: 0.7 + max_model_len: 262144 + max_num_batched_tokens: 8192 + +# Environment variables +env: + VLLM_MARLIN_USE_ATOMIC_ADD: 1 + +# The vLLM serve command template +command: | + vllm serve Intel/Qwen3.5-122B-A10B-int4-AutoRound \ + --max-model-len {max_model_len} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --port {port} \ + --host {host} \ + --load-format fastsafetensors \ + --enable-prefix-caching \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ + --reasoning-parser qwen3 \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --trust-remote-code \ + -tp {tensor_parallel} \ + --distributed-executor-backend ray +