diff --git a/mods/fix-qwen3.6-chat-template/chat_template.jinja b/mods/fix-qwen3.6-chat-template/chat_template.jinja new file mode 100644 index 0000000..7e82350 --- /dev/null +++ b/mods/fix-qwen3.6-chat-template/chat_template.jinja @@ -0,0 +1,223 @@ +{%- set image_count = namespace(value=0) %} +{%- set video_count = namespace(value=0) %} +{%- macro render_content(content, do_vision_count, is_system_content=false) %} + {%- if content is string %} + {{- content }} + {%- elif content is iterable and content is not mapping %} + {%- for item in content %} + {%- if 'image' in item or 'image_url' in item or item.type == 'image' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain images.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set image_count.value = image_count.value + 1 %} + {%- endif %} + {%- if add_vision_id is defined and add_vision_id %} + {{- 'Picture ' ~ image_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|image_pad|><|vision_end|>' }} + {%- elif 'video' in item or item.type == 'video' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain videos.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set video_count.value = video_count.value + 1 %} + {%- endif %} + {%- if add_vision_id is defined and add_vision_id %} + {{- 'Video ' ~ video_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|video_pad|><|vision_end|>' }} + {%- elif 'text' in item %} + {{- item.text }} + {%- else %} + {{- raise_exception('Unexpected item type in content.') }} + {%- endif %} + {%- endfor %} + {%- elif content is none or content is undefined %} + {{- '' }} + {%- else %} + {{- raise_exception('Unexpected content type.') }} + {%- endif %} +{%- endmacro %} +{%- set ns_flags = namespace(enable_thinking=true) %} +{%- if enable_thinking is defined %} + {%- set ns_flags.enable_thinking = enable_thinking %} +{%- endif %} +{%- if not messages %} + {{- raise_exception('No messages provided.') }} +{%- endif %} +{%- if tools and tools is iterable and tools is not mapping %} + {{- '<|im_start|>system\n' }} + {{- "# Tools\n\nYou have access to the following functions:\n\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n" }} + {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }} + {%- if messages[0].role == 'system' or messages[0].role == 'developer' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {%- if '<|think_off|>' in content %} + {%- set ns_flags.enable_thinking = false %} + {%- set content = content.replace('<|think_off|>', '') %} + {%- endif %} + {%- if '<|think_on|>' in content %} + {%- set ns_flags.enable_thinking = true %} + {%- set content = content.replace('<|think_on|>', '') %} + {%- endif %} + {%- set content = content.strip() %} + {%- if content %} + {{- '\n\n' + content }} + {%- endif %} + {%- endif %} + {{- '<|im_end|>\n' }} +{%- else %} + {%- if messages[0].role == 'system' or messages[0].role == 'developer' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {%- if '<|think_off|>' in content %} + {%- set ns_flags.enable_thinking = false %} + {%- set content = content.replace('<|think_off|>', '') %} + {%- endif %} + {%- if '<|think_on|>' in content %} + {%- set ns_flags.enable_thinking = true %} + {%- set content = content.replace('<|think_on|>', '') %} + {%- endif %} + {%- set content = content.strip() %} + {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" %} + {%- set content = render_content(message.content, false)|trim %} + {%- if not(content.startswith('') and content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if ns.multi_step_tool %} + {%- set ns.last_query_index = messages|length - 1 %} +{%- endif %} +{%- for message in messages %} + {%- set content = render_content(message.content, true)|trim %} + {%- if '<|think_off|>' in content %} + {%- set ns_flags.enable_thinking = false %} + {%- set content = content.replace('<|think_off|>', '') %} + {%- endif %} + {%- if '<|think_on|>' in content %} + {%- set ns_flags.enable_thinking = true %} + {%- set content = content.replace('<|think_on|>', '') %} + {%- endif %} + {%- set content = content.strip() %} + {%- if message.role == "system" or message.role == "developer" %} + {%- if not loop.first %} + {{- raise_exception('System message must be at the beginning.') }} + {%- endif %} + {%- elif message.role == "user" %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {#- Auto-close unclosed think before tool_call -#} + {%- if '' in content and '' in content %} + {%- set last_think = content.rfind('') %} + {%- set last_close = content.rfind('') %} + {%- set tool_pos = content.find('') %} + {%- if last_close < last_think or last_close == -1 %} + {%- if tool_pos > last_think %} + {%- set content = content[:tool_pos] + '' + content[tool_pos:] %} + {%- else %} + {%- set content = content + '' %} + {%- endif %} + {%- endif %} + {%- endif %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- set has_think_tag = false %} + {%- set think_end_token = '' %} + {%- if '' in content %} + {%- set has_think_tag = true %} + {%- elif '' in content %} + {%- set has_think_tag = true %} + {%- set think_end_token = '' %} + {%- elif '' in content %} + {%- set reasoning_content = content.split('')[-1].lstrip('\n') %} + {%- set content = '' %} + {%- endif %} + {%- if has_think_tag %} + {%- set reasoning_content = content.split(think_end_token)[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split(think_end_token)[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- set reasoning_content = reasoning_content|trim %} + {%- set show_think = false %} + {%- if loop.index0 > ns.last_query_index %} + {%- set show_think = true %} + {%- elif ns_flags.enable_thinking and (preserve_thinking is undefined or preserve_thinking is true) and reasoning_content|length > 0 %} + {%- set show_think = true %} + {%- endif %} + {%- if show_think %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if loop.first %} + {%- if content|trim %} + {{- '\n\n\n\n' }} + {%- else %} + {{- '\n\n' }} + {%- endif %} + {%- else %} + {{- '\n\n\n' }} + {%- endif %} + {%- if tool_call.arguments is defined and tool_call.arguments is mapping %} + {%- if tool_call.arguments|length > 0 %} + {%- for args_name in tool_call.arguments %} + {%- set args_value = tool_call.arguments[args_name] %} + {{- '\n' }} + {%- set args_value = args_value | string if args_value is string else args_value | tojson %} + {{- args_value }} + {{- '\n\n' }} + {%- endfor %} + {%- endif %} + {%- elif tool_call.arguments is defined and tool_call.arguments is string %} + {%- if tool_call.arguments|trim|length > 0 %} + {{- tool_call.arguments }} + {{- '\n' }} + {%- endif %} + {%- endif %} + {{- '\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.previtem and loop.previtem.role != "tool" %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if not loop.last and loop.nextitem.role != "tool" %} + {{- '<|im_end|>\n' }} + {%- elif loop.last %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- else %} + {{- raise_exception('Unexpected message role.') }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if ns_flags.enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- else %} + {{- '\n' }} + {%- endif %} +{%- endif %} diff --git a/mods/fix-qwen3.6-chat-template/run.sh b/mods/fix-qwen3.6-chat-template/run.sh new file mode 100644 index 0000000..016716d --- /dev/null +++ b/mods/fix-qwen3.6-chat-template/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +cp chat_template.jinja $WORKSPACE_DIR/fixed_chat_template.jinja +echo "=======> to apply chat template, use --chat-template fixed_chat_template.jinja" \ No newline at end of file diff --git a/recipes/qwen3.6-35b-a3b-fp8-dflash.yaml b/recipes/qwen3.6-35b-a3b-fp8-dflash.yaml new file mode 100644 index 0000000..52d2c21 --- /dev/null +++ b/recipes/qwen3.6-35b-a3b-fp8-dflash.yaml @@ -0,0 +1,51 @@ +# Recipe: Qwen/Qwen3.5-35B-A3B-FP8 +# Qwen/Qwen3.5-35B-A3B model in native FP8 format + + +recipe_version: "1" +name: Qwen36-35B-A3B +description: vLLM serving Qwen3.6-35B-A3B-FP8 + +# HuggingFace model to download (optional, for --download-model) +model: Qwen/Qwen3.6-35B-A3B-FP8 + +#solo_only: true + +# Container image to use +container: vllm-node + +# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857) +mods: + - mods/fix-qwen3.6-chat-template + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 2 + gpu_memory_utilization: 0.7 + max_model_len: 262144 + max_num_batched_tokens: 16384 + +# Environment variables +env: + VLLM_MARLIN_USE_ATOMIC_ADD: 1 + +# The vLLM serve command template +command: | + vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \ + --host {host} \ + --port {port} \ + --max-model-len {max_model_len} \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_xml \ + --reasoning-parser qwen3 \ + --load-format fastsafetensors \ + --attention-backend flash_attn \ + --enable-prefix-caching \ + --chat-template fixed_chat_template.jinja \ + --speculative-config '{{"method": "dflash", "model": "z-lab/Qwen3.5-35B-A3B-DFlash", "num_speculative_tokens": 15}}' \ + -tp {tensor_parallel} \ + --distributed-executor-backend ray diff --git a/recipes/qwen3.6-35b-a3b-fp8.yaml b/recipes/qwen3.6-35b-a3b-fp8.yaml new file mode 100644 index 0000000..da3d30a --- /dev/null +++ b/recipes/qwen3.6-35b-a3b-fp8.yaml @@ -0,0 +1,50 @@ +# Recipe: Qwen/Qwen3.5-35B-A3B-FP8 +# Qwen/Qwen3.5-35B-A3B model in native FP8 format + + +recipe_version: "1" +name: Qwen36-35B-A3B +description: vLLM serving Qwen3.6-35B-A3B-FP8 + +# HuggingFace model to download (optional, for --download-model) +model: Qwen/Qwen3.6-35B-A3B-FP8 + +#solo_only: true + +# Container image to use +container: vllm-node + +# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857) +mods: + - mods/fix-qwen3.6-chat-template + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 2 + gpu_memory_utilization: 0.7 + max_model_len: 262144 + max_num_batched_tokens: 16384 + +# Environment variables +env: + VLLM_MARLIN_USE_ATOMIC_ADD: 1 + +# The vLLM serve command template +command: | + vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \ + --host {host} \ + --port {port} \ + --max-model-len {max_model_len} \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_xml \ + --kv-cache-dtype fp8 \ + --load-format fastsafetensors \ + --attention-backend flashinfer \ + --enable-prefix-caching \ + --chat-template fixed_chat_template.jinja \ + -tp {tensor_parallel} \ + --distributed-executor-backend ray