Compare commits
13 Commits
staging-cu
...
staging-cu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d42c4199fa | ||
|
|
9dc09bd04b | ||
|
|
e88426646b | ||
|
|
eb8abcca7f | ||
|
|
d148d95a19 | ||
|
|
5346372f14 | ||
|
|
5f8f988d91 | ||
|
|
3fabd3fb1c | ||
|
|
2d03bc138d | ||
|
|
4303f8b6d0 | ||
|
|
50b3ca60f3 | ||
|
|
163f23d85b | ||
|
|
e8f94d6b8b |
@@ -113,7 +113,9 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
|
||||
# flashinfer-jit-cache
|
||||
cd ../flashinfer-jit-cache && \
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
|
||||
# dump git ref in the wheels dir
|
||||
cd .. && git rev-parse HEAD > /workspace/wheels/.flashinfer-commit
|
||||
|
||||
# =========================================================
|
||||
# STAGE 3: FlashInfer Wheel Export
|
||||
@@ -196,7 +198,9 @@ RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/
|
||||
# Final Compilation
|
||||
RUN --mount=type=cache,id=ccache,target=/root/.ccache \
|
||||
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
|
||||
# dump git ref in the wheels dir
|
||||
git rev-parse HEAD > /workspace/wheels/.vllm-commit
|
||||
|
||||
# =========================================================
|
||||
# STAGE 5: vLLM Wheel Export
|
||||
|
||||
@@ -66,7 +66,12 @@ copy_to_host() {
|
||||
|
||||
# try_download_wheels TAG PREFIX
|
||||
# Downloads wheels matching PREFIX*.whl from a GitHub release.
|
||||
# Skips files that are already present and up to date (by remote updated_at vs local mtime).
|
||||
# Skip conditions (either is sufficient):
|
||||
# 1. Commit hash in release name matches .wheels/.{PREFIX}_commit (primary check).
|
||||
# 2. All local wheels are newer than the latest GitHub asset (freshly built).
|
||||
# Only downloads a file when the remote asset is newer than the local copy AND
|
||||
# the above skip conditions are not met.
|
||||
# On success, persists the release commit hash to .wheels/.{PREFIX}_commit.
|
||||
# Returns 0 if all matching wheels are now available, 1 on any error.
|
||||
try_download_wheels() {
|
||||
local TAG="$1"
|
||||
@@ -92,7 +97,7 @@ try_download_wheels() {
|
||||
|
||||
local DOWNLOAD_LIST
|
||||
DOWNLOAD_LIST=$(echo "$RELEASE_JSON" | python3 -c '
|
||||
import json, sys, os
|
||||
import json, sys, os, re
|
||||
from datetime import datetime, timezone
|
||||
|
||||
wheels_dir, prefix = sys.argv[1], sys.argv[2]
|
||||
@@ -104,6 +109,31 @@ if not assets:
|
||||
print("No assets found matching prefix: " + prefix, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Extract commit hash from the release name:
|
||||
# FlashInfer: "Prebuilt FlashInfer Wheels (0.6.5-124a2d32-d20260305) - DGX Spark Only"
|
||||
# vLLM: "Prebuilt vLLM Wheels (0.16.1rc1.dev296+ga73af584f.d20260305.cu131) - DGX Spark only"
|
||||
release_name = data.get("name", "")
|
||||
commit_hash = None
|
||||
if prefix.startswith("flashinfer"):
|
||||
m = re.search(r"\([\d.]+\w*-([0-9a-f]{6,})-d\d{8}\)", release_name, re.IGNORECASE)
|
||||
if m:
|
||||
commit_hash = m.group(1)
|
||||
else:
|
||||
m = re.search(r"\+g([0-9a-f]{6,})\.", release_name, re.IGNORECASE)
|
||||
if m:
|
||||
commit_hash = m.group(1)
|
||||
|
||||
# Compare against the locally stored commit hash
|
||||
commit_file = os.path.join(wheels_dir, "." + prefix + "-commit")
|
||||
local_commit = None
|
||||
if os.path.exists(commit_file):
|
||||
with open(commit_file) as f:
|
||||
local_commit = f.read().strip()
|
||||
|
||||
if commit_hash and local_commit and local_commit[:len(commit_hash)] == commit_hash:
|
||||
print("Commit hash matches (" + commit_hash + ") — wheels are up to date.", file=sys.stderr)
|
||||
sys.exit(0)
|
||||
|
||||
newest_remote_ts = max(
|
||||
datetime.strptime(a["updated_at"], "%Y-%m-%dT%H:%M:%SZ")
|
||||
.replace(tzinfo=timezone.utc).timestamp()
|
||||
@@ -119,12 +149,19 @@ local_wheels = [
|
||||
if local_wheels and all(os.path.getmtime(p) >= newest_remote_ts for p in local_wheels):
|
||||
sys.exit(0)
|
||||
|
||||
downloads = []
|
||||
for a in assets:
|
||||
local_path = os.path.join(wheels_dir, a["name"])
|
||||
remote_ts = datetime.strptime(a["updated_at"], "%Y-%m-%dT%H:%M:%SZ") \
|
||||
.replace(tzinfo=timezone.utc).timestamp()
|
||||
if not os.path.exists(local_path) or remote_ts > os.path.getmtime(local_path):
|
||||
print(a["browser_download_url"] + " " + a["name"])
|
||||
downloads.append(a["browser_download_url"] + " " + a["name"])
|
||||
|
||||
if downloads:
|
||||
if commit_hash:
|
||||
print("#commit:" + commit_hash)
|
||||
for d in downloads:
|
||||
print(d)
|
||||
' "$WHEELS_DIR" "$PREFIX") || return 1
|
||||
|
||||
if [ -z "$DOWNLOAD_LIST" ]; then
|
||||
@@ -132,12 +169,31 @@ for a in assets:
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Parse the optional '#commit:HASH' sentinel emitted by the Python script
|
||||
local REMOTE_COMMIT=""
|
||||
local DOWNLOAD_ENTRIES=""
|
||||
while IFS= read -r LINE; do
|
||||
if [[ "$LINE" == "#commit:"* ]]; then
|
||||
REMOTE_COMMIT="${LINE#"#commit:"}"
|
||||
elif [[ -n "$LINE" ]]; then
|
||||
DOWNLOAD_ENTRIES+="$LINE"$'\n'
|
||||
fi
|
||||
done <<< "$DOWNLOAD_LIST"
|
||||
|
||||
if [ -z "$DOWNLOAD_ENTRIES" ]; then
|
||||
echo "All $PREFIX wheels are up to date — skipping download."
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Back up existing wheels so we never leave a mix of old and new on failure
|
||||
local DL_BACKUP="$WHEELS_DIR/.backup-download-${PREFIX}"
|
||||
rm -rf "$DL_BACKUP" && mkdir -p "$DL_BACKUP"
|
||||
for f in "$WHEELS_DIR/${PREFIX}"*.whl; do
|
||||
[ -f "$f" ] && mv "$f" "$DL_BACKUP/"
|
||||
done
|
||||
for f in "$WHEELS_DIR/.${PREFIX}"*; do
|
||||
[ -f "$f" ] && mv "$f" "$DL_BACKUP/"
|
||||
done
|
||||
|
||||
local URL NAME TMP_WHL
|
||||
local DOWNLOADED=()
|
||||
@@ -154,13 +210,18 @@ for a in assets:
|
||||
if compgen -G "$DL_BACKUP/${PREFIX}*.whl" > /dev/null 2>&1; then
|
||||
echo "Restoring previous $PREFIX wheels..."
|
||||
mv "$DL_BACKUP/${PREFIX}"*.whl "$WHEELS_DIR/"
|
||||
mv "$DL_BACKUP/.${PREFIX}"* "$WHEELS_DIR/"
|
||||
fi
|
||||
rm -rf "$DL_BACKUP"
|
||||
return 1
|
||||
fi
|
||||
done <<< "$DOWNLOAD_LIST"
|
||||
done <<< "$DOWNLOAD_ENTRIES"
|
||||
|
||||
rm -rf "$DL_BACKUP"
|
||||
if [ -n "$REMOTE_COMMIT" ]; then
|
||||
echo "$REMOTE_COMMIT" > "$WHEELS_DIR/.${PREFIX}-commit"
|
||||
echo "Recorded $PREFIX commit hash: $REMOTE_COMMIT"
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
@@ -58,8 +58,8 @@ network:
|
||||
ethernets:
|
||||
enp1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [ ipv4 ] # Restrict link-local addresses to IPv4 only
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [] # Restrict link-local addresses to static IPv4 only
|
||||
mtu: 9000
|
||||
addresses: [192.168.177.11/24]
|
||||
enP2p1s0f1np1:
|
||||
@@ -76,8 +76,8 @@ network:
|
||||
ethernets:
|
||||
enp1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [ ipv4 ] # Restrict link-local addresses to IPv4 only
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [] # Restrict link-local addresses to static IPv4 only
|
||||
mtu: 9000
|
||||
addresses: [192.168.177.12/24]
|
||||
enP2p1s0f1np1:
|
||||
@@ -239,4 +239,4 @@ mpirun -np 2 -H 192.168.177.11:1,192.168.177.12:1 \
|
||||
-x LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
|
||||
$HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 2
|
||||
|
||||
```
|
||||
```
|
||||
|
||||
@@ -115,7 +115,7 @@ while [[ "$#" -gt 0 ]]; do
|
||||
fi
|
||||
ACTION="exec"
|
||||
shift
|
||||
COMMAND_TO_RUN="$@"
|
||||
COMMAND_TO_RUN=$(printf "%q " "$@")
|
||||
break
|
||||
;;
|
||||
*)
|
||||
|
||||
155
mods/fix-qwen3.5-chat-template/chat_template.jinja
Normal file
155
mods/fix-qwen3.5-chat-template/chat_template.jinja
Normal file
@@ -0,0 +1,155 @@
|
||||
{%- set image_count = namespace(value=0) %}
|
||||
{%- set video_count = namespace(value=0) %}
|
||||
{%- macro render_content(content, do_vision_count, is_system_content=false) %}
|
||||
{%- if content is string %}
|
||||
{{- content }}
|
||||
{%- elif content is iterable and content is not mapping %}
|
||||
{%- for item in content %}
|
||||
{%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
|
||||
{%- if is_system_content %}
|
||||
{{- raise_exception('System message cannot contain images.') }}
|
||||
{%- endif %}
|
||||
{%- if do_vision_count %}
|
||||
{%- set image_count.value = image_count.value + 1 %}
|
||||
{%- endif %}
|
||||
{%- if add_vision_id %}
|
||||
{{- 'Picture ' ~ image_count.value ~ ': ' }}
|
||||
{%- endif %}
|
||||
{{- '<|vision_start|><|image_pad|><|vision_end|>' }}
|
||||
{%- elif 'video' in item or item.type == 'video' %}
|
||||
{%- if is_system_content %}
|
||||
{{- raise_exception('System message cannot contain videos.') }}
|
||||
{%- endif %}
|
||||
{%- if do_vision_count %}
|
||||
{%- set video_count.value = video_count.value + 1 %}
|
||||
{%- endif %}
|
||||
{%- if add_vision_id %}
|
||||
{{- 'Video ' ~ video_count.value ~ ': ' }}
|
||||
{%- endif %}
|
||||
{{- '<|vision_start|><|video_pad|><|vision_end|>' }}
|
||||
{%- elif 'text' in item %}
|
||||
{{- item.text }}
|
||||
{%- else %}
|
||||
{{- raise_exception('Unexpected item type in content.') }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- elif content is none or content is undefined %}
|
||||
{{- '' }}
|
||||
{%- else %}
|
||||
{{- raise_exception('Unexpected content type.') }}
|
||||
{%- endif %}
|
||||
{%- endmacro %}
|
||||
{%- if not messages %}
|
||||
{{- raise_exception('No messages provided.') }}
|
||||
{%- endif %}
|
||||
{%- if tools and tools is iterable and tools is not mapping %}
|
||||
{{- '<|im_start|>system\n' }}
|
||||
{{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
|
||||
{%- for tool in tools %}
|
||||
{{- "\n" }}
|
||||
{{- tool | tojson }}
|
||||
{%- endfor %}
|
||||
{{- "\n</tools>" }}
|
||||
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{%- set content = render_content(messages[0].content, false, true)|trim %}
|
||||
{%- if content %}
|
||||
{{- '\n\n' + content }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- else %}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{%- set content = render_content(messages[0].content, false, true)|trim %}
|
||||
{{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
||||
{%- for message in messages[::-1] %}
|
||||
{%- set index = (messages|length - 1) - loop.index0 %}
|
||||
{%- if ns.multi_step_tool and message.role == "user" %}
|
||||
{%- set content = render_content(message.content, false)|trim %}
|
||||
{%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
|
||||
{%- set ns.multi_step_tool = false %}
|
||||
{%- set ns.last_query_index = index %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if ns.multi_step_tool %}
|
||||
{{- raise_exception('No user query found in messages.') }}
|
||||
{%- endif %}
|
||||
{%- for message in messages %}
|
||||
{%- set content = render_content(message.content, true)|trim %}
|
||||
{%- if message.role == "system" %}
|
||||
{%- if not loop.first %}
|
||||
{{- raise_exception('System message must be at the beginning.') }}
|
||||
{%- endif %}
|
||||
{%- elif message.role == "user" %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
||||
{%- elif message.role == "assistant" %}
|
||||
{%- set reasoning_content = '' %}
|
||||
{%- if message.reasoning_content is string %}
|
||||
{%- set reasoning_content = message.reasoning_content %}
|
||||
{%- else %}
|
||||
{%- if '</think>' in content %}
|
||||
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- set reasoning_content = reasoning_content|trim %}
|
||||
{%- if loop.index0 > ns.last_query_index %}
|
||||
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
|
||||
{%- else %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||
{%- endif %}
|
||||
{%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{%- if tool_call.function is defined %}
|
||||
{%- set tool_call = tool_call.function %}
|
||||
{%- endif %}
|
||||
{%- if loop.first %}
|
||||
{%- if content|trim %}
|
||||
{{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
||||
{%- else %}
|
||||
{{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
||||
{%- endif %}
|
||||
{%- else %}
|
||||
{{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
||||
{%- endif %}
|
||||
{%- if tool_call.arguments is mapping %}
|
||||
{%- for args_name in tool_call.arguments %}
|
||||
{%- set args_value = tool_call.arguments[args_name] %}
|
||||
{{- '<parameter=' + args_name + '>\n' }}
|
||||
{%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
|
||||
{{- args_value }}
|
||||
{{- '\n</parameter>\n' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- '</function>\n</tool_call>' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- elif message.role == "tool" %}
|
||||
{%- if loop.previtem and loop.previtem.role != "tool" %}
|
||||
{{- '<|im_start|>user' }}
|
||||
{%- endif %}
|
||||
{{- '\n<tool_response>\n' }}
|
||||
{{- content }}
|
||||
{{- '\n</tool_response>' }}
|
||||
{%- if not loop.last and loop.nextitem.role != "tool" %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- elif loop.last %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- else %}
|
||||
{{- raise_exception('Unexpected message role.') }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|im_start|>assistant\n' }}
|
||||
{%- if enable_thinking is defined and enable_thinking is false %}
|
||||
{{- '<think>\n\n</think>\n\n' }}
|
||||
{%- else %}
|
||||
{{- '<think>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
4
mods/fix-qwen3.5-chat-template/run.sh
Normal file
4
mods/fix-qwen3.5-chat-template/run.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
cp chat_template.jinja $WORKSPACE_DIR/unsloth.jinja
|
||||
echo "=======> to apply chat template, use --chat-template unsloth.jinja"
|
||||
45
recipes/4x-spark-cluster/minimax-m2.5.yaml
Normal file
45
recipes/4x-spark-cluster/minimax-m2.5.yaml
Normal file
@@ -0,0 +1,45 @@
|
||||
# Recipe: MiniMax-M2.5
|
||||
# MiniMaxAI/MiniMax-M2.5
|
||||
|
||||
recipe_version: "1"
|
||||
name: MiniMax-M2.5
|
||||
description: vLLM serving MiniMax-M2.5 with Ray distributed backend
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: MiniMaxAI/MiniMax-M2.5
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node
|
||||
|
||||
# Can only be run in a cluster
|
||||
cluster_only: true
|
||||
|
||||
# No mods required
|
||||
mods: []
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 4
|
||||
gpu_memory_utilization: 0.90
|
||||
max_model_len: 128000
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_DISTRIBUTED_EXECUTOR_CONFIG: '{"placement_group_options":{"strategy":"SPREAD"}}'
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve MiniMaxAI/MiniMax-M2.5 \
|
||||
--trust-remote-code \
|
||||
--port {port} \
|
||||
--host {host} \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray \
|
||||
--max-model-len {max_model_len} \
|
||||
--load-format fastsafetensors \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser minimax_m2 \
|
||||
--reasoning-parser minimax_m2_append_think
|
||||
63
recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml
Normal file
63
recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml
Normal file
@@ -0,0 +1,63 @@
|
||||
# Recipe: Qwen3.5-397B-A17B-FP8
|
||||
# Qwen3.5-397B-A17B model in FP8 precision
|
||||
# Multi-modal input
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen3.5-397B-A17B-FP8
|
||||
description: vLLM serving Qwen3.5-397B-A17B-FP8
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Qwen/Qwen3.5-397B-A17B-FP8
|
||||
|
||||
#solo_only: true
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node-tf5
|
||||
|
||||
build_args:
|
||||
- --tf5
|
||||
- --rebuild-flashinfer
|
||||
- --rebuild-vllm
|
||||
|
||||
# Mod required to fix ROPE syntax error
|
||||
mods:
|
||||
- mods/fix-qwen3.5-autoround
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 4
|
||||
gpu_memory_utilization: 0.85
|
||||
max_model_len: 262144
|
||||
max_num_batched_tokens: 8192
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_USE_DEEP_GEMM: 0
|
||||
VLLM_USE_FLASHINFER_MOE_FP16: 1
|
||||
VLLM_USE_FLASHINFER_SAMPLER: 0
|
||||
OMP_NUM_THREADS: 4
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \
|
||||
--max-model-len {max_model_len} \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--port {port} \
|
||||
--host {host} \
|
||||
--load-format fastsafetensors \
|
||||
--enable-prefix-caching \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--reasoning-parser qwen3 \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--trust-remote-code \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray \
|
||||
--mm-encoder-tp-mode data \
|
||||
--kv-cache-dtype fp8 \
|
||||
--compilation-config.cudagraph_mode none \
|
||||
--max-num-seqs 32 \
|
||||
--attention-backend flashinfer
|
||||
|
||||
@@ -15,7 +15,8 @@ cluster_only: true
|
||||
container: vllm-node
|
||||
|
||||
# No mods required
|
||||
mods: []
|
||||
mods:
|
||||
- mods/fix-qwen3.5-chat-template
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
@@ -41,5 +42,6 @@ command: |
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--reasoning-parser qwen3 \
|
||||
--chat-template unsloth.jinja \
|
||||
-tp {tensor_parallel} --distributed-executor-backend ray \
|
||||
--max-num-batched-tokens {max_num_batched_tokens}
|
||||
|
||||
@@ -19,6 +19,7 @@ build_args:
|
||||
# Mod required to fix ROPE syntax error
|
||||
mods:
|
||||
- mods/fix-qwen3.5-autoround
|
||||
- mods/fix-qwen3.5-chat-template
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
@@ -47,6 +48,7 @@ command: |
|
||||
--reasoning-parser qwen3 \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--trust-remote-code \
|
||||
--chat-template unsloth.jinja \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray
|
||||
|
||||
|
||||
51
recipes/qwen3.5-35b-a3b-fp8.yaml
Normal file
51
recipes/qwen3.5-35b-a3b-fp8.yaml
Normal file
@@ -0,0 +1,51 @@
|
||||
# Recipe: Qwen/Qwen3.5-35B-A3B-FP8
|
||||
# Qwen/Qwen3.5-35B-A3B model in native FP8 format
|
||||
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen35-35B-A3B
|
||||
description: vLLM serving Qwen3.5-35B-A3B-FP8
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Qwen/Qwen3.5-35B-A3B-FP8
|
||||
|
||||
#solo_only: true
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node
|
||||
|
||||
# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
|
||||
mods:
|
||||
- mods/fix-qwen3-coder-next
|
||||
- mods/fix-qwen3.5-chat-template
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 2
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 262144
|
||||
max_num_batched_tokens: 16384
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: 1
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \
|
||||
--host {host} \
|
||||
--port {port} \
|
||||
--max-model-len {max_model_len} \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--kv-cache-dtype fp8 \
|
||||
--load-format fastsafetensors \
|
||||
--attention-backend flashinfer \
|
||||
--enable-prefix-caching \
|
||||
--chat-template unsloth.jinja \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray
|
||||
@@ -86,6 +86,7 @@ RELATED FILES:
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import shlex
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
@@ -473,7 +474,7 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
|
||||
# Append extra args if provided (after --)
|
||||
if extra_args:
|
||||
# Join extra args and append to command
|
||||
extra_args_str = ' '.join(extra_args)
|
||||
extra_args_str = ' '.join(shlex.quote(a) for a in extra_args)
|
||||
command = command.rstrip()
|
||||
# Handle multi-line commands with backslash continuations
|
||||
if command.endswith('\\'):
|
||||
|
||||
Reference in New Issue
Block a user