Adding sample profile and profile loader

2026-01-25 21:22:45 -05:00
parent 133ed9cfb9
commit 751bc5a47a
6 changed files with 390 additions and 8 deletions
--- a/profiles/vllm-openai-gpt-oss-120b.sh
+++ b/profiles/vllm-openai-gpt-oss-120b.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# PROFILE: OpenAI GPT-OSS 120B
+# DESCRIPTION: vLLM serving openai/gpt-oss-120b with FlashInfer MOE optimization
+
+# Enable FlashInfer MOE with MXFP4/MXFP8 quantization
+export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
+
+vllm serve openai/gpt-oss-120b \
+    --tool-call-parser openai \
+    --enable-auto-tool-choice \
+    --tensor-parallel-size 2 \
+    --distributed-executor-backend ray \
+    --kv-cache-dtype fp8 \
+    --gpu-memory-utilization 0.70 \
+    --max-model-len 128000 \
+    --max-num-batched-tokens 4096 \
+    --max-num-seqs 8 \
+    --enable-prefix-caching \
+    --host 0.0.0.0 \
+    --port 8000