From e8a12da072647e52413d09c3762f4f14440ea938 Mon Sep 17 00:00:00 2001 From: eugr Date: Sun, 14 Dec 2025 00:30:50 -0800 Subject: [PATCH] Build triton from source; add TRITON_SHA argument to specify triton release, and add timing statistics --- Dockerfile | 27 ++++++++++++--- build-and-copy.sh | 87 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 93 insertions(+), 21 deletions(-) diff --git a/Dockerfile b/Dockerfile index 7a37c24..6221a4c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -56,11 +56,6 @@ RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \ RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \ pip install xgrammar fastsafetensors -# Install latest Triton from main -RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \ - pip install git+https://github.com/triton-lang/triton.git \ - git+https://github.com/triton-lang/triton.git#subdirectory=python/triton_kernels - # Install FlashInfer packages RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \ pip install flashinfer-python --no-deps --index-url https://flashinfer.ai/whl && \ @@ -99,6 +94,8 @@ WORKDIR $VLLM_BASE_DIR/vllm RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \ python3 use_existing_torch.py && \ sed -i "/flashinfer/d" requirements/cuda.txt && \ + sed -i '/^triton\b/d' requirements/test.txt && \ + sed -i '/^fastsafetensors\b/d' requirements/test.txt && \ pip install -r requirements/build.txt # Apply Patches @@ -113,6 +110,26 @@ RUN --mount=type=cache,id=ccache,target=/root/.ccache \ --mount=type=cache,id=pip-cache,target=/root/.cache/pip \ pip install --no-build-isolation . -v +# Install latest Triton from main - override version pulled from dependencies + +# Initial clone (Cached forever) +RUN git clone https://github.com/triton-lang/triton.git + +# We expect TRITON_SHA to be passed from the command line to break the cache +# Set to v3.5.1 commit by default +ARG TRITON_SHA=0add68262ab0a2e33b84524346cb27cbb2787356 + +# This only runs if TRITON_SHA differs from the last build +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/ccache \ + cd triton && \ + git fetch origin && \ + git checkout ${TRITON_SHA} && \ + git submodule sync && \ + git submodule update --init --recursive && \ + pip install -r python/requirements.txt && \ + pip install --no-build-isolation . -v && \ + pip install python/triton_kernels --no-deps # ========================================================= # STAGE 2: Runner (Transfers only necessary artifacts) diff --git a/build-and-copy.sh b/build-and-copy.sh index 7dbbf17..819dde7 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -1,12 +1,17 @@ #!/bin/bash set -e +# Start total time tracking +START_TIME=$(date +%s) + # Default values IMAGE_TAG="vllm-node" REBUILD_DEPS=false REBUILD_VLLM=false COPY_HOST="" SSH_USER="$USER" +NO_BUILD=false +TRITON_SHA="" # Help function usage() { @@ -14,8 +19,10 @@ usage() { echo " -t, --tag : Image tag (default: 'vllm-node')" echo " --rebuild-deps : Set cache bust for dependencies" echo " --rebuild-vllm : Set cache bust for vllm" + echo " --triton-sha : Triton commit SHA (default: auto-detect latest main)" echo " -h, --copy-to-host : Host address to copy the image to (if not set, don't copy)" echo " -u, --user : Username for ssh command (default: \$USER)" + echo " --no-build : Skip building, only copy image (requires --copy-to-host)" echo " --help : Show this help message" exit 1 } @@ -26,42 +33,90 @@ while [[ "$#" -gt 0 ]]; do -t|--tag) IMAGE_TAG="$2"; shift ;; --rebuild-deps) REBUILD_DEPS=true ;; --rebuild-vllm) REBUILD_VLLM=true ;; + --triton-sha) TRITON_SHA="$2"; shift ;; -h|--copy-to-host) COPY_HOST="$2"; shift ;; -u|--user) SSH_USER="$2"; shift ;; + --no-build) NO_BUILD=true ;; --help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac shift done -# Construct build command -CMD=("docker" "build" "-t" "$IMAGE_TAG") - -if [ "$REBUILD_DEPS" = true ]; then - echo "Setting CACHEBUST_DEPS..." - CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)") +# Validate --no-build usage +if [ "$NO_BUILD" = true ] && [ -z "$COPY_HOST" ]; then + echo "Error: --no-build requires --copy-to-host to be specified" + exit 1 fi -if [ "$REBUILD_VLLM" = true ]; then - echo "Setting CACHEBUST_VLLM..." - CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)") +# Build image (unless --no-build is set) +BUILD_TIME=0 +if [ "$NO_BUILD" = false ]; then + # Auto-detect TRITON_SHA if not provided + if [ -z "$TRITON_SHA" ]; then + echo "Auto-detecting Triton commit for v3.5.1..." + TRITON_SHA=$(git ls-remote https://github.com/triton-lang/triton.git refs/tags/v3.5.1 | cut -f1) + echo "Detected TRITON_SHA: $TRITON_SHA" + fi + + # Construct build command + CMD=("docker" "build" "-t" "$IMAGE_TAG") + + if [ "$REBUILD_DEPS" = true ]; then + echo "Setting CACHEBUST_DEPS..." + CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)") + fi + + if [ "$REBUILD_VLLM" = true ]; then + echo "Setting CACHEBUST_VLLM..." + CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)") + fi + + # Add TRITON_SHA to build arguments + CMD+=("--build-arg" "TRITON_SHA=$TRITON_SHA") + + # Add build context + CMD+=(".") + + # Execute build + echo "Building image with command: ${CMD[*]}" + BUILD_START=$(date +%s) + "${CMD[@]}" + BUILD_END=$(date +%s) + BUILD_TIME=$((BUILD_END - BUILD_START)) +else + echo "Skipping build (--no-build specified)" fi -# Add build context -CMD+=(".") - -# Execute build -echo "Building image with command: ${CMD[*]}" -"${CMD[@]}" - # Copy to host if requested +COPY_TIME=0 if [ -n "$COPY_HOST" ]; then echo "Copying image '$IMAGE_TAG' to ${SSH_USER}@${COPY_HOST}..." + COPY_START=$(date +%s) # Using the pipe method from README.md docker save "$IMAGE_TAG" | ssh "${SSH_USER}@${COPY_HOST}" "docker load" + COPY_END=$(date +%s) + COPY_TIME=$((COPY_END - COPY_START)) echo "Copy complete." else echo "No host specified, skipping copy." fi +# Calculate total time +END_TIME=$(date +%s) +TOTAL_TIME=$((END_TIME - START_TIME)) + +# Display timing statistics +echo "" +echo "=========================================" +echo " TIMING STATISTICS" +echo "=========================================" +if [ "$BUILD_TIME" -gt 0 ]; then + echo "Docker Build: $(printf '%02d:%02d:%02d' $((BUILD_TIME/3600)) $((BUILD_TIME%3600/60)) $((BUILD_TIME%60)))" +fi +if [ "$COPY_TIME" -gt 0 ]; then + echo "Image Copy: $(printf '%02d:%02d:%02d' $((COPY_TIME/3600)) $((COPY_TIME%3600/60)) $((COPY_TIME%60)))" +fi +echo "Total Time: $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))" +echo "=========================================" echo "Done."