diff --git a/README.md b/README.md index 6af4b37..f6eb5f3 100644 --- a/README.md +++ b/README.md @@ -164,6 +164,17 @@ Don't do it every time you rebuild, because it will slow down compilation times. For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h` +### 2026-02-10 + +#### Cache Directory Mounting + +`launch-cluster.sh` now automatically mounts default cache directories to the container to improve cold start times: +- `~/.cache/vllm` +- `~/.cache/flashinfer` +- `~/.triton` + +To disable this behavior (clean start), use `--no-cache-dirs` flag. + ### 2026-02-09 - Migrated to a new base image with PyTorch 2.10 compiled with Spark support. With this change, wheels build is no longer a recommended way - please use a source build instead. @@ -732,6 +743,7 @@ You can override the auto-detected values if needed: | `--nccl-debug` | NCCL debug level (e.g., INFO, WARN). Defaults to INFO if flag is present but value is omitted. | | `--check-config` | Check configuration and auto-detection without launching. | | `--solo` | Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster | +| `--no-cache-dirs` | Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton). | | `--launch-script` | Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted. | | `-d` | Run in daemon mode (detached). | diff --git a/launch-cluster.sh b/launch-cluster.sh index 17434c1..0d4d354 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -31,6 +31,7 @@ SCRIPT_DIR="$(dirname "$(realpath "$0")")" ACTIONS_ARG="" SOLO_MODE="false" +MOUNT_CACHE_DIRS="true" # Function to print usage usage() { @@ -46,6 +47,7 @@ usage() { echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted." echo " --check-config Check configuration and auto-detection without launching" echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster" + echo " --no-cache-dirs Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton)" echo " -d Daemon mode (only for 'start' action)" echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." @@ -77,6 +79,7 @@ while [[ "$#" -gt 0 ]]; do ;; --check-config) CHECK_CONFIG="true" ;; --solo) SOLO_MODE="true" ;; + --no-cache-dirs) MOUNT_CACHE_DIRS="false" ;; -d) DAEMON_MODE="true" ;; -h|--help) usage ;; start|stop|status) @@ -127,6 +130,22 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then esac fi +# Add cache dirs if requested +CACHE_DIRS_TO_CREATE=() +if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then + # vLLM Cache + DOCKER_ARGS="$DOCKER_ARGS -v $HOME/.cache/vllm:/root/.cache/vllm" + CACHE_DIRS_TO_CREATE+=("$HOME/.cache/vllm") + + # FlashInfer Cache + DOCKER_ARGS="$DOCKER_ARGS -v $HOME/.cache/flashinfer:/root/.cache/flashinfer" + CACHE_DIRS_TO_CREATE+=("$HOME/.cache/flashinfer") + + # Triton Cache + DOCKER_ARGS="$DOCKER_ARGS -v $HOME/.triton:/root/.triton" + CACHE_DIRS_TO_CREATE+=("$HOME/.triton") +fi + # Resolve launch script path if specified if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then # Check if it's an absolute path or relative path that exists @@ -276,6 +295,12 @@ if [[ "$CHECK_CONFIG" == "true" ]]; then echo " Image Name: $IMAGE_NAME" echo " ETH Interface: $ETH_IF" echo " IB Interface: $IB_IF" + echo " Docker Args: $DOCKER_ARGS" + if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then + echo " Mounting Cache Dirs: ${CACHE_DIRS_TO_CREATE[*]}" + else + echo " Mounting Cache Dirs: (Disabled)" + fi exit 0 fi @@ -508,6 +533,13 @@ start_cluster() { # Start Head Node echo "Starting Head Node on $HEAD_IP..." + # Ensure cache dirs exist on head + if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then + for dir in "${CACHE_DIRS_TO_CREATE[@]}"; do + mkdir -p "$dir" + done + fi + local head_cmd_args=() if [[ "$SOLO_MODE" == "true" ]]; then if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then @@ -534,6 +566,13 @@ start_cluster() { for worker in "${PEER_NODES[@]}"; do echo "Starting Worker Node on $worker..." + # Ensure cache dirs exist on worker + if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then + # Create string of dirs to create + dirs_str="${CACHE_DIRS_TO_CREATE[*]}" + ssh "$worker" "mkdir -p $dirs_str" + fi + local docker_run_cmd="docker run -d --privileged --gpus all --rm --ipc=host --network host --name $CONTAINER_NAME $DOCKER_ARGS $IMAGE_NAME" if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then