diff --git a/README.md b/README.md index 7a7506e..3aefb51 100644 --- a/README.md +++ b/README.md @@ -166,7 +166,8 @@ Autodiscover function in both `launch-cluster.sh` and `run-recipe.sh` now can de You can try running a model on all 3 nodes in pipeline-parallel configuration using the following recipe: ```bash -./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround --setup # you can drop --setup and --force-build on subsequent calls +./run-recipe.sh --discover # force mesh discovery +./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround --setup --no-ray --force-build # you can drop --setup and --force-build on subsequent calls ``` Please note that `--tensor-parallel-size 3` or `-tp 3` is not supported by any commonly used model, so the only two viable options to utilize all three nodes for a single model are: diff --git a/run-recipe.py b/run-recipe.py index a141d82..6a2de79 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -146,8 +146,11 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]: SystemExit: If recipe not found or validation fails """ if not recipe_path.exists(): - # Try recipes directory with various extensions + # Try candidates in order: add extension to original path first, + # then fall back to flat recipes/ directory (for bare recipe names) candidates = [ + recipe_path.with_suffix(".yaml"), + recipe_path.with_suffix(".yml"), RECIPES_DIR / recipe_path.name, RECIPES_DIR / f"{recipe_path.name}.yaml", RECIPES_DIR / f"{recipe_path.name}.yml", @@ -325,7 +328,7 @@ def build_image( if build_args: cmd.extend(build_args) if copy_to: - cmd.extend(["--copy-to", ",".join(copy_to)]) + cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"]) print(f"Building image '{image}'...") if build_args: @@ -363,7 +366,7 @@ def download_model(model: str, copy_to: list[str] | None = None) -> bool: cmd = [str(DOWNLOAD_SCRIPT), model] if copy_to: - cmd.extend(["--copy-to", ",".join(copy_to)]) + cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"]) print(f"Downloading model '{model}'...") if copy_to: