Updated README

This commit is contained in:
Eugene Rakhmatulin
2026-03-31 16:54:19 -07:00
parent e89104d91b
commit a889fed254
2 changed files with 8 additions and 4 deletions

View File

@@ -166,7 +166,8 @@ Autodiscover function in both `launch-cluster.sh` and `run-recipe.sh` now can de
You can try running a model on all 3 nodes in pipeline-parallel configuration using the following recipe:
```bash
./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround --setup # you can drop --setup and --force-build on subsequent calls
./run-recipe.sh --discover # force mesh discovery
./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround --setup --no-ray --force-build # you can drop --setup and --force-build on subsequent calls
```
Please note that `--tensor-parallel-size 3` or `-tp 3` is not supported by any commonly used model, so the only two viable options to utilize all three nodes for a single model are:

View File

@@ -146,8 +146,11 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]:
SystemExit: If recipe not found or validation fails
"""
if not recipe_path.exists():
# Try recipes directory with various extensions
# Try candidates in order: add extension to original path first,
# then fall back to flat recipes/ directory (for bare recipe names)
candidates = [
recipe_path.with_suffix(".yaml"),
recipe_path.with_suffix(".yml"),
RECIPES_DIR / recipe_path.name,
RECIPES_DIR / f"{recipe_path.name}.yaml",
RECIPES_DIR / f"{recipe_path.name}.yml",
@@ -325,7 +328,7 @@ def build_image(
if build_args:
cmd.extend(build_args)
if copy_to:
cmd.extend(["--copy-to", ",".join(copy_to)])
cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"])
print(f"Building image '{image}'...")
if build_args:
@@ -363,7 +366,7 @@ def download_model(model: str, copy_to: list[str] | None = None) -> bool:
cmd = [str(DOWNLOAD_SCRIPT), model]
if copy_to:
cmd.extend(["--copy-to", ",".join(copy_to)])
cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"])
print(f"Downloading model '{model}'...")
if copy_to: