Updated README
This commit is contained in:
@@ -166,7 +166,8 @@ Autodiscover function in both `launch-cluster.sh` and `run-recipe.sh` now can de
|
|||||||
You can try running a model on all 3 nodes in pipeline-parallel configuration using the following recipe:
|
You can try running a model on all 3 nodes in pipeline-parallel configuration using the following recipe:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround --setup # you can drop --setup and --force-build on subsequent calls
|
./run-recipe.sh --discover # force mesh discovery
|
||||||
|
./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround --setup --no-ray --force-build # you can drop --setup and --force-build on subsequent calls
|
||||||
```
|
```
|
||||||
|
|
||||||
Please note that `--tensor-parallel-size 3` or `-tp 3` is not supported by any commonly used model, so the only two viable options to utilize all three nodes for a single model are:
|
Please note that `--tensor-parallel-size 3` or `-tp 3` is not supported by any commonly used model, so the only two viable options to utilize all three nodes for a single model are:
|
||||||
|
|||||||
@@ -146,8 +146,11 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]:
|
|||||||
SystemExit: If recipe not found or validation fails
|
SystemExit: If recipe not found or validation fails
|
||||||
"""
|
"""
|
||||||
if not recipe_path.exists():
|
if not recipe_path.exists():
|
||||||
# Try recipes directory with various extensions
|
# Try candidates in order: add extension to original path first,
|
||||||
|
# then fall back to flat recipes/ directory (for bare recipe names)
|
||||||
candidates = [
|
candidates = [
|
||||||
|
recipe_path.with_suffix(".yaml"),
|
||||||
|
recipe_path.with_suffix(".yml"),
|
||||||
RECIPES_DIR / recipe_path.name,
|
RECIPES_DIR / recipe_path.name,
|
||||||
RECIPES_DIR / f"{recipe_path.name}.yaml",
|
RECIPES_DIR / f"{recipe_path.name}.yaml",
|
||||||
RECIPES_DIR / f"{recipe_path.name}.yml",
|
RECIPES_DIR / f"{recipe_path.name}.yml",
|
||||||
@@ -325,7 +328,7 @@ def build_image(
|
|||||||
if build_args:
|
if build_args:
|
||||||
cmd.extend(build_args)
|
cmd.extend(build_args)
|
||||||
if copy_to:
|
if copy_to:
|
||||||
cmd.extend(["--copy-to", ",".join(copy_to)])
|
cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"])
|
||||||
|
|
||||||
print(f"Building image '{image}'...")
|
print(f"Building image '{image}'...")
|
||||||
if build_args:
|
if build_args:
|
||||||
@@ -363,7 +366,7 @@ def download_model(model: str, copy_to: list[str] | None = None) -> bool:
|
|||||||
|
|
||||||
cmd = [str(DOWNLOAD_SCRIPT), model]
|
cmd = [str(DOWNLOAD_SCRIPT), model]
|
||||||
if copy_to:
|
if copy_to:
|
||||||
cmd.extend(["--copy-to", ",".join(copy_to)])
|
cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"])
|
||||||
|
|
||||||
print(f"Downloading model '{model}'...")
|
print(f"Downloading model '{model}'...")
|
||||||
if copy_to:
|
if copy_to:
|
||||||
|
|||||||
Reference in New Issue
Block a user