Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introducing a pythonic CLI #5

Merged
merged 27 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
d6b4378
Introducing a pythonic CLI
marcromeyn Aug 21, 2024
2d5383a
Remove commented out code
marcromeyn Aug 21, 2024
840d22d
Small fix in readme
marcromeyn Aug 21, 2024
c593485
Adding --interactive
marcromeyn Aug 21, 2024
5106d3b
Starting to support plugins
marcromeyn Aug 21, 2024
a10cc9f
Using RunContext to simplify things
marcromeyn Aug 22, 2024
b0bfaa3
Fix some bugs + improved example
marcromeyn Aug 22, 2024
b1146c1
Fix typo
marcromeyn Aug 22, 2024
02f44bb
Make sure code in README is the same as task.py
marcromeyn Aug 22, 2024
31555da
Fixing failing tests
marcromeyn Aug 22, 2024
005af4e
Fix linting errors
marcromeyn Aug 22, 2024
bca6553
Run fmt
marcromeyn Aug 22, 2024
d64cdcf
Fix some more issues
marcromeyn Aug 22, 2024
3e4d537
Run formatting again
marcromeyn Aug 22, 2024
c284dcc
Fix spelling issue
marcromeyn Aug 22, 2024
2bdfa0a
Change API to create an experiment entrypoint
marcromeyn Aug 23, 2024
6c47eaa
Fix docstring
marcromeyn Aug 23, 2024
a50d81d
Adding default_factory arg to entrypoint
marcromeyn Aug 23, 2024
2fbaab6
Also expose default_factory in main
marcromeyn Aug 23, 2024
3a0adfe
Adding default_executor
marcromeyn Aug 23, 2024
7be0996
Adding default_plugins
marcromeyn Aug 23, 2024
e76d67f
Some fixes
marcromeyn Aug 26, 2024
a07ae58
Fix linting issues
marcromeyn Aug 26, 2024
677e59a
List -> list
marcromeyn Aug 26, 2024
c15e98e
Copy over __main__.py in slurm packaging
marcromeyn Aug 26, 2024
aadd121
Copy over __main__.py in slurm packaging
marcromeyn Aug 26, 2024
809a697
Add copyright header to fdl_runner
marcromeyn Aug 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
403 changes: 403 additions & 0 deletions examples/entrypoint/README.md

Large diffs are not rendered by default.

107 changes: 107 additions & 0 deletions examples/entrypoint/experiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from dataclasses import dataclass
from typing import List

import nemo_run as run


@dataclass
class Model:
"""Dummy model config"""

hidden_size: int
num_layers: int
activation: str


@dataclass
class Optimizer:
"""Dummy optimizer config"""

learning_rate: float
weight_decay: float
betas: List[float]


@run.cli.entrypoint
def train_model(model: Model, optimizer: Optimizer, epochs: int = 10, batch_size: int = 32):
"""
Train a model using the specified configuration.

Args:
model (Model): Configuration for the model.
optimizer (Optimizer): Configuration for the optimizer.
epochs (int, optional): Number of training epochs. Defaults to 10.
batch_size (int, optional): Batch size for training. Defaults to 32.
"""
print("Training model with the following configuration:")
print(f"Model: {model}")
print(f"Optimizer: {optimizer}")
print(f"Epochs: {epochs}")
print(f"Batch size: {batch_size}")

# Simulating model training
for epoch in range(epochs):
print(f"Epoch {epoch + 1}/{epochs}")

print("Training completed!")


@run.cli.factory
@run.autoconvert
def my_model(hidden_size: int = 256, num_layers: int = 3, activation: str = "relu") -> Model:
"""
Create a model configuration.
"""
return Model(hidden_size=hidden_size, num_layers=num_layers, activation=activation)


@run.cli.factory
@run.autoconvert
def my_optimizer(
learning_rate: float = 0.001, weight_decay: float = 1e-5, betas: List[float] = [0.9, 0.999]
) -> Optimizer:
"""
Create an optimizer configuration.
"""
return Optimizer(learning_rate=learning_rate, weight_decay=weight_decay, betas=betas)


@run.cli.factory
@run.autoconvert
def local_executor() -> run.LocalExecutor:
return run.LocalExecutor()


@run.cli.entrypoint(type="experiment")
def train_models_experiment(
ctx: run.cli.RunContext,
models: List[Model] = [my_model(), my_model(hidden_size=512)],
optimizers: List[Optimizer] = [my_optimizer(), my_optimizer(learning_rate=0.01)],
epochs: int = 10,
batch_size: int = 32,
sequential: bool = False,
):
"""
Run an experiment to train multiple models with different configurations.

Args:
ctx (run.RunContext): The run context for the experiment.
models (List[Model]): List of model configurations to train.
optimizers (List[Optimizer]): List of optimizer configurations to use.
epochs (int): Number of training epochs for each model.
batch_size (int): Batch size for training.
"""

with run.Experiment("train_models_experiment") as exp:
for i, (model, optimizer) in enumerate(zip(models, optimizers)):
train = run.Partial(
train_model, model=model, optimizer=optimizer, epochs=epochs, batch_size=batch_size
)

exp.add(train, name=f"train_model_{i}", executor=ctx.executor)

ctx.launch(exp, sequential=sequential)


if __name__ == "__main__":
run.cli.main(train_models_experiment)
Binary file added examples/entrypoint/img/experiment-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/experiment-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/experiment-4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/experiment-5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/experiment-6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/experiment-help.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-7.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-help.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/entrypoint/img/task-repl.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
73 changes: 73 additions & 0 deletions examples/entrypoint/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from dataclasses import dataclass
from typing import List

import nemo_run as run


@dataclass
class Model:
"""Dummy model config"""

hidden_size: int
num_layers: int
activation: str


@dataclass
class Optimizer:
"""Dummy optimizer config"""

learning_rate: float
weight_decay: float
betas: List[float]


@run.cli.factory
@run.autoconvert
def my_model(hidden_size: int = 256, num_layers: int = 3, activation: str = "relu") -> Model:
"""
Create a model configuration.
"""
return Model(hidden_size=hidden_size, num_layers=num_layers, activation=activation)


@run.cli.factory
def my_optimizer(
learning_rate: float = 0.001, weight_decay: float = 1e-5, betas: List[float] = [0.9, 0.999]
) -> run.Config[Optimizer]:
"""Create an optimizer configuration."""
return run.Config(
Optimizer, learning_rate=learning_rate, weight_decay=weight_decay, betas=betas
)


def train_model(
model: Model,
optimizer: Optimizer,
epochs: int = 10,
batch_size: int = 32,
):
"""
Train a model using the specified configuration.

Args:
model (Model): Configuration for the model.
optimizer (Optimizer): Configuration for the optimizer.
epochs (int, optional): Number of training epochs. Defaults to 10.
batch_size (int, optional): Batch size for training. Defaults to 32.
"""
print("Training model with the following configuration:")
print(f"Model: {model}")
print(f"Optimizer: {optimizer}")
print(f"Epochs: {epochs}")
print(f"Batch size: {batch_size}")

# Simulating model training
for epoch in range(epochs):
print(f"Epoch {epoch + 1}/{epochs}")

print("Training completed!")


if __name__ == "__main__":
run.cli.main(train_model)
98 changes: 98 additions & 0 deletions examples/entrypoint/task_with_defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from dataclasses import dataclass
from typing import List

import nemo_run as run


@dataclass
class Model:
"""Dummy model config"""

hidden_size: int
num_layers: int
activation: str


@dataclass
class Optimizer:
"""Dummy optimizer config"""

learning_rate: float
weight_decay: float
betas: List[float]


@run.cli.factory
@run.autoconvert
def my_model(hidden_size: int = 256, num_layers: int = 3, activation: str = "relu") -> Model:
"""
Create a model configuration.
"""
return Model(hidden_size=hidden_size, num_layers=num_layers, activation=activation)


@run.cli.factory
def my_optimizer(
learning_rate: float = 0.001, weight_decay: float = 1e-5, betas: List[float] = [0.9, 0.999]
) -> run.Config[Optimizer]:
"""Create an optimizer configuration."""
return run.Config(
Optimizer, learning_rate=learning_rate, weight_decay=weight_decay, betas=betas
)


def train_model(
model: Model,
optimizer: Optimizer,
epochs: int = 10,
batch_size: int = 32,
):
"""
Train a model using the specified configuration.

Args:
model (Model): Configuration for the model.
optimizer (Optimizer): Configuration for the optimizer.
epochs (int, optional): Number of training epochs. Defaults to 10.
batch_size (int, optional): Batch size for training. Defaults to 32.
"""
print("Training model with the following configuration:")
print(f"Model: {model}")
print(f"Optimizer: {optimizer}")
print(f"Epochs: {epochs}")
print(f"Batch size: {batch_size}")

# Simulating model training
for epoch in range(epochs):
print(f"Epoch {epoch + 1}/{epochs}")

print("Training completed!")


def custom_defaults() -> run.Partial[train_model]:
return run.Partial(
train_model,
model=my_model(hidden_size=512),
optimizer=my_optimizer(learning_rate=0.0005),
epochs=50,
batch_size=2048,
)


@run.autoconvert
def local_executor() -> run.Executor:
return run.LocalExecutor()


class DummyPlugin(run.Plugin):
def setup(self, task: run.Partial[train_model], executor: run.Executor):
task.epochs *= 2


if __name__ == "__main__":
run.cli.main(
train_model,
default_factory=custom_defaults,
default_executor=local_executor(),
default_plugins=run.Config(DummyPlugin),
)
3 changes: 2 additions & 1 deletion requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
# all-features: false
# with-sources: false
# generate-hashes: false
# universal: false

-e file:.
absl-py==2.1.0
# via fiddle
appnope==0.1.4
# via ipykernel
asttokens==2.4.1
# via stack-data
attrs==24.2.0
Expand Down
1 change: 0 additions & 1 deletion requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# all-features: false
# with-sources: false
# generate-hashes: false
# universal: false

-e file:.
absl-py==2.1.0
Expand Down
14 changes: 5 additions & 9 deletions src/nemo_run/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo_run.api import autoconvert, list_tasks, task
from nemo_run import cli
from nemo_run.api import autoconvert, dryrun_fn
from nemo_run.config import Config, Partial, Script
from nemo_run.core.execution.base import (
Executor,
ExecutorMacros,
FaultTolerance,
Torchrun,
)
from nemo_run.core.execution.base import Executor, ExecutorMacros, FaultTolerance, Torchrun
from nemo_run.core.execution.local import LocalExecutor
from nemo_run.core.execution.skypilot import SkypilotExecutor
from nemo_run.core.execution.slurm import SlurmExecutor
Expand All @@ -35,6 +31,8 @@

__all__ = [
"autoconvert",
"cli",
"dryrun_fn",
"Config",
"DevSpace",
"Executor",
Expand All @@ -43,7 +41,6 @@
"FaultTolerance",
"GitArchivePackager",
"help",
"list_tasks",
"LocalExecutor",
"LocalTunnel",
"Packager",
Expand All @@ -54,7 +51,6 @@
"SkypilotExecutor",
"SlurmExecutor",
"SSHTunnel",
"task",
"Torchrun",
]

Expand Down
Loading
Loading