From 9aba299ea8dfde52c7412da006990f539d580235 Mon Sep 17 00:00:00 2001 From: Sean Yang Date: Mon, 12 Aug 2024 10:45:52 -0700 Subject: [PATCH] Add FedJobAPI documentation (#2718) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add JobAPI docs * 2.5 misc doc updates * address comments --------- Co-authored-by: Yuan-Ting Hsieh (謝沅廷) Co-authored-by: Chester Chen <512707+chesterxgchen@users.noreply.github.com> --- docs/getting_started.rst | 4 + docs/programming_guide.rst | 1 + docs/programming_guide/execution_api_type.rst | 3 +- .../execution_api_type/client_api.rst | 65 +++-- docs/programming_guide/fed_job_api.rst | 252 ++++++++++++++++++ docs/publications_and_talks.rst | 1 + docs/real_world_fl/cloud_deployment.rst | 5 + docs/resources/Dockerfile | 4 +- nvflare/job_config/fed_job.py | 42 ++- tests/unit_test/job_config/fed_job_test.py | 11 +- 10 files changed, 341 insertions(+), 47 deletions(-) create mode 100644 docs/programming_guide/fed_job_api.rst diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 171a5e6c8c..6d6a150eff 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -175,6 +175,10 @@ Using any text editor to edit the Dockerfile and paste the following: .. literalinclude:: resources/Dockerfile :language: dockerfile +.. note:: + + For nvflare version 2.3 set PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.02-py3 + We can then build the new container by running docker build in the directory containing this Dockerfile, for example tagging it nvflare-pt: diff --git a/docs/programming_guide.rst b/docs/programming_guide.rst index 28e8b7992b..1053f84270 100644 --- a/docs/programming_guide.rst +++ b/docs/programming_guide.rst @@ -35,6 +35,7 @@ Please refer to :ref:`application` for more details. .. toctree:: :maxdepth: 1 + programming_guide/fed_job_api programming_guide/workflows_and_controllers programming_guide/execution_api_type programming_guide/fl_model diff --git a/docs/programming_guide/execution_api_type.rst b/docs/programming_guide/execution_api_type.rst index 77baf7806a..9f39978307 100644 --- a/docs/programming_guide/execution_api_type.rst +++ b/docs/programming_guide/execution_api_type.rst @@ -35,7 +35,8 @@ The :ref:`client_api` provides the most straightforward way to write FL code, and can easily be used to convert centralized code with minimal code changes. The Client API uses the :class:`FLModel` object for data transfer and supports common tasks such as train, validate, and submit_model. -Additionally, options for using decorators or PyTorch Lightning are also available. +Options for using decorators or PyTorch Lightning are also available. +For Client API executors, the in-process and external-process executors are provided for different use cases. We recommend users start with the Client API, and to consider the other types for more specific cases as required. diff --git a/docs/programming_guide/execution_api_type/client_api.rst b/docs/programming_guide/execution_api_type/client_api.rst index e5ed5cb7f0..dee674477c 100644 --- a/docs/programming_guide/execution_api_type/client_api.rst +++ b/docs/programming_guide/execution_api_type/client_api.rst @@ -167,20 +167,26 @@ Client API communication patterns We offer various implementations of Client APIs tailored to different scenarios, each linked with distinct communication patterns. -Broadly, we present in-process and sub-process executors. The in-process executor, slated for release in NVFlare 2.5.0, -entails both training scripts and client executor operating within the same process. The training scripts will be launched once -at the event of START_RUN. The training scripts keep on running till the END_RUN event. Communication between them occurs -through an in-memory databus. +In-process Client API +--------------------- -On the other hand, the LauncherExecutor employs a sub-process to execute training scripts, leading to the client executor -and training scripts residing in separate processes. The "launch_once" option is provided to the SubprocessLauncher to control +The in-process executor entails both the training script and client executor operating within the same process. +The training script will be launched once at the event of START_RUN and will keep on running till the END_RUN event. +Communication between them occurs through an efficient in-memory databus. + +When the training process involves either a single GPU or no GPUs, and the training script doesn't integrate third-party +training systems, the in-process executor is preferable (when available). + +Sub-process Client API +---------------------- + +On the other hand, the LauncherExecutor employs the SubprocessLauncher to use a sub-process to execute the training script. This results in the client executor +and training script residing in separate processes. The "launch_once" option is provided to the SubprocessLauncher to control whether to launch the external script everytime when getting the task from server, or just launch the script once at the event of START_RUN and keeps running till the END_RUN event. Communication between them is facilitated by either CellPipe (default) or FilePipe. -When the training process involves either a single GPU or no GPUs, and the training script doesn't integrate third-party -training systems, the in-process executor is preferable (when available). For scenarios involving multi-GPU training or -the utilization of external training infrastructure, opting for the Launcher executor might be more suitable. +For scenarios involving multi-GPU training or the utilization of external training infrastructure, opting for the Launcher executor might be more suitable. Choice of different Pipes @@ -203,34 +209,35 @@ Configuration Different configurations are available for each type of executor. -Definition lists: - in-process executor configuration - .. literalinclude:: ../../../job_templates/sag_pt_in_proc/config_fed_client.conf +--------------------------------- +This configuration specifically caters to PyTorch applications, providing serialization and deserialization +(aka Decomposers) for commonly used PyTorch objects. For non-PyTorch applications, the generic +:class:`InProcessClientAPIExecutor` can be employed. + +.. literalinclude:: ../../../job_templates/sag_pt_in_proc/config_fed_client.conf - This configuration specifically caters to PyTorch applications, providing serialization and deserialization - (aka Decomposers) for commonly used PyTorch objects. For non-PyTorch applications, the generic - ``InProcessClientAPIExecutor`` can be employed. subprocess launcher Executor configuration - In the config_fed_client in the FLARE app, in order to launch the training script we use the - :class:`SubprocessLauncher` component. - The defined ``script`` is invoked, and ``launch_once`` can be set to either - launch once for the whole job (launch_once = True), or launch a process for each task received from the server (launch_once = False) +------------------------------------------ +In the config_fed_client in the FLARE app, in order to launch the training script we use the +:class:`SubprocessLauncher` component. +The defined ``script`` is invoked, and ``launch_once`` can be set to either +launch once for the whole job (launch_once = True), or launch a process for each task received from the server (launch_once = False) - ``launch_once`` dictates how many times the training scripts are invoked during the overall training process. - When set to False, the executor essentially invokes ``python .py`` every round of training. - Typically, launch_once is set to True. +``launch_once`` dictates how many times the training scripts are invoked during the overall training process. +When set to False, the executor essentially invokes ``python .py`` every round of training. +Typically, launch_once is set to True. - A corresponding :class:`LauncherExecutor` - is used as the executor to handle the tasks and perform the data exchange using the pipe. - For the Pipe component we provide implementations of :class:`FilePipe` - and :class:`CellPipe`. +A corresponding :class:`ClientAPILauncherExecutor` +is used as the executor to handle the tasks and perform the data exchange using the pipe. +For the Pipe component we provide implementations of :class:`FilePipe` +and :class:`CellPipe`. - .. literalinclude:: ../../../job_templates/sag_pt/config_fed_client.conf +.. literalinclude:: ../../../job_templates/sag_pt/config_fed_client.conf - For example configurations, take a look at the :github_nvflare_link:`job_templates ` - directory for templates using the launcher and Client API. +For example configurations, take a look at the :github_nvflare_link:`job_templates ` +directory for templates using the launcher and Client API. .. note:: In that case that the user does not need to launch the process and instead diff --git a/docs/programming_guide/fed_job_api.rst b/docs/programming_guide/fed_job_api.rst new file mode 100644 index 0000000000..2754003d15 --- /dev/null +++ b/docs/programming_guide/fed_job_api.rst @@ -0,0 +1,252 @@ +.. _fed_job_api: + +########## +FedJob API +########## + +The FLARE :class:`FedJob` API allows users to Pythonically define and create job configurations. + +Core Concepts +============= + +* Use the :func:`to` routine to assign objects (e.g. controllers, executor, models, filters, components etc.) to the server or clients. +* Export the job to a configuration with :func:`export_job`. +* Run the job in the simulator with :func:`simulator_run`. + +Table overview of the :class:`FedJob` API: + +.. list-table:: FedJob + :widths: 25 35 50 + :header-rows: 1 + + * - API + - Description + - API Doc Link + * - to + - Assign object to target. + - :func:`to` + * - to_server + - Assign object to server. + - :func:`to_server` + * - to_clients + - Assign object to all clients. + - :func:`to_clients` + * - as_id + - Return generated uuid of object. Object will be added as component if referenced. + - :func:`as_id` + * - simulator_run + - Run the job with the simulator. + - :func:`simulator_run` + * - export_job + - Export the job configuration. + - :func:`export_job` + + +Here is an example of how to create a simple cifar10_fedavg job using the :class:`FedJob` API. +We assign a FedAvg controller and the initial PyTorch model to the server, and assign a ScriptExecutor for our training script to the clients. +Then we use the simulator to run the job: + +.. code-block:: python + + from src.net import Net + + from nvflare import FedAvg, FedJob, ScriptExecutor + + if __name__ == "__main__": + n_clients = 2 + num_rounds = 2 + train_script = "src/cifar10_fl.py" + + job = FedJob(name="cifar10_fedavg") + + # Define the controller workflow and send to server + controller = FedAvg( + num_clients=n_clients, + num_rounds=num_rounds, + ) + job.to_server(controller) + + # Define the initial global model and send to server + job.to_server(Net()) + + # Send executor to all clients + executor = ScriptExecutor( + task_script_path=train_script, task_script_args="" # f"--batch_size 32 --data_path /tmp/data/site-{i}" + ) + job.to_clients(executor) + + # job.export_job("/tmp/nvflare/jobs/job_config") + job.simulator_run("/tmp/nvflare/jobs/workdir", n_clients=n_clients) + + +Initializing the FedJob +======================= + +Initialize the :class:`FedJob` object with the following arguments: + +* ``name`` (str): for job name. +* ``min_clients`` (int): required for the job, will be set in the ``meta.json``. +* ``mandatory_clients`` (List[str]): to run the job, will be set in the ``meta.json``. +* ``key_metric`` (str): the metric used for global model selection, will be used by the preconfigured :class:`IntimeModelSelector`. + +Example: + +.. code-block:: python + + job = FedJob(name="cifar10_fedavg", min_clients=2, mandatory_clients=["site-1", "site-2"], key_metric="accuracy") + +Assigning objects with :func:`to` +======================================================================= + +Assign objects with :func:`to` for a specific ``target``, +:func:`to_server` for the server, and +:func:`to_clients` for all the clients. + +These functions have the following parameters which are used depending on the type of object: + +* ``obj`` (any): The object to be assigned. The obj will be given a default id if none is provided based on its type. +* ``target`` (str): (For :func:`to`) The target location of the object. Can be “server” or a client name, e.g. “site-1”. +* ``tasks`` (List[str]): If object is an Executor or Filter, optional list of tasks that should be handled. Defaults to None. If None, all tasks will be handled using [*]. +* ``gpu`` (int | List[int]): GPU index or list of GPU indices used for simulating the run on that target. +* ``filter_type`` (FilterType): The type of filter used. Either FilterType.TASK_RESULT or FilterType.TASK_DATA. +* ``id`` (int): Optional user-defined id for the object. Defaults to None and ID will automatically be assigned. + +.. note:: + + In order for the FedJob to use the values of arguments passed into the ``obj``, the arguments must be set as instance variables of the same name (or prefixed with ``_``) in the constructor. + +Below we cover in-depth how different types of objects are handled when using :func:`to`: + +Controller +---------- + +If the object is a :class:`Controller` sent to the server, the controller is added to the server app workflows. + +* If the ``key_metric`` is defined in the FedJob (see initialization), an :class:`IntimeModelSelector` widget will be added for best model selection. +* A :class:`ValidationJsonGenerator` is automatically added for creating json validation results. +* If PyTorch and TensorBoard are supported, then :class:`TBAnalyticsReceiver` is automatically added to receives analytics data to save to TensorBoard. Other types of receivers can be added as components with :func:`to`. + +Example: + +.. code-block:: python + + controller = FedAvg( + num_clients=n_clients, + num_rounds=num_rounds, + ) + job.to(controller, "server") + +If the object is a :class:`Controller` sent to a client, the controller is added to the client app components as a client-side controller. +The controller can then be used by the :class:`ClientControllerExecutor`. + + +Executor +-------- + +If the object is an :class:`Executor`, it must be sent to a client. The executor is added to the client app executors. + +* The ``tasks`` parameter specifies the tasks that the executor is defined the handle. +* The ``gpu`` parameter specifies which gpus to use for simulating the run on the target. +* If the object is a :class:`ScriptExecutor`, the task_script_path will be added to the external scripts to be included in the custom directory. +* If the object is a :class:`ScriptLauncherExecutor`, the launch_script will be launched in a subprocess. Corresponding :class:`SubprocessLauncher`, :class:`CellPipe`, :class:`MetricRelay`, and :class:`ExternalConfigurator` components will be automatically configured. +* The :class:`ConvertToFedEvent` widget is automatically added to convert local events to federated events. + +Example: + +.. code-block:: python + + executor = ScriptExecutor(task_script_path="src/cifar10_fl.py", task_script_args="") + job.to(executor, "site-1", tasks=["train"], gpu=0) + + +Script (str) +------------ + +If the object is a str, it is treated as an external script and will be included in the custom directory. + +Example: + +.. code-block:: python + + job.to("src/cifar10_fl.py", "site-1") + + +Filter +------ + +If the object is a :class:`Filter`, users must specify the ``filter_type`` +as either FilterType.TASK_RESULT (flow from executor to controller) or FilterType.TASK_DATA (flow from controller to executor). + +The filter will be added task_data_filters and task_result_filters accordingly and be applied to the specified ``tasks``. + +Example: + +.. code-block:: python + + pp_filter = PercentilePrivacy(percentile=10, gamma=0.01) + job.to(pp_filter, "site-1", tasks=["train"], filter_type=FilterType.TASK_RESULT) + + +Model +----- +If the object is a common model type, a corresponding persistor will automatically be configured with the model. + +For PyTorch models (``torch.nn.Module``) we add a :class:`PTFileModelPersistor` and +:class:`PTFileModelLocator`, and for TensorFlow models (``tf.keras.Model``) we add a :class:`TFModelPersistor`. + +Example: + +.. code-block:: python + + job.to(Net(), "server") + +For unsupported models, the model and persistor can be added as components. + + +Components +---------- +For any object that does not fall under any of the previous types, it is added as a component with ``id``. +The ``id`` can be either specified as a parameter, or it will be automatically assigned.Components may reference other components by id + +If an id generated by :func:`as_id`, is referenced by another added object, this the referenced object will also be added as a component. +In the example below, comp2 is assigned to the server. Since comp1 was referenced in comp2 with :func:`as_id`, comp1 will also be added as a component to the server. + +Example: + +.. code-block:: python + + comp1 = Component1() + comp2 = Component2(sub_component_id=job.as_id(comp1)) + job.to(comp2, "server") + + +Running the Job +=============== + +Simulator +--------- + +Run the FedJob with the simulator with :func:`simulator_run` in the ``workspace`` with ``n_clients`` and ``threads``. +(Note: only set ``n_clients`` if you have not specified clients using :func:`to`) + +Example: + +.. code-block:: python + + job.simulator_run(workspace="/tmp/nvflare/jobs/workdir", n_clients=2, threads=2) + + +Export Configuration +-------------------- +We can export the job configuration with :func:`export_job` to the ``job_root`` directory. + +Example: + +.. code-block:: python + + job.export_job(job_root="/tmp/nvflare/jobs/job_config") + +Examples +======== + +To see examples of how the FedJob API can be used for different applications, refer the :github_nvflare_link:`Getting Started ` examples. diff --git a/docs/publications_and_talks.rst b/docs/publications_and_talks.rst index bd700723b2..422512da72 100644 --- a/docs/publications_and_talks.rst +++ b/docs/publications_and_talks.rst @@ -21,6 +21,7 @@ Publications: 2023 Publications: 2022 ------------------ +* **2022-11** `Federated Learning with Azure Machine Learning `__ (Video) * **2022-10** `Auto-FedRL: Federated Hyperparameter Optimization for Multi-institutional Medical Image Segmentation `__ (`ECCV 2022 `__) * **2022-10** `Joint Multi Organ and Tumor Segmentation from Partial Labels Using Federated Learning `__ (`DeCaF @ MICCAI 2022 `__) * **2022-10** `Split-U-Net: Preventing Data Leakage in Split Learning for Collaborative Multi-modal Brain Tumor Segmentation `__ (`DeCaF @ MICCAI 2022 `__) diff --git a/docs/real_world_fl/cloud_deployment.rst b/docs/real_world_fl/cloud_deployment.rst index 8ee2f6d4ec..3d438628e4 100644 --- a/docs/real_world_fl/cloud_deployment.rst +++ b/docs/real_world_fl/cloud_deployment.rst @@ -219,6 +219,11 @@ The configuration file provided is formatted as follows: EC2_TYPE=t2.small REGION=us-west-2 +.. note:: + + For the AWS AMIs, we recommend the following images for each version of Ubuntu: + 20.04:ami-04bad3c587fe60d89, 22.04:ami-03c983f9003cb9cd1, 24.04:ami-0406d1fdd021121cd + Deploy FL Client in the Cloud ============================= As an organization admin for an FL project, you are responsible for setting up your FL Client system. You will receive a Client startup kit either from email, sftp diff --git a/docs/resources/Dockerfile b/docs/resources/Dockerfile index c2c992a651..baefd870d8 100644 --- a/docs/resources/Dockerfile +++ b/docs/resources/Dockerfile @@ -1,7 +1,7 @@ -ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.02-py3 +ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.03-py3 FROM ${PYTORCH_IMAGE} -ARG NVF_VERSION=2.3 +ARG NVF_VERSION=2.4 ENV NVF_BRANCH=${NVF_VERSION} RUN python3 -m pip install -U pip diff --git a/nvflare/job_config/fed_job.py b/nvflare/job_config/fed_job.py index 4cf5c1d630..5fe038a788 100644 --- a/nvflare/job_config/fed_job.py +++ b/nvflare/job_config/fed_job.py @@ -14,7 +14,7 @@ import os.path import re import uuid -from typing import Any, List, Union +from typing import Any, List, Optional, Union from nvflare.apis.executor import Executor from nvflare.apis.filter import Filter @@ -164,7 +164,13 @@ def _create_server_app(self): class FedJob: - def __init__(self, name="fed_job", min_clients=1, mandatory_clients=None, key_metric="accuracy") -> None: + def __init__( + self, + name: str = "fed_job", + min_clients: int = 1, + mandatory_clients: Optional[List[str]] = None, + key_metric: str = "accuracy", + ) -> None: """FedJob allows users to generate job configurations in a Pythonic way. The `to()` routine allows users to send different components to either the server or clients. @@ -199,9 +205,9 @@ def to( """assign an object to a target (server or clients). Args: - obj: The object to be assigned. The obj will be given a default `id` if non is provided based on its type. - target: The target location of th object. Can be "server" or a client name, e.g. "site-1". - tasks: In case object is an `Executor`, optional list of tasks the executor should handle. + obj: The object to be assigned. The obj will be given a default `id` if none is provided based on its type. + target: The target location of the object. Can be "server" or a client name, e.g. "site-1". + tasks: In case object is an `Executor` or `Filter`, optional list of tasks that should be handled. Defaults to `None`. If `None`, all tasks will be handled using `[*]`. gpu: GPU index or list of GPU indices used for simulating the run on that target. filter_type: The type of filter used. Either `FilterType.TASK_RESULT` or `FilterType.TASK_DATA`. @@ -213,10 +219,21 @@ def to( self._validate_target(target) if isinstance(obj, Controller): - if target not in self._deploy_map: - self._deploy_map[target] = ControllerApp(key_metric=self.key_metric) - self._deploy_map[target].add_controller(obj, id) + if target != "server": # add client-side controllers as components + if target not in self._deploy_map: + raise ValueError( + f"{target} doesn't have an `Executor`. Deploy one first before adding client-side controllers!" + ) + self._deploy_map[target].add_component(obj, id) + else: + if target not in self._deploy_map: + self._deploy_map[target] = ControllerApp(key_metric=self.key_metric) + self._deploy_map[target].add_controller(obj, id) elif isinstance(obj, Executor): + if target == "server": + raise ValueError( + f"`Executor` must be assigned to a client, but tried to assign `Executor` {obj} to 'server'!" + ) if target not in self._deploy_map: self._deploy_map[target] = ExecutorApp() if isinstance(obj, ScriptExecutor): @@ -324,7 +341,8 @@ def to_clients( self.to(obj=obj, target=ALL_SITES, tasks=tasks, filter_type=filter_type, id=id) - def as_id(self, obj: Any): + def as_id(self, obj: Any) -> str: + """Generate and return uuid for `obj`. If this id is referenced by another added object, this `obj` will also be added as a component.""" id = str(uuid.uuid4()) self._components[id] = obj return id @@ -393,11 +411,13 @@ def _set_all_apps(self): self._deployed = True - def export_job(self, job_root): + def export_job(self, job_root: str): + """Export job config to `job_root` directory with name `self.job_name`.""" self._set_all_apps() self.job.generate_job_config(job_root) - def simulator_run(self, workspace, n_clients: int = None, threads: int = None): + def simulator_run(self, workspace: str, n_clients: int = None, threads: int = None): + """Run the job with the simulator with the `workspace` using `n_clients` and `threads`.""" self._set_all_apps() if ALL_SITES in self.clients and not n_clients: diff --git a/tests/unit_test/job_config/fed_job_test.py b/tests/unit_test/job_config/fed_job_test.py index ffcd739730..e1feb9558d 100644 --- a/tests/unit_test/job_config/fed_job_test.py +++ b/tests/unit_test/job_config/fed_job_test.py @@ -15,19 +15,22 @@ import pytest from nvflare import FedAvg +from nvflare.app_common.abstract.model_learner import ModelLearner +from nvflare.app_common.executors.model_learner_executor import ModelLearnerExecutor from nvflare.job_config.fed_job import FedJob class TestFedJob: def test_validate_targets(self): job = FedJob() - component = FedAvg() + controller = FedAvg() + executor = ModelLearnerExecutor(learner_id=job.as_id(ModelLearner())) - job.to(component, "server") - job.to(component, "site-1") + job.to(controller, "server") + job.to(executor, "site-1") with pytest.raises(Exception): - job.to(component, "site-/1", gpu=0) + job.to(executor, "site-/1", gpu=0) def test_non_empty_target(self): job = FedJob()