diff --git a/3rdParty/tdigest.LICENSE.txt b/3rdParty/fastdigest.LICENSE.txt similarity index 100% rename from 3rdParty/tdigest.LICENSE.txt rename to 3rdParty/fastdigest.LICENSE.txt diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 07e05e2264..dd31762704 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -44,9 +44,6 @@ To collaborate efficiently, please read through this section and follow them. * [Building documentation](#building-the-documentation) * [Signing your work](#signing-your-work) -> Note: - > some package dependencies requires python-dev in local development such as - > python3.12-dev. #### Checking the coding style We check code style using flake8 and isort. diff --git a/examples/advanced/federated-statistics/README.md b/examples/advanced/federated-statistics/README.md index 32b09124ce..0d16d705fe 100644 --- a/examples/advanced/federated-statistics/README.md +++ b/examples/advanced/federated-statistics/README.md @@ -2,7 +2,7 @@ ## Objective NVIDIA FLARE will provide built-in federated statistics operators (controllers and executors) that -can generate global statistics based on local client side statistics. +can generate global statistics based on local client-side statistics. At each client site, we could have one or more datasets (such as "train" and "test" datasets); each dataset may have many features. For each feature in the dataset, we will calculate the statistics and then combine them to produce @@ -19,14 +19,47 @@ The result should be visualized via the visualization utility in the notebook. ## Assumptions -Assume that clients will provide the following: - * user needs to provide target statistics such as count, histogram only - * user needs to provide the local statistics for the target statistics (by implementing the statistic_spec) - * user needs to provide the data sets and dataset features (feature name, data type) - * * Note: count is always required as we use count to enforce data privacy policy -We only support **numerical features**, not categorical features. But user can return all types of features +Assume that clients will provide the following: +* Users need to provide target statistics such as count, histogram only +* Users need to provide the local statistics for the target statistics (by implementing the statistics_spec) +* Users need to provide the datasets and dataset features (feature name, data type) +* Note: count is always required as we use count to enforce data privacy policy + +We only support **numerical features**, not categorical features. However, users can return all types of features; the non-numerical features will be removed. + +## Statistics + + Federated statistics includes numerics statistics measures for + * count + * mean + * sum + * std_dev + * histogram + * quantile + + We did not include min, max value to avoid data privacy concern. + +### Quantile + +Quantile statistics refers to statistical measures that divide a probability distribution or dataset into intervals with equal probabilities or proportions. Quantiles help summarize the distribution of data by providing key points that indicate how values are spread. + +#### Key Quantiles: +1. Median (50th percentile): The middle value of a dataset, dividing it into two equal halves. +2. Quartiles (25th, 50th, 75th percentiles): Divide the data into four equal parts: +* Q1 (25th percentile): Lower quartile, below which 25% of the data falls. +* Q2 (50th percentile): Median. +* Q3 (75th percentile): Upper quartile, below which 75% of the data falls. +3. Deciles (10th, 20th, ..., 90th percentiles): Divide the data into ten equal parts. +4. Percentiles (1st, 2nd, ..., 99th): Divide the data into 100 equal parts. + +#### Usage of Quantiles: +* Descriptive Statistics: Summarizes the spread of data. +* Outlier Detection: Helps identify extreme values. +* Machine Learning: Used in feature engineering, normalization, and decision tree algorithms. +* Risk Analysis: Used in finance (e.g., Value at Risk, VaR). + ## Examples We provide several examples to demonstrate how should the operators be used. @@ -57,20 +90,21 @@ The main steps are The detailed example instructions can be found [Data frame statistics](df_stats/README.md) + ### COVID 19 Radiology Image Examples -The second example provided is image histogram example. Different from **Tabular** data example, +The second example provided is an image histogram example. Unlike the **Tabular** data example: -The image examples show the followings +The image examples show the following: * The [image_statistics.py](image_stats/jobs/image_stats/app/custom/image_statistics.py) only needs -to calculate the count and histogram target statistics, then user only needs to provide the calculation count, failure_count and histogram functions. There is no need to implement other metrics functions - (sum, mean,std_dev etc.) ( get_failure_count by default return 0 ) -* For each site's dataset, there are several thousands of images, the local histogram is aggregate histogram of all the image histograms. -* The image files are large, we can't load everything in memory, then calculate the statistics. -We will need to iterate through files for each calculation. For single feature, such as example. This is ok. If there are multiple features, -such as multiple channels, reload image to memory for each channel to do histogram calculation is really wasteful. -* Unlike [Data frame statistics](df_stats/README.md), the histogram bin's global range is pre-defined by user [0, 256] -where in [Data frame statistics](df_stats/README.md), besides "Age", all other features histogram global bin range +to calculate the count and histogram target statistics. Users only need to provide the calculation count, failure_count and histogram functions. There is no need to implement other metrics functions +(sum, mean, std_dev etc.) (get_failure_count by default returns 0) +* For each site's dataset, there are several thousand images; the local histogram is an aggregate histogram of all the image histograms +* The image files are large, so we can't load everything into memory and then calculate the statistics. +We will need to iterate through files for each calculation. For a single feature, this is acceptable. If there are multiple features, +such as multiple channels, reloading images to memory for each channel to do histogram calculation is wasteful +* Unlike [Data frame statistics](df_stats/README.md), the histogram bin's global range is pre-defined by users [0, 256], +whereas in [Data frame statistics](df_stats/README.md), besides "Age", all other features' histogram global bin range is dynamically estimated based on local min/max values An example of image histogram (the underline image files have only 1 channel) @@ -155,6 +189,7 @@ The main steps are * provide client side configuration to specify data input location * provide hierarchy specification file providing details about all the clients and their hierarchy. + ## Privacy Policy and Privacy Filters NVFLARE provide data privacy protection through privacy filters [privacy-management](https://nvflare.readthedocs.io/en/main/user_guide/security/site_policy_management.html#privacy-management) @@ -178,22 +213,21 @@ defined and job doesn't specify the privacy scope, the job deployment will fail, ### Privacy Policy Instrumentation -There are different ways to set privacy filter depending the use cases +There are different ways to set privacy filters depending on the use cases: #### Set Privacy Policy as researcher You can specify the "task_result_filters" in config_fed_client.json to specify -the privacy control. This is useful when you develop these filters +the privacy control. This is useful when you develop these filters. #### Setup site privacy policy as org admin -Once the company decides to instrument certain privacy policy independent of individual -job, one can copy the local directory privacy.json content to clients' local privacy.json ( merge not overwrite). -in this example, since we only has one app, we can simply copy the private.json from local directory to +Once the company decides to implement certain privacy policies independent of individual +jobs, one can copy the local directory privacy.json content to clients' local privacy.json (merge, not overwrite). +In this example, since we only have one app, we can simply copy the privacy.json from the local directory to: * site-1/local/privacy.json * site-2/local/privacy.json - We need to remove the same filters from the job definition in config_fed_client.json by simply set the "task_result_filters" to empty list to avoid **double filtering** ``` @@ -304,10 +338,7 @@ sequenceDiagram ``` - - ## Summary -We provided federated statistics operators that can easily aggregate and visualize the local statistics for -different data site and features. We hope this feature will make it easier to perform federated data analysis. - +We provided federated statistics operators that can easily aggregate and visualize the local statistics for +different data site and features. We hope this feature will make it easier to perform federated data analysis. \ No newline at end of file diff --git a/examples/advanced/federated-statistics/df_stats/README.md b/examples/advanced/federated-statistics/df_stats/README.md index 65900aeb5f..6c50b8b8d8 100644 --- a/examples/advanced/federated-statistics/df_stats/README.md +++ b/examples/advanced/federated-statistics/df_stats/README.md @@ -17,6 +17,52 @@ cd NVFlare/examples/advanced/federated-statistics/df_stats pip install -r requirements.txt ``` + +## Install fastdigest + +If you intend to calculate quantiles, you need to install fastdigest. + +``` +pip install fastdigest==0.4.0 +``` + +on Ubuntu, you might get the following error: + + Cargo, the Rust package manager, is not installed or is not on PATH. + This package requires Rust and Cargo to compile extensions. Install it through + the system's package manager or via https://rustup.rs/ + + Checking for Rust toolchain.... + +This is because fastdigest (or its dependencies) requires Rust and Cargo to build. + +You need to install Rust and Cargo on your Ubuntu system. Follow these steps: +Install Rust and Cargo +Run the following command to install Rust using rustup: + +``` +cd NVFlare/examples/advanced/federated-statistics/df_stats +./install_cargo.sh +``` + +Then you can install fastdigest again +``` +pip install fastdigest==0.4.0 +``` + +### Quantile Calculation + +To calculate federated quantiles, we needed to select a package that satisfies the following constraints: + +* Works in distributed systems +* Does not copy the original data (avoiding privacy leaks) +* Avoids transmitting large amounts of data +* Ideally, no system-level dependency + +We chose the fastdigest python package, a rust-based package. tdigest only carries the cluster coordinates, initially each data point is in its own cluster. By default, we will compress with max_bin = sqrt(datasize) to compress the coordinates, so the data won't leak. You can always override max_bins if you prefer more or less compression. + + + ## 1. Prepare data In this example, we are using UCI (University of California, Irvine) [adult dataset](https://archive.ics.uci.edu/dataset/2/adult) @@ -165,8 +211,12 @@ statistics computing, we will only need to provide the followings "stddev": {}, "histogram": { "*": {"bins": 10 }, "Age": {"bins": 5, "range":[0,120]} - } + }, + "quantile": { + "*": [25, 50, 75] + } }, + "writer_id": "stats_writer" } } @@ -195,7 +245,8 @@ in FLARE job store. ### 5.2 client side configuration -First, we specify the built-in client side executor: `StatisticsExecutor`, which takes a local stats generator Id +First, we specify the built-in client side executor: `StatisticsExecutor`, which takes a local stats generator ID + ``` "executor": { @@ -248,7 +299,7 @@ In this example, task_result_filters is defined as task privacy filter : `Statis `StatisticsPrivacyFilter` is using three separate the `StatisticsPrivacyCleanser`, you can find more details in [local privacy policy](../local/privacy.json) and in later discussion on privacy. -The privacy cleansers specify policy can be find in +The privacy cleansers specify policies can be found in ``` "components": [ { @@ -311,6 +362,8 @@ to calculate the local statistics, we will need to implements few methods def histogram(self, dataset_name: str, feature_name: str, num_of_bins: int, global_min_value: float, global_max_value: float) -> Histogram: + def quantiles(self, dataset_name: str, feature_name: str, percentiles: List) -> Dict: + ``` since some of features do not provide histogram bin range, we will need to calculate based on local min/max to estimate the global min/max, and then use the global bin/max as the range for all clients' histogram bin range. diff --git a/examples/advanced/federated-statistics/df_stats/demo/visualization.ipynb b/examples/advanced/federated-statistics/df_stats/demo/visualization.ipynb index 283f5279b2..f03648716a 100644 --- a/examples/advanced/federated-statistics/df_stats/demo/visualization.ipynb +++ b/examples/advanced/federated-statistics/df_stats/demo/visualization.ipynb @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "c44a0217", "metadata": { "tags": [] @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "93c62d5e", "metadata": { "tags": [] @@ -271,9 +271,9 @@ ], "metadata": { "kernelspec": { - "display_name": "nvflare_example", + "display_name": "nvflare-env", "language": "python", - "name": "nvflare_example" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -285,7 +285,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.2" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/examples/advanced/federated-statistics/df_stats/install_cargo.sh b/examples/advanced/federated-statistics/df_stats/install_cargo.sh new file mode 100755 index 0000000000..6fcbc255c6 --- /dev/null +++ b/examples/advanced/federated-statistics/df_stats/install_cargo.sh @@ -0,0 +1,15 @@ + +# fastdigest (or its dependencies) requires Rust and Cargo to build. +# You need to install Rust and Cargo on your Ubuntu system. Follow these steps: +# Install Rust and Cargo +# Run the following command to install Rust using rustup: + + +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +# Then restart your terminal or run: + +source $HOME/.cargo/env +# Verify Installation +# Check if Rust and Cargo are installed correctly: +rustc --version +cargo --version \ No newline at end of file diff --git a/examples/advanced/federated-statistics/df_stats/job_api/df_statistics.py b/examples/advanced/federated-statistics/df_stats/job_api/df_statistics.py index 5078c2f0a9..942bbf692d 100644 --- a/examples/advanced/federated-statistics/df_stats/job_api/df_statistics.py +++ b/examples/advanced/federated-statistics/df_stats/job_api/df_statistics.py @@ -21,10 +21,10 @@ class DFStatistics(DFStatisticsCore): - def __init__(self, data_path): + def __init__(self, filename, data_root_dir="/tmp/nvflare/df_stats/data"): super().__init__() - self.data_root_dir = "/tmp/nvflare/df_stats/data" - self.data_path = data_path + self.data_root_dir = data_root_dir + self.filename = filename self.data: Optional[Dict[str, pd.DataFrame]] = None self.data_features = [ "Age", @@ -57,7 +57,7 @@ def load_data(self, fl_ctx: FLContext) -> Dict[str, pd.DataFrame]: self.log_info(fl_ctx, f"load data for client {client_name}") try: skip_rows = self.skip_rows[client_name] - data_path = f"{self.data_root_dir}/{fl_ctx.get_identity_name()}/{self.data_path}" + data_path = f"{self.data_root_dir}/{fl_ctx.get_identity_name()}/{self.filename}" # example of load data from CSV df: pd.DataFrame = pd.read_csv( data_path, names=self.data_features, sep=r"\s*,\s*", skiprows=skip_rows, engine="python", na_values="?" diff --git a/examples/advanced/federated-statistics/df_stats/job_api/df_stats_job.py b/examples/advanced/federated-statistics/df_stats/job_api/df_stats_job.py index 696d57170c..39aafe312b 100644 --- a/examples/advanced/federated-statistics/df_stats/job_api/df_stats_job.py +++ b/examples/advanced/federated-statistics/df_stats/job_api/df_stats_job.py @@ -20,9 +20,9 @@ def define_parser(): parser = argparse.ArgumentParser() - parser.add_argument("-n", "--n_clients", type=int, default=3) - parser.add_argument("-d", "--data_root_dir", type=str, nargs="?", default="/tmp/nvflare/dataset/output") - parser.add_argument("-o", "--stats_output_path", type=str, nargs="?", default="statistics/stats.json") + parser.add_argument("-n", "--n_clients", type=int, default=2) + parser.add_argument("-d", "--data_root_dir", type=str, nargs="?", default="/tmp/nvflare/df_stats/data") + parser.add_argument("-o", "--stats_output_path", type=str, nargs="?", default="statistics/adults_stats.json") parser.add_argument("-j", "--job_dir", type=str, nargs="?", default="/tmp/nvflare/jobs/stats_df") parser.add_argument("-w", "--work_dir", type=str, nargs="?", default="/tmp/nvflare/jobs/stats_df/work_dir") parser.add_argument("-co", "--export_config", action="store_true", help="config only mode, export config") @@ -45,12 +45,11 @@ def main(): "mean": {}, "sum": {}, "stddev": {}, - "histogram": {"*": {"bins": 20}}, - "Age": {"bins": 20, "range": [0, 10]}, - "percentile": {"*": [25, 50, 75], "Age": [50, 95]}, + "histogram": {"*": {"bins": 20}, "Age": {"bins": 20, "range": [0, 100]}}, + "quantile": {"*": [0.1, 0.5, 0.9], "Age": [0.1, 0.5, 0.9]}, } # define local stats generator - df_stats_generator = DFStatistics(data_root_dir=data_root_dir) + df_stats_generator = DFStatistics(filename="data.csv", data_root_dir=data_root_dir) job = StatsJob( job_name="stats_df", @@ -63,6 +62,7 @@ def main(): job.setup_clients(sites) if export_config: + print("Exporting job config...", job_dir) job.export_job(job_dir) else: job.simulator_run(work_dir) diff --git a/examples/advanced/federated-statistics/df_stats/jobs/df_stats/app/config/config_fed_server.json b/examples/advanced/federated-statistics/df_stats/jobs/df_stats/app/config/config_fed_server.json index 58e4b861fb..62c4d3c8c5 100644 --- a/examples/advanced/federated-statistics/df_stats/jobs/df_stats/app/config/config_fed_server.json +++ b/examples/advanced/federated-statistics/df_stats/jobs/df_stats/app/config/config_fed_server.json @@ -19,7 +19,7 @@ "range": [0,120] } }, - "percentile": { + "quantile": { "*": [25, 50, 75] } }, diff --git a/examples/advanced/federated-statistics/df_stats/requirements.txt b/examples/advanced/federated-statistics/df_stats/requirements.txt index c766bd7827..5ae34037be 100644 --- a/examples/advanced/federated-statistics/df_stats/requirements.txt +++ b/examples/advanced/federated-statistics/df_stats/requirements.txt @@ -2,4 +2,4 @@ numpy pandas matplotlib jupyterlab -tdigest + diff --git a/examples/advanced/streaming/src/simple_controller.py b/examples/advanced/streaming/src/simple_controller.py index 206346e8f2..3c2d5a56d9 100644 --- a/examples/advanced/streaming/src/simple_controller.py +++ b/examples/advanced/streaming/src/simple_controller.py @@ -25,7 +25,6 @@ class SimpleController(Controller): - def control_flow(self, abort_signal: Signal, fl_ctx: FLContext): logger.info(f"Entering control loop of {self.__class__.__name__}") engine = fl_ctx.get_engine() diff --git a/examples/advanced/streaming/src/standalone_file_streaming.py b/examples/advanced/streaming/src/standalone_file_streaming.py index 1c4c44b87b..e714cb237c 100644 --- a/examples/advanced/streaming/src/standalone_file_streaming.py +++ b/examples/advanced/streaming/src/standalone_file_streaming.py @@ -27,7 +27,6 @@ class FileSender(FLComponent): - def __init__(self): super().__init__() self.seq = 0 @@ -73,7 +72,6 @@ def _sending_file(self, fl_ctx): class FileReceiver(FLComponent): - def __init__(self): super().__init__() self.done = False diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.0_introduction/introduction.ipynb index 260449d93b..d1454a14d4 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.0_introduction/introduction.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.0_introduction/introduction.ipynb @@ -4,43 +4,33 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Recap: Runing Federated Learning Applications\n", - "\n", - "\n", - "In this chapter, we will explore the process of running federated learning applications. We will start by setting up the environment and preparing the data, followed by training a classifier using PyTorch. We will then convert deep learning models to federated learning, customize server and client logic, and setup track experiments. Finally, we will delve into the job structure and configurations, including running a simulator, and conclude with a recap of the covered topics.\n", + "# Running Federated Learning Applications\n", "\n", + "In this chapter, we will explore the process of running federated learning applications. We will start by setting up the environment and preparing the data, followed by training a classifier using PyTorch. We will then convert deep learning models to federated learning, customize server and client logic, and set up experiment tracking. Finally, we will delve into the job structure and configurations, including running a simulator, and conclude with a recap of the covered topics.\n", "\n", "1. **Running federated learning job**\n", " * [Installation, prepare data](../01.1_running_federated_learning_job/setup.ipynb)\n", - " * [traing classifier with pytorch](../01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb)\n", - "\n", - "2. **From stand-alone-deep learning to Federated Learning**\n", - "\n", - " * [Convert deep learning with pytorch to federated leraning](../01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb)\n", - "\n", - "\n", - "2. **How to Customize the Federated Algorithms**\n", + " * [Training classifier with PyTorch](../01.1_running_federated_learning_job/running_pytorch_fl_job.ipynb)\n", "\n", - " * [customize server logics](../01.3_customize_server_logics/customize_server_logics.ipynb)\n", + "2. **From stand-alone deep learning to Federated Learning**\n", + " * [Convert deep learning with PyTorch to federated learning](../01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb)\n", "\n", - "4. **How to make adjustments to different traing parameters** \n", + "3. **Sever Side Customization**\n", + " * [customize server logics](../01.3_server_side_customization/customize_server_logics.ipynb)\n", "\n", - " * [customize client logics](../01.4_customize_client_training/customize_client_training.ipynb)\n", + "4. **Client Side Customization** \n", + " * [customize client training](../01.4_client_side_customization/customize_client_training.ipynb)\n", "\n", "5. **Tracking the training metrics** \n", - "\n", - " * [experiment tracking](../01.5_experiment_tracking/experiment_tracking.ipynb )\n", + " * [Experiment tracking](../01.5_experiment_tracking/experiment_tracking.ipynb)\n", "\n", "6. **Job structure and configurations**\n", + " * [Job structure & configuration](../01.6_job_structure_and_configuration/understanding_fl_job.ipynb)\n", "\n", - " * [job structure & configuration ](../01.6_job_structure_and_configuration/01.1.6.1_understanding_fl_job.ipynb)\n", - "\n", - " \n", - "7. [Recap of the covered topics](../01.7_recap/recap.ipynb)\n", - "\n", + "7. **Logging configurations**\n", + " * [Logging](../01.7_logging/logging.ipynb)\n", "\n", - "\n", - "Let's get started with [Installation & data preparation](.././01.1_running_federated_learning_job/setup.ipynb)\n" + "Let's get started with [Installation & data preparation](../01.1_running_federated_learning_job/setup.ipynb)" ] }, { @@ -50,8 +40,14 @@ } ], "metadata": { + "kernelspec": { + "display_name": "nvflare-env", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "name": "python", + "version": "3.8.13" } }, "nbformat": 4, diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/fl_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/fl_job.py index 5cd4c6de70..15ed34823e 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/fl_job.py +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/fl_job.py @@ -32,4 +32,4 @@ ) job.to(executor, f"site-{i + 1}") - job.simulator_run(workspace="/tmp/nvflare/jobs/workdir", log_config="./log_config.json") + job.simulator_run(workspace="/tmp/nvflare/jobs/workdir") diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/log_config.json b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/log_config.json deleted file mode 100644 index 240e9616eb..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/code/log_config.json +++ /dev/null @@ -1,93 +0,0 @@ -{ - "version": 1, - "disable_existing_loggers": false, - "formatters": { - "baseFormatter": { - "()": "nvflare.fuel.utils.log_utils.BaseFormatter", - "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" - }, - "colorFormatter": { - "()": "nvflare.fuel.utils.log_utils.ColorFormatter", - "fmt": "%(asctime)s - %(levelname)s - %(message)s", - "datefmt": "%Y-%m-%d %H:%M:%S" - }, - "jsonFormatter": { - "()": "nvflare.fuel.utils.log_utils.JsonFormatter", - "fmt": "%(asctime)s - %(identity)s - %(name)s - %(fullName)s - %(levelname)s - %(fl_ctx)s - %(message)s" - } - }, - "filters": { - "FLFilter": { - "()": "nvflare.fuel.utils.log_utils.LoggerNameFilter", - "logger_names": ["custom", "nvflare.app_common", "nvflare.app_opt"] - } - }, - "handlers": { - "consoleHandler": { - "class": "logging.StreamHandler", - "level": "INFO", - "formatter": "colorFormatter", - "filters": ["FLFilter"], - "stream": "ext://sys.stdout" - }, - "logFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "DEBUG", - "formatter": "baseFormatter", - "filename": "log.txt", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10 - }, - "errorFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "ERROR", - "formatter": "baseFormatter", - "filename": "log_error.txt", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10 - }, - "jsonFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "DEBUG", - "formatter": "jsonFormatter", - "filename": "log.json", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10 - }, - "FLFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "DEBUG", - "formatter": "baseFormatter", - "filters": ["FLFilter"], - "filename": "log_fl.txt", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10, - "delay": true - } - }, - "loggers": { - "root": { - "level": "INFO", - "handlers": ["consoleHandler", "logFileHandler", "errorFileHandler", "jsonFileHandler", "FLFileHandler"] - }, - "nvflare.app_opt.pt.client_api_launcher_executor.PTClientAPILauncherExecutor": { - "level": "ERROR" - }, - "nvflare.app_opt.pt.in_process_client_api_executor.PTInProcessClientAPIExecutor": { - "level": "ERROR" - } - } -} - - - - - - - - - diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/running_pytorch_fl_job.ipynb similarity index 61% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/running_pytorch_fl_job.ipynb index 87e84f39c9..990a62f9db 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/running_pytorch_fl_job.ipynb @@ -5,9 +5,9 @@ "id": "7a5c3d67-a6ea-4f59-84d2-effc3ef016e1", "metadata": {}, "source": [ - "# Runing Federated Learning Job with PyTorch\n", + " # Running Federated Learning Job with PyTorch\n", "\n", - "We have installed the NVIDIA FLARE, dependencies, download the data, look at the data split in [previous step](01.1.1_setup.ipynb), now we are going to look at the training. " + "We have installed NVIDIA FLARE and its dependencies, downloaded the data, and looked at the data split in the [previous step](01.1.1_setup.ipynb). Now we are going to look at the training.\n" ] }, { @@ -15,18 +15,22 @@ "id": "eb3f04b0", "metadata": {}, "source": [ - "## Run Federated Learning Training code\n", + " ## Run Federated Learning Training Code\n", "\n", + "The training code essentially consists of three files:\n", + "- `fl_job.py`: the main job flow\n", + "- `client.py`: the client-side training code\n", + "- `network.py`: the network model definition\n", + "\n", + "We use the `FedAvg` algorithm and workflow.\n", "\n", - "The training code essentially consists of `fl_job.py` code, `client.py` the client-side training code, and `nn.py` network model. We use the `FedAvg` algorithm and workflow.\n", "\n", - "```markdown\n", "## Run Federated Learning Training\n", "\n", - "The training code consists of three main scripts: `fl_job.py` for the overall job flow, `client.py` for the client-side training, and `network.py` for the network model. We use the built-in `FedAvg` algorithm for the server side worklow.\n", - "```\n", + "The training code consists of three main scripts: `fl_job.py` for the overall job flow, `client.py` for the client-side training, and `network.py` for the network model. We use the built-in `FedAvg` algorithm for the server-side workflow.\n", + "\n", "\n", - "to run the training in simulator we can simply execute the fl_job.py" + "To run the training in the simulator, we can simply execute `fl_job.py`.\n" ] }, { @@ -62,18 +66,18 @@ "id": "e823b5d4", "metadata": {}, "source": [ - "## 3. Access the logs and results\n", + " ## 3. Access the Logs and Results\n", "\n", - "You can find the running logs and results inside the simulator's workspace:\n", + "You can find the running logs and results inside the simulator's workspace.\n", "\n", - "noticed the \"fl_job.py\", we used the code \n", + "Notice that in `fl_job.py`, we used the code:\n", "\n", - "```\n", + "```python\n", "job.simulator_run(\"/tmp/nvflare/jobs/workdir\")\n", - "\n", "```\n", "\n", - "The \"/tmp/nvflare/jobs/workdir\" is the workspace directory of simulator\n" + "The \"/tmp/nvflare/jobs/workdir\" is the workspace directory of the simulator.\n", + "\n" ] }, { @@ -92,15 +96,21 @@ "id": "d4b62108", "metadata": {}, "source": [ - "We have successfully train a federated image classification model with pytorch. Next we need to take closer look the training codes and job structure. Let's go to [converting deep learning to federated learning](../01.1.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb)\n" + "We have successfully trained a federated image classification model with PyTorch. Next, we need to take a closer look at the training code and job structure. Let's go to [converting deep learning to federated learning](../01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb)." ] + }, + { + "cell_type": "markdown", + "id": "4406a33e", + "metadata": {}, + "source": [] } ], "metadata": { "kernelspec": { "display_name": "nvflare_env", "language": "python", - "name": "python3" + "name": "nvflare_env" }, "language_info": { "codemirror_mode": { diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/setup.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/setup.ipynb index bbf9a7e923..08da12882a 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/setup.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.1_running_federated_learning_job/setup.ipynb @@ -5,11 +5,12 @@ "id": "7a5c3d67-a6ea-4f59-84d2-effc3ef016e1", "metadata": {}, "source": [ - "# Setup and Prepare Data\n", + "# Setup and Preparation\n", "\n", - "This example of using [NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.html) to train an image classifier using federated averaging ([FedAvg](https://arxiv.org/abs/1602.05629))\n", + "This is an example of using [NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.html) to train an image classifier using federated averaging ([FedAvg](https://arxiv.org/abs/1602.05629))\n", "and [PyTorch](https://pytorch.org/) as the deep learning training framework.\n", "\n", + "\n", "We will use the train script [cifar10_fl.py](src/cifar10_fl.py) and network [net.py](src/net.py) from the src directory.\n", "\n", "The dataset will be [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset and will load its data within the client train code." @@ -72,7 +73,7 @@ "\n", "The CIFAR10 data will be downloaded to the common location. To make this process easiler, we wrote an simple download program like the followings\n", "\n", - "```\n", + "```python\n", "\n", "import argparse\n", "import torchvision.datasets as datasets\n", @@ -99,15 +100,26 @@ "id": "bcba6293", "metadata": {}, "source": [ - "The program just take a root dataset_path and download the training and test dataset to the given root directory from torchvision dataset. Let run the code. " + " The program just takes a root dataset_path and downloads the training and test datasets to the given root directory from the torchvision dataset. Let's run the code." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "87a13909", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /tmp/nvflare/data/cifar10/cifar-10-python.tar.gz\n", + "100%|████████████████████████████████████████| 170M/170M [00:05<00:00, 28.6MB/s]\n", + "Extracting /tmp/nvflare/data/cifar10/cifar-10-python.tar.gz to /tmp/nvflare/data/cifar10\n", + "Files already downloaded and verified\n" + ] + } + ], "source": [ "!python3 code/data/download.py" ] @@ -122,10 +134,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "08bbe572", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[01;34m/tmp/nvflare/data/cifar10/cifar-10-batches-py/\u001b[0m\n", + "├── \u001b[00mbatches.meta\u001b[0m\n", + "├── \u001b[00mdata_batch_1\u001b[0m\n", + "├── \u001b[00mdata_batch_2\u001b[0m\n", + "├── \u001b[00mdata_batch_3\u001b[0m\n", + "├── \u001b[00mdata_batch_4\u001b[0m\n", + "├── \u001b[00mdata_batch_5\u001b[0m\n", + "├── \u001b[00mreadme.html\u001b[0m\n", + "└── \u001b[00mtest_batch\u001b[0m\n", + "\n", + "0 directories, 8 files\n" + ] + } + ], "source": [ "!tree /tmp/nvflare/data/cifar10/cifar-10-batches-py/" ] @@ -137,8 +167,8 @@ "source": [ "### Split the data\n", "\n", - "In real-world scenarios, the data will be distributed among different clients/sides. Since we are simulating the real-world data, we need to split the data into different clients/sites. How to split the data, \n", - "depending on the type of problem or type of data. For simplicity, in this example we assume all clients will have the same data for horizontal federated learning cases.\n", + "In real-world scenarios, the data will be distributed among different clients/sites. Since we are simulating real-world data, we need to split the data into different clients/sites. How to split the data\n", + "depends on the type of problem or type of data. For simplicity, in this example we assume all clients will have the same data for horizontal federated learning cases.\n", "Thus we do not do a data split, but rather point all clients to the same data location.\n", "\n", "\n", @@ -148,6 +178,7 @@ "\n", "\n", "\n", + "\n", "\n" ] }, @@ -156,7 +187,7 @@ "id": "316bae55", "metadata": {}, "source": [ - "Next Step, we will start to run training using simulation: [run pytorch federated learning job](../01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb)\n" + "Next step, we will start to run training using simulation: [run pytorch federated learning job](../01.1_running_federated_learning_job/running_pytorch_fl_job.ipynb)\n" ] }, { @@ -168,9 +199,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "nvflare_env", "language": "python", - "name": "python3" + "name": "nvflare_env" }, "language_info": { "codemirror_mode": { diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb index 208177e0be..8f980292c6 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb @@ -7,9 +7,10 @@ "source": [ "# PyTorch Deep Learning to Federated Learning Conversion\n", "\n", - "One common question frequently heard from data scientists is how do I wrote a federated learning ? If I already have training code already for deep learning? how do I write an federated learning training code for the same problem?\n", "\n", - "In this section, we will look at the classification training code we ran earlier and see how to convert the existing the pytorch training script to federated Learning client training code\n", + "One common question frequently heard from data scientists is \"how do I write federated learning code? If I already have training code for deep learning, how do I write federated learning training code for the same problem?\"\n", + "\n", + "In this section, we will look at the classification training code we ran earlier and see how to convert the existing PyTorch training script to federated learning client training code.\n", "\n", "\n", "## Orginal Deep learning Training Script" @@ -61,21 +62,24 @@ "\n", "we call \n", "\n", - "```\n", + "```python\n", "flare.init()\n", "```\n", "\n", "Once the flare is initialized, we will recieve some system metadata for example\n", - "```\n", + "\n", + "```python\n", + "\n", " sys_info = flare.system_info()\n", " client_name = sys_info[\"site_name\"]\n", "\n", "```\n", "We can get current client's \"identity\". \n", "\n", - "Next we need to extends the trainig beyond local iterations. Image the Federated Learning is like the following for-loop: \n", + "Next we need to extend the training beyond local iterations. Imagine the Federated Learning is like the following for-loop:\n", + "\n", + "```python\n", "\n", - "```\n", "rounds = 5\n", "for current_round in ranage (rounds):\n", " \n", @@ -94,9 +98,10 @@ "For each round: we need to receive and evaluate the global model. \n", "\n", "\n", - "**Step-4** Recive global model \n", + "**Step-4** Receive global model\n", + "\n", + "```python\n", "\n", - "```\n", " input_model = flare.receive()\n", " round=input_model.current_round\n", "\n", @@ -104,23 +109,23 @@ " model.load_state_dict(input_model.params)\n", "```\n", "\n", - "**Step-5** Eveluate Global Model\n", + "**Step-5** Evaluate Global Model\n", + "\n", + "Since the local model is being updated with global model, the training procedure calculates the loss which evaluates the model\n", "\n", - " Since the local model is being updated with global model, the training procedue caclate the loss which evaluate the model \n", "\n", "**Step-6** Send the local trained model back to aggregator\n", "\n", - " we take the newly trained local model parameters as well as metadata, sned it back to aggregator. \n", + "We take the newly trained local model parameters as well as metadata, send it back to aggregator.\n", "\n", - "```\n", + "```python\n", "\n", " output_model = flare.FLModel( params=model.cpu().state_dict(), meta={\"NUM_STEPS_CURRENT_ROUND\": steps},)\n", "\n", " flare.send(output_model)\n", "```\n", "\n", - "\n", - "With above steps, just a few lines of code changes, no code structural changes, we converted the pytorch deep learning code to federated learning with NVIDIA FLARE\n", + "With above steps, just a few lines of code changes, no code structural changes, we converted the PyTorch deep learning code to federated learning with NVIDIA FLARE.\n", "\n", "The complete code can be found at client.py" ] @@ -140,19 +145,19 @@ "id": "7f1824bf", "metadata": {}, "source": [ - "Now, we converted the client pytorch training script to federated learning code. Lets look further to handle multi-task client code\n", + "Now, we converted the client PyTorch training script to federated learning code. Let's look further to handle multi-task client code.\n", "\n", "\n", "## Multi-Task Client Scripts\n", "\n", - "So far, the client only handles traing, regardless what tasks the server issues to the clients. What if there are many tasks ? Client should take different actions based on the different tasks. Also, in previous version, we did not evaluate the global model. We are also to handle all these in this section. \n", + "So far, the client only handles training, regardless of what tasks the server issues to the clients. What if there are many tasks? Client should take different actions based on the different tasks. Also, in the previous version, we did not evaluate the global model. We are going to handle all these in this section.\n", "\n", "\n", - "In Flare's Client API, by detault, we will issue three different tasks: \"train\", \"evaluate\" and \"submit_model\"\n", + "In Flare's Client API, by default, we will issue three different tasks: \"train\", \"evaluate\" and \"submit_model\"\n", "\n", "These three tasks can be checked by \n", "\n", - "```\n", + "```python\n", "\n", "flare.is_train()\n", "\n", @@ -162,7 +167,7 @@ "\n", "```\n", "\n", - "So we need to motify our existing training code to have both training and evaluation logics\n", + "So we need to modify our existing training code to have both training and evaluation logics.\n", "\n", "### Training logics changes\n", "\n", @@ -171,23 +176,25 @@ "\n", "evaluate the local model: \n", "\n", - "```\n", + "```python\n", " # (5.2) evaluation on local trained model to save best model\n", " local_accuracy = evaluate(net.state_dict())\n", "\n", "\n", "```\n", "\n", - "evalute the global model received \n", + "evaluate the global model received:\n", + "\n", + "```python\n", "\n", - "```\n", " # (5.3) evaluate on received model for model selection\n", " accuracy = evaluate(input_model.params)\n", "```\n", "\n", "Then add the global model accuracy into the metrics parameter of the FLModel before send it back to server. \n", "\n", - "```\n", + "```python\n", + "\n", " output_model = flare.FLModel(\n", " params=net.cpu().state_dict(),\n", " metrics={\"accuracy\": accuracy},\n", @@ -201,7 +208,7 @@ ">Note: the evaluate() function will discussed next\n", "\n", "\n", - "```\n", + "```python\n", " \n", "\n", " # (5.2) evaluation on local trained model to save best model\n", @@ -235,7 +242,7 @@ "The return value is accuracy percentage. \n", "\n", "\n", - "```\n", + "```python\n", "\n", " # wraps evaluation logic into a method to re-use for\n", " # evaluation on both trained and received model\n", @@ -270,7 +277,8 @@ "\n", "The overall logics becomes\n", "\n", - "```\n", + "```python\n", + "\n", "if flare.is_training(): \n", " traing and evaluate metrics\n", " send model and merics back\n", @@ -313,7 +321,7 @@ "source": [ "Now, we know how to convert an existing Deep Learning code to Federated Learning training script. We can now explore how to customize the training logics. \n", "\n", - "Please checkout [customize server logics](../01.1.3_customize_server_logics/customize_server_logics.ipynb)\n", + "Please checkout [server side customization](../01.3_server_side_customization/customize_server_logics.ipynb)\n", "\n" ] }, diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/data/download.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/data/download.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/data/download.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/data/download.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/fl_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/fl_job.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/fl_job.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/fl_job.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/requirements.txt b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/requirements.txt similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/requirements.txt rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/requirements.txt diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/client.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/src/client.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/client.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/src/client.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v0.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/src/fedavg_v0.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v0.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/src/fedavg_v0.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v1.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/src/fedavg_v1.py similarity index 99% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v1.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/src/fedavg_v1.py index b0508d33eb..e8d555bea5 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v1.py +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/src/fedavg_v1.py @@ -23,7 +23,6 @@ class FedAvgV1(BaseFedAvg): - def __init__( self, *args, diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v2.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/src/fedavg_v2.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/fedavg_v2.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/src/fedavg_v2.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/network.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/src/network.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/code/src/network.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/code/src/network.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/customize_server_logics.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/customize_server_logics.ipynb similarity index 85% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/customize_server_logics.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/customize_server_logics.ipynb index ed22affd71..6d436f9d1c 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_customize_server_logics/customize_server_logics.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.3_server_side_customization/customize_server_logics.ipynb @@ -8,16 +8,14 @@ "\n", "# Customizing Federated Learning Server logics\n", "\n", + "In previous sections, we were able to run federated PyTorch image classification code with NVIDIA FLARE's built-in FedAvg algorithm.\n", + "What if we want to build our own algorithms or modify the existing algorithm?\n", "\n", - "In previous sections, we are able to run federated pytorch image classification code with NVIDIA FLARE builtin FedAvg algorithm. \n", - "What if we want to build my own algorithms or modify the existing algorithm ? \n", - "\n", - "In the following, using FedAvg as starting point, we like to make a few changes to FedAvg to fit our needs: \n", - "\n", - "* Add early stopping mechanism so that the training could stop instead of waiting to the total numbers of rounds if the criteria is statisfied\n", - "* Instead of rely on the internal best model selection approach, we want to provide our own best model selection\n", - "* Instead of using building persiste component PTFileModelPersistor, we like to have our own save and loading functions\n", + "In the following, using FedAvg as a starting point, we would like to make a few changes to FedAvg to fit our needs:\n", "\n", + "* Add early stopping mechanism so that the training could stop instead of waiting for the total number of rounds if the criteria is satisfied\n", + "* Instead of relying on the internal best model selection approach, we want to provide our own best model selection\n", + "* Instead of using built-in persist component PTFileModelPersistor, we would like to have our own save and loading functions\n", "\n", "In this section, we will go over these changes step-by-step. \n", "\n", @@ -39,9 +37,10 @@ "FedAvg can be written as very simple for-loop. There are several other factors to consider \n", "\n", "* How to send the model to clients?\n", - "* How to receive the response \n", - "* for the model and response, what's the format ? \n", - "* The model and responses and corresponding objects must be serialized, how to series them ? \n", + "* How to receive the responses\n", + "* For the model and responses, what's the format?\n", + "* The model and responses and corresponding objects must be serialized, how to serialize them?\n", + "\n", "\n", "Let's dive into these questions.\n", "\n", @@ -50,7 +49,7 @@ "\n", "FLARE defined a high-level data structure \"FLModel\" that holds the model parameters, metrics and metadata\n", "\n", - "```\n", + "```python\n", "\n", "class ParamsType(str, Enum):\n", " FULL = \"FULL\"\n", @@ -76,12 +75,13 @@ "\n", "#### Serialization \n", "\n", - "Many of the deep learning machine frameworks using python pickle as default serrialization mechanism. There are enough security concerns that FLARE is not using Pickle. NVIDIA FLARE Object Serializer (FOBS) used a [messagePack](https://msgpack.org/index.html)-based serialization approach. \n", + "Many of the deep learning machine frameworks using python pickle as default serialization mechanism. There are enough security concerns that FLARE is not using Pickle. NVIDIA FLARE Object Serializer (FOBS) used a [messagePack](https://msgpack.org/index.html)-based serialization approach. \n", "User needs to register a component ( \"Decomposer\") to serialize/de-serialize certain project to fobs. \n", "\n", "To PyTorch Tensor, we need to register [TensorDecompressor](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_opt/pt/decomposers.py) component at FOBS. \n", "\n", - "```\n", + "```python\n", + "\n", " # Use FOBS for serializing/deserializing PyTorch tensors\n", " fobs.register(TensorDecomposer)\n", "```\n", @@ -90,10 +90,11 @@ "\n", "For high-level API, we can use the followings\n", "\n", - "```\n", + "```python\n", + "\n", " results = self.send_model_and_wait(targets=clients, data=model)\n", "```\n", - "the function send the FLModel to targeted clients and recieve result. This is synchornized methood like scatter and gather. We broadcast the model to all targeted clients and receive results when required clients send back the results. \n", + "the function send the FLModel to targeted clients and recieve result. This is synchronized method like scatter and gather. We broadcast the model to all targeted clients and receive results when required clients send back the results. \n", "\n", "The BasedFedAvg is derived from ModelController which has the communication component, which allows the component to send the model and wait for result. \n", "\n", @@ -129,11 +130,13 @@ "```stop_cond``` is a string to represent the stop condition, its string literal in the format of \" \" (e.g. \"accuracy >= 80\")\n", "\n", "we need to parse this condition so we can compare. To parse this, we leverage FLARE's math_utils\n", - "```\n", + "\n", + "```python\n", "\n", "math_utils.parse_compare_criteria(compare_expr: Optional[str] = None) -> Tuple[str, float, Callable]\n", "\n", "```\n", + "\n", "the return will be\n", "* key,\n", "* target_value,\n", @@ -163,16 +166,17 @@ "source": [ "#### Integrate the early stop condition\n", "\n", - "This should simple, if the condition is satified and simply break out the for-loop\n", + "This should simple, if the condition is satisfied and simply break out the for-loop\n", "\n", - "```\n", + "```python\n", " if self.should_stop(model.metrics, self.stop_condition):\n", " break\n", "```\n", "\n", "and the ```should_stop``` function is defined as followings\n", "\n", - "```\n", + "```python\n", + "\n", "def should_stop(self, metrics: Optional[Dict] = None, stop_condition: Optional[str] = None):\n", " key, target, op_fn = stop_condition\n", " value = metrics.get(key, None)\n", @@ -205,9 +209,10 @@ "source": [ "### Select best model \n", "\n", - "we simply write the following two functions and put into previus code\n", + "we simply write the following two functions and put into previous code\n", + "\n", + "```python\n", "\n", - "```\n", " def select_best_model(self, curr_model: FLModel):\n", " if self.best_model is None:\n", " self.best_model = curr_model\n", @@ -246,7 +251,7 @@ "source": [ "### Customized save and load model functions\n", " \n", - "The ```BaseFedAvg``` class defined ```save_model()``` and ```load_model()``` functions for user to overwrite. \n", + "The ```BaseFedAvg``` class defined ```save_model()``` and ```load_model()``` functions for user to override. \n", "We use torch save and load functions, and save the FLModel metadata separately with the fobs.dumpf and fobs.loadf serialization utilities.\n", "\n", "\n", @@ -313,7 +318,8 @@ "\n", "### Create Fed Job\n", "\n", - "```\n", + "```python\n", + "\n", " n_clients = 5\n", " num_rounds = 2\n", "\n", @@ -399,11 +405,17 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "e096501e", + "cell_type": "markdown", + "id": "63108356", + "metadata": {}, + "source": [ + "Next step, we are going to see how to customize the cilent side logics: [customize client side logics](../01.4_client_side_customization/customize_client_training.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "99daa93c", "metadata": {}, - "outputs": [], "source": [] } ], diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/data/download.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/data/download.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/data/download.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/data/download.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/fl_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/fl_job.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/fl_job.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/fl_job.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/requirements.txt b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/requirements.txt similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/requirements.txt rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/requirements.txt diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/client.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/src/client.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/client.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/src/client.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/fedavg.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/src/fedavg.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/fedavg.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/src/fedavg.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/fl_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/src/fl_job.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/fl_job.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/src/fl_job.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/network.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/src/network.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/code/src/network.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/code/src/network.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/customize_client_training.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/customize_client_training.ipynb similarity index 75% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/customize_client_training.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/customize_client_training.ipynb index 56f1a33a89..9b37e5e50c 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_customize_client_training/customize_client_training.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.4_client_side_customization/customize_client_training.ipynb @@ -1,24 +1,19 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "38561a0b-a072-41ea-b027-290402cf4582", - "metadata": {}, - "source": [ - "# Customize client training scripts for different sites\n" - ] - }, { "cell_type": "markdown", "id": "3f020f8b", "metadata": {}, "source": [ - "The client training script, so far, assume all sides have the same training parameters. In the real-world applications, each site's data will be different, therefore the training parameters such batch size and learning rate will be different.\n", + "# Customize client training scripts for different sites\n", "\n", + "The client training script, so far, assumes all sites have the same training parameters. In real-world applications, each site's data will be different, therefore the training parameters such as batch size and learning rate will be different.\n", "\n", - "In this section, we will show to set different parameters \n", + "In this section, we will show how to set different parameters.\n", + "\n", + "\n", + "```python\n", "\n", - "```\n", " # Add clients\n", "\n", " executor_1 = ScriptRunner(script=train_script, script_args=\"--learning_rate 0.01 --batch_size 12\")\n", @@ -31,10 +26,10 @@ " job.to(executor_3, \"site-3\")\n", "\n", " executor_4 = ScriptRunner(script=train_script, script_args=\"--learning_rate 0.001 --batch_size 6\")\n", - " job.to(executor_3, \"site-4\")\n", + " job.to(executor_4, \"site-4\")\n", " \n", " executor_5 = ScriptRunner(script=train_script, script_args=\"--learning_rate 0.0001 --batch_size 4\")\n", - " job.to(executor_3, \"site-5\")\n", + " job.to(executor_5, \"site-5\")\n", "\n", "```\n", "\n", @@ -93,11 +88,17 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "7966ab3a", + "cell_type": "markdown", + "id": "dbd5d996", + "metadata": {}, + "source": [ + "Next step, we are going to see how to do federated exeperiment trackinig with different experiment tracking systems: [experiment_tracking](../01.5_experiment_tracking/experiment_tracking.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "6d7fb83b", "metadata": {}, - "outputs": [], "source": [] } ], diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/01.5.1_experiment_tracking_with_tensorboard/experiment_tracking_tensorboard.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/01.5.1_experiment_tracking_with_tensorboard/experiment_tracking_tensorboard.ipynb index b22d5e90f2..2ea3c3d23b 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/01.5.1_experiment_tracking_with_tensorboard/experiment_tracking_tensorboard.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/01.5.1_experiment_tracking_with_tensorboard/experiment_tracking_tensorboard.ipynb @@ -7,11 +7,12 @@ "source": [ "# Experiment tracking with TensorBoard\n", "\n", - "NVFlare uses `TBAnalyticsReceiver` for experiment tracking on the FL server by default, allowing for .\n", + "NVFlare uses `TBAnalyticsReceiver` for experiment tracking on the FL server by default, enabling experiment tracking.\n", "\n", "## Default in FedAvgJob\n", "\n", - "The FedJob API makes it easy to create job congifurations, and by default the `TBAnalyticsReceiver` for TensorBoard streaming is included. You can specify your own analytics_receiver of type `AnalyticsReceiver` as a parameter if you want, but if left unspecified, `TBAnalyticsReceiver` is configured to be set up in `BaseFedJob` (nvflare/app_opt/pt/job_config/base_fed_job.py). \n", + "The FedJob API makes it easy to create job configurations, and by default the `TBAnalyticsReceiver` for TensorBoard streaming is included. You can specify your own analytics_receiver of type `AnalyticsReceiver` as a parameter if you want, but if left unspecified, `TBAnalyticsReceiver` is configured to be set up in `BaseFedJob` (nvflare/app_opt/pt/job_config/base_fed_job.py).\n", + "\n", "\n", "The `TBAnalyticsReceiver` for TensorBoard streaming receives and records the logs during the experiment by saving them to Tensoboard event files on the FL server. See [this link](https://nvflare.readthedocs.io/en/main/programming_guide/experiment_tracking/experiment_tracking_log_writer.html#tools-sender-logwriter-and-receivers) for more details on the other available AnalyticsReceivers in NVFlare: MLflowReceiver and WandBReceiver." ] @@ -152,7 +153,9 @@ "source": [ "## View tensorboard results\n", "\n", - "In order to see the results, you can use the following command directed to the location of the tensorboard event files (by default the location for the server should be as follows using the default simulator path provided):\n", + "\n", + "In order to see the results, you can use the following command directed to the location of the TensorBoard event files (by default, the location for the server should be as follows using the default simulator path provided):\n", + "\n", "\n", "```commandline\n", "tensorboard --logdir=/tmp/nvflare/jobs/workdir/server/simulate_job/tb_events\n", @@ -164,7 +167,8 @@ "id": "bdd0eb76", "metadata": {}, "source": [ - "Now, we know how experiment tracking can be achieved through metric logging and can be configured to work in a job with an `AnalyticsReceiver`. With this mechanism, we can stream various types of metric data.\n", + "Now we know how experiment tracking can be achieved through metric logging and can be configured to work in a job with an `AnalyticsReceiver`. With this mechanism, we can stream various types of metric data.\n", + "\n", "\n", "For how to use `MLflowReceiver` to set up experiment tracking for MLflow, see [Experiment Tracking with MLflow](../01.5.2_experiment_tracking_with_mlflow/experiment_tracking_mlflow.ipynb)." ] diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/01.5.2_experiment_tracking_with_mlflow/experiment_tracking_mlflow.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/01.5.2_experiment_tracking_with_mlflow/experiment_tracking_mlflow.ipynb index 9e26e8b38f..f106bcf70d 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/01.5.2_experiment_tracking_with_mlflow/experiment_tracking_mlflow.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/01.5.2_experiment_tracking_with_mlflow/experiment_tracking_mlflow.ipynb @@ -171,7 +171,7 @@ "id": "aec950ee", "metadata": {}, "source": [ - "The num_rounds for this job also 20 for more data for a better looking graph. Note that even though [this job](code/src/client_mlflow.py) uses `MLflowWriter`, if we used the [client code](code/src/client.py) with `SummaryWriter`, the resulting data logged to MLflow would be the same since behind the scenes, there is conversion that occurs to translate the event with the log with SummaryWriter to be the equivalent for MLflow." + "The num_rounds for this job is also 20 for more data for a better looking graph. Note that even though [this job](code/src/client_mlflow.py) uses `MLflowWriter`, if we used the [client code](code/src/client.py) with `SummaryWriter`, the resulting data logged to MLflow would be the same since, behind the scenes, there is conversion that occurs to translate the event with the log with SummaryWriter to be the equivalent for MLflow." ] }, { @@ -195,10 +195,16 @@ "id": "bdd0eb76", "metadata": {}, "source": [ - "Now, we know how experiment tracking can be achieved through metric logging and can different types of `AnalyticsReceiver` can be configured to work in a job. With this mechanism, we can stream various types of metric data.\n", + "Now we know how experiment tracking can be achieved through metric logging and how different types of `AnalyticsReceiver` can be configured to work in a job. With this mechanism, we can stream various types of metric data.\n", "\n", - "To continue, please see [Understanding FLARE federated learning Job structure](../../01.6_job_structure_and_configuration/01.1.6.1_understanding_fl_job.ipynb)." + "To continue, please see [Understanding FLARE federated learning Job structure](../../01.6_job_structure_and_configuration/understanding_fl_job.ipynb)" ] + }, + { + "cell_type": "markdown", + "id": "365e2f89", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/experiment_tracking.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/experiment_tracking.ipynb index e4a865a3b0..bf54d7a70d 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/experiment_tracking.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.5_experiment_tracking/experiment_tracking.ipynb @@ -15,14 +15,14 @@ "\n", "In a federated computing setting, data is distributed across multiple devices or systems, and training is run on each device independently while preserving each client’s data privacy.\n", "\n", - "Assuming a federated system consisting of one server and many clients and the server coordinating the ML training of clients, we can interact with ML experiment tracking tools in two different ways:\n", + "There are two ways to interact with ML experiment tracking tools:\n", "\n", - "![experiment_tracking](img/metrics-streaming-fl-server-clients.png)\n", + "\"experiment_tracking\"\n", "\n", - "- Decentralized tracking (client-side experiment tracking): Each client will directly send the log metrics/parameters to the ML experiment tracking server (like MLflow or Weights and Biases) or local file system (like tensorboard)\n", - "- Centralized tracking (aggregated experiment trackin): Clients will send the log metrics/parameters to the FL server, and the FL server will send the metrics to ML experiment tracking server or local file system\n", + "- Decentralized tracking (client-side experiment tracking): Each client directly sends the log metrics/parameters to the ML experiment tracking server (like MLflow or Weights and Biases) or local file system (like TensorBoard).\n", + "- Centralized tracking (aggregated experiment tracking): Clients send the log metrics/parameters to the FL server, and the FL server sends the metrics to the ML experiment tracking server or local file system.\n", "\n", - "The NVIDIA FLARE job configuration enables you to choose the tracking scenario or system that best fits your needs. When users need to migrate from one experiment tracking system to another, using NVIDIA FLARE, you can modify the job configuration without re-writing the experiment tracking code." + "The NVIDIA FLARE job configuration enables you to choose the tracking scenario or system that best fits your needs. When users need to migrate from one experiment tracking system to another using NVIDIA FLARE, you can modify the job configuration without rewriting the experiment tracking code.\n" ] }, { @@ -30,7 +30,7 @@ "id": "47399ea6", "metadata": {}, "source": [ - "The `nvflare.client.tracking` API enables you to flexibly redirect your logging metrics to any destination. The use of MLflow, Weights & Biases, or TensorBoard syntax does not matter here as you can stream the collected metrics to any supported experiment tracking system. Choosing to use MLflowWriter, WandBWriter, or TBWriter is based on your existing code and requirements.\n", + "The `nvflare.client.tracking` API enables you to flexibly redirect your logging metrics to any destination. The syntax you use (MLflow, Weights & Biases, or TensorBoard) doesn't matter, as you can stream the collected metrics to any supported experiment tracking system. The choice between MLflowWriter, WandBWriter, or TBWriter depends on your existing code and requirements.\n", "\n", "- MLflowWriter uses the MLflow API operation log_metric.\n", "- TBWriter uses the TensorBoard SummaryWriter operation add_scalar.\n", diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/01.1.6.1_understanding_fl_job.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/understanding_fl_job.ipynb similarity index 88% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/01.1.6.1_understanding_fl_job.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/understanding_fl_job.ipynb index ae93a347f5..1ce1f4b440 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/01.1.6.1_understanding_fl_job.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/understanding_fl_job.ipynb @@ -13,17 +13,16 @@ "id": "ab47f2c5", "metadata": {}, "source": [ - "## What is NVFlare Job ? \n", + " ## What is an NVFlare Job?\n", "\n", - "\n", - "NVFlare Job refers to a job configuration used within the NVIDIA FLARE framework. \n", + "NVFlare Job refers to a job configuration used within the NVIDIA FLARE framework.\n", "\n", "In NVFlare, a job is a unit of work that defines the specific tasks to be executed during a federated learning process. It encapsulates all necessary configurations, scripts, and resources needed to run an FL task, such as training, validation, or evaluation, across multiple participants in a federated system.\n", "\n", - "A job may have many apps. Each app consists code specific for the site (client site or server site) as well as configurations. \n", + "A job may have many apps. Each app consists of code specific for the site (client site or server site) as well as configurations.\n", "\n", + "In this section, we will take a look at the Job structure as well as the Job API (aka job construction API).\n", "\n", - "In this section, we will take a look at the Job structure as well as Job API ( akak job construction API). \n", "\n", "## Job creation API\n", "\n", @@ -183,7 +182,8 @@ "id": "f475fa49", "metadata": {}, "source": [ - "The job name \"FedAvg\" is folder structure, with each folder representing one app at one site. \n", + "The job name \"FedAvg\" is a folder structure, with each folder representing one app at one site.\n", + "\n", "\n", "* **\"app_server\"**: is the name for the server app\n", "\n", @@ -199,7 +199,7 @@ "\n", "* meta.json gives additional information related to the each app's deployment. \n", "\n", - "```\n", + "```json\n", "{\n", " \"name\": \"fedavg\",\n", " \"resource_spec\": {},\n", @@ -235,7 +235,8 @@ "source": [ "A simplifed format of job structure can also be used when the client code and configuration is the same for all sites\n", "\n", - "```\n", + "```shell\n", + "\n", "/tmp/nvflare/jobs/job_config/fedavg\n", "├── app_server\n", "│ ├── config\n", @@ -258,7 +259,7 @@ "meta.json needs to be \n", "\n", "\n", - "```\n", + "```json\n", "{\n", " \"name\": \"fedavg\",\n", " \"resource_spec\": {},\n", @@ -277,7 +278,7 @@ "\n", "If we don't mind deploy all code to all sites, we can change the job config into the followings\n", "\n", - "A simplifed format of job structure can also be used when the client code and configuration is the same for all sites\n", + " A simplified format of job structure can also be used when the client code and configuration are the same for all sites\n", "\n", "```\n", "/tmp/nvflare/jobs/job_config/fedavg\n", @@ -297,7 +298,7 @@ "meta.json needs to be \n", "\n", "\n", - "```\n", + "```json\n", "{\n", " \"name\": \"fedavg\",\n", " \"resource_spec\": {},\n", @@ -317,9 +318,10 @@ "source": [ "## Job Configuration\n", "\n", - "We have convered a lot of ground so far. You could stop here, and move to the next chapter of the training materials. \n", "\n", - "But if you like to futher understand how NVIDIA FLARE works, you might want to go through this section: Job Configuration. \n" + "We have covered a lot of ground so far. You could stop here and move to the next chapter of the training materials.\n", + "\n", + "But if you would like to further understand how NVIDIA FLARE works, you might want to go through this section: Job Configuration.\n" ] }, { @@ -379,7 +381,7 @@ "id": "a75c80c4", "metadata": {}, "source": [ - "The server configuration is a json file descripe the workflows. In our case, we defined one workflow, whci has a controller using our defined FedAvg class. \n", + "The server configuration is a JSON file describing the workflows. In our case, we defined one workflow, which has a controller using our defined FedAvg class.\n", "\n", "\n", ">Note: The configuration pattern is like the followings\n", @@ -417,8 +419,7 @@ "id": "d9753aeb", "metadata": {}, "source": [ - "the configuration is simular, it defines an array of \"executors\", a builtin ```PTInProcessClientAPIExecutor``` is used, \n", - "which takes the training script client.py and its corresponding arguments as input. \n", + "The configuration is similar; it defines an array of \"executors\". A built-in `PTInProcessClientAPIExecutor` is used, which takes the training script client.py and its corresponding arguments as input. \n", "\n", "\n", "```\n", @@ -472,8 +473,14 @@ "id": "e8914e76", "metadata": {}, "source": [ - "Hope you have a good standing of working with NVIDIA FLARE job so far. Let's move on to other chapters. " + "Hope you now have a good understanding of working with NVIDIA FLARE jobs. Before we move on to other chapters, let's logging configuration to make it easier to debug in case of errors. [Logging Configuration](../01.7_logging/logging.ipynb)" ] + }, + { + "cell_type": "markdown", + "id": "33630af4", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.7_logging/logging.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.7_logging/logging.ipynb new file mode 100644 index 0000000000..fa63f128bf --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.7_logging/logging.ipynb @@ -0,0 +1,148 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NVIDIA FLARE Logging\n", + "\n", + "Before we finish chapter one, we would like to discuss logging. This will help us with debugging in the following chapters.\n", + "\n", + "The detailed logging configuration can be found in the [NVFlare Documentation](https://nvflare.readthedocs.io/en/main/user_guide/configurations/logging_configuration.html)\n", + "\n", + "as well as in the [NVFlare Logging Tutorial](https://github.com/NVIDIA/NVFlare/blob/main/examples/tutorials/logging.ipynb)\n", + "\n", + "Here are few key features of the logging in NVFlare:\n", + "\n", + "## Structured logging:\n", + "\n", + "When defining new loggers, we provide several functions to help adhere to the FLARE package logger hierarchy. For example, say we have the following module at my_package.my_module:\n", + "\n", + "get_obj_logger for classes. Ex:\n", + "\n", + "```python\n", + "\n", + " class MyClass:\n", + " def __init__(self):\n", + " self.logger = get_obj_logger(self) # my_package.my_module.MyClass\n", + "```\n", + "\n", + "get_script_logger for scripts (if not in a package, default to custom.). Ex:\n", + "\n", + "```python\n", + " if __name__ == \"__main__\":\n", + " logger = get_script_logger() # my_package.my_module\n", + "``` \n", + "get_module_logger for modules. Ex:\n", + "\n", + "```python\n", + " def my_function():\n", + " logger = get_module_logger(name=\"my_function\") # my_package.my_module.my_function\n", + "```\n", + "\n", + "If you use these functions to create your loggers, you will have a better hierarchy and the logging configuration can be configured in a structured way.\n", + "\n", + "For example, you can enable a module logging level, which can define all the sub-modules' logging levels if not defined.\n", + "\n", + "## Multiple logging formats:\n", + "\n", + "NVFlare supports and creates multiple logging formats, including JSON and txt formats. The JSON format is more useful for integrating with monitoring systems.\n", + "\n", + "log.txt:\n", + "The logFileHandler uses the baseFormatter to write all logs to log.txt. This is the default log that we see in the console.\n", + "\n", + "log.json:\n", + "The jsonFileHandler uses the jsonFormatter to write JSON formatted logs to log.json. This is useful for leveraging structured logs (i.e., with a 3rd party observability package).\n", + "\n", + "log_error.txt:\n", + "The errorFileHandler uses the baseFormatter and level \"ERROR\" to write error level logs to log_error.txt. This allows users to easily see when errors are logged.\n", + "\n", + "log_fl.txt:\n", + "The FLFileHandler uses the baseFormatter and FLFilter (uses LoggerNameFilter allowing certain logger names) to write FL training and custom logs to log_fl.txt. This removes the system and communication related logs and clearly shows logs related to FL training.\n", + "\n", + "## Logging mode for simulation\n", + "\n", + "We define a few special log configurations for simulation. This will reduce the amount of log information seen by data scientists, so they can focus on the training logs.\n", + "\n", + "The three logging modes are:\n", + "\n", + "* **concise**: filters out server and client process logs, only contains the application logs\n", + "* **full**: is the full log\n", + "* **verbose**: is the debug level of full log\n", + "\n", + "The simulator defaults to ```concise``` mode, which is the most useful for data scientists to see the training logs.\n", + "\n", + "\n", + "## Dynamic Logging Configuration Commands\n", + "\n", + "\n", + "When running the FLARE system (POC mode or production mode), in many cases, we need to change the logging level or configuration dynamically without stopping the system. Dynamic logging configuration provides these capabilities.\n", + "\n", + "There are two sets of logs: the site logs and job logs. The current site log configuration will be used for the site logs as well as the log configuration of any new job started on that site. \n", + " \n", + "We provide two admin commands to enable users to dynamically configure the site or job level logging when running the FLARE system. Note these command effects will last until reconfiguration or as long as the corresponding site or job is running. However these commands do not overwrite the log configuration file in the workspace. The previous the log configuration file can be reloaded using “reload”.\n", + "\n", + " \n", + "\n", + "here are more examples: \n", + "\n", + "```python\n", + "\n", + "configure_site_log server debug\n", + "configure_site_log client site-1 debug\n", + "configure_site_log all info\n", + "\n", + "configure_job_log server debug\n", + "configure_job_log client site-1 debug\n", + "configure_job_log all info\n", + "configure_job_log all //custom_log_config.json\n", + "```\n", + "\n", + "The ```configure_site_log``` is the FLARE Console command used to configure the site log configuration. \n", + "The ```configure_job_log``` is the FLARE Console command used to configure job log configuration. \n", + "\n", + "\n", + "\n", + "## Customizing logging\n", + "You can always customize logging by adding or removing filters, formats and profile your own logging configuration for simulation, job and system. We are not going to cover this in this tutorial.\n", + "\n", + "## FLARE Job Simulator Run Logging\n", + "\n", + "Since the Flare Job API uses the simulator to run, it defaults to concise mode. If you want to use different log configure, you can use the following command:\n", + "\n", + "\n", + "```python\n", + "\n", + "job.simulator_run(job_config_dir, log_config=\"full\")\n", + "job.simulator_run(job_config_dir, log_config=\"verbose\")\n", + "job.simulator_run(job_config_dir, log_config=\"concise\")\n", + "job.simulator_run(job_config_dir, log_config=\"/path/to/log_config.json)\n", + "\n", + "```\n", + "\n", + "Now that we have briefly introduced the logging configuration, let's wrap up this chapter: [wrap up](..//01.8_recap/recap.ipynb) \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.7_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.7_recap/recap.ipynb deleted file mode 100644 index 640b47c170..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.7_recap/recap.ipynb +++ /dev/null @@ -1,82 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "7b152728-3366-4432-adb1-29aa3051dc22", - "metadata": {}, - "source": [ - "# Summary of Chapter 1\n", - "\n", - "We cover a lot of materials in Chapter 1. We guide you through the process of running federated learning applications. Here is an overview of the key contents:\n", - "\n", - "1. **Running Federated Learning Job**\n", - " - **Installation and Data Preparation**: Instructions for setting up the environment and preparing the data.\n", - " - [setup.ipynb](../01.1_running_federated_learning_job/setup.ipynb)\n", - " - **Training Classifier with PyTorch**: Steps to train a classifier using PyTorch in a federated learning setup.\n", - " - [runing_pytorch_fl_job.ipynb](../01.1_running_federated_learning_job/runing_pytorch_fl_job.ipynb)\n", - "\n", - "2. **From Stand-Alone Deep Learning to Federated Learning**\n", - " - **Conversion to Federated Learning**: Guide on converting deep learning models with PyTorch to federated learning.\n", - " - [convert_dl_to_fl.ipynb](../01.2_convert_deep_learning_to_federated_learning/convert_dl_to_fl.ipynb)\n", - "\n", - "3. **Customizing the Federated Algorithms**\n", - " - **Server Logic Customization**: Techniques to customize server logic for specific federated learning needs, we built an our own fed avg algorithms with best model seleciton, model saving and loading, as well as early stopping. \n", - " - [customize_server_logics.ipynb](../01.3_customize_server_logics/customize_server_logics.ipynb)\n", - "\n", - "4. **Adjusting Training Parameters**\n", - " - **Client Logic Customization**: Methods to customize client logic to optimize training parameters. Here we show how to customize the training for each site. \n", - " - [customize_client_training.ipynb](../01.4_customize_client_training/customize_client_training.ipynb)\n", - "\n", - "5. **Tracking Training Metrics**\n", - " - **Experiment Tracking**: Tools and methods to track experiments and monitor training metrics effectively.\n", - " - [experiment_tracking.ipynb](../01.5_experiment_tracking/experiment_tracking.ipynb)\n", - "\n", - "6. **Job Structure and Configurations**\n", - " - **Understanding Job Structure and Configuration**: Detailed explanation of the job structure and configurations necessary for running federated learning jobs.\n", - " - [01.1.6.1_understanding_fl_job.ipynb](../01.6_job_structure_and_configuration/01.1.6.1_understanding_fl_job.ipynb)\n", - "\n", - "7. **Recap of Covered Topics**\n", - " - **Summary and Recap**: A recap of the topics covered in the previous sections.\n", - " - [recap.ipynb](../01.7_recap/recap.ipynb)\n", - "\n", - "Each section is designed to provide comprehensive guidance and practical examples to help you implement and customize federated learning in your applications. For detailed instructions and examples, refer to the respective notebooks linked in each section.\n", - "\n", - "\n", - "Now let's move on to the [Chapter 2](../../chapter-2_develop_federated_learning_applications/02.0_introduction/introduction.ipynb\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "4f2e3cb3-e61f-45e9-8dad-ad55ebb3641a", - "metadata": {}, - "source": [ - " \n", - "\n", - "\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.8_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.8_recap/recap.ipynb new file mode 100644 index 0000000000..0eb75a8a34 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-1_running_federated_learning_applications/01.8_recap/recap.ipynb @@ -0,0 +1,70 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7b152728-3366-4432-adb1-29aa3051dc22", + "metadata": {}, + "source": [ + "# Recap: Chapter 1 Summary\n", + "\n", + "Throughout this chapter, we've learned several crucial concepts and practical skills in federated learning. Here are the essential takeaways:\n", + "\n", + "1. **Federated Learning Fundamentals**\n", + " - FL enables collaborative model training while keeping data at source\n", + " - Basic FL workflow: client training → model aggregation → model broadcast → repeat\n", + " - NVIDIA FLARE provides a robust framework for implementing FL systems\n", + "\n", + "2. **Converting Traditional to Federated Learning**\n", + " - Most PyTorch models can be adapted for federated learning\n", + " - Key modifications needed:\n", + " - Separating training logic from model loading. The model can be received and sent back to the server, while keeping the training logic mostly the same.\n", + "\n", + "3. **Customization Capabilities**\n", + " - Server-side customization:\n", + " - Implementing custom aggregation strategies\n", + " - Model selection and persistence\n", + " - Early stopping mechanisms\n", + " - Client-side customization:\n", + " - Local training optimization\n", + " - Site-specific parameters\n", + " - Custom data handling\n", + "\n", + "These fundamentals prepare you for more advanced FL concepts and implementations in the following chapters.\n", + "\n", + "Now let's move on to [Chapter 2](../../chapter-2_develop_federated_learning_applications/02.0_introduction/introduction.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "4f2e3cb3-e61f-45e9-8dad-ad55ebb3641a", + "metadata": {}, + "source": [ + " \n", + "\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nvflare_example", + "language": "python", + "name": "nvflare_example" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.0_introduction/introduction.ipynb index 85352f1462..bcd503a12b 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.0_introduction/introduction.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.0_introduction/introduction.ipynb @@ -5,36 +5,64 @@ "id": "ed32af6f", "metadata": {}, "source": [ - "# Introduction: Develop Federated Learning Applications\n", + "# Develop Federated Learning Applications\n", "\n", - "In this chapter, we will explore the process of developing federated learning applications. We will start by exploring federated statistics and getting different visualizations from the data. We will then convert PyTorch Lightning code for use with NVFlare. We will then examine machine learning algorithms and look at how to convert logistics regression, kmeans, and survival analysis for use with federated learning. Finally, we will look into the Client API and conclude with a recap of the covered topics.\n", + "In this chapter, we will explore the process of developing federated learning applications. \n", + "We will cover the following topics:\n", "\n", + "* Federated Analytics\n", + " * Discover the federated statistics of local and global data\n", + " * Visualize the federated statistics\n", + "\n", + "* Convert deep learning code to Federated Learning code\n", + "\n", + " We already covered converting PyTorch code to Federated Learning code in the previous chapter. Now we will cover: \n", + " * Converting PyTorch Lightning code to Federated Learning code\n", + " * Converting TensorFlow code to Federated Learning code\n", + " \n", + "\n", + "* Convert machine learning code to Federated Learning code\n", + " * Convert logistic regression to Federated Learning code\n", + " * Convert k-means to Federated Learning code\n", + " * Convert survival analysis to Federated Learning code\n", + "\n", + "* Client API\n", + " Converting ML/DL to federated learning leverages the client API to interact with the federated learning system. We will explore the client API in detail in this chapter.\n", + " \n", "\n", "2.1. **Federated Statistics**\n", - " * [Federated Statistics with image data](../02.1_federated_statistics/0federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb)\n", + "\n", " * [Federated Statistics with tabular data](../02.1_federated_statistics/federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb)\n", "\n", - "2.2. **Convert PyTorch Lightning to Federated Learning**\n", + " * [Federated Statistics with image data](../02.1_federated_statistics/federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb)\n", "\n", - " * [Convert Torch Lightning to FL](../02.2_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb)\n", + "2.2. **Introduction to Client API** \n", "\n", + " * [Client API](../02.2_client_api/client_api.ipynb)\n", "\n", - "2.3. **How to Convert Machine Learning Algorithms to Federated Algorithms**\n", + "2.3. **Convert PyTorch Lightning to Federated Learning**\n", "\n", - " * [Convert Logistics Regression to federated learning](../02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_Logistics_regression_to_federated_learning/convert_lr_to_fl.ipynb)\n", - " * [Convert KMeans to federated learning](../02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb)\n", - " * [Convert Survival Analysis to federated learning](../02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb)\n", + " * [Convert PyTorch Lightning to Federated Learning](../02.3_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb)\n", "\n", - "2.4. **Client API** \n", + "2.4. **Convert Machine Learning to Federated Learning**\n", "\n", - " * [NVFlare Client API](../02.4_client_api/Client_api.ipynb)\n", + " * [Convert Logistic Regression to Federated Learning](../02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/convert_logistic_regression_to_fl.ipynb)\n", "\n", - "2.5. [Recap of the covered topics](../02.5_recap/recap.ipynb)\n", + " * [Convert K-means to Federated Learning](../02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb)\n", "\n", + " * [Convert Survival Analysis to Federated Learning](../02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb)\n", + "\n", + "2.5. [Recap of the covered topics](../02.5_recap/recap.ipynb)\n", "\n", "\n", - "Let's get started with [Federated Statistics](../02.1_federated_statistics/0federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb)\n" + "Let's get started with [Federated Statistics](../02.1_federated_statistics/federated_statistics_introduction.ipynb)" ] + }, + { + "cell_type": "markdown", + "id": "abde95b6", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_introduction.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_introduction.ipynb index 5fcc41dfed..ad2066eb2f 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_introduction.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_introduction.ipynb @@ -4,18 +4,227 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Federated Statistics Introduction\n", + "# Federated Statistics\n", "\n", - "In a federated learning setting, because data is private at each site and we need to ensure data privacy, there are many considerations to take into account when trying to gather statistics on the data. We provide two examples, one for image data and one for tabular data:\n", + "Before training the model, data scientists need to understand the data distribution and data quality of the data at different sites. As we can't access the raw data, we need to perform federated Analytics to get the statistics of the data at global level. That's where Federated Statistics comes in.\n", "\n", - " * [Federated Statistics with image data](./federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb) shows how to compute local and global image statistics with the consideration that data is private at each of the client sites.\n", - " * [Federated Statistics with tabular data](./federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb) demonstrates how to create federated statistics for data that can be represented as Pandas DataFrames." + "\n", + "## Objective\n", + "NVIDIA FLARE will provide built-in federated statistics operators (controllers and executors) that \n", + "can generate global statistics based on local client side statistics.\n", + "\n", + "At each client site, we could have one or more datasets (such as \"train\" and \"test\" datasets); each dataset may have many \n", + "features. For each feature in the dataset, we will calculate the statistics and then combine them to produce \n", + "global statistics for all the numeric features. The output would be complete statistics for all datasets in clients and global. \n", + "\n", + "The statistics here are commonly used statistics: count, sum, mean, std_dev and histogram for the numerical features.\n", + "The max, min are not included as it might violate the client's data privacy. The mean will be calculated with count and sum. \n", + "\n", + "The quantile can also be included, but requires additional dependency to be installed. We are not going to include it in the default installation.\n", + "\n", + "\n", + "A client will only need to implement the selected methods of \"Statistics\" class from statistics_spec.\n", + "\n", + "The result will be statistics for all features of all datasets at all sites as well as global aggregates. \n", + "The result could be visualized via the visualization utility in the notebook. \n", + "\n", + "\n", + "## Assumptions\n", + " \n", + "Assume that clients will provide the following: \n", + " * users need to provide target statistics such as count, histogram etc. of targeted statistics features\n", + " * users need to provide the local statistics for the **target statistics** (by implementing the statistic_spec), not all statistics. For example, if the target statistics does not include histogram, then users will not need to provide the histogram calculation function.\n", + " * users need to provide the datasets and dataset features (feature name, data type)\n", + " * Note: count is always required as we use count to enforce data privacy policy\n", + " \n", + "We only support **numerical features**, not categorical features. But users can return all types of features;\n", + "the non-numerical features will be removed.\n", + "\n", + "\n", + "\n", + "## Statistics\n", + "\n", + " Federated statistics includes numerics statistics measures for \n", + " * count\n", + " * mean \n", + " * sum\n", + " * std_dev\n", + " * histogram \n", + " * quantile\n", + " \n", + " We did not include min, max value to avoid data privacy concern. \n", + "\n", + "\n", + "#### Quantile\n", + "\n", + "Quantile statistics refers to statistical measures that divide a probability distribution or dataset into intervals with equal probabilities or proportions. Quantiles help summarize the distribution of data by providing key points that indicate how values are spread.\n", + "\n", + "##### Key Quantiles:\n", + "1. Median (50th percentile): The middle value of a dataset, dividing it into two equal halves.\n", + "2. Quartiles (25th, 50th, 75th percentiles): Divide the data into four equal parts:\n", + "* Q1 (25th percentile): Lower quartile, below which 25% of the data falls.\n", + "* Q2 (50th percentile): Median.\n", + "* Q3 (75th percentile): Upper quartile, below which 75% of the data falls.\n", + "3. Deciles (10th, 20th, ..., 90th percentiles): Divide the data into ten equal parts.\n", + "4. Percentiles (1st, 2nd, ..., 99th): Divide the data into 100 equal parts.\n", + "\n", + "##### Usage of Quantiles:\n", + "* Descriptive Statistics: Summarizes the spread of data.\n", + "* Outlier Detection: Helps identify extreme values.\n", + "* Machine Learning: Used in feature engineering, normalization, and decision tree algorithms.\n", + "* Risk Analysis: Used in finance (e.g., Value at Risk, VaR).\n", + "\n", + " \n", + "## Privacy Policy and Privacy Filters\n", + "\n", + "NVFLARE provide data privacy protection through privacy filters [privacy-management](https://nvflare.readthedocs.io/en/main/user_guide/security/site_policy_management.html#privacy-management)\n", + "Each site can have its own privacy policy. \n", + "\n", + "### Local privacy policy\n", + "\n", + "privacy.json provides local site specific privacy policy.\n", + "The policy is likely setup by the company and implemented by organization admin\n", + "for the project. For different type of scope or categories, there are might be different types of policies. \n", + "\n", + "### Privacy configuration\n", + "\n", + "The NVFLARE privacy configuration consists of set of task data filters and task result filters\n", + "* The task data filter applies before client executor executes;\n", + "* The task result filter is applied after the client executor executes and before the data is sent to the server;\n", + "* Both the data filter and result filter are grouped by scope.\n", + "\n", + "Each job will need to have a privacy scope. If not specified, the default scope will be used. If default scope is not\n", + "defined and job doesn't specify the privacy scope, the job deployment will fail, and job will not executed\n", + "\n", + "### Privacy Policy Instrumentation \n", + "\n", + "There are different ways to set privacy filters depending on the use cases\n", + "\n", + "\n", + "#### Set Privacy Policy as researcher\n", + "\n", + "You can specify the \"task_result_filters\" in config_fed_client.json to specify\n", + "the privacy control. This is useful when you develop these filters\n", + "\n", + "#### Setup site privacy policy as org admin\n", + "\n", + "Once the company decides to instrument certain privacy policy independent of individual\n", + "job, one can copy the local directory privacy.json content to clients' local privacy.json ( merge not overwrite).\n", + "in this example, since we only has one app, we can simply copy the private.json from local directory to\n", + "\n", + "* site-1/local/privacy.json\n", + "* site-2/local/privacy.json\n", + "\n", + "We need to remove the same filters from the job definition in config_fed_client.json\n", + "by simply set the \"task_result_filters\" to empty list to avoid **double filtering**\n", + "```\n", + "\"task_result_filters\": []\n", + "```\n", + "#### Job filter vs filters in private.json\n", + "\n", + "Privacy filters are defined within a privacy scope.\n", + "If a job's privacy scope is defined or has default scope, then the scope's filters (if any) are applied\n", + "before the job-specified filters (if any). This rule is enforced during task execution time.\n", + "\n", + "With such rules, if we have both task result filters and privacy scoped filters, we need to understand\n", + "that the privacy filters will be applied first, then the job filters. The filters will be applied by their specified orders\n", + "\n", + "### Statistics Privacy Filters\n", + "Statistics privacy filters are task result filters. We already built one for Statistics. \n", + "\n", + "```\n", + "StatisticsPrivacyFilter\n", + "```\n", + "The StatisticsPrivacyFilter consists of several `StatisticsPrivacyCleanser`s focused on the statistics sent\n", + "from client to server.\n", + "\n", + "`StatisticsPrivacyCleanser` can be considered as an interceptor before the results delivered to server.\n", + "Currently, we use three `StatisticsPrivacyCleanser`s to guard the data privacy. The reason we built \n", + "`StatisticsPrivacyCleanser` instead of separate filters is to avoid repeated data de-serialization.\n", + "\n", + "#### MinCountCleanser:\n", + "Check against the number of count returned from client for each dataset and each feature.\n", + "\n", + "If the min_count is not satisfied, there is potential risk of reveal client's real data. Then remove that feature's statistics\n", + "from the result for this client.\n", + "\n", + "#### HistogramBinsCleanser:\n", + "For histogram calculations, number of bins can't be too large compare to count. if the bins = count, then\n", + "we also reveal the real data. This check to make sure that the number of bins be less than X percent of the count.\n", + "X = max_bins_percent in percentage, for 10 is for 10%\n", + "if the number of bins for the histogram is not satisfy this specified condition, the resulting histogram will be removed\n", + "from statistics before sending to server.\n", + "\n", + "#### AddNoiseToMinMax\n", + "For histogram calculations, if the feature's histogram bin's range is not specified, we will need to use local data's min\n", + "and max values to calculate the global min/max values, then use the global min, max values as the bin range for histogram\n", + "calculation. But send the server the local min, max values will reveal client's real data.\n", + "To protect data privacy, we add noise to the local min/max values.\n", + "\n", + "Min/max random is used to generate random noise between (min_noise_level and max_noise_level).\n", + "for example, the random noise is to be within (0.1 and 0.3),i.e. 10% to 30% level. These noise\n", + "will make local min values smaller than the true local min values, and max values larger than\n", + "the true local max values. As result, the estimate global max and min values (i.e. with noise)\n", + "are still bounded the true global min/max values, in such that\n", + "```\n", + "est. global min value <\n", + " true global min value <\n", + " client's min value <\n", + " client's max value <\n", + " true global max <\n", + " est. global max value\n", + "```" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Summary\n", + "\n", + "We provided federated statistics operators that can easily aggregate and visualize the local statistics for\n", + "different data site and features. We hope this feature will make it easier to perform federated data analysis. \n", + "\n", + "We provide several examples to demonstrate how should the operators be used. \n", + "\n", + "The main steps are \n", + "* provide server side configuration to specify target statistics and their configurations and output location\n", + "* implement the local statistics generator (statistics_spec)\n", + "* provide client side configuration to specify data input location\n", + " \n", + "\n", + "Now lets dive into the examples\n", + "\n", + "\n", + "* [Federated Statistics with tabular data](./federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb) demonstrates how to create federated statistics for data that can be represented as Pandas DataFrames.\n", + "* [Federated Statistics with image data](./federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb) shows how to compute local and global image statistics with the consideration that data is private at each of the client sites.\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] } ], "metadata": { + "kernelspec": { + "display_name": "nvflare-env", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" } }, "nbformat": 4, diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/data/download_and_unzip_data.sh b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/data/download_and_unzip_data.sh deleted file mode 100755 index a50bb1c504..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/data/download_and_unzip_data.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -DATASET_PATH="/tmp/nvflare/image_stats/data" - -if [ ! -d $DATASET_PATH ]; then - mkdir -p $DATASET_PATH -fi - -source_url="$1" -echo "download url = ${source_url}" -if [ -n "${source_url}" ]; then - if [ ! -f "${DATASET_PATH}/COVID-19_Radiography_Dataset.zip" ]; then - wget -O "${DATASET_PATH}/COVID-19_Radiography_Dataset.zip" "${source_url}" - else - echo "zip file exists." - fi - if [ ! -d "${DATASET_PATH}/COVID-19_Radiography_Dataset" ]; then - unzip -d $DATASET_PATH "${DATASET_PATH}/COVID-19_Radiography_Dataset.zip" - else - echo "image files exist." - fi -else - echo "empty URL, nothing downloaded, you need to provide real URL to download" -fi \ No newline at end of file diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/data/prepare_data.sh b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/data/prepare_data.sh new file mode 100755 index 0000000000..c7f677ad3e --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/data/prepare_data.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +INPUT_DATASET_PATH="/tmp/nvflare/image_stats/data" +OUTPUT_DATASET_PATH="/tmp/nvflare/image_stats/data" + +python3 "${SCRIPT_DIR}"/utils/prepare_data.py --input_dir "${INPUT_DATASET_PATH}" --output_dir "${OUTPUT_DATASET_PATH}" \ No newline at end of file diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats/utils/prepare_data.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/data/utils/prepare_data.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats/utils/prepare_data.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/data/utils/prepare_data.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/demo/visualization.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/demo/visualization.ipynb new file mode 100644 index 0000000000..965bda266e --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/demo/visualization.ipynb @@ -0,0 +1,258 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3f851980", + "metadata": {}, + "source": [ + "# NVFLARE Federated Statistics Visualization" + ] + }, + { + "cell_type": "markdown", + "id": "0b71dd55", + "metadata": {}, + "source": [ + "## Image Statistics Visualization\n", + "In this example, we demonstate how to visualize the results from the statistics of image data. The visualization requires json, pandas, matplotlib modules as well as nvflare visualization utlities. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "85f23acf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "\n", + "import json\n", + "import pandas as pd\n", + "from nvflare.app_opt.statistics.visualization.statistics_visualization import Visualization" + ] + }, + { + "cell_type": "markdown", + "id": "151e23a8", + "metadata": {}, + "source": [ + "First, copy the resulting json file to demo directory. In this example, resulting file is called image_statistics.json. Then load json file\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "44f6bed2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "with open('image_stats.json', 'r') as f:\n", + " data = json.load(f)" + ] + }, + { + "cell_type": "markdown", + "id": "c4b83ddb", + "metadata": {}, + "source": [ + "Initialize the Visualization utilities\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ab771712", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "vis = Visualization()\n" + ] + }, + { + "cell_type": "markdown", + "id": "49f976aa", + "metadata": {}, + "source": [ + "### Overall Statistics\n", + "vis.show_stats() will show the statistics for each features, at each site for each dataset\n", + "\n", + "vis.show_stats(data = data)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20ea4dff", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "vis.show_stats(data = data)" + ] + }, + { + "cell_type": "markdown", + "id": "521cbf6f", + "metadata": {}, + "source": [ + "### select features statistics using white_list_features \n", + "user can optionally select only show specified features via white_list_features arguments. In these image files, we only have one feature" + ] + }, + { + "cell_type": "markdown", + "id": "9ab23bcc", + "metadata": {}, + "source": [ + "### Histogram Visualization\n", + "We can use vis.show_histograms() to visualize the histogram. Before we do that, we need set some iPython display setting to make sure the graph displayed in full cell. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bada64b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from IPython.display import display, HTML\n", + "display(HTML(\"\"))" + ] + }, + { + "cell_type": "markdown", + "id": "e415b49e", + "metadata": {}, + "source": [ + "The following command display histograms for numberic features. The result shows both main plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53542cf9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "vis.show_histograms(data = data, plot_type=\"main\")" + ] + }, + { + "cell_type": "markdown", + "id": "d8d537cc", + "metadata": {}, + "source": [ + "## Display Options\n", + "Similar to other statistics, we can use white_list_features to select only few features to display histograms. We can also use display_format=\"percent\" to allow all dataset and sites to be displayed in the same scale. User can set \n", + "\n", + "* display_format: \"percent\" or \"sample_count\"\n", + "* white_list_features: feature names\n", + "* plot_type : \"both\" or \"main\" or \"subplot\"\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "353db4d9", + "metadata": {}, + "source": [ + "#### show default display format with subplot\n", + "In the following, we display only feature \"Intensity\" in default display_format, with \"subplot\" plot_type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f619729", + "metadata": {}, + "outputs": [], + "source": [ + "vis.show_histograms(data = data, plot_type=\"subplot\")" + ] + }, + { + "cell_type": "markdown", + "id": "fbf7fc73", + "metadata": {}, + "source": [ + "\n", + "#### show percent display format with default plot_type (main)\n", + "In the following, we display only feature \"Intensity\" in \"percent\" display_format, with default plot_type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8655ad63", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "vis.show_histograms(data = data, display_format=\"percent\")" + ] + }, + { + "cell_type": "markdown", + "id": "48501d37", + "metadata": {}, + "source": [ + "back to [federated_statistics_with_image_data](../../federated_statistics_with_image_data.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd6f59f2", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "918341b9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "d86bd3e8", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nvflare_example", + "language": "python", + "name": "nvflare_example" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats/demo/image_statistics.json b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats/demo/image_statistics.json deleted file mode 100644 index 45ba336e50..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats/demo/image_statistics.json +++ /dev/null @@ -1 +0,0 @@ -{"intensity": {"count": {"site-4": {"train": 1345}, "site-1": {"train": 3616}, "site-2": {"train": 6012}, "site-3": {"train": 10192}, "Global": {"train": 21165}}, "histogram": {"site-4": {"train": [[0.0, 1.0039, 7529944], [1.0039, 2.0078, 425108], [2.0078, 3.0118, 380736], [3.0118, 4.0157, 334702], [4.0157, 5.0196, 327862], [5.0196, 6.0235, 310277], [6.0235, 7.0275, 313290], [7.0275, 8.0314, 313654], [8.0314, 9.0353, 307805], [9.0353, 10.0392, 305356], [10.0392, 11.0431, 295868], [11.0431, 12.0471, 279067], [12.0471, 13.051, 270320], [13.051, 14.0549, 260414], [14.0549, 15.0588, 262861], [15.0588, 16.0627, 264740], [16.0627, 17.0667, 263759], [17.0667, 18.0706, 266840], [18.0706, 19.0745, 269593], [19.0745, 20.0784, 265030], [20.0784, 21.0824, 260416], [21.0824, 22.0863, 259147], [22.0863, 23.0902, 257797], [23.0902, 24.0941, 259564], [24.0941, 25.098, 259845], [25.098, 26.102, 256981], [26.102, 27.1059, 252589], [27.1059, 28.1098, 244041], [28.1098, 29.1137, 236778], [29.1137, 30.1176, 235530], [30.1176, 31.1216, 233887], [31.1216, 32.1255, 232690], [32.1255, 33.1294, 230496], [33.1294, 34.1333, 227944], [34.1333, 35.1373, 225518], [35.1373, 36.1412, 226460], [36.1412, 37.1451, 228242], [37.1451, 38.149, 230445], [38.149, 39.1529, 234045], [39.1529, 40.1569, 238784], [40.1569, 41.1608, 243710], [41.1608, 42.1647, 251529], [42.1647, 43.1686, 258775], [43.1686, 44.1726, 264193], [44.1726, 45.1765, 271677], [45.1765, 46.1804, 277858], [46.1804, 47.1843, 287293], [47.1843, 48.1882, 295818], [48.1882, 49.1922, 304070], [49.1922, 50.1961, 312974], [50.1961, 51.2, 323260], [51.2, 52.2039, 328831], [52.2039, 53.2078, 335326], [53.2078, 54.2118, 343748], [54.2118, 55.2157, 351160], [55.2157, 56.2196, 356396], [56.2196, 57.2235, 363184], [57.2235, 58.2275, 369794], [58.2275, 59.2314, 373595], [59.2314, 60.2353, 378877], [60.2353, 61.2392, 384616], [61.2392, 62.2431, 391297], [62.2431, 63.2471, 396375], [63.2471, 64.251, 398059], [64.251, 65.2549, 401636], [65.2549, 66.2588, 404846], [66.2588, 67.2627, 409274], [67.2627, 68.2667, 413504], [68.2667, 69.2706, 416709], [69.2706, 70.2745, 422904], [70.2745, 71.2784, 425087], [71.2784, 72.2824, 430026], [72.2824, 73.2863, 434464], [73.2863, 74.2902, 438169], [74.2902, 75.2941, 441835], [75.2941, 76.298, 447908], [76.298, 77.302, 454231], [77.302, 78.3059, 458792], [78.3059, 79.3098, 466910], [79.3098, 80.3137, 473704], [80.3137, 81.3176, 480384], [81.3176, 82.3216, 488587], [82.3216, 83.3255, 493441], [83.3255, 84.3294, 499632], [84.3294, 85.3333, 506173], [85.3333, 86.3373, 508150], [86.3373, 87.3412, 514601], [87.3412, 88.3451, 520894], [88.3451, 89.349, 523951], [89.349, 90.3529, 532177], [90.3529, 91.3569, 539180], [91.3569, 92.3608, 544994], [92.3608, 93.3647, 550783], [93.3647, 94.3686, 556833], [94.3686, 95.3726, 563344], [95.3726, 96.3765, 570413], [96.3765, 97.3804, 578149], [97.3804, 98.3843, 585887], [98.3843, 99.3882, 592452], [99.3882, 100.3922, 599573], [100.3922, 101.3961, 606518], [101.3961, 102.4, 612282], [102.4, 103.4039, 620579], [103.4039, 104.4078, 627741], [104.4078, 105.4118, 635267], [105.4118, 106.4157, 643762], [106.4157, 107.4196, 651374], [107.4196, 108.4235, 658375], [108.4235, 109.4275, 664897], [109.4275, 110.4314, 672678], [110.4314, 111.4353, 680048], [111.4353, 112.4392, 688321], [112.4392, 113.4431, 694140], [113.4431, 114.4471, 702296], [114.4471, 115.451, 710582], [115.451, 116.4549, 715170], [116.4549, 117.4588, 724991], [117.4588, 118.4627, 733420], [118.4627, 119.4667, 741582], [119.4667, 120.4706, 747743], [120.4706, 121.4745, 752486], [121.4745, 122.4784, 759045], [122.4784, 123.4824, 767290], [123.4824, 124.4863, 773984], [124.4863, 125.4902, 779475], [125.4902, 126.4941, 786162], [126.4941, 127.498, 791886], [127.498, 128.502, 798187], [128.502, 129.5059, 807763], [129.5059, 130.5098, 812860], [130.5098, 131.5137, 820695], [131.5137, 132.5177, 830039], [132.5177, 133.5216, 837940], [133.5216, 134.5255, 844831], [134.5255, 135.5294, 853935], [135.5294, 136.5333, 862363], [136.5333, 137.5373, 872252], [137.5373, 138.5412, 875083], [138.5412, 139.5451, 879200], [139.5451, 140.549, 886720], [140.549, 141.5529, 894372], [141.5529, 142.5569, 898412], [142.5569, 143.5608, 907203], [143.5608, 144.5647, 914686], [144.5647, 145.5686, 921733], [145.5686, 146.5726, 930211], [146.5726, 147.5765, 942743], [147.5765, 148.5804, 950464], [148.5804, 149.5843, 964346], [149.5843, 150.5882, 975391], [150.5882, 151.5922, 985204], [151.5922, 152.5961, 993241], [152.5961, 153.6, 1003272], [153.6, 154.6039, 1007841], [154.6039, 155.6078, 1014981], [155.6078, 156.6118, 1025516], [156.6118, 157.6157, 1038689], [157.6157, 158.6196, 1046387], [158.6196, 159.6235, 1055958], [159.6235, 160.6275, 1061917], [160.6275, 161.6314, 1069902], [161.6314, 162.6353, 1074383], [162.6353, 163.6392, 1077711], [163.6392, 164.6431, 1084480], [164.6431, 165.6471, 1092246], [165.6471, 166.651, 1098250], [166.651, 167.6549, 1105551], [167.6549, 168.6588, 1112735], [168.6588, 169.6628, 1118098], [169.6628, 170.6667, 1125523], [170.6667, 171.6706, 1133121], [171.6706, 172.6745, 1142512], [172.6745, 173.6784, 1148704], [173.6784, 174.6824, 1154061], [174.6824, 175.6863, 1163823], [175.6863, 176.6902, 1168052], [176.6902, 177.6941, 1171740], [177.6941, 178.698, 1176882], [178.698, 179.702, 1179433], [179.702, 180.7059, 1176290], [180.7059, 181.7098, 1170622], [181.7098, 182.7137, 1165328], [182.7137, 183.7177, 1156930], [183.7177, 184.7216, 1147879], [184.7216, 185.7255, 1137300], [185.7255, 186.7294, 1120497], [186.7294, 187.7333, 1106738], [187.7333, 188.7373, 1091219], [188.7373, 189.7412, 1070762], [189.7412, 190.7451, 1047710], [190.7451, 191.749, 1024671], [191.749, 192.7529, 999503], [192.7529, 193.7569, 969456], [193.7569, 194.7608, 933606], [194.7608, 195.7647, 898096], [195.7647, 196.7686, 857537], [196.7686, 197.7726, 821693], [197.7726, 198.7765, 775987], [198.7765, 199.7804, 729475], [199.7804, 200.7843, 682130], [200.7843, 201.7882, 640395], [201.7882, 202.7922, 601811], [202.7922, 203.7961, 568228], [203.7961, 204.8, 542387], [204.8, 205.8039, 506923], [205.8039, 206.8078, 477752], [206.8078, 207.8118, 453628], [207.8118, 208.8157, 431648], [208.8157, 209.8196, 412465], [209.8196, 210.8235, 392404], [210.8235, 211.8275, 373984], [211.8275, 212.8314, 352797], [212.8314, 213.8353, 331776], [213.8353, 214.8392, 314056], [214.8392, 215.8431, 299929], [215.8431, 216.8471, 282146], [216.8471, 217.851, 263520], [217.851, 218.8549, 243529], [218.8549, 219.8588, 225895], [219.8588, 220.8627, 211073], [220.8627, 221.8667, 195363], [221.8667, 222.8706, 179534], [222.8706, 223.8745, 163521], [223.8745, 224.8784, 150222], [224.8784, 225.8824, 137280], [225.8824, 226.8863, 124883], [226.8863, 227.8902, 114227], [227.8902, 228.8941, 101407], [228.8941, 229.898, 89917], [229.898, 230.902, 79965], [230.902, 231.9059, 69839], [231.9059, 232.9098, 60326], [232.9098, 233.9137, 51425], [233.9137, 234.9176, 43914], [234.9176, 235.9216, 38433], [235.9216, 236.9255, 32395], [236.9255, 237.9294, 27010], [237.9294, 238.9333, 22764], [238.9333, 239.9373, 18860], [239.9373, 240.9412, 14844], [240.9412, 241.9451, 12068], [241.9451, 242.949, 9001], [242.949, 243.9529, 7004], [243.9529, 244.9569, 5834], [244.9569, 245.9608, 4248], [245.9608, 246.9647, 2930], [246.9647, 247.9686, 2306], [247.9686, 248.9725, 2106], [248.9725, 249.9765, 1872], [249.9765, 250.9804, 1876], [250.9804, 251.9843, 1856], [251.9843, 252.9882, 1814], [252.9882, 253.9922, 1858], [253.9922, 254.9961, 1955], [254.9961, 256.0, 9820]]}, "site-1": {"train": [[0.0, 1.0039, 10894028], [1.0039, 2.0078, 919649], [2.0078, 3.0118, 769280], [3.0118, 4.0157, 693827], [4.0157, 5.0196, 647480], [5.0196, 6.0235, 650549], [6.0235, 7.0275, 534844], [7.0275, 8.0314, 473089], [8.0314, 9.0353, 423891], [9.0353, 10.0392, 407681], [10.0392, 11.0431, 377949], [11.0431, 12.0471, 371994], [12.0471, 13.051, 337606], [13.051, 14.0549, 321416], [14.0549, 15.0588, 314833], [15.0588, 16.0627, 305874], [16.0627, 17.0667, 299324], [17.0667, 18.0706, 299362], [18.0706, 19.0745, 302235], [19.0745, 20.0784, 303270], [20.0784, 21.0824, 304850], [21.0824, 22.0863, 311243], [22.0863, 23.0902, 313018], [23.0902, 24.0941, 322185], [24.0941, 25.098, 323243], [25.098, 26.102, 333137], [26.102, 27.1059, 367039], [27.1059, 28.1098, 330758], [28.1098, 29.1137, 332182], [29.1137, 30.1176, 350388], [30.1176, 31.1216, 343288], [31.1216, 32.1255, 353006], [32.1255, 33.1294, 359859], [33.1294, 34.1333, 397470], [34.1333, 35.1373, 373270], [35.1373, 36.1412, 379933], [36.1412, 37.1451, 396555], [37.1451, 38.149, 398242], [38.149, 39.1529, 405559], [39.1529, 40.1569, 417184], [40.1569, 41.1608, 430439], [41.1608, 42.1647, 441752], [42.1647, 43.1686, 452952], [43.1686, 44.1726, 466667], [44.1726, 45.1765, 484097], [45.1765, 46.1804, 505844], [46.1804, 47.1843, 515014], [47.1843, 48.1882, 527903], [48.1882, 49.1922, 555451], [49.1922, 50.1961, 569528], [50.1961, 51.2, 586813], [51.2, 52.2039, 599347], [52.2039, 53.2078, 618684], [53.2078, 54.2118, 630950], [54.2118, 55.2157, 647808], [55.2157, 56.2196, 673090], [56.2196, 57.2235, 688245], [57.2235, 58.2275, 710112], [58.2275, 59.2314, 718399], [59.2314, 60.2353, 729100], [60.2353, 61.2392, 743367], [61.2392, 62.2431, 758390], [62.2431, 63.2471, 774620], [63.2471, 64.251, 790928], [64.251, 65.2549, 810389], [65.2549, 66.2588, 828657], [66.2588, 67.2627, 847662], [67.2627, 68.2667, 865040], [68.2667, 69.2706, 879144], [69.2706, 70.2745, 895969], [70.2745, 71.2784, 916437], [71.2784, 72.2824, 935822], [72.2824, 73.2863, 953892], [73.2863, 74.2902, 973171], [74.2902, 75.2941, 989698], [75.2941, 76.298, 1006427], [76.298, 77.302, 1024038], [77.302, 78.3059, 1039927], [78.3059, 79.3098, 1055270], [79.3098, 80.3137, 1069199], [80.3137, 81.3176, 1083336], [81.3176, 82.3216, 1097692], [82.3216, 83.3255, 1116214], [83.3255, 84.3294, 1129098], [84.3294, 85.3333, 1141204], [85.3333, 86.3373, 1155065], [86.3373, 87.3412, 1168550], [87.3412, 88.3451, 1185271], [88.3451, 89.349, 1199064], [89.349, 90.3529, 1214772], [90.3529, 91.3569, 1225242], [91.3569, 92.3608, 1239084], [92.3608, 93.3647, 1253750], [93.3647, 94.3686, 1269047], [94.3686, 95.3726, 1284168], [95.3726, 96.3765, 1299647], [96.3765, 97.3804, 1317955], [97.3804, 98.3843, 1335169], [98.3843, 99.3882, 1354612], [99.3882, 100.3922, 1379464], [100.3922, 101.3961, 1400813], [101.3961, 102.4, 1459381], [102.4, 103.4039, 1437747], [103.4039, 104.4078, 1452027], [104.4078, 105.4118, 1472904], [105.4118, 106.4157, 1495774], [106.4157, 107.4196, 1517943], [107.4196, 108.4235, 1540170], [108.4235, 109.4275, 1567816], [109.4275, 110.4314, 1596016], [110.4314, 111.4353, 1623599], [111.4353, 112.4392, 1638923], [112.4392, 113.4431, 1648676], [113.4431, 114.4471, 1660864], [114.4471, 115.451, 1670206], [115.451, 116.4549, 1678346], [116.4549, 117.4588, 1689001], [117.4588, 118.4627, 1697572], [118.4627, 119.4667, 1709890], [119.4667, 120.4706, 1723120], [120.4706, 121.4745, 1733628], [121.4745, 122.4784, 1744632], [122.4784, 123.4824, 1758708], [123.4824, 124.4863, 1769412], [124.4863, 125.4902, 1780412], [125.4902, 126.4941, 1793815], [126.4941, 127.498, 1804310], [127.498, 128.502, 1816440], [128.502, 129.5059, 1826211], [129.5059, 130.5098, 1838735], [130.5098, 131.5137, 1851383], [131.5137, 132.5177, 1864726], [132.5177, 133.5216, 1876103], [133.5216, 134.5255, 1886905], [134.5255, 135.5294, 1901283], [135.5294, 136.5333, 1911711], [136.5333, 137.5373, 1918392], [137.5373, 138.5412, 1927938], [138.5412, 139.5451, 1936607], [139.5451, 140.549, 1944733], [140.549, 141.5529, 1949944], [141.5529, 142.5569, 1956728], [142.5569, 143.5608, 1964468], [143.5608, 144.5647, 1971692], [144.5647, 145.5686, 1978261], [145.5686, 146.5726, 1989150], [146.5726, 147.5765, 1994601], [147.5765, 148.5804, 1998874], [148.5804, 149.5843, 2002253], [149.5843, 150.5882, 2010981], [150.5882, 151.5922, 2019781], [151.5922, 152.5961, 2029319], [152.5961, 153.6, 2041442], [153.6, 154.6039, 2049317], [154.6039, 155.6078, 2052385], [155.6078, 156.6118, 2057924], [156.6118, 157.6157, 2063970], [157.6157, 158.6196, 2070129], [158.6196, 159.6235, 2073822], [159.6235, 160.6275, 2071202], [160.6275, 161.6314, 2074443], [161.6314, 162.6353, 2085518], [162.6353, 163.6392, 2084152], [163.6392, 164.6431, 2088635], [164.6431, 165.6471, 2087421], [165.6471, 166.651, 2089825], [166.651, 167.6549, 2091350], [167.6549, 168.6588, 2098728], [168.6588, 169.6628, 2098083], [169.6628, 170.6667, 2103421], [170.6667, 171.6706, 2106742], [171.6706, 172.6745, 2108895], [172.6745, 173.6784, 2104367], [173.6784, 174.6824, 2097960], [174.6824, 175.6863, 2091572], [175.6863, 176.6902, 2083093], [176.6902, 177.6941, 2073152], [177.6941, 178.698, 2069031], [178.698, 179.702, 2049718], [179.702, 180.7059, 2040335], [180.7059, 181.7098, 2023287], [181.7098, 182.7137, 2022978], [182.7137, 183.7177, 2010025], [183.7177, 184.7216, 1994103], [184.7216, 185.7255, 1965720], [185.7255, 186.7294, 1961279], [186.7294, 187.7333, 1943250], [187.7333, 188.7373, 1936609], [188.7373, 189.7412, 1916980], [189.7412, 190.7451, 1913374], [190.7451, 191.749, 1893539], [191.749, 192.7529, 1882353], [192.7529, 193.7569, 1864142], [193.7569, 194.7608, 1855682], [194.7608, 195.7647, 1847667], [195.7647, 196.7686, 1853185], [196.7686, 197.7726, 1846513], [197.7726, 198.7765, 1826374], [198.7765, 199.7804, 1805915], [199.7804, 200.7843, 1777044], [200.7843, 201.7882, 1775611], [201.7882, 202.7922, 1738088], [202.7922, 203.7961, 1716504], [203.7961, 204.8, 1694891], [204.8, 205.8039, 1667268], [205.8039, 206.8078, 1643096], [206.8078, 207.8118, 1613994], [207.8118, 208.8157, 1578087], [208.8157, 209.8196, 1534606], [209.8196, 210.8235, 1495973], [210.8235, 211.8275, 1449118], [211.8275, 212.8314, 1399671], [212.8314, 213.8353, 1339223], [213.8353, 214.8392, 1287605], [214.8392, 215.8431, 1223187], [215.8431, 216.8471, 1174973], [216.8471, 217.851, 1131599], [217.851, 218.8549, 1090599], [218.8549, 219.8588, 1044891], [219.8588, 220.8627, 1008683], [220.8627, 221.8667, 974052], [221.8667, 222.8706, 944405], [222.8706, 223.8745, 918074], [223.8745, 224.8784, 899155], [224.8784, 225.8824, 878302], [225.8824, 226.8863, 844524], [226.8863, 227.8902, 813378], [227.8902, 228.8941, 787275], [228.8941, 229.898, 764613], [229.898, 230.902, 739253], [230.902, 231.9059, 712397], [231.9059, 232.9098, 696324], [232.9098, 233.9137, 676432], [233.9137, 234.9176, 668208], [234.9176, 235.9216, 651782], [235.9216, 236.9255, 633075], [236.9255, 237.9294, 624123], [237.9294, 238.9333, 618392], [238.9333, 239.9373, 612616], [239.9373, 240.9412, 602796], [240.9412, 241.9451, 603908], [241.9451, 242.949, 612502], [242.949, 243.9529, 616100], [243.9529, 244.9569, 616756], [244.9569, 245.9608, 608695], [245.9608, 246.9647, 612086], [246.9647, 247.9686, 615833], [247.9686, 248.9725, 612640], [248.9725, 249.9765, 611049], [249.9765, 250.9804, 643359], [250.9804, 251.9843, 705574], [251.9843, 252.9882, 716770], [252.9882, 253.9922, 679330], [253.9922, 254.9961, 601071], [254.9961, 256.0, 1052689]]}, "site-2": {"train": [[0.0, 1.0039, 15941521], [1.0039, 2.0078, 3409289], [2.0078, 3.0118, 3292634], [3.0118, 4.0157, 2963028], [4.0157, 5.0196, 2697815], [5.0196, 6.0235, 2472349], [6.0235, 7.0275, 2279512], [7.0275, 8.0314, 2005163], [8.0314, 9.0353, 1802259], [9.0353, 10.0392, 1628386], [10.0392, 11.0431, 1567570], [11.0431, 12.0471, 1431692], [12.0471, 13.051, 1392798], [13.051, 14.0549, 1269837], [14.0549, 15.0588, 1193088], [15.0588, 16.0627, 1130059], [16.0627, 17.0667, 1092008], [17.0667, 18.0706, 1015100], [18.0706, 19.0745, 983700], [19.0745, 20.0784, 960282], [20.0784, 21.0824, 909431], [21.0824, 22.0863, 866762], [22.0863, 23.0902, 836220], [23.0902, 24.0941, 823204], [24.0941, 25.098, 815079], [25.098, 26.102, 816740], [26.102, 27.1059, 816584], [27.1059, 28.1098, 810720], [28.1098, 29.1137, 815188], [29.1137, 30.1176, 817414], [30.1176, 31.1216, 825910], [31.1216, 32.1255, 834258], [32.1255, 33.1294, 842404], [33.1294, 34.1333, 847901], [34.1333, 35.1373, 855877], [35.1373, 36.1412, 865681], [36.1412, 37.1451, 877634], [37.1451, 38.149, 891457], [38.149, 39.1529, 906151], [39.1529, 40.1569, 920792], [40.1569, 41.1608, 937021], [41.1608, 42.1647, 951070], [42.1647, 43.1686, 969154], [43.1686, 44.1726, 986948], [44.1726, 45.1765, 1005515], [45.1765, 46.1804, 1027579], [46.1804, 47.1843, 1048011], [47.1843, 48.1882, 1065838], [48.1882, 49.1922, 1089849], [49.1922, 50.1961, 1112060], [50.1961, 51.2, 1132509], [51.2, 52.2039, 1154043], [52.2039, 53.2078, 1177892], [53.2078, 54.2118, 1200277], [54.2118, 55.2157, 1224671], [55.2157, 56.2196, 1246311], [56.2196, 57.2235, 1273277], [57.2235, 58.2275, 1298370], [58.2275, 59.2314, 1320641], [59.2314, 60.2353, 1343924], [60.2353, 61.2392, 1368803], [61.2392, 62.2431, 1398404], [62.2431, 63.2471, 1418798], [63.2471, 64.251, 1447717], [64.251, 65.2549, 1478809], [65.2549, 66.2588, 1509546], [66.2588, 67.2627, 1544021], [67.2627, 68.2667, 1576975], [68.2667, 69.2706, 1614871], [69.2706, 70.2745, 1647199], [70.2745, 71.2784, 1684707], [71.2784, 72.2824, 1721295], [72.2824, 73.2863, 1754361], [73.2863, 74.2902, 1791169], [74.2902, 75.2941, 1827869], [75.2941, 76.298, 1867136], [76.298, 77.302, 1909117], [77.302, 78.3059, 1948501], [78.3059, 79.3098, 1990679], [79.3098, 80.3137, 2026898], [80.3137, 81.3176, 2064295], [81.3176, 82.3216, 2100536], [82.3216, 83.3255, 2134824], [83.3255, 84.3294, 2173686], [84.3294, 85.3333, 2215018], [85.3333, 86.3373, 2250716], [86.3373, 87.3412, 2289487], [87.3412, 88.3451, 2331344], [88.3451, 89.349, 2370562], [89.349, 90.3529, 2404724], [90.3529, 91.3569, 2446641], [91.3569, 92.3608, 2481625], [92.3608, 93.3647, 2516636], [93.3647, 94.3686, 2557334], [94.3686, 95.3726, 2598710], [95.3726, 96.3765, 2638704], [96.3765, 97.3804, 2670602], [97.3804, 98.3843, 2696413], [98.3843, 99.3882, 2721216], [99.3882, 100.3922, 2749841], [100.3922, 101.3961, 2779287], [101.3961, 102.4, 2806616], [102.4, 103.4039, 2835561], [103.4039, 104.4078, 2863948], [104.4078, 105.4118, 2883549], [105.4118, 106.4157, 2905702], [106.4157, 107.4196, 2932802], [107.4196, 108.4235, 2954641], [108.4235, 109.4275, 2982323], [109.4275, 110.4314, 3004721], [110.4314, 111.4353, 3028188], [111.4353, 112.4392, 3048302], [112.4392, 113.4431, 3060261], [113.4431, 114.4471, 3075460], [114.4471, 115.451, 3091074], [115.451, 116.4549, 3101385], [116.4549, 117.4588, 3108729], [117.4588, 118.4627, 3123015], [118.4627, 119.4667, 3126990], [119.4667, 120.4706, 3134952], [120.4706, 121.4745, 3147843], [121.4745, 122.4784, 3156261], [122.4784, 123.4824, 3163852], [123.4824, 124.4863, 3167825], [124.4863, 125.4902, 3168960], [125.4902, 126.4941, 3161481], [126.4941, 127.498, 3164858], [127.498, 128.502, 3173046], [128.502, 129.5059, 3182439], [129.5059, 130.5098, 3198768], [130.5098, 131.5137, 3196157], [131.5137, 132.5177, 3196652], [132.5177, 133.5216, 3197763], [133.5216, 134.5255, 3199729], [134.5255, 135.5294, 3205524], [135.5294, 136.5333, 3211286], [136.5333, 137.5373, 3213368], [137.5373, 138.5412, 3211412], [138.5412, 139.5451, 3206645], [139.5451, 140.549, 3209193], [140.549, 141.5529, 3214692], [141.5529, 142.5569, 3211823], [142.5569, 143.5608, 3212346], [143.5608, 144.5647, 3208845], [144.5647, 145.5686, 3204488], [145.5686, 146.5726, 3199919], [146.5726, 147.5765, 3189501], [147.5765, 148.5804, 3179552], [148.5804, 149.5843, 3166138], [149.5843, 150.5882, 3153095], [150.5882, 151.5922, 3142765], [151.5922, 152.5961, 3139218], [152.5961, 153.6, 3130360], [153.6, 154.6039, 3134121], [154.6039, 155.6078, 3134447], [155.6078, 156.6118, 3139450], [156.6118, 157.6157, 3145100], [157.6157, 158.6196, 3147758], [158.6196, 159.6235, 3146090], [159.6235, 160.6275, 3138656], [160.6275, 161.6314, 3131494], [161.6314, 162.6353, 3119024], [162.6353, 163.6392, 3111654], [163.6392, 164.6431, 3106116], [164.6431, 165.6471, 3098675], [165.6471, 166.651, 3093899], [166.651, 167.6549, 3085426], [167.6549, 168.6588, 3070005], [168.6588, 169.6628, 3067126], [169.6628, 170.6667, 3070429], [170.6667, 171.6706, 3076652], [171.6706, 172.6745, 3076603], [172.6745, 173.6784, 3063677], [173.6784, 174.6824, 3054089], [174.6824, 175.6863, 3046927], [175.6863, 176.6902, 3030450], [176.6902, 177.6941, 3007018], [177.6941, 178.698, 2980311], [178.698, 179.702, 2962652], [179.702, 180.7059, 2946745], [180.7059, 181.7098, 2924491], [181.7098, 182.7137, 2896582], [182.7137, 183.7177, 2869170], [183.7177, 184.7216, 2843862], [184.7216, 185.7255, 2819849], [185.7255, 186.7294, 2795058], [186.7294, 187.7333, 2777121], [187.7333, 188.7373, 2751346], [188.7373, 189.7412, 2737059], [189.7412, 190.7451, 2708849], [190.7451, 191.749, 2669175], [191.749, 192.7529, 2631722], [192.7529, 193.7569, 2609137], [193.7569, 194.7608, 2588888], [194.7608, 195.7647, 2556796], [195.7647, 196.7686, 2524864], [196.7686, 197.7726, 2489195], [197.7726, 198.7765, 2446870], [198.7765, 199.7804, 2408229], [199.7804, 200.7843, 2377414], [200.7843, 201.7882, 2343879], [201.7882, 202.7922, 2302332], [202.7922, 203.7961, 2265841], [203.7961, 204.8, 2239418], [204.8, 205.8039, 2220973], [205.8039, 206.8078, 2184959], [206.8078, 207.8118, 2150144], [207.8118, 208.8157, 2114186], [208.8157, 209.8196, 2078547], [209.8196, 210.8235, 2049835], [210.8235, 211.8275, 1997943], [211.8275, 212.8314, 1943236], [212.8314, 213.8353, 1899057], [213.8353, 214.8392, 1854857], [214.8392, 215.8431, 1814772], [215.8431, 216.8471, 1769082], [216.8471, 217.851, 1731006], [217.851, 218.8549, 1697932], [218.8549, 219.8588, 1659068], [219.8588, 220.8627, 1627294], [220.8627, 221.8667, 1598942], [221.8667, 222.8706, 1552328], [222.8706, 223.8745, 1493287], [223.8745, 224.8784, 1443311], [224.8784, 225.8824, 1398265], [225.8824, 226.8863, 1350080], [226.8863, 227.8902, 1291945], [227.8902, 228.8941, 1235278], [228.8941, 229.898, 1182931], [229.898, 230.902, 1135305], [230.902, 231.9059, 1083947], [231.9059, 232.9098, 1043830], [232.9098, 233.9137, 989887], [233.9137, 234.9176, 921426], [234.9176, 235.9216, 856490], [235.9216, 236.9255, 794486], [236.9255, 237.9294, 732566], [237.9294, 238.9333, 667414], [238.9333, 239.9373, 583868], [239.9373, 240.9412, 492933], [240.9412, 241.9451, 411886], [241.9451, 242.949, 340969], [242.949, 243.9529, 281630], [243.9529, 244.9569, 232361], [244.9569, 245.9608, 190872], [245.9608, 246.9647, 152602], [246.9647, 247.9686, 126933], [247.9686, 248.9725, 100975], [248.9725, 249.9765, 75090], [249.9765, 250.9804, 54275], [250.9804, 251.9843, 35979], [251.9843, 252.9882, 29189], [252.9882, 253.9922, 38216], [253.9922, 254.9961, 18398], [254.9961, 256.0, 24730]]}, "site-3": {"train": [[0.0, 1.0039, 35420536], [1.0039, 2.0078, 3434579], [2.0078, 3.0118, 3341431], [3.0118, 4.0157, 3092450], [4.0157, 5.0196, 3042051], [5.0196, 6.0235, 2757745], [6.0235, 7.0275, 2490254], [7.0275, 8.0314, 2411643], [8.0314, 9.0353, 2285068], [9.0353, 10.0392, 2156599], [10.0392, 11.0431, 2074827], [11.0431, 12.0471, 2118880], [12.0471, 13.051, 2024748], [13.051, 14.0549, 1964664], [14.0549, 15.0588, 1877950], [15.0588, 16.0627, 1895079], [16.0627, 17.0667, 1777361], [17.0667, 18.0706, 1680089], [18.0706, 19.0745, 1668241], [19.0745, 20.0784, 1609927], [20.0784, 21.0824, 1539886], [21.0824, 22.0863, 1526719], [22.0863, 23.0902, 1444669], [23.0902, 24.0941, 1446578], [24.0941, 25.098, 1415474], [25.098, 26.102, 1430897], [26.102, 27.1059, 1446241], [27.1059, 28.1098, 1467478], [28.1098, 29.1137, 1493183], [29.1137, 30.1176, 1537432], [30.1176, 31.1216, 1586366], [31.1216, 32.1255, 1630488], [32.1255, 33.1294, 1677620], [33.1294, 34.1333, 1716854], [34.1333, 35.1373, 1766773], [35.1373, 36.1412, 1817857], [36.1412, 37.1451, 1867103], [37.1451, 38.149, 1918587], [38.149, 39.1529, 1969687], [39.1529, 40.1569, 2017640], [40.1569, 41.1608, 2067479], [41.1608, 42.1647, 2120295], [42.1647, 43.1686, 2173831], [43.1686, 44.1726, 2225973], [44.1726, 45.1765, 2277126], [45.1765, 46.1804, 2324888], [46.1804, 47.1843, 2375966], [47.1843, 48.1882, 2422688], [48.1882, 49.1922, 2470304], [49.1922, 50.1961, 2515550], [50.1961, 51.2, 2562232], [51.2, 52.2039, 2604402], [52.2039, 53.2078, 2646158], [53.2078, 54.2118, 2685611], [54.2118, 55.2157, 2725405], [55.2157, 56.2196, 2769219], [56.2196, 57.2235, 2801003], [57.2235, 58.2275, 2836516], [58.2275, 59.2314, 2868959], [59.2314, 60.2353, 2899569], [60.2353, 61.2392, 2928114], [61.2392, 62.2431, 2956648], [62.2431, 63.2471, 2985628], [63.2471, 64.251, 3014738], [64.251, 65.2549, 3050418], [65.2549, 66.2588, 3082767], [66.2588, 67.2627, 3113070], [67.2627, 68.2667, 3144182], [68.2667, 69.2706, 3171032], [69.2706, 70.2745, 3197708], [70.2745, 71.2784, 3224877], [71.2784, 72.2824, 3254809], [72.2824, 73.2863, 3282706], [73.2863, 74.2902, 3313171], [74.2902, 75.2941, 3340328], [75.2941, 76.298, 3369138], [76.298, 77.302, 3393056], [77.302, 78.3059, 3420210], [78.3059, 79.3098, 3446986], [79.3098, 80.3137, 3469169], [80.3137, 81.3176, 3490349], [81.3176, 82.3216, 3511811], [82.3216, 83.3255, 3529029], [83.3255, 84.3294, 3551042], [84.3294, 85.3333, 3575271], [85.3333, 86.3373, 3600291], [86.3373, 87.3412, 3623519], [87.3412, 88.3451, 3648167], [88.3451, 89.349, 3666903], [89.349, 90.3529, 3684651], [90.3529, 91.3569, 3703091], [91.3569, 92.3608, 3718889], [92.3608, 93.3647, 3736978], [93.3647, 94.3686, 3759958], [94.3686, 95.3726, 3779436], [95.3726, 96.3765, 3797707], [96.3765, 97.3804, 3819111], [97.3804, 98.3843, 3828612], [98.3843, 99.3882, 3847381], [99.3882, 100.3922, 3866771], [100.3922, 101.3961, 3879339], [101.3961, 102.4, 3902031], [102.4, 103.4039, 3929528], [103.4039, 104.4078, 3948769], [104.4078, 105.4118, 3968417], [105.4118, 106.4157, 3992482], [106.4157, 107.4196, 4016228], [107.4196, 108.4235, 4042653], [108.4235, 109.4275, 4063340], [109.4275, 110.4314, 4087918], [110.4314, 111.4353, 4109329], [111.4353, 112.4392, 4138338], [112.4392, 113.4431, 4162863], [113.4431, 114.4471, 4189909], [114.4471, 115.451, 4215519], [115.451, 116.4549, 4247141], [116.4549, 117.4588, 4273944], [117.4588, 118.4627, 4311672], [118.4627, 119.4667, 4338825], [119.4667, 120.4706, 4370836], [120.4706, 121.4745, 4402997], [121.4745, 122.4784, 4431116], [122.4784, 123.4824, 4460083], [123.4824, 124.4863, 4482474], [124.4863, 125.4902, 4504318], [125.4902, 126.4941, 4519438], [126.4941, 127.498, 4550325], [127.498, 128.502, 4580158], [128.502, 129.5059, 4618214], [129.5059, 130.5098, 4659317], [130.5098, 131.5137, 4686527], [131.5137, 132.5177, 4721523], [132.5177, 133.5216, 4744639], [133.5216, 134.5255, 4762751], [134.5255, 135.5294, 4779756], [135.5294, 136.5333, 4801818], [136.5333, 137.5373, 4821757], [137.5373, 138.5412, 4834936], [138.5412, 139.5451, 4845614], [139.5451, 140.549, 4873763], [140.549, 141.5529, 4898921], [141.5529, 142.5569, 4937001], [142.5569, 143.5608, 4973016], [143.5608, 144.5647, 4993681], [144.5647, 145.5686, 5002594], [145.5686, 146.5726, 4999512], [146.5726, 147.5765, 4994651], [147.5765, 148.5804, 4987965], [148.5804, 149.5843, 4967454], [149.5843, 150.5882, 4942949], [150.5882, 151.5922, 4920523], [151.5922, 152.5961, 4896259], [152.5961, 153.6, 4882397], [153.6, 154.6039, 4869893], [154.6039, 155.6078, 4869395], [155.6078, 156.6118, 4868932], [156.6118, 157.6157, 4866115], [157.6157, 158.6196, 4861393], [158.6196, 159.6235, 4858587], [159.6235, 160.6275, 4849058], [160.6275, 161.6314, 4848421], [161.6314, 162.6353, 4847538], [162.6353, 163.6392, 4853239], [163.6392, 164.6431, 4852638], [164.6431, 165.6471, 4858857], [165.6471, 166.651, 4851576], [166.651, 167.6549, 4844393], [167.6549, 168.6588, 4825204], [168.6588, 169.6628, 4824452], [169.6628, 170.6667, 4832095], [170.6667, 171.6706, 4845295], [171.6706, 172.6745, 4852993], [172.6745, 173.6784, 4852364], [173.6784, 174.6824, 4847783], [174.6824, 175.6863, 4843062], [175.6863, 176.6902, 4833583], [176.6902, 177.6941, 4825035], [177.6941, 178.698, 4813977], [178.698, 179.702, 4809204], [179.702, 180.7059, 4803096], [180.7059, 181.7098, 4794134], [181.7098, 182.7137, 4774045], [182.7137, 183.7177, 4758050], [183.7177, 184.7216, 4748434], [184.7216, 185.7255, 4742331], [185.7255, 186.7294, 4734879], [186.7294, 187.7333, 4727653], [187.7333, 188.7373, 4723189], [188.7373, 189.7412, 4716853], [189.7412, 190.7451, 4695524], [190.7451, 191.749, 4673050], [191.749, 192.7529, 4652493], [192.7529, 193.7569, 4640657], [193.7569, 194.7608, 4630755], [194.7608, 195.7647, 4621774], [195.7647, 196.7686, 4592518], [196.7686, 197.7726, 4554421], [197.7726, 198.7765, 4528396], [198.7765, 199.7804, 4495683], [199.7804, 200.7843, 4456919], [200.7843, 201.7882, 4421014], [201.7882, 202.7922, 4385719], [202.7922, 203.7961, 4354884], [203.7961, 204.8, 4337416], [204.8, 205.8039, 4320880], [205.8039, 206.8078, 4297270], [206.8078, 207.8118, 4265592], [207.8118, 208.8157, 4221109], [208.8157, 209.8196, 4184597], [209.8196, 210.8235, 4145936], [210.8235, 211.8275, 4102365], [211.8275, 212.8314, 4054169], [212.8314, 213.8353, 4014081], [213.8353, 214.8392, 3980732], [214.8392, 215.8431, 3950586], [215.8431, 216.8471, 3909584], [216.8471, 217.851, 3864430], [217.851, 218.8549, 3813607], [218.8549, 219.8588, 3776123], [219.8588, 220.8627, 3735486], [220.8627, 221.8667, 3706293], [221.8667, 222.8706, 3661984], [222.8706, 223.8745, 3606364], [223.8745, 224.8784, 3544293], [224.8784, 225.8824, 3477928], [225.8824, 226.8863, 3406908], [226.8863, 227.8902, 3332751], [227.8902, 228.8941, 3252660], [228.8941, 229.898, 3161730], [229.898, 230.902, 3062149], [230.902, 231.9059, 2953007], [231.9059, 232.9098, 2862108], [232.9098, 233.9137, 2760883], [233.9137, 234.9176, 2633718], [234.9176, 235.9216, 2503331], [235.9216, 236.9255, 2347287], [236.9255, 237.9294, 2194271], [237.9294, 238.9333, 2016997], [238.9333, 239.9373, 1808115], [239.9373, 240.9412, 1593507], [240.9412, 241.9451, 1395255], [241.9451, 242.949, 1210900], [242.949, 243.9529, 1038748], [243.9529, 244.9569, 875010], [244.9569, 245.9608, 714476], [245.9608, 246.9647, 562953], [246.9647, 247.9686, 431427], [247.9686, 248.9725, 326175], [248.9725, 249.9765, 231626], [249.9765, 250.9804, 155845], [250.9804, 251.9843, 104933], [251.9843, 252.9882, 72649], [252.9882, 253.9922, 47831], [253.9922, 254.9961, 34488], [254.9961, 256.0, 100798]]}, "Global": {"train": [[0.0, 1.0039, 69786029], [1.0039, 2.0078, 8188625], [2.0078, 3.0118, 7784081], [3.0118, 4.0157, 7084007], [4.0157, 5.0196, 6715208], [5.0196, 6.0235, 6190920], [6.0235, 7.0275, 5617900], [7.0275, 8.0314, 5203549], [8.0314, 9.0353, 4819023], [9.0353, 10.0392, 4498022], [10.0392, 11.0431, 4316214], [11.0431, 12.0471, 4201633], [12.0471, 13.051, 4025472], [13.051, 14.0549, 3816331], [14.0549, 15.0588, 3648732], [15.0588, 16.0627, 3595752], [16.0627, 17.0667, 3432452], [17.0667, 18.0706, 3261391], [18.0706, 19.0745, 3223769], [19.0745, 20.0784, 3138509], [20.0784, 21.0824, 3014583], [21.0824, 22.0863, 2963871], [22.0863, 23.0902, 2851704], [23.0902, 24.0941, 2851531], [24.0941, 25.098, 2813641], [25.098, 26.102, 2837755], [26.102, 27.1059, 2882453], [27.1059, 28.1098, 2852997], [28.1098, 29.1137, 2877331], [29.1137, 30.1176, 2940764], [30.1176, 31.1216, 2989451], [31.1216, 32.1255, 3050442], [32.1255, 33.1294, 3110379], [33.1294, 34.1333, 3190169], [34.1333, 35.1373, 3221438], [35.1373, 36.1412, 3289931], [36.1412, 37.1451, 3369534], [37.1451, 38.149, 3438731], [38.149, 39.1529, 3515442], [39.1529, 40.1569, 3594400], [40.1569, 41.1608, 3678649], [41.1608, 42.1647, 3764646], [42.1647, 43.1686, 3854712], [43.1686, 44.1726, 3943781], [44.1726, 45.1765, 4038415], [45.1765, 46.1804, 4136169], [46.1804, 47.1843, 4226284], [47.1843, 48.1882, 4312247], [48.1882, 49.1922, 4419674], [49.1922, 50.1961, 4510112], [50.1961, 51.2, 4604814], [51.2, 52.2039, 4686623], [52.2039, 53.2078, 4778060], [53.2078, 54.2118, 4860586], [54.2118, 55.2157, 4949044], [55.2157, 56.2196, 5045016], [56.2196, 57.2235, 5125709], [57.2235, 58.2275, 5214792], [58.2275, 59.2314, 5281594], [59.2314, 60.2353, 5351470], [60.2353, 61.2392, 5424900], [61.2392, 62.2431, 5504739], [62.2431, 63.2471, 5575421], [63.2471, 64.251, 5651442], [64.251, 65.2549, 5741252], [65.2549, 66.2588, 5825816], [66.2588, 67.2627, 5914027], [67.2627, 68.2667, 5999701], [68.2667, 69.2706, 6081756], [69.2706, 70.2745, 6163780], [70.2745, 71.2784, 6251108], [71.2784, 72.2824, 6341952], [72.2824, 73.2863, 6425423], [73.2863, 74.2902, 6515680], [74.2902, 75.2941, 6599730], [75.2941, 76.298, 6690609], [76.298, 77.302, 6780442], [77.302, 78.3059, 6867430], [78.3059, 79.3098, 6959845], [79.3098, 80.3137, 7038970], [80.3137, 81.3176, 7118364], [81.3176, 82.3216, 7198626], [82.3216, 83.3255, 7273508], [83.3255, 84.3294, 7353458], [84.3294, 85.3333, 7437666], [85.3333, 86.3373, 7514222], [86.3373, 87.3412, 7596157], [87.3412, 88.3451, 7685676], [88.3451, 89.349, 7760480], [89.349, 90.3529, 7836324], [90.3529, 91.3569, 7914154], [91.3569, 92.3608, 7984592], [92.3608, 93.3647, 8058147], [93.3647, 94.3686, 8143172], [94.3686, 95.3726, 8225658], [95.3726, 96.3765, 8306471], [96.3765, 97.3804, 8385817], [97.3804, 98.3843, 8446081], [98.3843, 99.3882, 8515661], [99.3882, 100.3922, 8595649], [100.3922, 101.3961, 8665957], [101.3961, 102.4, 8780310], [102.4, 103.4039, 8823415], [103.4039, 104.4078, 8892485], [104.4078, 105.4118, 8960137], [105.4118, 106.4157, 9037720], [106.4157, 107.4196, 9118347], [107.4196, 108.4235, 9195839], [108.4235, 109.4275, 9278376], [109.4275, 110.4314, 9361333], [110.4314, 111.4353, 9441164], [111.4353, 112.4392, 9513884], [112.4392, 113.4431, 9565940], [113.4431, 114.4471, 9628529], [114.4471, 115.451, 9687381], [115.451, 116.4549, 9742042], [116.4549, 117.4588, 9796665], [117.4588, 118.4627, 9865679], [118.4627, 119.4667, 9917287], [119.4667, 120.4706, 9976651], [120.4706, 121.4745, 10036954], [121.4745, 122.4784, 10091054], [122.4784, 123.4824, 10149933], [123.4824, 124.4863, 10193695], [124.4863, 125.4902, 10233165], [125.4902, 126.4941, 10260896], [126.4941, 127.498, 10311379], [127.498, 128.502, 10367831], [128.502, 129.5059, 10434627], [129.5059, 130.5098, 10509680], [130.5098, 131.5137, 10554762], [131.5137, 132.5177, 10612940], [132.5177, 133.5216, 10656445], [133.5216, 134.5255, 10694216], [134.5255, 135.5294, 10740498], [135.5294, 136.5333, 10787178], [136.5333, 137.5373, 10825769], [137.5373, 138.5412, 10849369], [138.5412, 139.5451, 10868066], [139.5451, 140.549, 10914409], [140.549, 141.5529, 10957929], [141.5529, 142.5569, 11003964], [142.5569, 143.5608, 11057033], [143.5608, 144.5647, 11088904], [144.5647, 145.5686, 11107076], [145.5686, 146.5726, 11118792], [146.5726, 147.5765, 11121496], [147.5765, 148.5804, 11116855], [148.5804, 149.5843, 11100191], [149.5843, 150.5882, 11082416], [150.5882, 151.5922, 11068273], [151.5922, 152.5961, 11058037], [152.5961, 153.6, 11057471], [153.6, 154.6039, 11061172], [154.6039, 155.6078, 11071208], [155.6078, 156.6118, 11091822], [156.6118, 157.6157, 11113874], [157.6157, 158.6196, 11125667], [158.6196, 159.6235, 11134457], [159.6235, 160.6275, 11120833], [160.6275, 161.6314, 11124260], [161.6314, 162.6353, 11126463], [162.6353, 163.6392, 11126756], [163.6392, 164.6431, 11131869], [164.6431, 165.6471, 11137199], [165.6471, 166.651, 11133550], [166.651, 167.6549, 11126720], [167.6549, 168.6588, 11106672], [168.6588, 169.6628, 11107759], [169.6628, 170.6667, 11131468], [170.6667, 171.6706, 11161810], [171.6706, 172.6745, 11181003], [172.6745, 173.6784, 11169112], [173.6784, 174.6824, 11153893], [174.6824, 175.6863, 11145384], [175.6863, 176.6902, 11115178], [176.6902, 177.6941, 11076945], [177.6941, 178.698, 11040201], [178.698, 179.702, 11001007], [179.702, 180.7059, 10966466], [180.7059, 181.7098, 10912534], [181.7098, 182.7137, 10858933], [182.7137, 183.7177, 10794175], [183.7177, 184.7216, 10734278], [184.7216, 185.7255, 10665200], [185.7255, 186.7294, 10611713], [186.7294, 187.7333, 10554762], [187.7333, 188.7373, 10502363], [188.7373, 189.7412, 10441654], [189.7412, 190.7451, 10365457], [190.7451, 191.749, 10260435], [191.749, 192.7529, 10166071], [192.7529, 193.7569, 10083392], [193.7569, 194.7608, 10008931], [194.7608, 195.7647, 9924333], [195.7647, 196.7686, 9828104], [196.7686, 197.7726, 9711822], [197.7726, 198.7765, 9577627], [198.7765, 199.7804, 9439302], [199.7804, 200.7843, 9293507], [200.7843, 201.7882, 9180899], [201.7882, 202.7922, 9027950], [202.7922, 203.7961, 8905457], [203.7961, 204.8, 8814112], [204.8, 205.8039, 8716044], [205.8039, 206.8078, 8603077], [206.8078, 207.8118, 8483358], [207.8118, 208.8157, 8345030], [208.8157, 209.8196, 8210215], [209.8196, 210.8235, 8084148], [210.8235, 211.8275, 7923410], [211.8275, 212.8314, 7749873], [212.8314, 213.8353, 7584137], [213.8353, 214.8392, 7437250], [214.8392, 215.8431, 7288474], [215.8431, 216.8471, 7135785], [216.8471, 217.851, 6990555], [217.851, 218.8549, 6845667], [218.8549, 219.8588, 6705977], [219.8588, 220.8627, 6582536], [220.8627, 221.8667, 6474650], [221.8667, 222.8706, 6338251], [222.8706, 223.8745, 6181246], [223.8745, 224.8784, 6036981], [224.8784, 225.8824, 5891775], [225.8824, 226.8863, 5726395], [226.8863, 227.8902, 5552301], [227.8902, 228.8941, 5376620], [228.8941, 229.898, 5199191], [229.898, 230.902, 5016672], [230.902, 231.9059, 4819190], [231.9059, 232.9098, 4662588], [232.9098, 233.9137, 4478627], [233.9137, 234.9176, 4267266], [234.9176, 235.9216, 4050036], [235.9216, 236.9255, 3807243], [236.9255, 237.9294, 3577970], [237.9294, 238.9333, 3325567], [238.9333, 239.9373, 3023459], [239.9373, 240.9412, 2704080], [240.9412, 241.9451, 2423117], [241.9451, 242.949, 2173372], [242.949, 243.9529, 1943482], [243.9529, 244.9569, 1729961], [244.9569, 245.9608, 1518291], [245.9608, 246.9647, 1330571], [246.9647, 247.9686, 1176499], [247.9686, 248.9725, 1041896], [248.9725, 249.9765, 919637], [249.9765, 250.9804, 855355], [250.9804, 251.9843, 848342], [251.9843, 252.9882, 820422], [252.9882, 253.9922, 767235], [253.9922, 254.9961, 655912], [254.9961, 256.0, 1188037]]}}}} \ No newline at end of file diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats/demo/visualization.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats/demo/visualization.ipynb deleted file mode 100644 index 71f933fb78..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats/demo/visualization.ipynb +++ /dev/null @@ -1,440 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "3f851980", - "metadata": {}, - "source": [ - "# NVFLARE Federated Statistics Visualization" - ] - }, - { - "cell_type": "markdown", - "id": "1aea2083", - "metadata": {}, - "source": [ - "#### Dependencies\n", - "\n", - "To run this example, you need to install the following dependencies:\n", - "* monai[itk]\n", - "* numpy\n", - "* pandas\n", - "* kaleido\n", - "* matplotlib\n", - "* jupyter\n", - "* notebook\n", - "\n", - "These are captured in the requirements.txt\n" - ] - }, - { - "cell_type": "markdown", - "id": "0b71dd55", - "metadata": {}, - "source": [ - "## Image Statistics Visualization\n", - "In this example, we demonstate how to visualize the results from the statistics of image data. The visualization requires json, pandas, matplotlib modules as well as nvflare visualization utlities. " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "85f23acf", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "\n", - "import json\n", - "import pandas as pd\n", - "from nvflare.app_opt.statistics.visualization.statistics_visualization import Visualization" - ] - }, - { - "cell_type": "markdown", - "id": "151e23a8", - "metadata": {}, - "source": [ - "First, copy the resulting json file to demo directory. In this example, resulting file is called image_statistics.json. Then load json file\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "44f6bed2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "with open('image_statistics.json', 'r') as f:\n", - " data = json.load(f)" - ] - }, - { - "cell_type": "markdown", - "id": "c4b83ddb", - "metadata": {}, - "source": [ - "Initialize the Visualization utilities\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "ab771712", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "vis = Visualization()\n" - ] - }, - { - "cell_type": "markdown", - "id": "49f976aa", - "metadata": {}, - "source": [ - "### Overall Statistics\n", - "vis.show_stats() will show the statistics for each features, at each site for each dataset\n", - "\n", - "vis.show_stats(data = data)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "20ea4dff", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "intensity\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
counthistogram
site-4-train1345[[0.0, 1.0039, 7529944], [1.0039, 2.0078, 4251...
site-1-train3616[[0.0, 1.0039, 10894028], [1.0039, 2.0078, 919...
site-2-train6012[[0.0, 1.0039, 15941521], [1.0039, 2.0078, 340...
site-3-train10192[[0.0, 1.0039, 35420536], [1.0039, 2.0078, 343...
Global-train21165[[0.0, 1.0039, 69786029], [1.0039, 2.0078, 818...
\n", - "
" - ], - "text/plain": [ - " count histogram\n", - "site-4-train 1345 [[0.0, 1.0039, 7529944], [1.0039, 2.0078, 4251...\n", - "site-1-train 3616 [[0.0, 1.0039, 10894028], [1.0039, 2.0078, 919...\n", - "site-2-train 6012 [[0.0, 1.0039, 15941521], [1.0039, 2.0078, 340...\n", - "site-3-train 10192 [[0.0, 1.0039, 35420536], [1.0039, 2.0078, 343...\n", - "Global-train 21165 [[0.0, 1.0039, 69786029], [1.0039, 2.0078, 818..." - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "vis.show_stats(data = data)" - ] - }, - { - "cell_type": "markdown", - "id": "521cbf6f", - "metadata": {}, - "source": [ - "### select features statistics using white_list_features \n", - "user can optionally select only show specified features via white_list_features arguments. In these image files, we only have one feature" - ] - }, - { - "cell_type": "markdown", - "id": "9ab23bcc", - "metadata": {}, - "source": [ - "### Histogram Visualization\n", - "We can use vis.show_histograms() to visualize the histogram. Before we do that, we need set some iPython display setting to make sure the graph displayed in full cell. " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "4bada64b", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from IPython.display import display, HTML\n", - "display(HTML(\"\"))" - ] - }, - { - "cell_type": "markdown", - "id": "e415b49e", - "metadata": {}, - "source": [ - "The following command display histograms for numberic features. The result shows both main plot" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "53542cf9", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "vis.show_histograms(data = data, plot_type=\"main\")" - ] - }, - { - "cell_type": "markdown", - "id": "d8d537cc", - "metadata": {}, - "source": [ - "## Display Options\n", - "Similar to other statistics, we can use white_list_features to select only few features to display histograms. We can also use display_format=\"percent\" to allow all dataset and sites to be displayed in the same scale. User can set \n", - "\n", - "* display_format: \"percent\" or \"sample_count\"\n", - "* white_list_features: feature names\n", - "* plot_type : \"both\" or \"main\" or \"subplot\"\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "353db4d9", - "metadata": {}, - "source": [ - "#### show default display format with subplot\n", - "In the following, we display only feature \"Intensity\" in default display_format, with \"subplot\" plot_type" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "8f619729", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "vis.show_histograms(data = data, plot_type=\"subplot\")" - ] - }, - { - "cell_type": "markdown", - "id": "fbf7fc73", - "metadata": {}, - "source": [ - "\n", - "#### show percent display format with default plot_type (main)\n", - "In the following, we display only feature \"Intensity\" in \"percent\" display_format, with default plot_type" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "8655ad63", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "vis.show_histograms(data = data, display_format=\"percent\")" - ] - }, - { - "cell_type": "markdown", - "id": "fe527f64", - "metadata": {}, - "source": [ - "### Tip: Avoid repeated calculation\n" - ] - }, - { - "cell_type": "markdown", - "id": "4640b9aa", - "metadata": {}, - "source": [ - "If you intend to plot histogram main plot and subplot separately, repeated calling show_histogram with different plot_types is not efficicent, as it repeatewd calculate the same set of Dataframes. To do it efficiently, you can use the following functions instead show_histogram methods. This avoid the duplicated calculation in show_histograms. But if you intend to show both plots, the show_histogram() should be used" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "a8e6722f", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "feature_dfs = vis.get_histogram_dataframes(data, display_format=\"percent\" )\n", - " \n", - "vis.show_dataframe_plots(feature_dfs, plot_type=\"main\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "17fbd7fd", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "vis.show_dataframe_plots(feature_dfs, plot_type=\"subplot\") " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venvpt", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats/figs/image_histogram.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats/figs/image_histogram.png deleted file mode 100644 index 94503bd053..0000000000 Binary files a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats/figs/image_histogram.png and /dev/null differ diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats_job.py index 0579a82a9b..f8573fe4ba 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats_job.py +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/code/image_stats_job.py @@ -22,7 +22,7 @@ def define_parser(): parser = argparse.ArgumentParser() parser.add_argument("-n", "--n_clients", type=int, default=3) parser.add_argument("-d", "--data_root_dir", type=str, nargs="?", default="/tmp/nvflare/image_stats/data") - parser.add_argument("-o", "--stats_output_path", type=str, nargs="?", default="statistics/stats.json") + parser.add_argument("-o", "--stats_output_path", type=str, nargs="?", default="statistics/image_stats.json") parser.add_argument("-j", "--job_dir", type=str, nargs="?", default="/tmp/nvflare/jobs/image_stats") parser.add_argument("-w", "--work_dir", type=str, nargs="?", default="/tmp/nvflare/workspace/image_stats") diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb index 41852256cb..a2ac9fedd4 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb @@ -7,9 +7,126 @@ "source": [ "# Federated Statistics with image data\n", "\n", - "## Calculate Image Histogram\n", + "In this example, we will compute local and global image statistics with the consideration that data is private at each of the client sites.\n", "\n", - "In this example, we will compute local and global image statistics with the consideration that data is private at each of the client sites." + "## Define target statistics configuration\n", + "\n", + "For Image statistics, we are only interested in histogram of the image intensity, so we ignore all other statistic measures. \n", + "\n", + "```python\n", + "\n", + "statistic_configs = {\"count\": {}, \"histogram\": {\"*\": {\"bins\": 20, \"range\": [0, 256]}}}\n", + "```\n", + "\n", + "## Define the local statistics generator\n", + "\n", + "Based on the above target statistics configuration, we can define the local statistics generator. To do this, we need to write a class that implement \n", + "\n", + "```python\n", + "\n", + "class Statistics(InitFinalComponent, ABC):\n", + "\n", + " def initialize(self, fl_ctx: FLContext):\n", + " def pre_run(self, statistics: List[str], num_of_bins: Optional[Dict[str, Optional[int]]],bin_ranges: Optional[Dict[str, Optional[List[float]]]]):\n", + " def features(self) -> Dict[str, List[Feature]]:\n", + " def count(self, dataset_name: str, feature_name: str) -> int:\n", + " def sum(self, dataset_name: str, feature_name: str) -> float:\n", + " def mean(self, dataset_name: str, feature_name: str) -> float:\n", + " def stddev(self, dataset_name: str, feature_name: str) -> float:\n", + " def variance_with_mean(self, dataset_name: str, feature_name: str, global_mean: float, global_count: float) -> float:\n", + " def histogram(self, dataset_name: str, feature_name: str, num_of_bins: int, global_min_value: float, global_max_value: float) -> Histogram:\n", + " def max_value(self, dataset_name: str, feature_name: str) -> float:\n", + " def min_value(self, dataset_name: str, feature_name: str) -> float:\n", + " def failure_count(self, dataset_name: str, feature_name: str) -> int:\n", + " def quantiles(self, dataset_name: str, feature_name: str, percentiles: List) -> Dict:\n", + " def finalize(self, fl_ctx: FLContext):\n", + "\n", + "```\n", + "\n", + "But since we are only interested in two metrics : Count and Histogram, we can ignore other metrics implementation and only implements count and histogram. Here is the skeleton code for this generator\n", + "\n", + "```python\n", + "\n", + "class ImageStatistics(Statistics):\n", + "\n", + " def __init__(self):\n", + " pass\n", + " \n", + " def initialize(self, fl_ctx: FLContext):\n", + " self.fl_ctx = fl_ctx\n", + " self.client_name = fl_ctx.get_identity_name()\n", + " \n", + " # call load data function \n", + "\n", + " def _load_data_list(self, client_name, fl_ctx: FLContext) -> bool:\n", + " pass\n", + "\n", + "\n", + " def pre_run(\n", + " self,\n", + " statistics: List[str],\n", + " num_of_bins: Optional[Dict[str, Optional[int]]],\n", + " bin_ranges: Optional[Dict[str, Optional[List[float]]]],\n", + " ):\n", + " return {}\n", + "\n", + " def features(self) -> Dict[str, List[Feature]]:\n", + " return {\"train\": [Feature(\"intensity\", DataType.FLOAT)]}\n", + "\n", + " def count(self, dataset_name: str, feature_name: str) -> int:\n", + "\n", + " # return number of images loaded\n", + " pass\n", + " \n", + "\n", + " def failure_count(self, dataset_name: str, feature_name: str) -> int:\n", + "\n", + " return self.failure_images\n", + "\n", + " def histogram(\n", + " self, dataset_name: str, feature_name: str, num_of_bins: int, global_min_value: float, global_max_value: float\n", + " ) -> Histogram:\n", + " # do histogram calculation: \n", + " return Histogram(HistogramType.STANDARD, histogram_bins)\n", + "```\n", + "\n", + "Here ```FLContext``` is the context of the current Job workflow, \"identity\" referring to the site identity, therefore ```get_identity_name()``` will return the site name.\n", + "\n", + "You can take a look of the implementation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31517cdb", + "metadata": {}, + "outputs": [], + "source": [ + "! cat code/src/image_statistics.py" + ] + }, + { + "cell_type": "markdown", + "id": "2bdda4a9", + "metadata": {}, + "source": [ + "# Define Job Configuration\n", + " \n", + "\n", + "```python\n", + "\n", + " statistic_configs = {\"count\": {}, \"histogram\": {\"*\": {\"bins\": 20, \"range\": [0, 256]}}}\n", + " \n", + " # define local stats generator\n", + " stats_generator = ImageStatistics(data_root_dir)\n", + "\n", + " job = StatsJob(\n", + " job_name=\"stats_image\",\n", + " statistic_configs=statistic_configs,\n", + " stats_generator=stats_generator,\n", + " output_path=output_path,\n", + " )\n", + "```" ] }, { @@ -45,12 +162,24 @@ "\n", "As an example, we use the dataset from the [\"COVID-19 Radiography Database\"](https://www.kaggle.com/tawsifurrahman/covid19-radiography-database).\n", "it contains png image files in four different classes: `COVID`, `Lung_Opacity`, `Normal`, and `Viral Pneumonia`.\n", - "First create a temp directory, then we download and extract to `/tmp/nvflare/image_stats/data/.`." + "First create a temp directory, then we download and extract to `/tmp/nvflare/image_stats/data/.`.\n", + "\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, + "id": "ae0f0f1b", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install kagglehub" + ] + }, + { + "cell_type": "code", + "execution_count": 1, "id": "b4e64769-17f1-4805-9399-1c141e050065", "metadata": { "tags": [] @@ -66,12 +195,28 @@ "fi\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "519789a2", + "metadata": {}, + "outputs": [], + "source": [ + "import kagglehub\n", + "\n", + "# Download latest version\n", + "path = kagglehub.dataset_download(\"tawsifurrahman/covid19-radiography-database\")\n", + "\n", + "print(\"Path to dataset files:\", path)\n", + "\n" + ] + }, { "cell_type": "markdown", "id": "0562f713-5892-43c7-a3d6-d277c337b5ea", "metadata": {}, "source": [ - "Download and unzip the data (you may need to log in to Kaggle or use an API key). Once you have extracted the data from the zip file, you can check the directory to make sure you have the COVID-19_Radiography_Dataset directory at the following location." + "Download and unzip the data (you may need to log in to Kaggle or use an API key). Once you have extracted the data from the zip file, check the directory to make sure you have the COVID-19_Radiography_Dataset directory at the following location." ] }, { @@ -83,7 +228,9 @@ }, "outputs": [], "source": [ - "ls -l /tmp/nvflare/image_stats/data/." + "! mv {path} /tmp/nvflare/image_stats/data/\n", + "\n", + "! tree /tmp/nvflare/image_stats/data" ] }, { @@ -104,17 +251,11 @@ { "cell_type": "code", "execution_count": null, - "id": "e1ea959f-7282-4e55-bb26-11524ec47e99", - "metadata": { - "tags": [] - }, + "id": "79ba087e", + "metadata": {}, "outputs": [], "source": [ - "from code.image_stats.utils.prepare_data import prepare_data\n", - "\n", - "prepare_data(input_dir = \"/tmp/nvflare/image_stats/data\", \n", - " input_ext = \".png\",\n", - " output_dir =\"/tmp/nvflare/image_stats/data\")\n" + "! code/data/prepare_data.sh" ] }, { @@ -140,7 +281,11 @@ "metadata": {}, "outputs": [], "source": [ - "! python3 code/image_stats_job.py" + "%cd code\n", + "\n", + "! python3 image_stats_job.py\n", + "\n", + "%cd -\n" ] }, { @@ -171,7 +316,8 @@ }, "outputs": [], "source": [ - "! ls -al /tmp/nvflare/workspace/image_stats/server/simulate_job/statistics/image_statistics.json" + "! ls -al /tmp/nvflare/workspace/image_stats/server/simulate_job/statistics/image_stats.json\n", + " " ] }, { @@ -182,19 +328,19 @@ }, "source": [ "## Visualization\n", - "We can visualize the results easly via the visualization notebook. Before we do that, we need to copy the data to the notebook directory \n" + "We can visualize the results easily via the visualization notebook. Before we do that, we need to copy the data to the notebook directory\n" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "a3c89693-37b9-450c-85dd-8a2d78fee3fa", "metadata": { "tags": [] }, "outputs": [], "source": [ - "! cp /tmp/nvflare/workspace/image_stats/server/simulate_job/statistics/image_statistics.json image_stats/demo/." + "! cp /tmp/nvflare/workspace/image_stats/server/simulate_job/statistics/image_stats.json code/image_stats/demo/." ] }, { @@ -202,7 +348,7 @@ "id": "d5c6f632-3326-4236-902e-8c0965688d85", "metadata": {}, "source": [ - "now we can visualize via the [visualization notebook](image_stats/demo/visualization.ipynb)" + "now we can visualize via the [visualization notebook](code/image_stats/demo/visualization.ipynb)" ] }, { @@ -213,15 +359,15 @@ }, "source": [ "## We are done !\n", - "Congratulations, you just completed the federated stats image histogram calulation\n" + "Congratulations, you have just completed the federated stats image histogram calculation.\n" ] } ], "metadata": { "kernelspec": { - "display_name": ".venvpt", + "display_name": "nvflare_env", "language": "python", - "name": "python3" + "name": "nvflare_env" }, "language_info": { "codemirror_mode": { @@ -233,7 +379,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.2" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/utils/prepare_data.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/data/prepare_data.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/utils/prepare_data.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/data/prepare_data.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/visualization.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/demo/visualization.ipynb similarity index 82% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/visualization.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/demo/visualization.ipynb index c04b07220b..ea51dfe209 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/visualization.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/demo/visualization.ipynb @@ -8,24 +8,6 @@ "# NVFlare Federated Statistics Visualization" ] }, - { - "cell_type": "markdown", - "id": "987e6028", - "metadata": {}, - "source": [ - "#### Dependencies\n", - "\n", - "To run the examples, you will need to install the following dependencies:\n", - "* numpy\n", - "* pandas\n", - "* wget\n", - "* matplotlib\n", - "* jupyter\n", - "* notebook\n", - "\n", - "These are captured in [requirements.txt](../../requirements.txt)." - ] - }, { "cell_type": "markdown", "id": "665dc17e", @@ -37,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "c44a0217", "metadata": { "tags": [] @@ -81,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "93c62d5e", "metadata": { "tags": [] @@ -249,31 +231,24 @@ }, { "cell_type": "markdown", - "id": "2f330eb6", + "id": "ffa1da20", "metadata": {}, "source": [ - "### Tip: Avoid repeated calculation\n", - "If you intend to plot the histogram main plot and subplot separately, repeatedly calling `show_histograms()` with different plot_types is not efficicent, as it repeatedly calculates the same set of Dataframes. To do it efficiently, you can use the following functions instead of `show_histograms()` to avoid the duplicated calculations. If you intend to show both plots, then `show_histograms()` should be used." + "Now, Let's back to [federated_statistics_with_tabular_data](../../../federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "395315a4", + "cell_type": "markdown", + "id": "d3f12a4a", "metadata": {}, - "outputs": [], - "source": [ - "feature_dfs = vis.get_histogram_dataframes(data, display_format=\"percent\")\n", - " \n", - "vis.show_dataframe_plots(feature_dfs, plot_type=\"main\")" - ] + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "nvflare_example", + "display_name": "nvflare_env", "language": "python", - "name": "nvflare_example" + "name": "nvflare_env" }, "language_info": { "codemirror_mode": { @@ -285,7 +260,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.2" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/df_stats.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/df_stats.png deleted file mode 100644 index da9835b735..0000000000 Binary files a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/df_stats.png and /dev/null differ diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/hist_plot.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/hist_plot.png deleted file mode 100644 index 9fef1da1ec..0000000000 Binary files a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/hist_plot.png and /dev/null differ diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/stats_df.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/stats_df.png deleted file mode 100644 index 90c797b243..0000000000 Binary files a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats/demo/stats_df.png and /dev/null differ diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats_job.py index 6666b9a533..e2f3637757 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats_job.py +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/df_stats_job.py @@ -20,11 +20,13 @@ def define_parser(): parser = argparse.ArgumentParser() - parser.add_argument("-n", "--n_clients", type=int, default=3) + parser.add_argument("-n", "--n_clients", type=int, default=2) parser.add_argument("-d", "--data_root_dir", type=str, nargs="?", default="/tmp/nvflare/df_stats/data") - parser.add_argument("-o", "--stats_output_path", type=str, nargs="?", default="statistics/adult_stats.json") + parser.add_argument("-o", "--stats_output_path", type=str, nargs="?", default="statistics/adults_stats.json") parser.add_argument("-j", "--job_dir", type=str, nargs="?", default="/tmp/nvflare/jobs/stats_df") - parser.add_argument("-w", "--work_dir", type=str, nargs="?", default="/tmp/nvflare/jobs/stats_df") + parser.add_argument("-w", "--work_dir", type=str, nargs="?", default="/tmp/nvflare/jobs/stats_df/work_dir") + parser.add_argument("-co", "--export_config", action="store_true", help="config only mode, export config") + parser.add_argument("-l", "--log_config", type=str, default="concise") return parser.parse_args() @@ -37,18 +39,19 @@ def main(): output_path = args.stats_output_path job_dir = args.job_dir work_dir = args.work_dir + export_config = args.export_config + log_config = args.log_config statistic_configs = { "count": {}, "mean": {}, "sum": {}, "stddev": {}, - "histogram": {"*": {"bins": 20}}, - "Age": {"bins": 20, "range": [0, 10]}, - "percentile": {"*": [25, 50, 75], "Age": [50, 95]}, + "histogram": {"*": {"bins": 20}, "Age": {"bins": 20, "range": [0, 100]}}, + "quantile": {"*": [0.1, 0.5, 0.9], "Age": [0.5, 0.9]}, } # define local stats generator - df_stats_generator = DFStatistics(data_root_dir=data_root_dir) + df_stats_generator = DFStatistics(filename="data.csv", data_root_dir=data_root_dir) job = StatsJob( job_name="stats_df", @@ -60,9 +63,11 @@ def main(): sites = [f"site-{i + 1}" for i in range(n_clients)] job.setup_clients(sites) - job.export_job(job_dir) - - job.simulator_run(work_dir) + if export_config: + print("Exporting job config...", job_dir) + job.export_job(job_dir) + else: + job.simulator_run(work_dir, log_config=log_config) if __name__ == "__main__": diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/src/df_statistics.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/src/df_statistics.py index 5078c2f0a9..942bbf692d 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/src/df_statistics.py +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/code/src/df_statistics.py @@ -21,10 +21,10 @@ class DFStatistics(DFStatisticsCore): - def __init__(self, data_path): + def __init__(self, filename, data_root_dir="/tmp/nvflare/df_stats/data"): super().__init__() - self.data_root_dir = "/tmp/nvflare/df_stats/data" - self.data_path = data_path + self.data_root_dir = data_root_dir + self.filename = filename self.data: Optional[Dict[str, pd.DataFrame]] = None self.data_features = [ "Age", @@ -57,7 +57,7 @@ def load_data(self, fl_ctx: FLContext) -> Dict[str, pd.DataFrame]: self.log_info(fl_ctx, f"load data for client {client_name}") try: skip_rows = self.skip_rows[client_name] - data_path = f"{self.data_root_dir}/{fl_ctx.get_identity_name()}/{self.data_path}" + data_path = f"{self.data_root_dir}/{fl_ctx.get_identity_name()}/{self.filename}" # example of load data from CSV df: pd.DataFrame = pd.read_csv( data_path, names=self.data_features, sep=r"\s*,\s*", skiprows=skip_rows, engine="python", na_values="?" diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb index a630c65dcc..4e6d4cee41 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.1_federated_statistics/federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb @@ -2,24 +2,28 @@ "cells": [ { "cell_type": "markdown", - "id": "26cb3afa", + "id": "64a17f22-5667-4f99-b4f6-d49116db74b0", "metadata": { "tags": [] }, "source": [ - "# Data Frame Federated Statistics \n", + "# Federated Statistics with Tabular Data\n", + "\n", + "Tabular data is the most common data type in the world; it is easy to understand and manipulate. In this chapter, we will explore federated statistics with tabular data. We will leverage pandas dataframe to calculate the statistics of the local data and the global data.\n", + "\n", + "\n", + "To illustrate with an example, we will prepare the dependencies and prepare the dataset. \n", "\n", - "In this example, we will show how to generate federated statistics for data that can be represented as Pandas Data Frame." + "## Prerequisites" ] }, { "cell_type": "markdown", - "id": "64a17f22-5667-4f99-b4f6-d49116db74b0", - "metadata": { - "tags": [] - }, + "id": "497d44fa", + "metadata": {}, "source": [ - "## Install requirements\n", + "\n", + "### Install dependencies\n", "First, install the required packages:" ] }, @@ -35,6 +39,47 @@ "%pip install -r code/requirements.txt" ] }, + { + "cell_type": "markdown", + "id": "05d9890a", + "metadata": {}, + "source": [ + "\n", + "> Sidebar: \n", + "> **Installing fastdigest**\n", + ">\n", + "> If you intend to calculate quantiles, you need to install fastdigest. the fastdigest not included in the requirements.txt file. If you are not calculating quantiles, you can skip this step.\n", + ">\n", + "> ```bash\n", + "> pip install fastdigest==0.4.0\n", + "> ```\n", + ">\n", + "> On Ubuntu, you might get the following error:\n", + ">\n", + "> ```\n", + "> Cargo, the Rust package manager, is not installed or is not on PATH.\n", + "> This package requires Rust and Cargo to compile extensions. Install it through\n", + "> the system's package manager or via https://rustup.rs/\n", + "> \n", + "> Checking for Rust toolchain....\n", + "> ```\n", + ">\n", + "> This is because fastdigest (or its dependencies) requires Rust and Cargo to build. \n", + ">\n", + "> You need to install Rust and Cargo on your Ubuntu system. Follow these steps:\n", + ">\n", + "> 1. Install Rust and Cargo by running:\n", + "> ```bash\n", + "> cd NVFlare/examples/advanced/federated-statistics/df_stats\n", + "> ./install_cargo.sh\n", + "> ```\n", + ">\n", + "> 2. Then install fastdigest again:\n", + "> ```bash\n", + "> pip install fastdigest==0.4.0\n", + "> ```" + ] + }, { "cell_type": "markdown", "id": "94faaa6b-08fd-485c-87d5-53b4520177fe", @@ -43,10 +88,14 @@ }, "source": [ "\n", - "## Prepare data\n", + "### Prepare data\n", "\n", "In this example, we are using the UCI (University of California, Irvine) [adult dataset](https://archive.ics.uci.edu/dataset/2/adult)\n", "The original dataset already contains \"training\" and \"test\" datasets. Here we simply assume that the \"training\" and \"test\" data set each belong to a client, so we assign the adult.train dataset to site-1 and the adult.test dataset to site-2.\n", + "```\n", + "site-1: adult.train\n", + "site-2: adult.test\n", + "```\n", "\n", "Now we use the data utility to download UCI datasets to separate client package directory to /tmp/nvflare/data/ directory.\n", "Please note that the UCI's website may experience occasional downtime." @@ -61,9 +110,13 @@ }, "outputs": [], "source": [ - "from code.df_stats.utils.prepare_data import prepare_data\n", + "%cd code/data\n", "\n", - "prepare_data(data_root_dir = \"/tmp/nvflare/df_stats/data\")" + "from prepare_data import prepare_data\n", + "\n", + "prepare_data(data_root_dir = \"/tmp/nvflare/df_stats/data\")\n", + "\n", + "%cd -" ] }, { @@ -71,7 +124,7 @@ "id": "c5444d8f-4938-4759-bd43-831013043c23", "metadata": {}, "source": [ - "#### Let's take a look at the data" + "Let's take a look at the data" ] }, { @@ -122,7 +175,175 @@ "jp-MarkdownHeadingCollapsed": true }, "source": [ - "> Note **We will only calculate the statistics of numerical features, categorical features will be skipped**" + "> Note \n", + "We will only calculate the statistics of numerical features; categorical features will be skipped." + ] + }, + { + "cell_type": "markdown", + "id": "ae3bb5a4", + "metadata": {}, + "source": [ + "## Define target statistics configuration\n", + "\n", + "\n", + "Let's see what statistics we want to calculate, we can capture the statistics configuration in a dictionary, the key is the statistic name, the value is the statistic configuration.\n", + "\n", + "```python\n", + "\n", + " statistic_configs = {\n", + " \"count\": {},\n", + " \"mean\": {},\n", + " \"sum\": {},\n", + " \"stddev\": {},\n", + " \"histogram\": {\"*\": {\"bins\": 20}, \"Age\": {\"bins\": 10, \"range\": [0, 100]}},\n", + " \"quantile\": {\"*\": [0.1, 0.5, 0.9], \"Age\": [0.5, 0.9]},\n", + " }\n", + "```\n", + "\n", + "For each statistic, we can configure to give additional instructions for each feature. While count, mean, sum and stddev are defined in such a way that the calculation will be the same for all features, for histogram, we can define different bins for each feature. \"*\" is a wildcard for all features.\n", + "```\n", + "\"histogram\": {\"*\": {\"bins\": 20}, \"Age\": {\"bins\": 20, \"range\": [0, 10]}},\n", + "```\n", + "here, we define a histogram with a histogram with 20 bins for all features, the range is not defined, which means the range will be calculated from the data.\n", + "We also defined 10 bins and range [0, 100] for the feature \"Age\".\n", + "\n", + "Similarly the quantile is defined for different features with different values. This can be specified with Job API\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d17a6a9", + "metadata": {}, + "outputs": [], + "source": [ + "!cat code/df_stats_job.py" + ] + }, + { + "cell_type": "markdown", + "id": "b1857b89", + "metadata": {}, + "source": [ + "If a user only wants to calculate statistics except for quantile and histogram, then the configuration can be simplified as:\n", + "\n", + "```python\n", + "\n", + " statistic_configs = {\n", + " \"count\": {},\n", + " \"mean\": {},\n", + " \"sum\": {},\n", + " \"stddev\": {},\n", + " }\n", + "\n", + "```\n", + "\n", + "Similarly, the code can be changed to \n", + "\n", + "\n", + "```python\n", + "\n", + " skip previous code\n", + "\n", + "\n", + " statistic_configs = {\n", + " \"count\": {},\n", + " \"mean\": {},\n", + " \"sum\": {},\n", + " \"stddev\": {},\n", + "\n", + " }\n", + " # define local stats generator\n", + " df_stats_generator = DFStatistics(data_root_dir=data_root_dir)\n", + "\n", + " job = StatsJob(\n", + " job_name=\"stats_df\",\n", + " statistic_configs=statistic_configs,\n", + " stats_generator=df_stats_generator,\n", + " output_path=output_path,\n", + " )\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "id": "f102f6b9", + "metadata": {}, + "source": [ + "## Define the local statistics generator\n", + "\n", + "Based on the above target statistics configuration, we can define the local statistics generator. To do this, we need to write a class that implement \n", + "\n", + "\n", + "```python\n", + "\n", + "class Statistics(InitFinalComponent, ABC):\n", + "\n", + " def initialize(self, fl_ctx: FLContext):\n", + " def pre_run(self, statistics: List[str], num_of_bins: Optional[Dict[str, Optional[int]]],bin_ranges: Optional[Dict[str, Optional[List[float]]]]):\n", + " def features(self) -> Dict[str, List[Feature]]:\n", + " def count(self, dataset_name: str, feature_name: str) -> int:\n", + " def sum(self, dataset_name: str, feature_name: str) -> float:\n", + " def mean(self, dataset_name: str, feature_name: str) -> float:\n", + " def stddev(self, dataset_name: str, feature_name: str) -> float:\n", + " def variance_with_mean(self, dataset_name: str, feature_name: str, global_mean: float, global_count: float) -> float:\n", + " def histogram(self, dataset_name: str, feature_name: str, num_of_bins: int, global_min_value: float, global_max_value: float) -> Histogram:\n", + " def max_value(self, dataset_name: str, feature_name: str) -> float:\n", + " def min_value(self, dataset_name: str, feature_name: str) -> float:\n", + " def failure_count(self, dataset_name: str, feature_name: str) -> int:\n", + " def quantiles(self, dataset_name: str, feature_name: str, percentiles: List) -> Dict:\n", + " def finalize(self, fl_ctx: FLContext):\n", + "\n", + "```\n", + "\n", + "\n", + "NVIDIA FLARE has implemented ```DFStatisticsCore```, which is a core class for calculating the statistics of the data frame. We can inherit this class and override the methods to calculate the statistics. Here are a few assumptions:\n", + "\n", + "* data can be loaded and cached in the memory.\n", + "* data has the proper column names and can be loaded into a pandas dataframe.\n", + "* The feature names can be obtained from the dataframe.\n", + "\n", + "```\n", + "def load_data(self, fl_ctx: FLContext) -> Dict[str, pd.DataFrame]:\n", + "```\n", + "which loads the data into a dictionary of pandas dataframe, the key is the dataset name, the value is the pandas dataframe.\n", + "\n", + "Let's take a look what the user needs to do to implement the local statistics generator. With ```DFStatisticsCore```, the user needs to implement the following methods:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ad902da", + "metadata": {}, + "outputs": [], + "source": [ + "!cat code/src/df_statistics.py\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c9eb7c51", + "metadata": {}, + "source": [ + "# Define Job Configuration\n", + "\n", + "Each FLARE job is defined by a job configuration, the configuration includes configurations for the clients and server. Optionally, the job configuration also contains the customized job code. You have seen this in [Job Structure and configuration](../../../chapter-1_running_federated_learning_applications/01.6_job_structure_and_configuration/understanding_fl_job.ipynb)\n", + "\n", + "For now, we can define a FebJob via FLARE's Job API, here is Statistrics Job we havea predefined for you\n", + "\n", + "```python\n", + "\n", + " job = StatsJob(\n", + " job_name=\"\",\n", + " statistic_configs=statistic_configs,\n", + " stats_generator=df_stats_generator,\n", + " output_path=output_path,\n", + " )\n", + "\n", + "```" ] }, { @@ -130,6 +351,9 @@ "id": "5c4da328", "metadata": {}, "source": [ + "For this example, we simply hardcoded the column names, but in practice, users can get the column names from files such as CSV files, parquet files, etc.\n", + "\n", + "\n", "## Run Job with FL Simulator" ] }, @@ -148,7 +372,10 @@ "metadata": {}, "outputs": [], "source": [ - "! python3 code/df_stats_job.py" + "%cd code\n", + "\n", + "! python3 df_stats_job.py \n", + "%cd -" ] }, { @@ -166,9 +393,10 @@ "id": "45bf6e9a-3265-4e45-8b06-c8e543605f21", "metadata": {}, "source": [ - "With the default parameters, the results are stored in workspace \"/tmp/nvflare/jobs/stats_df/\"\n", + "With the default parameters, the results are stored in workspace \"/tmp/nvflare/jobs/stats_df/work_dir\"\n", "```\n", - "/tmp/nvflare/jobs/stats_df/server/simulate_job/statistics/adults_stats.json\n", + "/tmp/nvflare/jobs/stats_df/work_dir/server/simulate_job/statistics/adults_stats.json \n", + "\n", "```" ] }, @@ -181,7 +409,7 @@ }, "outputs": [], "source": [ - "cat /tmp/nvflare/jobs/stats_df/server/simulate_job/statistics/adults_stats.json" + "ls -al /tmp/nvflare/jobs/stats_df/work_dir/server/simulate_job/statistics/adults_stats.json " ] }, { @@ -197,14 +425,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "a3c89693-37b9-450c-85dd-8a2d78fee3fa", "metadata": { "tags": [] }, "outputs": [], "source": [ - "! cp /tmp/nvflare/jobs/stats_df/server/simulate_job/statistics/adults_stats.json code/df_stats/demo/." + "! cp /tmp/nvflare/jobs/stats_df/work_dir/server/simulate_job/statistics/adults_stats.json code/demo/." ] }, { @@ -212,9 +440,17 @@ "id": "d5c6f632-3326-4236-902e-8c0965688d85", "metadata": {}, "source": [ - "now we can visualize the results with the [visualization notebook](code/df_stats/demo/visualization.ipynb)" + "Now we can visualize the results with the [visualization notebook](code/demo/visualization.ipynb)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef94e26f", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "fda06c0b-798d-480d-9b4c-a62fab95bcf0", @@ -223,15 +459,23 @@ }, "source": [ "## We are done !\n", - "Congratulations, you just completed the federated stats calulation with data represented by data frame\n" + "Congratulations, you just completed the federated stats calculation with data represented by a data frame.\n", + "\n", + "Let's move on to [federated stats with Image Data](../federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb)" ] + }, + { + "cell_type": "markdown", + "id": "70459712", + "metadata": {}, + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "nvflare_example", + "display_name": "nvflare_env", "language": "python", - "name": "nvflare_example" + "name": "nvflare_env" }, "language_info": { "codemirror_mode": { @@ -243,7 +487,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.2" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_client_api/client_api.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_client_api/client_api.ipynb new file mode 100644 index 0000000000..69eb19ebf2 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_client_api/client_api.ipynb @@ -0,0 +1,248 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "58149c32", + "metadata": {}, + "source": [ + "# Transform Existing Code to FL Easily with the FLARE Client API" + ] + }, + { + "cell_type": "markdown", + "id": "06203527", + "metadata": {}, + "source": [ + "The FLARE Client API offers a straightforward path to transform your existing machine learning or deep learning code into federated learning applications. With just a few lines of code changes, you can adapt your training logic without restructuring your codebase or moving code into different class methods. This flexibility applies to both traditional machine learning and deep learning frameworks. For PyTorch Lightning users, the process is even more streamlined with dedicated Lightning API support.\n", + "\n", + "You can see detailed examples with actual integration across different platforms including PyTorch and TensorFlow [here:](https://github.com/NVIDIA/NVFlare/tree/main/examples/hello-world/ml-to-fl)\n", + "\n", + "In Chapter 1, you have already seen the Client API in action with pytorch. In this section, we will focus on the core concepts of the Client API and explain some of the ways it can be configured to help you use the Client API more effectively.\n", + "\n", + "Then we will see how to use the Client API with PyTorch Lightning, and traditional machine learning algorithms such as Logistic Regression, KMeans and survival analysis." + ] + }, + { + "cell_type": "markdown", + "id": "be7efa36", + "metadata": {}, + "source": [ + "## Core Concept" + ] + }, + { + "cell_type": "markdown", + "id": "76102eac", + "metadata": {}, + "source": [ + "The general workflow of the popular federated learning (FL) follows the following steps:\n", + "\n", + "1. **FL server initializes an initial model**\n", + "2. **For each round (global iteration):**\n", + " * FL server broadcasts the global model to clients\n", + " * Each FL client starts with this global model and perform the local training on their own data\n", + " * Each FL client, then sends back their newly trained model to the FL server\n", + " * FL server aggregates all the local models and produces a new global model\n", + "\n", + "On the client side, the training workflow is as follows:\n", + "\n", + "1. Receive the model from the FL server\n", + "2. Perform local training on the received global model and/or evaluate the received global model for model selection\n", + "3. Send the new model back to the FL server" + ] + }, + { + "cell_type": "markdown", + "id": "50e2b7dd", + "metadata": {}, + "source": [ + "To convert a centralized training code to federated learning, we need to\n", + "adapt the code to do the following steps:\n", + "\n", + "\n", + "1. Obtain the required information from the received `FLModel`\n", + "2. Run local training\n", + "3. Put the results in a new `FLModel` to be sent back\n", + "\n", + "For a general use case, there are three essential methods for the Client API:\n", + "\n", + "* ``init()``: Initializes NVFlare Client API environment.\n", + "* ``receive()``: Receives model from NVFlare side.\n", + "* ``send()``: Sends the model to NVFlare side.\n", + "\n", + "Where `FLModel` is a data structure like this:\n", + "\n", + "```python\n", + "\n", + "class FLModel:\n", + " def __init__(\n", + " self,\n", + " params_type: Union[None, str, ParamsType] = None,\n", + " params: Any = None,\n", + " optimizer_params: Any = None,\n", + " metrics: Optional[Dict] = None,\n", + " start_round: Optional[int] = 0,\n", + " current_round: Optional[int] = None,\n", + " total_rounds: Optional[int] = None,\n", + " meta: Optional[Dict] = None,\n", + " ):\n", + " \"\"\"FLModel is a standardize data structure for NVFlare to communicate with external systems.\n", + "\n", + " Args:\n", + " params_type: type of the parameters. It only describes the \"params\".\n", + " If params_type is None, params need to be None.\n", + " If params is provided but params_type is not provided, then it will be treated as FULL.\n", + " params: model parameters, for example: model weights for deep learning.\n", + " optimizer_params: optimizer parameters.\n", + " For many cases, the optimizer parameters don't need to be transferred during FL training.\n", + " metrics: evaluation metrics such as loss and scores.\n", + " current_round: the current FL rounds. A round means round trip between client/server during training.\n", + " None for inference.\n", + " total_rounds: total number of FL rounds. A round means round trip between client/server during training.\n", + " None for inference.\n", + " meta: metadata dictionary used to contain any key-value pairs to facilitate the process.\n", + " \"\"\"\n", + "```\n", + "\n", + "You can use the Client API to change centralized training code to\n", + "federated learning, for example:\n", + "\n", + "```python\n", + "\n", + "import nvflare.client as flare\n", + "\n", + "flare.init() # 1. Initializes NVFlare Client API environment.\n", + "input_model = flare.receive() # 2. Receives model from NVFlare side.\n", + "params = input_model.params # 3. Obtain the required information from received FLModel\n", + "\n", + "# original local training code begins\n", + "\n", + "new_params = trainer.fit(params)\n", + "\n", + "# original local training code ends\n", + "\n", + "output_model = flare.FLModel(params=new_params) # 4. Put the results in a new FLModel\n", + "flare.send(output_model) # 5. Sends the model to NVFlare side.\n", + "\n", + "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "494e4079", + "metadata": {}, + "source": [ + "With 5 lines of code changes, we convert the centralized training code to work in a\n", + "federated learning setting.\n", + "\n", + "After this, we can use the job templates and the Job CLI\n", + "to generate a job and export it to run on a deployed NVFlare system or directly run the job using FL Simulator.\n", + "\n", + "To see a table of the key Client APIs, see the [Client API documentation in the programming guide](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html#id2).\n", + "\n", + "Please consult the [Client API Module](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.client.api.html) for more in-depth information about all of the Client API functions.\n", + "\n", + "If you are using PyTorch Lightning in your training code, you can check the [Lightning API Module](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_opt.lightning.api.html). Also, be sure to look through the [Convert Torch Lightning to FL notebook](../02.2_client_api/convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb) and related code." + ] + }, + { + "cell_type": "markdown", + "id": "4a09d80e", + "metadata": {}, + "source": [ + "## Client API with Different Implementations\n", + "\n", + "Within the Client API, we offer multiple implementations tailored to diverse requirements:\n", + "\n", + "* In-process Client API: efficient for single GPU training\n", + "* Sub-process Client API: flexible for multi-GPU or distributed PyTorch training\n", + "\n", + "\n", + "\n", + "### In-process Client API\n", + "\n", + "In this setup, the client training script operates within the same process as the NVFlare Client job. This configuration, utilizing the ```InProcessClientAPIExecutor```, offers shared memory usage and is efficient with simple configuration. \n", + "This is the default for `ScriptRunner` since by default `launch_external_process=False`. Use this configuration for development or single GPU training.\n", + "\n", + "### Sub-process Client API: \n", + "\n", + "Here, the client training script runs in a separate subprocess.\n", + "\n", + "Utilizing the ```ClientAPILauncherExecutor```, this option offers flexibility in communication mechanisms:\n", + " * Communication via CellPipe (default)\n", + " * Communication via FilePipe (no capability to stream metrics for experiment tracking) \n", + "\n", + "This configuration is ideal for scenarios requiring multi-GPU or distributed PyTorch training.\n", + "\n", + "Choose the option best suited to your specific requirements and workflow preferences.\n", + "\n", + "\n", + "## Client API communication patterns\n", + "\n", + "We have two different implementations of the Client API tailored to different scenarios, each linked with distinct communication patterns.\n", + "\n", + "Broadly, we present in-process and sub-process executors. The in-process executor entails both training scripts and client executor operating within the same process.\n", + "\n", + "\n", + "On the other hand, the LauncherExecutor employs a sub-process to execute training scripts, leading to the client executor and training scripts residing in separate processes. Communication between them is facilitated by either CellPipe (default) or FilePipe.\n", + "\n", + "\"Client\n", + "\n", + "\n", + "\n", + "### Choice of different Pipes\n", + "We suggest using the default setting with CellPipe for most users.\n", + "\n", + "CellPipe facilitates TCP-based cell-to-cell connections between the Executor and training script processes on the local host. The term cell represents logical endpoints. This communication enables the exchange of models, metrics, and metadata between the two processes.\n", + "\n", + "In contrast, FilePipe offers file-based communication between the Executor and training script processes, utilizing a job-specific file directory for exchanging models and metadata via files. While FilePipe is easier to set up than CellPipe, it’s not suitable for high-frequency metrics exchange. On the other hand, FilePipe might be a better choice for scenarios where the training script is running on a remote machine and the client executor is running on the local machine.\n", + "\n", + "\n", + "## Client API Examples\n", + "\n", + "All implementations can be easily configured using the JobAPI's `ScriptRunner`. By default, the in-process is used, however setting `launch_external_process=True` uses the sub-process with pre-configured CellPipes for communication and metrics streaming.\n", + "\n", + "To find out more about the Client API, and its pipe configurations, please refer to the [Client API](https://nvflare.readthedocs.io/en/2.4/programming_guide/execution_api_type/client_api.html). In this tutorial, we will only use the in-process for simplicity. You can follow the examples in [ML-To-FL](https://github.com/NVIDIA/NVFlare/tree/main/examples/hello-world/ml-to-fl) for sub-processtraining. with different pipe training. \n", + "\n", + "In the following sections, we will see how to use the Client API with PyTorch Lightning and machine learning algorithms.\n", + "\n", + "\n", + "* [Convert PyTorch lightning to federated learning](../02.3_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb)\n", + "\n", + "* [Convert logistic regression to federated learning](../02.4_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/convert_logistic_regression_to_fl.ipynb)\n", + "\n", + "* [Convert Kmeans to federated learning](../02.4_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb)\n", + "\n", + "* [Convert survival analysis to federated learning](../02.4_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb)\n" + ] + }, + { + "cell_type": "markdown", + "id": "fffcd761", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nvflare_example", + "language": "python", + "name": "nvflare_example" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_client_api/client_api_communication_pattern.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_client_api/client_api_communication_pattern.png new file mode 100644 index 0000000000..4a50383778 Binary files /dev/null and b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_client_api/client_api_communication_pattern.png differ diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/log_config.json b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/log_config.json deleted file mode 100644 index e5732b4950..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/log_config.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "version": 1, - "disable_existing_loggers": false, - "formatters": { - "baseFormatter": { - "()": "nvflare.fuel.utils.log_utils.BaseFormatter", - "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" - }, - "colorFormatter": { - "()": "nvflare.fuel.utils.log_utils.ColorFormatter", - "fmt": "%(asctime)s - %(levelname)s - %(message)s", - "datefmt": "%Y-%m-%d %H:%M:%S" - }, - "jsonFormatter": { - "()": "nvflare.fuel.utils.log_utils.JsonFormatter", - "fmt": "%(asctime)s - %(identity)s - %(name)s - %(fullName)s - %(levelname)s - %(fl_ctx)s - %(message)s" - } - }, - "filters": { - "FLFilter": { - "()": "nvflare.fuel.utils.log_utils.LoggerNameFilter", - "logger_names": ["custom", "nvflare.app_common", "nvflare.app_opt"] - } - }, - "handlers": { - "consoleHandler": { - "class": "logging.StreamHandler", - "level": "INFO", - "formatter": "colorFormatter", - "filters": ["FLFilter"], - "stream": "ext://sys.stdout" - }, - "logFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "DEBUG", - "formatter": "baseFormatter", - "filename": "log.txt", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10 - }, - "errorFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "ERROR", - "formatter": "baseFormatter", - "filename": "log_error.txt", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10 - }, - "jsonFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "DEBUG", - "formatter": "jsonFormatter", - "filename": "log.json", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10 - }, - "FLFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "DEBUG", - "formatter": "baseFormatter", - "filters": ["FLFilter"], - "filename": "log_fl.txt", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10, - "delay": true - } - }, - "loggers": { - "root": { - "level": "INFO", - "handlers": ["consoleHandler", "logFileHandler", "errorFileHandler", "jsonFileHandler", "FLFileHandler"] - } - } -} - - - - - - - - - diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/log_config.json b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/log_config.json deleted file mode 100644 index e5732b4950..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/log_config.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "version": 1, - "disable_existing_loggers": false, - "formatters": { - "baseFormatter": { - "()": "nvflare.fuel.utils.log_utils.BaseFormatter", - "fmt": "%(asctime)s - %(name)s - %(levelname)s - %(fl_ctx)s - %(message)s" - }, - "colorFormatter": { - "()": "nvflare.fuel.utils.log_utils.ColorFormatter", - "fmt": "%(asctime)s - %(levelname)s - %(message)s", - "datefmt": "%Y-%m-%d %H:%M:%S" - }, - "jsonFormatter": { - "()": "nvflare.fuel.utils.log_utils.JsonFormatter", - "fmt": "%(asctime)s - %(identity)s - %(name)s - %(fullName)s - %(levelname)s - %(fl_ctx)s - %(message)s" - } - }, - "filters": { - "FLFilter": { - "()": "nvflare.fuel.utils.log_utils.LoggerNameFilter", - "logger_names": ["custom", "nvflare.app_common", "nvflare.app_opt"] - } - }, - "handlers": { - "consoleHandler": { - "class": "logging.StreamHandler", - "level": "INFO", - "formatter": "colorFormatter", - "filters": ["FLFilter"], - "stream": "ext://sys.stdout" - }, - "logFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "DEBUG", - "formatter": "baseFormatter", - "filename": "log.txt", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10 - }, - "errorFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "ERROR", - "formatter": "baseFormatter", - "filename": "log_error.txt", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10 - }, - "jsonFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "DEBUG", - "formatter": "jsonFormatter", - "filename": "log.json", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10 - }, - "FLFileHandler": { - "class": "logging.handlers.RotatingFileHandler", - "level": "DEBUG", - "formatter": "baseFormatter", - "filters": ["FLFilter"], - "filename": "log_fl.txt", - "mode": "a", - "maxBytes": 20971520, - "backupCount": 10, - "delay": true - } - }, - "loggers": { - "root": { - "level": "INFO", - "handlers": ["consoleHandler", "logFileHandler", "errorFileHandler", "jsonFileHandler", "FLFileHandler"] - } - } -} - - - - - - - - - diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/lr_fl_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/lr_fl_job.py deleted file mode 100644 index 4cc8c55e48..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/lr_fl_job.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from src.newton_raphson_persistor import NewtonRaphsonModelPersistor -from src.newton_raphson_workflow import FedAvgNewtonRaphson - -from nvflare.app_opt.pt.job_config.base_fed_job import BaseFedJob -from nvflare.client.config import ExchangeFormat -from nvflare.job_config.script_runner import ScriptRunner - -if __name__ == "__main__": - n_clients = 4 - num_rounds = 5 - - job = BaseFedJob( - name="logistic_regression_fedavg", - model_persistor=NewtonRaphsonModelPersistor(n_features=13), - ) - - controller = FedAvgNewtonRaphson( - num_clients=n_clients, - num_rounds=num_rounds, - damping_factor=0.8, - persistor_id="newton_raphson_persistor", - ) - job.to(controller, "server") - - # Add clients - for i in range(n_clients): - runner = ScriptRunner( - script="src/newton_raphson_train.py", - script_args="--data_root /tmp/flare/dataset/heart_disease_data", - launch_external_process=True, - params_exchange_format=ExchangeFormat.RAW, - ) - job.to(runner, f"site-{i + 1}") - - job.export_job("/tmp/nvflare/jobs/job_config") - job.simulator_run("/tmp/nvflare/jobs/workdir", gpu="0", log_config="./log_config.json") diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_train.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_train.py deleted file mode 100644 index 419b9ed70b..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_train.py +++ /dev/null @@ -1,184 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import argparse -import os - -import numpy as np -from sklearn.metrics import accuracy_score, precision_score - -import nvflare.client as flare -from nvflare.apis.fl_constant import FLMetaKey -from nvflare.app_common.abstract.fl_model import FLModel, ParamsType -from nvflare.app_common.np.constants import NPConstants -from nvflare.client.tracking import SummaryWriter - - -def parse_arguments(): - """ - Parse command line args for client side training. - """ - parser = argparse.ArgumentParser(description="Federated Second-Order Newton Raphson") - - parser.add_argument("--data_root", type=str, help="Path to load client side data.") - - return parser.parse_args() - - -def load_data(data_root, site_name): - """ - Load the data for each client. - - Args: - data_root: root directory storing client site data. - site_name: client site name - Returns: - A dict with client site training and validation data. - """ - print("loading data for client {} from: {}".format(site_name, data_root)) - train_x_path = os.path.join(data_root, "{}.train.x.npy".format(site_name)) - train_y_path = os.path.join(data_root, "{}.train.y.npy".format(site_name)) - test_x_path = os.path.join(data_root, "{}.test.x.npy".format(site_name)) - test_y_path = os.path.join(data_root, "{}.test.y.npy".format(site_name)) - - train_X = np.load(train_x_path) - train_y = np.load(train_y_path) - valid_X = np.load(test_x_path) - valid_y = np.load(test_y_path) - - return {"train_X": train_X, "train_y": train_y, "valid_X": valid_X, "valid_y": valid_y} - - -def sigmoid(inp): - return 1.0 / (1.0 + np.exp(-inp)) - - -def train_newton_raphson(data, theta): - """ - Compute gradient and hessian on local data - based on paramters received from server. - - """ - train_X = data["train_X"] - train_y = data["train_y"] - - # Add intercept, pre-pend 1s to as first - # column of train_X - train_X = np.concatenate((np.ones((train_X.shape[0], 1)), train_X), axis=1) - - # Compute probabilities from current weights - proba = sigmoid(np.dot(train_X, theta)) - - # The gradient is X^T . (y - proba) - gradient = np.dot(train_X.T, (train_y - proba)) - - # The hessian is X^T . D . X, where D is the - # diagnoal matrix with values proba * (1 - proba) - D = np.diag((proba * (1 - proba))[:, 0]) - hessian = train_X.T.dot(D).dot(train_X) - - return {"gradient": gradient, "hessian": hessian} - - -def validate(data, theta): - """ - Performs local validation. - Computes accuracy and precision scores. - - """ - valid_X = data["valid_X"] - valid_y = data["valid_y"] - - # Add intercept, pre-pend 1s to as first - # column of valid_X - valid_X = np.concatenate((np.ones((valid_X.shape[0], 1)), valid_X), axis=1) - - # Compute probabilities from current weights - proba = sigmoid(np.dot(valid_X, theta)) - - return {"accuracy": accuracy_score(valid_y, proba.round()), "precision": precision_score(valid_y, proba.round())} - - -def main(): - """ - This is a typical ML training loop, - augmented with Flare Client API to - perform local training on each client - side and send result to server. - - """ - args = parse_arguments() - - flare.init() - - site_name = flare.get_site_name() - print("training on client site: {}".format(site_name)) - - # Load client site data. - data = load_data(args.data_root, site_name) - - # Get metric summary writer - writer = SummaryWriter() - - while flare.is_running(): - - # Receive global model (FLModel) from server. - global_model = flare.receive() - - curr_round = global_model.current_round - print("current_round={}".format(curr_round)) - - print( - ("[ROUND {}] - client site: {}, received " "global model: {}").format(curr_round, site_name, global_model) - ) - - # Get the weights, aka parameter theta for - # logistic regression. - global_weights = global_model.params[NPConstants.NUMPY_KEY] - print("[ROUND {}] - global model weights: {}".format(curr_round, global_weights)) - - # Local validation before training - print(("[ROUND {}] - start validation of global " "model on client: {}").format(curr_round, site_name)) - validation_scores = validate(data, global_weights) - print( - ("[ROUND {}] - validation metric scores on " "client: {} = {}").format( - curr_round, site_name, validation_scores - ) - ) - - # Write validation metric summary - writer.add_scalar("{}/accuracy".format(site_name), validation_scores["accuracy"], curr_round) - - writer.add_scalar("{}/precision".format(site_name), validation_scores["precision"], curr_round) - - # Local training - print(("[ROUND {}] - start local training on client " "site: {}").format(curr_round, site_name)) - result_dict = train_newton_raphson(data, theta=global_weights) - - # Send result to server for aggregation. - result_model = FLModel(params=result_dict, params_type=ParamsType.FULL) - result_model.meta[FLMetaKey.NUM_STEPS_CURRENT_ROUND] = data["train_X"].shape[0] - - print( - ( - "[ROUND {}] - local newton raphson training from " "client: {} complete, sending results to server: {}" - ).format(curr_round, site_name, result_model) - ) - - flare.send(result_model) - - -if __name__ == "__main__": - main() diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_workflow.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_workflow.py deleted file mode 100644 index a4094cb7f6..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_workflow.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import List - -import numpy as np - -from nvflare.apis.fl_constant import FLMetaKey -from nvflare.app_common.abstract.fl_model import FLModel -from nvflare.app_common.aggregators.weighted_aggregation_helper import WeightedAggregationHelper -from nvflare.app_common.app_constant import AppConstants -from nvflare.app_common.np.constants import NPConstants -from nvflare.app_common.workflows.base_fedavg import BaseFedAvg - - -class FedAvgNewtonRaphson(BaseFedAvg): - def __init__(self, damping_factor, epsilon=1.0, *args, **kwargs): - super().__init__(*args, **kwargs) - """ - Init function for FedAvgNewtonRaphson. - - Args: - damping_factor: damping factor for Newton Raphson updates. - epsilon: a regularization factor to avoid empty hessian for - matrix inversion - """ - self.damping_factor = damping_factor - self.epsilon = epsilon - self.aggregator = WeightedAggregationHelper() - - def run(self) -> None: - """ - The run function executes the logic of federated - second order Newton Raphson optimization. - - """ - self.info("starting Federated Averaging Netwon Raphson ...") - - # First load the model and set up some training params. - # A `persisitor` (NewtonRaphsonModelPersistor) will load - # the model in `ModelLearnable` format, then will be - # converted `FLModel` by `ModelController`. - # - model = self.load_model() - - model.start_round = self.start_round - model.total_rounds = self.num_rounds - - self.info("Server side model loader: {}".format(model)) - - for self.current_round in range(self.start_round, self.start_round + self.num_rounds): - self.info(f"Round {self.current_round} started.") - - # Get the list of clients. - clients = self.sample_clients(self.num_clients) - - model.current_round = self.current_round - - # Send training task and current global model to clients. - # - # A `task` isntance will be created, and sent - # to clients, the model is first converted to a shareable - # and is attached to the task. - # - # After the task is finished, the result (shareable) recieved - # from the task is converted to FLModel, and is returned to the - # server. The `results` below is a list with result (FLModel) - # from all clients. - # - # The full logic of `task` is implemented in: - # https://github.com/NVIDIA/NVFlare/blob/d6827bca96d332adb3402ceceb4b67e876146067/nvflare/app_common/workflows/model_controller.py#L178 - # - self.info("sending server side global model to clients") - results = self.send_model_and_wait(targets=clients, data=model) - - # Aggregate results receieved from clients. - aggregate_results = self.aggregate(results, aggregate_fn=self.newton_raphson_aggregator_fn) - - # Update global model based on the following formula: - # weights = weights + updates, where - # updates = -damping_factor * Hessian^{-1} . Gradient - self.update_model(model, aggregate_results) - - # Save global model. - self.save_model(model) - - self.info("Finished FedAvg.") - - def newton_raphson_aggregator_fn(self, results: List[FLModel]): - """ - Custom aggregator function for second order Newton Raphson - optimization. - - This uses the default thread-safe WeightedAggregationHelper, - which implement a weighted average of all values received from - a `result` dictionary. - - Args: - results: a list of `FLModel`s. Each `FLModel` is received - from a client. The field `params` is a dictionary that - contains values to be aggregated: the gradient and hessian. - """ - self.info("receieved results from clients: {}".format(results)) - - # On client side the `NUM_STEPS_CURRENT_ROUND` key - # is used to track the number of samples for each client. - for curr_result in results: - self.aggregator.add( - data=curr_result.params, - weight=curr_result.meta.get(FLMetaKey.NUM_STEPS_CURRENT_ROUND, 1.0), - contributor_name=curr_result.meta.get("client_name", AppConstants.CLIENT_UNKNOWN), - contribution_round=curr_result.current_round, - ) - - aggregated_dict = self.aggregator.get_result() - self.info("aggregated result: {}".format(aggregated_dict)) - - # Compute global model update: - # update = - damping_factor * Hessian^{-1} . Gradient - # A regularization is added to avoid empty hessian. - # - reg = self.epsilon * np.eye(aggregated_dict["hessian"].shape[0]) - newton_raphson_updates = self.damping_factor * np.linalg.solve( - aggregated_dict["hessian"] + reg, aggregated_dict["gradient"] - ) - self.info("newton raphson updates: {}".format(newton_raphson_updates)) - - # Convert the aggregated result to `FLModel`, this `FLModel` - # will then be used by `update_model` method from the base class, - # to update the global model weights. - # - aggr_result = FLModel( - params={"newton_raphson_updates": newton_raphson_updates}, - params_type=results[0].params_type, - meta={ - "nr_aggregated": len(results), - AppConstants.CURRENT_ROUND: results[0].current_round, - AppConstants.NUM_ROUNDS: self.num_rounds, - }, - ) - return aggr_result - - def update_model(self, model, model_update, replace_meta=True) -> FLModel: - """ - Update logistic regression parameters based on - aggregated gradient and hessian. - - """ - if replace_meta: - model.meta = model_update.meta - else: - model.meta.update(model_update.meta) - - model.metrics = model_update.metrics - model.params[NPConstants.NUMPY_KEY] += model_update.params["newton_raphson_updates"] diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/src/newton_raphson_persistor.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/src/newton_raphson_persistor.py deleted file mode 100644 index 5b324dd50c..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/src/newton_raphson_persistor.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import numpy as np - -from nvflare.app_common.np.np_model_persistor import NPModelPersistor - - -class NewtonRaphsonModelPersistor(NPModelPersistor): - """ - This class defines the persistor for Newton Raphson model. - - A persistor controls the logic behind initializing, loading - and saving of the model / parameters for each round of a - federated learning process. - - In the 2nd order Newton Raphson case, a model is just a - 1-D numpy vector containing the parameters for logistic - regression. The length of the parameter vector is defined - by the number of features in the dataset. - - """ - - def __init__(self, model_dir="models", model_name="weights.npy", n_features=13): - """ - Init function for NewtonRaphsonModelPersistor. - - Args: - model_dir: sub-folder name to save and load the global model - between rounds. - model_name: name to save and load the global model. - n_features: number of features for the logistic regression. - For the UCI ML heart Disease dataset, this is 13. - - """ - - super().__init__() - - self.model_dir = model_dir - self.model_name = model_name - self.n_features = n_features - - # A default model is loaded when no local model is available. - # This happen when training starts. - # - # A `model` for a binary logistic regression is just a matrix, - # with shape (n_features + 1, 1). - # For the UCI ML Heart Disease dataset, the n_features = 13. - # - # A default matrix with value 0s is created. - # - self.default_data = np.zeros((self.n_features + 1, 1), dtype=np.float32) diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/convert_logistic_regression_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/convert_logistic_regression_to_fl.ipynb deleted file mode 100644 index 5bf4d3f4f1..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/convert_logistic_regression_to_fl.ipynb +++ /dev/null @@ -1,341 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "e8c19632", - "metadata": {}, - "source": [ - "# Converting Logistic Regression to FL" - ] - }, - { - "cell_type": "markdown", - "id": "7f9d96ed", - "metadata": {}, - "source": [ - "## Federated Logistic Regression with Second-Order Newton-Raphson optimization\n", - "This example shows how to implement a federated binary classification via logistic regression with second-order Newton-Raphson optimization.\n", - "\n", - "The [UCI Heart Disease dataset](https://archive.ics.uci.edu/dataset/45/heart+disease) is\n", - "used in this example. Scripts are provided to download and process the\n", - "dataset as described\n", - "[here](https://github.com/owkin/FLamby/tree/main/flamby/datasets/fed_heart_disease).\n", - "\n", - "This dataset contains samples from 4 sites, splitted into training and\n", - "testing sets as described below:\n", - "|site | sample split |\n", - "|-------------|---------------------------------------|\n", - "|Cleveland | train: 199 samples, test: 104 samples |\n", - "|Hungary | train: 172 samples, test: 89 samples |\n", - "|Switzerland | train: 30 samples, test: 16 samples |\n", - "|Long Beach V | train: 85 samples, test: 45 samples |\n", - "\n", - "The number of features in each sample is 13." - ] - }, - { - "cell_type": "markdown", - "id": "e54f0dcc", - "metadata": {}, - "source": [ - "## Introduction\n", - "\n", - "The [Newton-Raphson\n", - "optimization](https://en.wikipedia.org/wiki/Newton%27s_method) problem\n", - "can be described as follows.\n", - "\n", - "In a binary classification task with logistic regression, the\n", - "probability of a data sample $x$ classified as positive is formulated\n", - "as:\n", - "$$p(x) = \\sigma(\\beta \\cdot x + \\beta_{0})$$\n", - "where $\\sigma(.)$ denotes the sigmoid function. We can incorporate\n", - "$\\beta_{0}$ and $\\beta$ into a single parameter vector $\\theta =\n", - "( \\beta_{0}, \\beta)$. Let $d$ be the number\n", - "of features for each data sample $x$ and let $N$ be the number of data\n", - "samples. We then have the matrix version of the above probability\n", - "equation:\n", - "$$p(X) = \\sigma( X \\theta )$$\n", - "Here $X$ is the matrix of all samples, with shape $N \\times (d+1)$,\n", - "having it's first column filled with value 1 to account for the\n", - "intercept $\\theta_{0}$.\n", - "\n", - "The goal is to compute parameter vector $\\theta$ that maximizes the\n", - "below likelihood function:\n", - "$$L_{\\theta} = \\prod_{i=1}^{N} p(x_i)^{y_i} (1 - p(x_i)^{1-y_i})$$\n", - "\n", - "The Newton-Raphson method optimizes the likelihood function via\n", - "quadratic approximation. Omitting the maths, the theoretical update\n", - "formula for parameter vector $\\theta$ is:\n", - "$$\\theta^{n+1} = \\theta^{n} - H_{\\theta^{n}}^{-1} \\nabla L_{\\theta^{n}}$$\n", - "where\n", - "$$\\nabla L_{\\theta^{n}} = X^{T}(y - p(X))$$\n", - "is the gradient of the likelihood function, with $y$ being the vector\n", - "of ground truth for sample data matrix $X$, and\n", - "$$H_{\\theta^{n}} = -X^{T} D X$$\n", - "is the Hessian of the likelihood function, with $D$ a diagonal matrix\n", - "where diagonal value at $(i,i)$ is $D(i,i) = p(x_i) (1 - p(x_i))$.\n", - "\n", - "In federated Newton-Raphson optimization, each client will compute its\n", - "own gradient $\\nabla L_{\\theta^{n}}$ and Hessian $H_{\\theta^{n}}$\n", - "based on local training samples. A server will aggregate the gradients\n", - "and Hessians computed from all clients, and perform the update of\n", - "parameter $\\theta$ based on the theoretical update formula described\n", - "above." - ] - }, - { - "cell_type": "markdown", - "id": "32003ba9", - "metadata": {}, - "source": [ - "## Implementation\n", - "\n", - "Using `nvflare`, The federated logistic regression with Newton-Raphson\n", - "optimization is implemented as follows.\n", - "\n", - "On the server side, all workflow logics are implemented in\n", - "class `FedAvgNewtonRaphson`, which can be found\n", - "[here](code/newton_raphson/app/custom/newton_raphson_workflow.py). The\n", - "`FedAvgNewtonRaphson` class inherits from the\n", - "[`BaseFedAvg`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/base_fedavg.py)\n", - "class, which itself inherits from the **ModelController**\n", - "([`ModelController`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/model_controller.py))\n", - "class. This is the preferrable approach to implement a custom\n", - "workflow, since `ModelController` decouples communication logic from\n", - "actual workflow (training & validation) logic. The mandatory\n", - "method to override in `ModelController` is the\n", - "[`run()`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/model_controller.py#L37)\n", - "method, where the orchestration of server-side workflow actually\n", - "happens. The implementation of `run()` method in\n", - "[`FedAvgNewtonRaphson`](code/newton_raphson/app/custom/newton_raphson_workflow.py)\n", - "is similar to the classic\n", - "[`FedAvg`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/fedavg.py#L44):\n", - "- Initialize the global model, this is acheived through method `load_model()`\n", - " from base class\n", - " [`ModelController`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/model_controller.py#L292),\n", - " which relies on the\n", - " [`ModelPersistor`](https://nvflare.readthedocs.io/en/main/glossary.html#persistor). A\n", - " custom\n", - " [`NewtonRaphsonModelPersistor`](code/newton_raphson/app/custom/newton_raphson_persistor.py)\n", - " is implemented in this example, which is based on the\n", - " [`NPModelPersistor`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/np/np_model_persistor.py)\n", - " for numpy data, since the _model_ in the case of logistic regression\n", - " is just the parameter vector $\\theta$ that can be represented by a\n", - " numpy array. Only the `__init__` method needs to be re-implemented\n", - " to provide a proper initialization for the global parameter vector\n", - " $\\theta$.\n", - "- During each training round, the global model will be sent to the\n", - " list of participating clients to perform a training task. This is\n", - " done using the\n", - " [`send_model_and_wait()`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/model_controller.py#L41)\n", - " method. Once\n", - " the clients finish their local training, results will be collected\n", - " and sent back to server as\n", - " [`FLModel`](https://nvflare.readthedocs.io/en/main/programming_guide/fl_model.html#flmodel)s.\n", - "- Results sent by clients contain their locally computed gradient and\n", - " Hessian. A [custom aggregation\n", - " function](code/newton_raphson/app/custom/newton_raphson_workflow.py)\n", - " is implemented to get the averaged gradient and Hessian, and compute\n", - " the Newton-Raphson update for the global parameter vector $\\theta$,\n", - " based on the theoretical formula shown above. The averaging of\n", - " gradient and Hessian is based on the\n", - " [`WeightedAggregationHelper`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/aggregators/weighted_aggregation_helper.py#L20),\n", - " which weighs the contribution from each client based on the number\n", - " of local training samples. The aggregated Newton-Raphson update is\n", - " returned as an `FLModel`.\n", - "- After getting the aggregated Newton-Raphson update, an\n", - " [`update_model()`](code/newton_raphson/app/custom/newton_raphson_workflow.py#L172)\n", - " method is implemented to actually apply the Newton-Raphson update to\n", - " the global model.\n", - "- The last step is to save the updated global model, again through\n", - " the `NewtonRaphsonModelPersistor` using `save_model()`.\n", - "\n", - "\n", - "On the client side, the local training logic is implemented\n", - "[here](code/newton_raphson/app/custom/newton_raphson_train.py). The\n", - "implementation is based on the [`Client\n", - "API`](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type.html#client-api). This\n", - "allows user to add minimum `nvflare`-specific code to turn a typical\n", - "centralized training script into a federated client side local training\n", - "script.\n", - "- During local training, each client receives a copy of the global\n", - " model, sent by the server, using `flare.receive()` from the Client API.\n", - " The received global model is an instance of `FLModel`.\n", - "- A local validation is first performed, where validation metrics\n", - " (accuracy and precision) are streamed to server using the\n", - " [`SummaryWriter`](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.client.tracking.html#nvflare.client.tracking.SummaryWriter). The\n", - " streamed metrics can be loaded and visualized using tensorboard.\n", - "- Then each client computes it's gradient and Hessian based on local\n", - " training data, using their respective theoretical formula described\n", - " above. This is implemented in the\n", - " [`train_newton_raphson()`](code/newton_raphson/app/custom/newton_raphson_train.py#L82)\n", - " method. Each client then sends the computed results (always in\n", - " `FLModel` format) to server for aggregation, using the Client API call\n", - " `flare.send()`.\n", - "\n", - "Each client site corresponds to a site listed in the data table above.\n", - "\n", - "A [centralized training script](code/train_centralized.py) is also\n", - "provided, which allows for comparing the federated Newton-Raphson\n", - "optimization versus the centralized version. In the centralized\n", - "version, training data samples from all 4 sites were concatenated into\n", - "a single matrix, used to optimize the model parameters. The\n", - "optimized model was then tested separately on testing data samples of\n", - "the 4 sites, using accuracy and precision as metrics.\n", - "\n", - "Comparing the federated [client-side training\n", - "code](code/newton_raphson/app/custom/newton_raphson_train.py) with the\n", - "centralized [training code](code/train_centralized.py), we can see that\n", - "the training logic remains similar: load data, perform training\n", - "(Newton-Raphson updates), and valid trained model. The only added\n", - "differences in the federated code are related to interaction with the\n", - "FL system, such as receiving and send `FLModel`." - ] - }, - { - "cell_type": "markdown", - "id": "c3fc55e0", - "metadata": {}, - "source": [ - "## Install requirements\n", - "First, install the required packages:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04911ca3", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -r code/requirements.txt" - ] - }, - { - "cell_type": "markdown", - "id": "33ea8504", - "metadata": {}, - "source": [ - "## Download and prepare data\n", - "\n", - "Execute the following script\n", - "```\n", - "bash ./code/data/prepare_heart_disease_data.sh\n", - "```\n", - "This will download the heart disease dataset under\n", - "`/tmp/flare/dataset/heart_disease_data/`\n", - "\n", - "Please note that you may need to accept the data terms in order to complete the download." - ] - }, - { - "cell_type": "markdown", - "id": "d548b466", - "metadata": {}, - "source": [ - "## Centralized Logistic Regression\n", - "\n", - "Launch the following script:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c68fe1a", - "metadata": {}, - "outputs": [], - "source": [ - "! python3 code/train_centralized.py --solver custom" - ] - }, - { - "cell_type": "markdown", - "id": "fa666b79", - "metadata": {}, - "source": [ - "Two implementations of logistic regression are provided in the\n", - "centralized training script, which can be specified by the `--solver`\n", - "argument:\n", - "- One is using `sklearn.LogisticRegression` with the `newton-cholesky`\n", - " solver\n", - "- The other one is manually implemented using the theoretical update\n", - " formulas described above.\n", - "\n", - "Both implementations were tested to converge in 4 iterations and to\n", - "give the same result.\n", - "\n", - "Example output:\n", - "```\n", - "using solver: custom\n", - "loading training data.\n", - "training data X loaded. shape: (486, 13)\n", - "training data y loaded. shape: (486, 1)\n", - "\n", - "site - 1\n", - "validation set n_samples: 104\n", - "accuracy: 0.75\n", - "precision: 0.7115384615384616\n", - "\n", - "site - 2\n", - "validation set n_samples: 89\n", - "accuracy: 0.7528089887640449\n", - "precision: 0.6122448979591837\n", - "\n", - "site - 3\n", - "validation set n_samples: 16\n", - "accuracy: 0.75\n", - "precision: 1.0\n", - "\n", - "site - 4\n", - "validation set n_samples: 45\n", - "accuracy: 0.6\n", - "precision: 0.9047619047619048\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "0b72ef2b", - "metadata": {}, - "source": [ - "## Federated Logistic Regression\n", - "\n", - "Execute the following command to launch federated logistic\n", - "regression. This will run in `nvflare`'s simulator mode.\n", - "```\n", - "nvflare simulator -w ./workspace -n 4 -t 4 job/newton_raphson/\n", - "```\n", - "\n", - "Accuracy and precision for each site can be viewed in Tensorboard:\n", - "```\n", - "tensorboard --logdir=./workspace/server/simulate_job/tb_events\n", - "```\n", - "As can be seen from the figure below, per-site evaluation metrics in\n", - "federated logistic regression are on-par with the centralized version.\n", - "\n", - "\"Tensorboard\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/convert_ml_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/convert_ml_to_fl.ipynb deleted file mode 100644 index efef1032b9..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/convert_ml_to_fl.ipynb +++ /dev/null @@ -1,34 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Simple ML/DL to FL transition with NVFlare\n", - "\n", - "Converting Deep Learning (DL) models to Federated Learning (FL) entails several key steps:\n", - "\n", - " - Formulating the algorithm: This involves determining how to adapt a DL model into an FL framework, including specifying the information exchange protocol between the server and clients.\n", - "\n", - " - Code conversion: Adapting existing standalone DL code into FL-compatible code. This typically involves minimal changes, often just a few lines of code, thanks to tools like NVFlare.\n", - "\n", - " - Workflow configuration: Once the code is modified, configuring the workflow to integrate the newly adapted FL code seamlessly.\n", - "\n", - "NVFlare simplifies the process of transitioning from traditional Machine Learning (ML) or DL algorithms to FL. With NVFlare, the conversion process requires only minor code adjustments.\n", - "\n", - "In this section, we have the following three examples for converting traditional ML to FL:\n", - "\n", - " * [Convert Logistics Regression to federated learning](02.3.1_convert_logistic_regression_to_federated_learning/convert_logistic_regression_to_fl.ipynb)\n", - " * [Convert KMeans to federated learning](02.3.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb)\n", - " * [Convert Survival Analysis to federated learning](02.3.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb)" - ] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/lightning_fl_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/code/lightning_fl_job.py similarity index 93% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/lightning_fl_job.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/code/lightning_fl_job.py index b51724a962..aee42bb95b 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/lightning_fl_job.py +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/code/lightning_fl_job.py @@ -41,4 +41,4 @@ job.to(runner, f"site-{i + 1}") job.export_job("/tmp/nvflare/jobs/job_config") - job.simulator_run("/tmp/nvflare/jobs/workdir", gpu="0", log_config="./log_config.json") + job.simulator_run("/tmp/nvflare/jobs/workdir", gpu="0") diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/requirements.txt b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/code/requirements.txt similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/requirements.txt rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/code/requirements.txt diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/src/cifar10_lightning_fl.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/code/src/cifar10_lightning_fl.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/src/cifar10_lightning_fl.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/code/src/cifar10_lightning_fl.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/src/lit_net.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/code/src/lit_net.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/code/src/lit_net.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/code/src/lit_net.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/controller_worker_flow.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/controller_worker_flow.png new file mode 100644 index 0000000000..5deac70f5a Binary files /dev/null and b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/controller_worker_flow.png differ diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb similarity index 88% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb index 71f9a334c6..b09b44d940 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.2_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb @@ -7,7 +7,7 @@ "source": [ "# Converting PyTorch Lightning to FL\n", "\n", - "In this notebook, we use FedAvg and CIFAR10 PyTorch Lightning for client training code to create and run a federated learning job with NVFlare." + "In chapter 1, we have learned how to convert the PyTorch code to a federated learning job with NVFlare. In this section and next section, we will learn how to convert the PyTorch Lightning code to a federated learning job with NVFlare.\n" ] }, { @@ -16,12 +16,16 @@ "metadata": {}, "source": [ "## Basic Concepts\n", + "\n", "At the heart of NVFlare lies the concept of collaboration through\n", "\"tasks.\" An FL controller assigns tasks (e.g., training on local data) to one or more FL clients, processes returned\n", "results (e.g., model weight updates), and may assign additional\n", "tasks based on these results and other factors (e.g., a pre-configured\n", "number of training rounds). The clients run executors which can listen for tasks and perform the necessary computations locally, such as model training. This task-based interaction repeats\n", - "until the experiment’s objectives are met. " + "until the experiment's objectives are met.\n", + "\n", + "\n", + "\"Controller-Executor" ] }, { @@ -30,6 +34,7 @@ "metadata": {}, "source": [ "## Federated Averaging with NVFlare\n", + "\n", "Given the flexible controller and executor concepts, it is easy to implement different computing & communication patterns with NVFlare, such as [FedAvg](https://proceedings.mlr.press/v54/mcmahan17a?ref=https://githubhelp.com) and [cyclic weight transfer](https://academic.oup.com/jamia/article/25/8/945/4956468). \n", "\n", "The controller's `run()` routine is responsible for assigning tasks and processing task results from the Executors. " @@ -114,8 +119,9 @@ "metadata": {}, "source": [ "Using NVFlare's Client Lightning API, we can easily adapt machine learning code that was written for centralized training and apply it in a federated scenario.\n", + "\n", "For general use cases, we can use the Client Lightning API patch function:\n", - "- `flare.patch(trainer)`: Patch the lightning trainer. After flare.patch, functions such as `trainer.fit()` and `trainer.validate()` will get the global model internally and automatically send the result model to the FL server." + "- `flare.patch(trainer)`: Patch the lightning trainer with callbacks. After flare.patch, functions such as `trainer.fit()` and `trainer.validate()` will get the global model internally and automatically send the result model to the FL server." ] }, { @@ -163,7 +169,8 @@ "id": "5da34414-bac4-4352-8077-ab7ade998eec", "metadata": {}, "source": [ - "## Run an NVFlare Job\n", + "## Run an FedAvg PyTorch Lightning Training Job\n", + "\n", "Now that we have defined the FedAvg controller to run our federated compute workflow on the FL server, and our client training script to receive the global models, run local training, and send the results back to the FL server, we can put everything together using NVFlare's Job API." ] }, @@ -183,7 +190,7 @@ "metadata": {}, "outputs": [], "source": [ - "% pip install -r code/requirements.txt" + "! pip install -r code/requirements.txt" ] }, { @@ -196,11 +203,12 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 1, "id": "93889e62-b725-427c-8839-2771ca81d24c", "metadata": {}, + "outputs": [], "source": [ - "```python\n", "from typing import Any\n", "\n", "import torch\n", @@ -278,8 +286,7 @@ "\n", " def configure_optimizers(self):\n", " optimizer = optim.SGD(self.parameters(), lr=0.001, momentum=0.9)\n", - " return {\"optimizer\": optimizer}\n", - "```" + " return {\"optimizer\": optimizer}\n" ] }, { @@ -296,13 +303,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "aaa2b6f4", "metadata": {}, "outputs": [], "source": [ - "from code.src.lit_net import LitNet\n", - "\n", "from nvflare.app_common.workflows.fedavg import FedAvg\n", "from nvflare.app_opt.pt.job_config.base_fed_job import BaseFedJob\n", "from nvflare.job_config.script_runner import ScriptRunner\n", @@ -324,11 +329,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "6962e6cc-995e-4356-8156-3ceba2c7a249", "metadata": {}, "outputs": [], "source": [ + "from nvflare.app_common.workflows.fedavg import FedAvg\n", + "from nvflare.app_opt.pt.job_config.base_fed_job import BaseFedJob\n", + "from nvflare.job_config.script_runner import ScriptRunner\n", + "\n", "n_clients = 2\n", "\n", "controller = FedAvg(\n", @@ -354,21 +363,27 @@ "#### 4. Add clients\n", "Next, we can use the `ScriptRunner` and send it to each of the clients to run our training script.\n", "\n", - "Note that our script could have additional input arguments, such as batch size or data path, but we don't use them here for simplicity." + "Note that our script could have additional input arguments, such as batch size or data path, but we don't use them here for simplicity.\n", + "```python\n", + "\n", + "for i in range(n_clients):\n", + " runner = ScriptRunner(\n", + " script=\"src/cifar10_lightning_fl.py\", script_args=\"\" # f\"--batch_size 32 --data_path /tmp/data/site-{i}\"\n", + " )\n", + " job.to(runner, f\"site-{i+1}\")\n", + "\n", + "\n", + "```" ] }, { "cell_type": "code", "execution_count": null, - "id": "ad5d36fe-9ae5-43c3-80bc-2cdc66bf7a7e", + "id": "9d391acd", "metadata": {}, "outputs": [], "source": [ - "for i in range(n_clients):\n", - " runner = ScriptRunner(\n", - " script=\"src/cifar10_lightning_fl.py\", script_args=\"\" # f\"--batch_size 32 --data_path /tmp/data/site-{i}\"\n", - " )\n", - " job.to(runner, f\"site-{i+1}\")" + "!cat code/src/cifar10_lightning_fl.py" ] }, { @@ -379,17 +394,12 @@ "That's it!\n", "\n", "#### 5. Optionally export the job\n", - "Now, we could export the job and submit it to a real NVFlare deployment using the [Admin client](https://nvflare.readthedocs.io/en/main/real_world_fl/operation.html) or [FLARE API](https://nvflare.readthedocs.io/en/main/real_world_fl/flare_api.html). " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "99a270bf-c906-425b-b999-2306cb76eb62", - "metadata": {}, - "outputs": [], - "source": [ - "job.export_job(\"/tmp/nvflare/jobs/job_config\")" + "Now, we could export the job and submit it to a real NVFlare deployment using the [Admin client](https://nvflare.readthedocs.io/en/main/real_world_fl/operation.html) or [FLARE API](https://nvflare.readthedocs.io/en/main/real_world_fl/flare_api.html). \n", + "\n", + "```python\n", + "\n", + "job.export_job(\"/tmp/nvflare/jobs/job_config\")\n", + "```" ] }, { @@ -410,7 +420,10 @@ }, "outputs": [], "source": [ - "job.simulator_run(\"/tmp/nvflare/jobs/workdir\", gpu=\"0\")" + "%cd code/\n", + "! python3 lightning_fl_job.py\n", + "%cd -\n", + "\n" ] }, { @@ -418,15 +431,23 @@ "id": "fb2e1266", "metadata": {}, "source": [ - "You can see the full code for this job in [lightning_fl_job](code/lightning_fl_job.py)." + "You can see the full code for this job in [lightning_fl_job](code/lightning_fl_job.py).\n", + "\n", + "Now that you see how easy it is to convert a PyTorch Lightning training job to a federated learning job with NVFlare, we will show you how to [convert machine learning training jobs to federated learning](../02.4_convert_machine_learning_to_federated_learning/convert_ml_to_fl.ipynb) in the next section." ] + }, + { + "cell_type": "markdown", + "id": "9370e34a", + "metadata": {}, + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "nvflare_env", "language": "python", - "name": "python3" + "name": "nvflare_env" }, "language_info": { "codemirror_mode": { diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/client_api.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/client_api.ipynb deleted file mode 100644 index 3a0c46da26..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/client_api.ipynb +++ /dev/null @@ -1,346 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "58149c32", - "metadata": {}, - "source": [ - "# Transform Existing Code to FL Easily with the FLARE Client API" - ] - }, - { - "cell_type": "markdown", - "id": "06203527", - "metadata": {}, - "source": [ - "The FLARE Client API provides an easy way to convert centralized, local training code into federated learning code with just a few lines of code changes.\n", - "\n", - "Most of the previous examples up this point have already been using the Client API, but in this section we focus on the core concepts of the Client API and explain some of the ways it can be configured to help you use the Client API more effectively.\n", - "\n", - "You can see the detailed examples with actual integration with deep learing platforms including PyTorch and TensorFlow here: https://github.com/NVIDIA/NVFlare/tree/main/examples/hello-world/ml-to-fl" - ] - }, - { - "cell_type": "markdown", - "id": "be7efa36", - "metadata": {}, - "source": [ - "## Core Concept" - ] - }, - { - "cell_type": "markdown", - "id": "76102eac", - "metadata": {}, - "source": [ - "The general structure of the popular federated learning (FL) workflow, \"FedAvg\" is as follows:\n", - "\n", - "1. **FL server initializes an initial model**\n", - "2. **For each round (global iteration):**\n", - " 1. FL server sends the global model to clients\n", - " 2. Each FL client starts with this global model and trains on their own data\n", - " 3. Each FL client sends back their trained model\n", - " 4. FL server aggregates all the models and produces a new global model\n", - "\n", - "On the client side, the training workflow is as follows:\n", - "\n", - "1. Receive the model from the FL server\n", - "2. Perform local training on the received global model and/or evaluate the received global model for model selection\n", - "3. Send the new model back to the FL server" - ] - }, - { - "cell_type": "markdown", - "id": "50e2b7dd", - "metadata": {}, - "source": [ - "To convert a centralized training code to federated learning, we need to\n", - "adapt the code to do the following steps:\n", - "\n", - "1. Obtain the required information from the received `fl_model`\n", - "2. Run local training\n", - "3. Put the results in a new `fl_model` to be sent back\n", - "\n", - "For a general use case, there are three essential methods for the Client API:\n", - "\n", - "* ``init()``: Initializes NVFlare Client API environment.\n", - "* ``receive()``: Receives model from NVFlare side.\n", - "* ``send()``: Sends the model to NVFlare side.\n", - "\n", - "You can use the Client API to change centralized training code to\n", - "federated learning, for example:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f21ee16", - "metadata": {}, - "outputs": [], - "source": [ - "import nvflare.client as flare\n", - "\n", - "flare.init() # 1. Initializes NVFlare Client API environment.\n", - "input_model = flare.receive() # 2. Receives model from NVFlare side.\n", - "params = input_model.params # 3. Obtain the required information from received FLModel\n", - "\n", - "# original local training code begins\n", - "new_params = local_train(params)\n", - "# original local training code ends\n", - "\n", - "output_model = flare.FLModel(params=new_params) # 4. Put the results in a new FLModel\n", - "flare.send(output_model) # 5. Sends the model to NVFlare side." - ] - }, - { - "cell_type": "markdown", - "id": "494e4079", - "metadata": {}, - "source": [ - "With 5 lines of code changes, we convert the centralized training code to work in a\n", - "federated learning setting.\n", - "\n", - "After this, we can use the job templates and the Job CLI\n", - "to generate a job and export it to run on a deployed NVFlare system or directly run the job using FL Simulator.\n", - "\n", - "To see a table of the key Client APIs, see the [Client API documentation in the programming guide](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type/client_api.html#id2).\n", - "\n", - "Please consult the [Client API Module](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.client.api.html) for more in-depth information about all of the Client API functions.\n", - "\n", - "If you are using PyTorch Lightning in your training code, you can check the [Lightning API Module](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_opt.lightning.api.html). Also, be sure to look through the [Convert Torch Lightning to FL notebook](../02.2_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb) and related code." - ] - }, - { - "cell_type": "markdown", - "id": "4a09d80e", - "metadata": {}, - "source": [ - "## Advanced User Options: Client API with Different Implementations\n", - "\n", - "Within the Client API, we offer multiple implementations tailored to diverse requirements:\n", - "\n", - "* In-process Client API: In this setup, the client training script operates within the same process as the NVFlare Client job.\n", - "This configuration, utilizing the ```InProcessClientAPIExecutor```, offers shared memory usage and is efficient with simple configuration. \n", - "This is the default for `ScriptRunner` since by default `launch_external_process=False`. Use this configuration for development or single GPU training.\n", - "\n", - "* Sub-process Client API: Here, the client training script runs in a separate subprocess.\n", - "Utilizing the ```ClientAPILauncherExecutor```, this option offers flexibility in communication mechanisms:\n", - " * Communication via CellPipe (default)\n", - " * Communication via FilePipe (no capability to stream metrics for experiment tracking) \n", - "This configuration is ideal for scenarios requiring multi-GPU or distributed PyTorch training.\n", - "\n", - "Choose the option best suited to your specific requirements and workflow preferences.\n", - "\n", - "These implementations can be easily configured using the JobAPI's `ScriptRunner`.\n", - "By default, the ```InProcessClientAPIExecutor``` is used, however setting `launch_external_process=True` uses the ```ClientAPILauncherExecutor```\n", - "with pre-configured CellPipes for communication and metrics streaming." - ] - }, - { - "cell_type": "markdown", - "id": "b3ac92dd", - "metadata": {}, - "source": [ - "## NVFlare Client API Job with NumPy" - ] - }, - { - "cell_type": "markdown", - "id": "832a4b34", - "metadata": {}, - "source": [ - "In this example we use simple NumPy scripts to showcase the Client API with the `ScriptRunner` for both in-process and sub-process settings. With NumPy, only nvflare is needed so you do not have to install any additional dependencies.\n", - "\n", - "The default mode of the `ScriptRunner` uses `InProcessClientAPIExecutor` with the client training script operating within the same process as the NVFlare Client job. Below, we show a script that sends back full model parameters and then one that sends back model parameters differences before explaining metrics streaming and then showing how to launch those same scripts with the Sub-process Client API." - ] - }, - { - "cell_type": "markdown", - "id": "c1af7da6", - "metadata": {}, - "source": [ - "### Send model parameters back to the NVFlare server\n", - "\n", - "We use the mock training script in [train_full.py](code/src/train_full.py)\n", - "and send back the FLModel with `params_type=\"FULL\"`.\n", - "\n", - "After we modify our training script, we can create a job using the ScriptRunner: [np_client_api_job.py](code/np_client_api_job.py).\n", - "\n", - "The script will run the job using the simulator with the Job API by default:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7952cab8", - "metadata": {}, - "outputs": [], - "source": [ - "! python3 code/np_client_api_job.py --script code/src/train_full.py" - ] - }, - { - "cell_type": "markdown", - "id": "5076b478", - "metadata": {}, - "source": [ - "To instead export the job configuration to use in other modes, run the script with the flag `--export_config`." - ] - }, - { - "cell_type": "markdown", - "id": "efdceecd", - "metadata": {}, - "source": [ - "### Send model parameters differences back to the NVFlare server\n", - "\n", - "We can send model parameter differences back to the NVFlare server by calculating the parameters differences and sending it back: [train_diff.py](code/src/train_diff.py)\n", - "\n", - "Note that we set the `params_type` to `DIFF` when creating `flare.FLModel`.\n", - "\n", - "Then we can run it using the NVFlare Simulator:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "737d8b7c", - "metadata": {}, - "outputs": [], - "source": [ - "! python3 code/np_client_api_job.py --script code/src/train_diff.py" - ] - }, - { - "cell_type": "markdown", - "id": "550479f2", - "metadata": {}, - "source": [ - "### Metrics streaming\n", - "\n", - "We already showed an example with metrics streaming in section 01.5 of Chapter 1 in Part 1, but this is a simple example with the Client API for streaming the training progress to the server with `MLflowWriter`.\n", - "\n", - "NVFlare supports the following writers:\n", - "\n", - " - `SummaryWriter` mimics Tensorboard `SummaryWriter`'s `add_scalar`, `add_scalars` method\n", - " - `WandBWriter` mimics Weights And Biases's `log` method\n", - " - `MLflowWriter` mimics MLflow's tracking api\n", - "\n", - "In this example we use `MLflowWriter` in [train_metrics.py](code/src/train_metrics.py) and configure a corresponding `MLflowReceiver` in the job script [np_client_api_job.py](code/np_client_api_job.py)\n", - "\n", - "Then we can run it using the NVFlare Simulator:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d27f9b3a", - "metadata": {}, - "outputs": [], - "source": [ - "! python3 code/np_client_api_job.py --script code/src/train_metrics.py" - ] - }, - { - "cell_type": "markdown", - "id": "3774d943", - "metadata": {}, - "source": [ - "After the experiment is finished, you can view the results by running the the mlflow command: `mlflow ui --port 5000` inside the directory `/tmp/nvflare/jobs/workdir/server/simulate_job/`." - ] - }, - { - "cell_type": "markdown", - "id": "eadee3dd", - "metadata": {}, - "source": [ - "## Sub-process Client API\n", - "\n", - "The `ScriptRunner` with `launch_external_process=True` uses the `ClientAPILauncherExecutor` for external process script execution.\n", - "This configuration is ideal for scenarios requiring third-party integrations, multi-GPU or distributed PyTorch training, or if additional processes are needed for training." - ] - }, - { - "cell_type": "markdown", - "id": "ade375cd", - "metadata": {}, - "source": [ - "### Launching the script\n", - "\n", - "When launching a script in an external process, it is launched once for the entire job.\n", - "We must ensure our training script [train_full.py](code/src/train_full.py) is in a loop to support this.\n", - "\n", - "Then we can run it using the NVFlare Simulator:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b3ee641", - "metadata": {}, - "outputs": [], - "source": [ - "! python3 code/np_client_api_job.py --script code/src/train_full.py --launch_process" - ] - }, - { - "cell_type": "markdown", - "id": "5006b87a", - "metadata": {}, - "source": [ - "### Metrics streaming\n", - "\n", - "In this example we use `MLflowWriter` in [train_metrics.py](code/src/train_metrics.py) and configure a corresponding `MLflowReceiver` in the job script [np_client_api_job.py](code/np_client_api_job.py)\n", - "\n", - "Then we can run it using the NVFlare Simulator:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "071834d0", - "metadata": {}, - "outputs": [], - "source": [ - "! python3 np_client_api_job.py --script src/train_metrics.py --launch_process" - ] - }, - { - "cell_type": "markdown", - "id": "95504253", - "metadata": {}, - "source": [ - "If you want to see example code with actual integration with PyTorch and TensorFlow, you can find it in the [Hello World ML to FL](https://github.com/NVIDIA/NVFlare/tree/main/examples/hello-world/ml-to-fl) section of the examples." - ] - }, - { - "cell_type": "markdown", - "id": "c56e633e", - "metadata": {}, - "source": [ - "With this, we are at the end of Chapter 2. The [next notebook](../02.5_recap/recap.ipynb) is a reacap of this chapter." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/np_client_api_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/np_client_api_job.py deleted file mode 100644 index b2a09c5968..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/np_client_api_job.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -from nvflare import FedJob -from nvflare.app_common.np.np_model_persistor import NPModelPersistor -from nvflare.app_common.workflows.fedavg import FedAvg -from nvflare.app_opt.tracking.mlflow.mlflow_receiver import MLflowReceiver -from nvflare.job_config.script_runner import FrameworkType, ScriptRunner - - -def define_parser(): - parser = argparse.ArgumentParser() - parser.add_argument("--n_clients", type=int, default=2) - parser.add_argument("--num_rounds", type=int, default=5) - parser.add_argument("--script", type=str, default="src/train_full.py") - parser.add_argument("--launch_process", action=argparse.BooleanOptionalAction, default=False) - parser.add_argument("--export_config", action=argparse.BooleanOptionalAction, default=False) - - return parser.parse_args() - - -def main(): - # define local parameters - args = define_parser() - - n_clients = args.n_clients - num_rounds = args.num_rounds - script = args.script - launch_process = args.launch_process - export_config = args.export_config - - job = FedJob(name="np_client_api") - - persistor_id = job.to_server(NPModelPersistor(), "persistor") - - # Define the controller workflow and send to server - controller = FedAvg(num_clients=n_clients, num_rounds=num_rounds, persistor_id=persistor_id) - job.to_server(controller) - - # Add MLflow Receiver for metrics streaming - if script == "src/train_metrics.py": - receiver = MLflowReceiver( - tracking_uri="file:///tmp/nvflare/jobs/workdir/server/simulate_job/mlruns", - kw_args={ - "experiment_name": "nvflare-fedavg-np-experiment", - "run_name": "nvflare-fedavg-np-with-mlflow", - "experiment_tags": {"mlflow.note.content": "## **NVFlare FedAvg Numpy experiment with MLflow**"}, - "run_tags": {"mlflow.note.content": "## Federated Experiment tracking with MLflow.\n"}, - }, - artifact_location="artifacts", - events=["fed.analytix_log_stats"], - ) - job.to_server(receiver) - - executor = ScriptRunner( - script=script, - launch_external_process=launch_process, - framework=FrameworkType.NUMPY, - ) - job.to_clients(executor) - - if export_config: - job.export_job("/tmp/nvflare/jobs/job_config") - else: - job.simulator_run("/tmp/nvflare/jobs/workdir", n_clients=n_clients, gpu="0") - - -if __name__ == "__main__": - main() diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/src/train_diff.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/src/train_diff.py deleted file mode 100755 index d7dc02ee45..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/src/train_diff.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -import nvflare.client as flare - - -def train(input_arr): - output_arr = copy.deepcopy(input_arr) - # mock training with plus 1 - return output_arr + 1 - - -def evaluate(input_arr): - # mock evaluation metrics - return 100 - - -def main(): - # initializes NVFlare interface - flare.init() - - # get system information - sys_info = flare.system_info() - print(f"system info is: {sys_info}") - - while flare.is_running(): - - # get model from NVFlare - input_model = flare.receive() - print(f"received weights is: {input_model.params}") - - input_numpy_array = input_model.params["numpy_key"] - - # training - output_numpy_array = train(input_numpy_array) - - # evaluation - metrics = evaluate(input_numpy_array) - - print(f"finish round: {input_model.current_round}") - - # calculate difference here - diff = output_numpy_array - input_numpy_array - - # send back the model difference - print(f"send back: {diff}") - flare.send( - flare.FLModel( - params={"numpy_key": diff}, - params_type="DIFF", - metrics={"accuracy": metrics}, - current_round=input_model.current_round, - ) - ) - - -if __name__ == "__main__": - main() diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/src/train_full.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/src/train_full.py deleted file mode 100755 index e1598275b5..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/src/train_full.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -import nvflare.client as flare - - -def train(input_arr): - output_arr = copy.deepcopy(input_arr) - # mock training with plus 1 - return output_arr + 1 - - -def evaluate(input_arr): - # mock evaluation metrics - return 100 - - -def main(): - # initializes NVFlare interface - flare.init() - - # get system information - sys_info = flare.system_info() - print(f"system info is: {sys_info}") - - while flare.is_running(): - - # get model from NVFlare - input_model = flare.receive() - print(f"received weights is: {input_model.params}", flush=True) - - input_numpy_array = input_model.params["numpy_key"] - - # training - output_numpy_array = train(input_numpy_array) - - # evaluation - metrics = evaluate(input_numpy_array) - - print(f"finish round: {input_model.current_round}", flush=True) - - # send back the model - print(f"send back: {output_numpy_array}", flush=True) - flare.send( - flare.FLModel( - params={"numpy_key": output_numpy_array}, - params_type="FULL", - metrics={"accuracy": metrics}, - current_round=input_model.current_round, - ) - ) - - -if __name__ == "__main__": - main() diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/src/train_metrics.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/src/train_metrics.py deleted file mode 100755 index c6b24e877d..0000000000 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_client_api/code/src/train_metrics.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import time - -import nvflare.client as flare -from nvflare.client.tracking import MLflowWriter - - -def train(input_arr, current_round, epochs=3): - writer = MLflowWriter() - output_arr = copy.deepcopy(input_arr) - num_of_data = 2000 - batch_size = 16 - num_of_batches = num_of_data // batch_size - for i in range(epochs): - for j in range(num_of_batches): - global_step = current_round * num_of_batches * epochs + i * num_of_batches + j - writer.log_metric( - key="global_step", - value=global_step, - step=global_step, - ) - print(f"logged records from epoch: {i}") - # mock training with plus 1 - output_arr += 1 - # assume each epoch takes 1 seconds - time.sleep(1.0) - return output_arr - - -def evaluate(input_arr): - # mock evaluation metrics - return 100 - - -def main(): - # initializes NVFlare interface - flare.init() - - # get system information - sys_info = flare.system_info() - print(f"system info is: {sys_info}") - - while flare.is_running(): - input_model = flare.receive() - print(f"received weights is: {input_model.params}") - - input_numpy_array = input_model.params["numpy_key"] - - # training - output_numpy_array = train(input_numpy_array, current_round=input_model.current_round, epochs=3) - - # evaluation - metrics = evaluate(input_numpy_array) - - print(f"finish round: {input_model.current_round}") - - # send back the model - print(f"send back: {output_numpy_array}") - flare.send( - flare.FLModel( - params={"numpy_key": output_numpy_array}, - params_type="FULL", - metrics={"accuracy": metrics}, - current_round=input_model.current_round, - ) - ) - - -if __name__ == "__main__": - main() diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/data/prepare_heart_disease_data.sh b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/data/prepare_heart_disease_data.sh similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/data/prepare_heart_disease_data.sh rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/data/prepare_heart_disease_data.sh diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/data/utils/convert_data_to_np.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/data/utils/convert_data_to_np.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/data/utils/convert_data_to_np.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/data/utils/convert_data_to_np.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/figs/tb-metrics.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/figs/tb-metrics.png similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/figs/tb-metrics.png rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/figs/tb-metrics.png diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/config/config_fed_client.json b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/config/config_fed_client.json similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/config/config_fed_client.json rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/config/config_fed_client.json diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/config/config_fed_server.json b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/config/config_fed_server.json similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/config/config_fed_server.json rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/config/config_fed_server.json diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_persistor.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_persistor.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_persistor.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_persistor.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/src/newton_raphson_train.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_train.py similarity index 97% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/src/newton_raphson_train.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_train.py index 419b9ed70b..1f426e4a77 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/src/newton_raphson_train.py +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_train.py @@ -20,7 +20,6 @@ from sklearn.metrics import accuracy_score, precision_score import nvflare.client as flare -from nvflare.apis.fl_constant import FLMetaKey from nvflare.app_common.abstract.fl_model import FLModel, ParamsType from nvflare.app_common.np.constants import NPConstants from nvflare.client.tracking import SummaryWriter @@ -169,7 +168,7 @@ def main(): # Send result to server for aggregation. result_model = FLModel(params=result_dict, params_type=ParamsType.FULL) - result_model.meta[FLMetaKey.NUM_STEPS_CURRENT_ROUND] = data["train_X"].shape[0] + result_model.meta["sample_size"] = data["train_X"].shape[0] print( ( diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/src/newton_raphson_workflow.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_workflow.py similarity index 98% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/src/newton_raphson_workflow.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_workflow.py index a4094cb7f6..40a01d6063 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/src/newton_raphson_workflow.py +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/app/custom/newton_raphson_workflow.py @@ -17,7 +17,6 @@ import numpy as np -from nvflare.apis.fl_constant import FLMetaKey from nvflare.app_common.abstract.fl_model import FLModel from nvflare.app_common.aggregators.weighted_aggregation_helper import WeightedAggregationHelper from nvflare.app_common.app_constant import AppConstants @@ -54,7 +53,6 @@ def run(self) -> None: # converted `FLModel` by `ModelController`. # model = self.load_model() - model.start_round = self.start_round model.total_rounds = self.num_rounds @@ -119,7 +117,7 @@ def newton_raphson_aggregator_fn(self, results: List[FLModel]): for curr_result in results: self.aggregator.add( data=curr_result.params, - weight=curr_result.meta.get(FLMetaKey.NUM_STEPS_CURRENT_ROUND, 1.0), + weight=curr_result.meta.get("sample_size", 1.0), contributor_name=curr_result.meta.get("client_name", AppConstants.CLIENT_UNKNOWN), contribution_round=curr_result.current_round, ) diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/meta.json b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/meta.json similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/meta.json rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/newton_raphson/meta.json diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/requirements.txt b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/requirements.txt similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/requirements.txt rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/requirements.txt diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/train_centralized.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/train_centralized.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/code/train_centralized.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/code/train_centralized.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/convert_logistic_regression_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/convert_logistic_regression_to_fl.ipynb new file mode 100644 index 0000000000..96fef946b5 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.1_convert_logistic_regression_to_federated_learning/convert_logistic_regression_to_fl.ipynb @@ -0,0 +1,687 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e8c19632", + "metadata": {}, + "source": [ + "# Converting Logistic Regression to Federated Learning\n", + "\n", + "\n", + "Logistic regression is a fundamental classification algorithm that models the probability of a binary outcome. Despite its name, it's used for classification rather than regression. The model uses the logistic (sigmoid) function to transform a linear combination of features into a probability between 0 and 1.\n", + "\n", + "The Newton-Raphson method is a powerful second-order optimization technique that uses both first-order (gradient) and second-order (Hessian) information to find the optimal model parameters. Unlike first-order methods like gradient descent, Newton's method incorporates curvature information through the Hessian matrix, often leading to faster convergence, especially near the optimum.\n", + "\n", + "In this section, we will convert logistics regression with the 2nd order Newton-Raphson optimization to Federated Learning\n" + ] + }, + { + "cell_type": "markdown", + "id": "7f9d96ed", + "metadata": {}, + "source": [ + "## Federated Logistic Regression with Second-Order Newton-Raphson optimization\n", + "This example shows how to implement a federated binary classification via logistic regression with second-order Newton-Raphson optimization.\n", + "\n", + "The [UCI Heart Disease dataset](https://archive.ics.uci.edu/dataset/45/heart+disease) is\n", + "used in this example. Scripts are provided to download and process the\n", + "dataset as described\n", + "[here](https://github.com/owkin/FLamby/tree/main/flamby/datasets/fed_heart_disease).\n", + "\n", + "This dataset contains samples from 4 sites, splitted into training and\n", + "testing sets as described below:\n", + "|site | sample split |\n", + "|-------------|---------------------------------------|\n", + "|Cleveland | train: 199 samples, test: 104 samples |\n", + "|Hungary | train: 172 samples, test: 89 samples |\n", + "|Switzerland | train: 30 samples, test: 16 samples |\n", + "|Long Beach V | train: 85 samples, test: 45 samples |\n", + "\n", + "The number of features in each sample is 13." + ] + }, + { + "cell_type": "markdown", + "id": "e54f0dcc", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "The [Newton-Raphson\n", + "optimization](https://en.wikipedia.org/wiki/Newton%27s_method) problem\n", + "can be described as follows.\n", + "\n", + "In a binary classification task with logistic regression, the\n", + "probability of a data sample $x$ classified as positive is formulated\n", + "as:\n", + "$$p(x) = \\sigma(\\beta \\cdot x + \\beta_{0})$$\n", + "where $\\sigma(.)$ denotes the sigmoid function. We can incorporate\n", + "$\\beta_{0}$ and $\\beta$ into a single parameter vector $\\theta =\n", + "( \\beta_{0}, \\beta)$. Let $d$ be the number\n", + "of features for each data sample $x$ and let $N$ be the number of data\n", + "samples. We then have the matrix version of the above probability\n", + "equation:\n", + "$$p(X) = \\sigma( X \\theta )$$\n", + "Here $X$ is the matrix of all samples, with shape $N \\times (d+1)$,\n", + "having it's first column filled with value 1 to account for the\n", + "intercept $\\theta_{0}$.\n", + "\n", + "The goal is to compute parameter vector $\\theta$ that maximizes the\n", + "below likelihood function:\n", + "$$L_{\\theta} = \\prod_{i=1}^{N} p(x_i)^{y_i} (1 - p(x_i)^{1-y_i})$$\n", + "\n", + "The Newton-Raphson method optimizes the likelihood function via\n", + "quadratic approximation. Omitting the maths, the theoretical update\n", + "formula for parameter vector $\\theta$ is:\n", + "$$\\theta^{n+1} = \\theta^{n} - H_{\\theta^{n}}^{-1} \\nabla L_{\\theta^{n}}$$\n", + "where\n", + "$$\\nabla L_{\\theta^{n}} = X^{T}(y - p(X))$$\n", + "is the gradient of the likelihood function, with $y$ being the vector\n", + "of ground truth for sample data matrix $X$, and\n", + "$$H_{\\theta^{n}} = -X^{T} D X$$\n", + "is the Hessian of the likelihood function, with $D$ a diagonal matrix\n", + "where diagonal value at $(i,i)$ is $D(i,i) = p(x_i) (1 - p(x_i))$.\n", + "\n", + "In federated Newton-Raphson optimization, each client will compute its\n", + "own gradient $\\nabla L_{\\theta^{n}}$ and Hessian $H_{\\theta^{n}}$\n", + "based on local training samples. A server will aggregate the gradients\n", + "and Hessians computed from all clients, and perform the update of\n", + "parameter $\\theta$ based on the theoretical update formula described\n", + "above." + ] + }, + { + "cell_type": "markdown", + "id": "32003ba9", + "metadata": {}, + "source": [ + "## Implementation\n", + "\n", + "Using `nvflare`, The federated logistic regression with Newton-Raphson\n", + "optimization is implemented as follows.\n", + "\n", + "On the server side, all workflow logics are implemented in\n", + "class `FedAvgNewtonRaphson`, which can be found\n", + "[here](code/newton_raphson/app/custom/newton_raphson_workflow.py). The\n", + "`FedAvgNewtonRaphson` class inherits from the\n", + "[`BaseFedAvg`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/base_fedavg.py)\n", + "class, which itself inherits from the **ModelController**\n", + "([`ModelController`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/model_controller.py))\n", + "class. This is the preferrable approach to implement a custom\n", + "workflow, since `ModelController` decouples communication logic from\n", + "actual workflow (training & validation) logic. The mandatory\n", + "method to override in `ModelController` is the\n", + "[`run()`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/model_controller.py#L37)\n", + "method, where the orchestration of server-side workflow actually\n", + "happens. The implementation of `run()` method in\n", + "[`FedAvgNewtonRaphson`](code/newton_raphson/app/custom/newton_raphson_workflow.py)\n", + "is similar to the classic\n", + "[`FedAvg`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/fedavg.py#L44):\n", + "- Initialize the global model, this is acheived through method `load_model()`\n", + " from base class\n", + " [`ModelController`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/model_controller.py#L292),\n", + " which relies on the\n", + " [`ModelPersistor`](https://nvflare.readthedocs.io/en/main/glossary.html#persistor). A\n", + " custom\n", + " [`NewtonRaphsonModelPersistor`](code/newton_raphson/app/custom/newton_raphson_persistor.py)\n", + " is implemented in this example, which is based on the\n", + " [`NPModelPersistor`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/np/np_model_persistor.py)\n", + " for numpy data, since the _model_ in the case of logistic regression\n", + " is just the parameter vector $\\theta$ that can be represented by a\n", + " numpy array. Only the `__init__` method needs to be re-implemented\n", + " to provide a proper initialization for the global parameter vector\n", + " $\\theta$.\n", + "- During each training round, the global model will be sent to the\n", + " list of participating clients to perform a training task. This is\n", + " done using the\n", + " [`send_model_and_wait()`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/workflows/model_controller.py#L41)\n", + " method. Once\n", + " the clients finish their local training, results will be collected\n", + " and sent back to server as\n", + " [`FLModel`](https://nvflare.readthedocs.io/en/main/programming_guide/fl_model.html#flmodel)s.\n", + "- Results sent by clients contain their locally computed gradient and\n", + " Hessian. A [custom aggregation\n", + " function](code/newton_raphson/app/custom/newton_raphson_workflow.py)\n", + " is implemented to get the averaged gradient and Hessian, and compute\n", + " the Newton-Raphson update for the global parameter vector $\\theta$,\n", + " based on the theoretical formula shown above. The averaging of\n", + " gradient and Hessian is based on the\n", + " [`WeightedAggregationHelper`](https://github.com/NVIDIA/NVFlare/blob/main/nvflare/app_common/aggregators/weighted_aggregation_helper.py#L20),\n", + " which weighs the contribution from each client based on the number\n", + " of local training samples. The aggregated Newton-Raphson update is\n", + " returned as an `FLModel`.\n", + "- After getting the aggregated Newton-Raphson update, an\n", + " [`update_model()`](code/newton_raphson/app/custom/newton_raphson_workflow.py#L172)\n", + " method is implemented to actually apply the Newton-Raphson update to\n", + " the global model.\n", + "- The last step is to save the updated global model, again through\n", + " the `NewtonRaphsonModelPersistor` using `save_model()`.\n", + "\n", + "\n", + "On the client side, the local training logic is implemented\n", + "[here](code/newton_raphson/app/custom/newton_raphson_train.py). The\n", + "implementation is based on the [`Client\n", + "API`](https://nvflare.readthedocs.io/en/main/programming_guide/execution_api_type.html#client-api). This\n", + "allows user to add minimum `nvflare`-specific code to turn a typical\n", + "centralized training script into a federated client side local training\n", + "script.\n", + "- During local training, each client receives a copy of the global\n", + " model, sent by the server, using `flare.receive()` from the Client API.\n", + " The received global model is an instance of `FLModel`.\n", + "- A local validation is first performed, where validation metrics\n", + " (accuracy and precision) are streamed to server using the\n", + " [`SummaryWriter`](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.client.tracking.html#nvflare.client.tracking.SummaryWriter). The\n", + " streamed metrics can be loaded and visualized using tensorboard.\n", + "- Then each client computes it's gradient and Hessian based on local\n", + " training data, using their respective theoretical formula described\n", + " above. This is implemented in the\n", + " [`train_newton_raphson()`](code/newton_raphson/app/custom/newton_raphson_train.py#L82)\n", + " method. Each client then sends the computed results (always in\n", + " `FLModel` format) to server for aggregation, using the Client API call\n", + " `flare.send()`.\n", + "\n", + "Each client site corresponds to a site listed in the data table above.\n", + "\n", + "A [centralized training script](code/train_centralized.py) is also\n", + "provided, which allows for comparing the federated Newton-Raphson\n", + "optimization versus the centralized version. In the centralized\n", + "version, training data samples from all 4 sites were concatenated into\n", + "a single matrix, used to optimize the model parameters. The\n", + "optimized model was then tested separately on testing data samples of\n", + "the 4 sites, using accuracy and precision as metrics.\n", + "\n", + "Comparing the federated [client-side training\n", + "code](code/newton_raphson/app/custom/newton_raphson_train.py) with the\n", + "centralized [training code](code/train_centralized.py), we can see that\n", + "the training logic remains similar: load data, perform training\n", + "(Newton-Raphson updates), and valid trained model. The only added\n", + "differences in the federated code are related to interaction with the\n", + "FL system, such as receiving and send `FLModel`." + ] + }, + { + "cell_type": "markdown", + "id": "c3fc55e0", + "metadata": {}, + "source": [ + "## Install requirements\n", + "First, install the required packages:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04911ca3", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r code/requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "33ea8504", + "metadata": {}, + "source": [ + "## Download and prepare data\n", + "\n", + "Execute the following script\n", + "```\n", + "bash ./code/data/prepare_heart_disease_data.sh\n", + "```\n", + "This will download the heart disease dataset under\n", + "`/tmp/flare/dataset/heart_disease_data/`\n", + "\n", + "Please note that you may need to accept the data terms in order to complete the download." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b395c0d9", + "metadata": {}, + "outputs": [], + "source": [ + "# Note: the the download site remember your download history and abort the 2nd download attempt. \n", + "\n", + "! echo y | bash ./code/data/prepare_heart_disease_data.sh\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61e13343", + "metadata": {}, + "outputs": [], + "source": [ + "! ls -al /tmp/flare/dataset/heart_disease_data/" + ] + }, + { + "cell_type": "markdown", + "id": "d548b466", + "metadata": {}, + "source": [ + "## Centralized Logistic Regression\n", + "\n", + "Two implementations of logistic regression are provided in the\n", + "centralized training script, which can be specified by the `--solver`\n", + "argument:\n", + "- One is using `sklearn.LogisticRegression` with the `newton-cholesky`\n", + " solver\n", + "- The other one is manually implemented using the theoretical update\n", + " formulas described above.\n", + "\n", + "Both implementations were tested to converge in 4 iterations and to\n", + "give the same result.\n", + "\n", + "Launch the following script:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c68fe1a", + "metadata": {}, + "outputs": [], + "source": [ + "%cd code\n", + "! python3 train_centralized.py --solver custom\n", + "\n", + "%cd -" + ] + }, + { + "cell_type": "markdown", + "id": "a1c8d278", + "metadata": {}, + "source": [ + "## Federated Logistic Regression\n", + "\n", + "\n", + "To convert the centralized logistic regression to federated learning, we need to do the following:\n", + "\n", + "1. Decide what model parameters will be transmitted between the server and clients\n", + "2. Define the workflow that orchestrates the federated learning process\n", + "3. Define how to load the initial model on the server side\n", + "4. Modify the client-side training logic to handle models received from the server\n", + "5. Implement the aggregation logic for the gradients and Hessians computed by the clients\n", + "6. Configure the job via FLARE job API\n", + "\n", + "Let's examine each step.\n", + "\n", + "### Model Parameters\n", + "\n", + "We decided to simply capture the model parameters in the FLModel:\n", + "\n", + "\n", + "```python\n", + "\n", + "model = FLModel(params={\"gradient\": gradient, \"hessian\": hessian})\n", + "```\n", + "\n", + "We could optionally use FLModel.optimizer_params to store the Hessian, but either approach works.\n", + "\n", + "We add a few metadata fields to help with the training process. We use the training sample size as the weight, storing this information in the metadata:\n", + "\n", + "```python\n", + "\n", + "model = FLModel(params=result_dict, params_type=ParamsType.FULL)\n", + "model.meta[\"sample_size\"] = data[\"train_X\"].shape[0]\n", + "```\n", + "\n", + "### Workflow\n", + "\n", + "We decided to choose the FedAvg type of scatter and gather workflow. So we can based the class using the `BaseFedAvg` class. \n", + "\n", + "```python\n", + "\n", + "class FedAvgNewtonRaphson(BaseFedAvg):\n", + "\n", + " def __init__(self, damping_factor, epsilon=1.0, *args, **kwargs):\n", + " super().__init__(*args, **kwargs)\n", + " \"\"\"\n", + " Args:\n", + " damping_factor: damping factor for Newton Raphson updates.\n", + " epsilon: a regularization factor to avoid empty hessian for\n", + " matrix inversion\n", + " \"\"\"\n", + " self.damping_factor = damping_factor\n", + " self.epsilon = epsilon\n", + " self.aggregator = WeightedAggregationHelper()\n", + "\n", + " def run(self) -> None:\n", + " \n", + " # First load the model and set up some training params.\n", + " # A `persisitor` (NewtonRaphsonModelPersistor) will load\n", + " # the model in `ModelLearnable` format, then will be\n", + " # converted `FLModel` by `ModelController`.\n", + " #\n", + " model = self.load_model()\n", + "\n", + " model.start_round = self.start_round\n", + " model.total_rounds = self.num_rounds\n", + "\n", + " \n", + " for self.current_round in range(self.start_round, self.start_round + self.num_rounds):\n", + "\n", + " # Get the list of clients.\n", + " clients = self.sample_clients(self.num_clients)\n", + "\n", + " model.current_round = self.current_round\n", + "\n", + " results = self.send_model_and_wait(targets=clients, data=model)\n", + "\n", + " # Aggregate results receieved from clients.\n", + " aggregate_results = self.aggregate(results, aggregate_fn=self.newton_raphson_aggregator_fn)\n", + "\n", + " # Update global model based on the following formula:\n", + " # weights = weights + updates, where\n", + " # updates = -damping_factor * Hessian^{-1} . Gradient\n", + " self.update_model(model, aggregate_results)\n", + "\n", + " # Save global model.\n", + " self.save_model(model)\n", + "\n", + " self.info(\"Finished FedAvg.\")\n", + "\n", + "```\n", + "As you can see the `run()` method is the only method we need to implement. Its nothing but a for loop that sends the model to the clients and aggregate the results. \n", + "\n", + "### Model Loader\n", + "\n", + "we need to decide how to load the initial model on the server side. We decide to implement a custom persistor that loads the model from a numpy file. \n", + "\n", + "```python\n", + "\n", + "class NewtonRaphsonModelPersistor(NPModelPersistor):\n", + " \"\"\"\n", + " This class defines the persistor for Newton Raphson model.\n", + "\n", + " A persistor controls the logic behind initializing, loading\n", + " and saving of the model / parameters for each round of a\n", + " federated learning process.\n", + "\n", + " In the 2nd order Newton Raphson case, a model is just a\n", + " 1-D numpy vector containing the parameters for logistic\n", + " regression. The length of the parameter vector is defined\n", + " by the number of features in the dataset.\n", + "\n", + " \"\"\"\n", + "\n", + " def __init__(self, model_dir=\"models\", model_name=\"weights.npy\", n_features=13):\n", + " super().__init__()\n", + "\n", + " self.model_dir = model_dir\n", + " self.model_name = model_name\n", + " self.n_features = n_features\n", + "\n", + " # A default model is loaded when no local model is available.\n", + " # This happen when training starts.\n", + " #\n", + " # A `model` for a binary logistic regression is just a matrix,\n", + " # with shape (n_features + 1, 1).\n", + " # For the UCI ML Heart Disease dataset, the n_features = 13.\n", + " #\n", + " # A default matrix with value 0s is created.\n", + " #\n", + " self.default_data = np.zeros((self.n_features + 1, 1), dtype=np.float32)\n", + "\n", + "```\n", + "\n", + "\n", + "### Client Training Logic \n", + "\n", + "Now, we need to convert the centralized training logic to the federated training logic with Client API.\n", + "\n", + "```python\n", + "\n", + "\n", + "def main():\n", + " \n", + " args = parse_arguments()\n", + "\n", + " flare.init()\n", + "\n", + " site_name = flare.get_site_name()\n", + " \n", + " # Load client site data.\n", + " data = load_data(args.data_root, site_name)\n", + "\n", + "\n", + " # keep running until the job is terminated or end of training round\n", + " while flare.is_running():\n", + "\n", + " # Receive global model (FLModel) from server.\n", + " global_model = flare.receive()\n", + "\n", + " # Get the weights, aka parameter theta for logistic regression.\n", + " global_weights = global_model.params[\"weights\"]\n", + "\n", + " # Local validation before training\n", + " validation_scores = validate(data, global_weights)\n", + "\n", + " # Local training\n", + " result_dict = train_newton_raphson(data, theta=global_weights)\n", + "\n", + " # Send result to server for aggregation.\n", + " local_model = FLModel(params=result_dict, params_type=ParamsType.FULL)\n", + " local_model.meta[\"sample_size\"] = data[\"train_X\"].shape[0]\n", + "\n", + " flare.send(local_model)\n", + "\n", + "```\n", + "\n", + "This is pretty straight forward. We receive the global model, perform the local training and send the result to the server. The code structure is the same to the centralized training with additional loop for the federated training. \n", + "\n", + "We added the sample size to the meta data so we can use it in weighted aggregation as the aggregation weight.\n", + "\n", + "\n", + "### Aggregation Logic\n", + "\n", + "Now, lets loop at the aggregation logic. \n", + "\n", + "```python\n", + "\n", + " def newton_raphson_aggregator_fn(self, results: List[FLModel]):\n", + " \"\"\"\n", + " This uses the default thread-safe WeightedAggregationHelper,\n", + " which implement a weighted average of all values received from\n", + " a `result` dictionary.\n", + "\n", + " Args:\n", + " results: a list of `FLModel`s. Each `FLModel` is received\n", + " from a client. The field `params` is a dictionary that\n", + " contains values to be aggregated: the gradient and hessian.\n", + " \"\"\"\n", + " \n", + " # On client side the `sample_size` key is used to track the number of samples for each client.\n", + " for curr_result in results:\n", + " self.aggregator.add(\n", + " data=curr_result.params,\n", + " weight=curr_result.meta.get(\"sample_size\", 1.0),\n", + " contributor_name=curr_result.meta.get(\"client_name\", AppConstants.CLIENT_UNKNOWN),\n", + " contribution_round=curr_result.current_round,\n", + " )\n", + "\n", + " aggregated_dict = self.aggregator.get_result()\n", + " \n", + " # Compute global model update:\n", + " # update = - damping_factor * Hessian^{-1} . Gradient\n", + " # A regularization is added to avoid empty hessian.\n", + " #\n", + " reg = self.epsilon * np.eye(aggregated_dict[\"hessian\"].shape[0])\n", + "\n", + " newton_raphson_updates = self.damping_factor * np.linalg.solve(\n", + " aggregated_dict[\"hessian\"] + reg, aggregated_dict[\"gradient\"]\n", + " )\n", + " \n", + " # Convert the aggregated result to `FLModel`, this `FLModel`\n", + " # will then be used by `update_model` method from the base class,\n", + " # to update the global model weights.\n", + " #\n", + " aggr_result = FLModel(\n", + " params={\"newton_raphson_updates\": newton_raphson_updates},\n", + " params_type=results[0].params_type,\n", + " meta={\n", + " \"nr_aggregated\": len(results),\n", + " AppConstants.CURRENT_ROUND: results[0].current_round,\n", + " AppConstants.NUM_ROUNDS: self.num_rounds,\n", + " },\n", + " )\n", + " return aggr_result\n", + "\n", + " def update_model(self, model, model_update, replace_meta=True) -> FLModel:\n", + " \"\"\"\n", + " Update logistic regression parameters based on\n", + " aggregated gradient and hessian.\n", + "\n", + " \"\"\"\n", + " if replace_meta:\n", + " model.meta = model_update.meta\n", + " else:\n", + " model.meta.update(model_update.meta)\n", + "\n", + " model.metrics = model_update.metrics\n", + " model.params[NPConstants.NUMPY_KEY] += model_update.params[\"newton_raphson_updates\"]\n", + "\n", + "```\n", + "Again, we just need to use FLModel to store the result and update the model. \n", + "\n", + "\n", + "### Job Configuration\n", + "\n", + "With the above steps, we have converted the centralized training to the federated training. \n", + "\n", + "Now, lets connect the pieces together and define the job configuration and run with simulator. \n", + "\n", + "In this example, we decided to sub-process instead of in-process training. \n", + "\n", + "We manually define the job configuration and run with simulator. \n", + "\n", + "#### server job configuration\n", + "\n", + "The key is defined a workflow ```FedAvgNewtonRaphson``` and corresponding arguments: number round, clients and damping factor. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c5767f4", + "metadata": {}, + "outputs": [], + "source": [ + "! cat code/newton_raphson/app/config/config_fed_server.json" + ] + }, + { + "cell_type": "markdown", + "id": "f13fdf5d", + "metadata": {}, + "source": [ + "#### client job configuration\n", + "\n", + "Notice that we used the ClientAPILauncherExecutor with a Cell Pipe, we also need a separate pipe for metrics relay" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e9b8eef", + "metadata": {}, + "outputs": [], + "source": [ + "! cat code/newton_raphson/app/config/config_fed_client.json" + ] + }, + { + "cell_type": "markdown", + "id": "9a3fe4de", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "id": "0b72ef2b", + "metadata": {}, + "source": [ + "## Running Federated Logistic Regression Job\n", + "\n", + "Execute the following command to launch federated logistic\n", + "regression. This will run in `nvflare`'s simulator mode.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2faea343", + "metadata": {}, + "outputs": [], + "source": [ + "! nvflare simulator -w /tmp/nvflare/job/lr/workspace -n 4 -t 4 code/newton_raphson/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Accuracy and precision for each site can be viewed in Tensorboard:\n", + "```\n", + "tensorboard --logdir=/tmp/nvflare/job/lr/workspace/server/simulate_job/tb_events\n", + "```\n", + "As can be seen from the figure below, per-site evaluation metrics in\n", + "federated logistic regression are on-par with the centralized version.\n", + "\n", + "\"Tensorboard\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0858459", + "metadata": {}, + "outputs": [], + "source": [ + "! tensorboard --logdir=/tmp/nvflare/job/lr/workspace/server/simulate_job/tb_events" + ] + }, + { + "cell_type": "markdown", + "id": "d8383681", + "metadata": {}, + "source": [ + "Now that we have converted the centralized logistic regression to federated learning, let's move on to the next example." + ] + }, + { + "cell_type": "markdown", + "id": "d990f546", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nvflare_env", + "language": "python", + "name": "nvflare_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/figs/minibatch.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/figs/minibatch.png similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/figs/minibatch.png rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/figs/minibatch.png diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/kmeans_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/kmeans_job.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/kmeans_job.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/kmeans_job.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/requirements.txt b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/requirements.txt similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/requirements.txt rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/requirements.txt diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/src/kmeans_assembler.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/src/kmeans_assembler.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/src/kmeans_assembler.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/src/kmeans_assembler.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/src/kmeans_fl.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/src/kmeans_fl.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/src/kmeans_fl.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/src/kmeans_fl.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/utils/prepare_data.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/utils/prepare_data.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/utils/prepare_data.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/utils/prepare_data.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/utils/split_data.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/utils/split_data.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/code/utils/split_data.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/code/utils/split_data.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb similarity index 99% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb index 9a8cb7a548..20376a251f 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb @@ -14,6 +14,7 @@ "metadata": {}, "source": [ "## Introduction to Scikit-learn, tabular data, and federated k-Means\n", + "\n", "### Scikit-learn\n", "This example shows how to use [NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.html) on tabular data.\n", "It uses [Scikit-learn](https://scikit-learn.org/),\n", diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_baseline.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_baseline.png similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_baseline.png rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_baseline.png diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_fl.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_fl.png similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_fl.png rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_fl.png diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_fl_he.png b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_fl_he.png similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_fl_he.png rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/figs/km_curve_fl_he.png diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/km_job.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/km_job.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/km_job.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/km_job.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/requirements.txt b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/requirements.txt similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/requirements.txt rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/requirements.txt diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_train.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_train.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_train.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_train.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_train_he.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_train_he.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_train_he.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_train_he.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_wf.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_wf.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_wf.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_wf.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_wf_he.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_wf_he.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_wf_he.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/src/kaplan_meier_wf_he.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/utils/baseline_kaplan_meier.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/utils/baseline_kaplan_meier.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/utils/baseline_kaplan_meier.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/utils/baseline_kaplan_meier.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/utils/prepare_data.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/utils/prepare_data.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/utils/prepare_data.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/utils/prepare_data.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/utils/prepare_he_context.py b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/utils/prepare_he_context.py similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/code/utils/prepare_he_context.py rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/code/utils/prepare_he_context.py diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb similarity index 100% rename from examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb rename to examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/02.4.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/convert_ml_to_fl.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/convert_ml_to_fl.ipynb new file mode 100644 index 0000000000..0efe327f83 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.4_convert_machine_learning_to_federated_learning/convert_ml_to_fl.ipynb @@ -0,0 +1,36 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to Convert Centralized Machine Learning to Federated Learning\n", + "\n", + "Converting different deep learning algorithms to federated learning is very similar, but for traditional machine learning, we have to deal with them case by case. However, they all essentially still follow the same process using the Client API:\n", + "\n", + "* Formulating the algorithm: see how to structure the model exchange to be represented by `FLModel`\n", + "* Receive model from global\n", + "* Update and train local model \n", + "* Send the global model back to aggregator\n", + " \n", + "In this section, we will study three different commonly used machine learning algorithms:\n", + "\n", + "* [Convert Logistic Regression to federated learning](./02.4.1_convert_logistic_regression_to_federated_learning/convert_logistic_regression_to_fl.ipynb)\n", + "* [Convert KMeans to federated learning](./02.4.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb)\n", + "* [Convert Survival Analysis to federated learning](./02.4.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.5_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.5_recap/recap.ipynb index 77f82177a4..a6898b699f 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.5_recap/recap.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/chapter-2_develop_federated_learning_applications/02.5_recap/recap.ipynb @@ -5,40 +5,32 @@ "id": "7b152728-3366-4432-adb1-29aa3051dc22", "metadata": {}, "source": [ - "# Summary of Chapter 2\n", + "# Recap: Chapter 2 Summary\n", "\n", - "We covered developing federated learning applications in Chapter 2. Here is an overview:\n", "\n", - "1. **Federated Statistics**\n", - " - **Federated Statistics with Image Data**: How to compute local and global image statistics with the consideration that data is private at each of the client sites.\n", - " - [federated_statistics_with_image_data.ipynb](../02.1_federated_statistics/federated_statistics_with_image_data/federated_statistics_with_image_data.ipynb)\n", - " - **Federated Statistics with Tabular Data**: How to create federated statistics for data that can be represented as Pandas DataFrames.\n", - " - [federated_statistics_with_tabular_data.ipynb](../02.1_federated_statistics/federated_statistics_with_tabular_data/federated_statistics_with_tabular_data.ipynb)\n", - "\n", - "2. **Converting PyTorch Lightning to FL**\n", - " - **PyTorch Lightning to FL**: Guide on converting PyTorch Lightning scripts to federated learning.\n", - " - [convert_torch_lightning_to_fl.ipynb](../02.2_convert_torch_lightning_to_federated_learning/convert_torch_lightning_to_fl.ipynb)\n", + "We covered a lot of ground in developing federated learning applications in this Chapter. We focused on how to compute federated statistics and how to leverage the NVFLARE Client API to convert machine learning models to federated learning.\n", "\n", - "3. **Simple ML/DL to FL transition with NVFlare**\n", - " - **Converting Logistic Regression to FL**: How to implement a federated binary classification via logistic regression with second-order Newton-Raphson optimization. \n", - " - [convert_logistic_regression_to_fl.ipynb](../02.3_convert_machine_learning_to_federated_learning/02.3.1_convert_logistic_regression_to_federated_learning/convert_logistic_regression_to_fl.ipynb)\n", - " - **Converting KMeans to FL**: ADD CONTENT HERE. \n", - " - [convert_kmeans_to_fl.ipynb](../02.3_convert_machine_learning_to_federated_learning/02.3.2_convert_kmeans_to_federated_learning/convert_kmeans_to_fl.ipynb)\n", - " - **Secure Federated Kaplan-Meier Analysis via Time-Binning and Homomorphic Encryption**: ADD CONTENT HERE. \n", - " - [convert_survival_analysis_to_fl.ipynb](../02.3_convert_machine_learning_to_federated_learning/02.3.3_convert_survival_analysis_to_federated_learning/convert_survival_analysis_to_fl.ipynb)\n", - "\n", - "4. **Client API**\n", - " - **Client API**: Here we focus on the core concepts of the Client API and explain how to configure it to run within the same process or in a separate subprocess. \n", - " - [client_api.ipynb](../02.4_client_api/client_api.ipynb)\n", + "1. **Federated Statistics**\n", + " Users can leverage NVFLARE's statistics feature to aggregate and visualize global statistics as well as site-specific statistics.\n", "\n", - "5. **Recap of Covered Topics**\n", - " - **Summary and Recap**: A recap of the topics covered in the previous sections.\n", + " This process is fairly straightforward.\n", + " \n", + "2. **Client API**\n", + " We discussed the Client API in detail, including the types of implementations and how to configure it to run within the same process or in a separate subprocess.\n", "\n", - "Each section is designed to provide comprehensive guidance and practical examples to help you implement and customize federated learning in your applications. For detailed instructions and examples, refer to the respective notebooks linked in each section.\n", + " Then we moved on to PyTorch Lightning and several traditional machine learning models and how to convert them to federated learning with the Client API.\n", + " \n", + " We hope that by now, you should be very confident in converting most deep learning and traditional machine learning models to federated learning with NVIDIA FLARE.\n", "\n", "\n", - "Now let's move on to the [Chapter 3](../../../part-2_federated_learning_system/chapter-3_federated_computing_platform/03.0_introduction/introduction.ipynb)." + "Now let's explore the [Federated Computing Platform](../../../part-2_federated_learning_system/chapter-3_federated_computing_platform/03.0_introduction/introduction.ipynb)." ] + }, + { + "cell_type": "markdown", + "id": "bbcd8e9d", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/part_1_introduction.ipynb b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/part_1_introduction.ipynb index bf2e5b6589..dc895cf38f 100644 --- a/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/part_1_introduction.ipynb +++ b/examples/tutorials/self-paced-training/part-1_federated_learning_introduction/part_1_introduction.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In part 1, we will use two chapters illustate how to run and develop federated learning applications. " + "we will use two chapters to illustrate how to run and develop federated learning applications. " ] }, { @@ -27,18 +27,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In Part 1, we explored the fundamentals of federated learning. We will cover:\n", + "In Part 1, we will explore the fundamentals of federated learning. We will cover:\n", "\n", "#### Chapter 1: \n", "* How to train an image classification model with PyTorch\n", - "* How to convert a standard PyTorch training code to federated learning code\n", - "* How to customize client and server side logics\n", - "* Understanding the federated job structure and configurations\n", + "* How to convert standard PyTorch training code to federated learning code\n", + "* How to customize client and server side logic\n", + "* Understanding federated job structure and configurations\n", "\n", "#### Chapter 2: \n", - "* federated statistics for both image and tabular data.\n", - "* convert porch lightning to federated learning\n", - "* convert traditional ML training code to federated learning code,\n", + "* Federated statistics for both image and tabular data\n", + "* Converting PyTorch Lightning to federated learning\n", + "* Converting traditional ML training code to federated learning code\n", "* FLARE Client API" ] }, diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.0_introduction/introduction.ipynb index 82509ddce4..0a43ccbc6a 100644 --- a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.0_introduction/introduction.ipynb +++ b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.0_introduction/introduction.ipynb @@ -103,7 +103,7 @@ "The core property of FLComponent is event support. FLComponent is able to fire and receive events, enabling the FLARE system to be an event-driven, pluggable system.\n", "\n", "### FLContext\n", - "One of the most important features of NVIDIA FLARE is ```nvflare.apis.fl_context``` to pass data between the FL components. FLContext is available to every method of all FLComponent types (Controller, Aggregator, Filter, Executor).\n", + "One of the most important features of NVIDIA FLARE is nvflare.apis.fl_context, which is used to pass data between the FL components. FLContext is available to every method of all FLComponent types (Controller, Aggregator, Filter, Executor).\n", "\n", "\n", "Through the FL Context, the component developer can:\n", diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.1_federated_computing_architecture/system_architecture.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.1_federated_computing_architecture/system_architecture.ipynb index e6ebe72a7f..adbebcb9a5 100644 --- a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.1_federated_computing_architecture/system_architecture.ipynb +++ b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.1_federated_computing_architecture/system_architecture.ipynb @@ -103,7 +103,7 @@ "\n", "## Event-Based System\n", "\n", - "ALL NVIDIA FLARE's components (FLComponent) has event handling and event firing via the runtine engine. As result, user can write a FLComponent as plugin and listen to event and write any customized logics at any layers. \n", + "ALL NVIDIA FLARE's components (FLComponent) has event handling and event firing via the runtime engine. As a result, users can write an FLComponent as a plugin and listen to events and write any customized logic at any layer.\n", "\n", "\n", "## Federated Learning Framework\n", @@ -126,7 +126,7 @@ "\n", "## Different type of FLARE APIs\n", "\n", - "At its Core, Flare uses controller and executor assign tasks and execute tasks for each job. There we have the \n", + "At its Core, Flare uses controller and executor to assign tasks and execute tasks for each job. There we have the:\n", "\n", "### Python APIs\n", "\n", @@ -150,17 +150,17 @@ " ...\n", "\n", " ```\n", - "This data structure essentially capture the model ( parameter type (Full, Diff), model paramaters (weights), optmizer parameters), metrics, metadata. This kind data structure is understandable by most data scientists. \n", + "This data structure essentially captures the model (parameter type (Full, Diff), model parameters (weights), optimizer parameters), metrics, metadata. This kind of data structure is understandable by most data scientists.\n", "\n", - "The Server side, we have ModelController -- Controller use and consume FLModel, on the client side we have Client API that receive and send model update via FLModel. You already seen this in previous chapters. \n", + "On the Server side, we have ModelController -- Controller uses and consumes FLModel, on the client side we have Client API that receives and sends model updates via FLModel. You have already seen this in previous chapters.\n", "\n", "\n", "* **Job API** -- FLARE Job API is a way to generate job configuration. Although once can direct edit configuration files, one can also use the Job API to construct the needed components and generate the job configuration. The job API can also call job.simulate_run() -- which is combined step of export job configuration and call simulator run. \n", "\n", - "* **Simulator API** -- one can directly invokve simulator_run() method start simulation in python\n", + "* **Simulator API** -- one can directly invoke simulator_run() method to start simulation in python\n", "\n", "\n", - "* **FLARE API** -- FLARE python API is equivallent FLARE Console command API. Instead of interact with FL system via Console command, we can perform most of the command functions via FLARE API. These includes connect to the server, checking status, monitoring jobs, submit job etc. \n", + "* **FLARE API** -- FLARE python API is equivalent to FLARE Console command API. Instead of interacting with FL system via Console command, we can perform most of the command functions via FLARE API. These include connecting to the server, checking status, monitoring jobs, submitting jobs etc.\n", "\n", "\n", "### Command Line Interface\n", @@ -194,7 +194,7 @@ "\n", "# Job Template\n", "\n", - "Job templates are set of existing job configurations with specified structure \n", + "Job templates are a set of existing job configurations with specified structure\n", "\n", "For example \n", "```\n", @@ -207,9 +207,9 @@ "\n", "```\n", "\n", - "Each job template consits \"information card\", info.conf, display card \"info.md\" and job configuration files. \n", + "Each job template consists of an \"information card\", info.conf, display card \"info.md\" and job configuration files.\n", "\n", - "The configuraiton is defined in pyhocon format so we can add comments and explain the details \n", + "The configuration is defined in pyhocon format so we can add comments and explain the details.\n", "\n", "we can take a look at one example \n", "\n", diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.2_deployment_simulation/simulate_real_world_deployment.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.2_deployment_simulation/simulate_real_world_deployment.ipynb index f1dc6c6e40..f16635a56e 100644 --- a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.2_deployment_simulation/simulate_real_world_deployment.ipynb +++ b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.2_deployment_simulation/simulate_real_world_deployment.ipynb @@ -29,7 +29,7 @@ "\n", "[POC command](https://nvflare.readthedocs.io/en/main/user_guide/nvflare_cli/poc_command.html) provides a set of command to create different software packages and simulate client and server. You can also find the tutorials in [here](../../../../setup_poc.ipynb) on how to setup POC. \n", "\n", - "> With POC mode, it is ideally use terminal instead of notebook to setup, as there are some non-consistent behavior when running scripts.\n", + "> With POC mode, it is ideally to use terminal instead of notebook for setup, as there are some inconsistent behaviors when running scripts.\n", "\n", "### POC Prepare\n", "\n", @@ -118,20 +118,21 @@ "```\n", "\n", "\n", - "Notice the command create a working directory at ```/tmp/nvflare/poc/example_project/prod_00```\n", + "Notice the command creates a working directory at ```/tmp/nvflare/poc/example_project/prod_00```\n", "\n", - "based on the default values: \n", "\n", - "POC workspace = \"/tmp/nvflare/poc\"\n", - "project_name = \"example_project\"\n", - "and default folder \"prod_00\"\n", + "based on the default values:\n", "\n", - "All sites are with the default name with \"site-\". \n", + "* POC workspace = \"/tmp/nvflare/poc\"\n", + "* project_name = \"example_project\"\n", + "* and default folder \"prod_00\"\n", "\n", - "The actual process of generate such software package (startup kit) is called **\"provision\"**, the POC is special mode of provision, where both client and server are at localhost\n", "\n", + "All sites are with default names starting with \"site-\".\n", "\n", - "Each site ( site-N and server) representing one location in the federated learning site and running federated learning client. \n", + "The actual process of generating such software packages (startup kit) is called **\"provision\"**. The POC is a special mode of provision, where both client and server are at localhost.\n", + "\n", + "Each site (site-N and server) represents one location in the federated learning system and runs a federated learning client.\n", "\n", "\n", "#### Simulating the real-world deployment\n", @@ -243,8 +244,7 @@ "source": [ "#### Prepare with Named Clients\n", "\n", - "If you just want to have default deployment, but specifiy the client site names ((instead of use default site-1,2 etc.) and not writing a project.yaml file, you do the following\n", - "\n", + "If you just want to have a default deployment but specify the client site names (instead of using default site-1,2 etc.) without writing a project.yaml file, you can do the following:\n", "\n", "\n", "nvflare poc prepare -c [CLIENTS ...]" @@ -331,7 +331,7 @@ "\n", "> Note: Using %%bash -bg to run the above command in a code cell may not always work\n", "\n", - "**Homework**: run the nvflare poc start command with or without -ex option\n", + "**Homework**: run the nvflare poc start command with or without the -ex option\n", "\n", "\n", "#### POC start individial site Only \n", diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/ ways_to_interact_with_fl_system.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/ ways_to_interact_with_fl_system.ipynb index e89c853bf8..f3d01cf3b9 100644 --- a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/ ways_to_interact_with_fl_system.ipynb +++ b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.3_interact_with_federated_computing_system/ ways_to_interact_with_fl_system.ipynb @@ -81,7 +81,7 @@ "source": [ "Now start a FLARE system in POC mode\n", "\n", - "And use a terminal to start the POC withut admin console.\n", + "And use a terminal to start the POC without admin console.\n", "\n", "```nvflare poc start -ex admin@nvidia.com```\n", "\n", @@ -111,7 +111,7 @@ "metadata": {}, "source": [ "\n", - "You can submit job, list jobs and check results, check status of sites, list jobs, abort jobs\n" + "You can submit jobs, list jobs and check results, check status of sites, and abort jobs " ] }, { @@ -127,7 +127,8 @@ "id": "95bce19f", "metadata": {}, "source": [ - "Another way to interact with FLARE system is using FLARE python APIs. These APIs have the equivallent functions of the Admin Commands. And they can be issued directly from notebooks. \n", + "Another way to interact with FLARE system is using FLARE python APIs. These APIs have the equivalent functions of the Admin Commands. And they can be issued directly from notebooks. \n", + "\n", "\n", "Let's take a look how this can be done. \n", "\n", @@ -178,7 +179,7 @@ "id": "22612078", "metadata": {}, "source": [ - "In the terminal, you should see the training output, but here we like to use API to monitoring the job \n", + "In the terminal, you should see the training output, but here we would like to use API to monitor the job \n", "\n", "#### Monitor job\n", "The command ```monitor_job()``` allows you to follow a job until the job is done.\n", @@ -397,9 +398,9 @@ "id": "a216ce5d", "metadata": {}, "source": [ - "So far, we have learnt three different ways to interact with FLARE system. Although we used POC model to simulate the real deployment. In production, the same interaction commands can be used in production setup\n", + "So far, we have learned three different ways to interact with FLARE system. Although we used POC mode to simulate the real deployment, in production, the same interaction commands can be used in production setup.\n", "\n", - "Next, lets see how do we [monitoring FLARE system](../03.4_system_monitoring/system_monitorinig.ipynb)\n", + "Next, lets see how do we [monitor FLARE system](../03.4_system_monitoring/system_monitorinig.ipynb)\n", "\n" ] }, diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.4_system_monitoring/job_example.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.4_system_monitoring/job_example.ipynb index e2fc01868f..0469518e11 100644 --- a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.4_system_monitoring/job_example.ipynb +++ b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.4_system_monitoring/job_example.ipynb @@ -5,8 +5,10 @@ "metadata": {}, "source": [ "# FLARE Monitoring\n", - "FLARE Monitoring provides a initial solution for tracking system metrics of your federated learning jobs.\n", - "Different from Machine learning experiment tracking, where it focused on the training metrics, the monitoring here focused on the FL system: i.e. job and system lifecycle metrics. \n", + "\n", + "FLARE Monitoring provides an initial solution for tracking system metrics of your federated learning jobs.\n", + "Unlike machine learning experiment tracking, which focuses on training metrics, this monitoring focuses on the FL system: i.e., job and system lifecycle metrics.\n", + "\n", "\n", "This guide will walk you through the steps to set up and use the monitoring system effectively.\n", "\n", @@ -97,7 +99,7 @@ "\n", "![setup-1](./figures/setup-1.png)\n", "\n", - "As described in the [system monitorinig introduction](./system_monitorinig.ipynb), we will make different component configurations depending on the setups.\n", + "As described in the [system monitoring introduction](./system_monitorinig.ipynb), we will create different component configurations depending on the setups.\n", "\n", "In this setup, all sites (server and clients) will share the same monitoring system with the same host and port.\n", "\n", @@ -133,27 +135,25 @@ "\n", "#### System Metrics Monitoring Configuration\n", "\n", - "We need to manually edit the configuration files for System Metrics collections.\n", + "We need to manually edit the configuration files for System Metrics collection.\n", "\n", - "For example, we need to add server to include \n", + "For example, we need to add to the server:\n", "\n", "* system metrics collector \n", "* statsd reporter\n", "\n", - "In the default POC setup, these components are added to \n", + "In the default POC setup, these components are added to:\n", "\"/tmp/nvflare/poc/example_project/prod_00/server/local/resources.json\"\n", "\n", - "For Client sides, we need to add \n", + "For client sides, we need to add:\n", "\n", "* system metrics collector \n", "* statsd reporter\n", "\n", - "\"/tmp/nvflare/poc/example_project/prod_00//local/resources.json\"\n", - "\n", + "to \"/tmp/nvflare/poc/example_project/prod_00//local/resources.json\"\n", "for the default POC setup.\n", "\n", - "\n", - "Instead of manually, go through each file, we wrote a small python program to do this: \n", + "Instead of manually going through each file, we wrote a small Python program to do this:\n", "\n", "```bash\n", "cd setup-1\n", @@ -202,11 +202,10 @@ "nvflare job submit -j /tmp/nvflare/jobs/job_config/fedavg\n", "```\n", "\n", - "\n", "## Monitoring View\n", "\n", - "Once you setup the system, you can view from the followingt website\n", - "for statsd-exporter, you can look at \n", + "Once you set up the system, you can view from the following websites.\n", + "For statsd-exporter, you can look at:\n", "\n", "### Statsd-exporter metrics view\n", "\n", @@ -241,15 +240,13 @@ "\n", "\n", "\n", - "## Complete steps\n", - "\n", - "Now, lets go to terminal and following all the steps to do the excersize\n", + "Now, let's go to the terminal and follow all the steps to do the exercise:\n", "\n", - "* install dependencies \n", + "* Install dependencies:\n", "\n", - " ```\n", - " pip install -r jobs/requirements.txt\n", - " \n", + " ```bash\n", + " \n", + " pip install -r jobs/requirements.txt\n", " ```\n", "\n", "* start monitoring systems (statsD, prometheus and grafana)\n", diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.4_system_monitoring/system_monitorinig.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.4_system_monitoring/system_monitorinig.ipynb index 13f8b85f0b..b9dc31430c 100644 --- a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.4_system_monitoring/system_monitorinig.ipynb +++ b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.4_system_monitoring/system_monitorinig.ipynb @@ -15,8 +15,9 @@ "source": [ "# NVFLARE System Metrics Monitoring\n", "\n", - "FLARE Monitoring provides a initial solution for tracking system metrics of your federated learning jobs.\n", - "Different from Machine learning experiment tracking, where it focused on the training metrics, the monitoring here focused on the FL system: i.e. job and system lifecycle metrics.\n", + "FLARE Monitoring provides an initial solution for tracking system metrics of your federated learning jobs.\n", + "Different from Machine learning experiment tracking, where it focuses on the training metrics, the monitoring here focuses on the FL system: i.e. job and system lifecycle metrics.\n", + "\n", "\n", "This guide describes how to set up NVFLARE metrics publishing to StatsD Exporter, which then can be scraped by Prometheus and visualized with Grafana.\n", "\n", @@ -35,7 +36,7 @@ "4. **Set up Grafana** to visualize the metrics from Prometheus.\n", "\n", "> side notes: \n", - " don't confuse statsd-exporter and statsd-reporter. statsd-exporter is a libary (such as from datadog) used to receive metrics and for prometheus to scrape from; statsd-reporter is the FLARE component which export metrics to statsd-exporter. \n", + "Don't confuse statsd-exporter and statsd-reporter. statsd-exporter is a library (such as from datadog) used to receive metrics and for prometheus to scrape from; statsd-reporter is the FLARE component which exports metrics to statsd-exporter. \n", "\n", "\n", "\n", @@ -148,7 +149,6 @@ "2. **JobMetricsCollector**: This component collects job-level metrics and publishes them to the databus. It can be added to the workflow components on both client and server sites.\n", "3. **SysMetricsCollector**: This component collects system-level metrics running in the parent process of the server and clients. The metrics will be published to the databus.\n", "4. **RemoteMetricsReceiver**: This component receives the federated metrics streamed from client sides and publish the metriics. \n", - "4. **RemoteMetricsReceiver**: This component receives the federated metrics streamed from client sides and publishes the metrics.\n", "\n", "\n", "### Components Configuration\n", @@ -222,7 +222,6 @@ "}\n", "``` \n", "\n", - "tags can be key, value pair, they are used for group metrics in the report. Here we used \"site\" to indicate origin of the metrics, the \"dev\" env. to indicating the dev environment. \n", "Tags can be key-value pairs used for grouping metrics in the report. Here we used \"site\" to indicate the origin of the metrics and \"env\" to indicate the development environment.\n", "\n", "\n", @@ -234,7 +233,6 @@ "\n", "```//local/resources.json```\n", "\n", - "by rename ```resources.json.default``` to ```resources.json```\n", "by renaming ```resources.json.default``` to ```resources.json```\n", "in ```//local/resources.json```\n", "\n", @@ -266,7 +264,9 @@ "\n", "\n", "#### 2. Clients Forward Metrics to Server Site\n", + "\n", "In this setup, all client-side metrics will not directly post to the StatsD Exporter. Instead, the metrics are streamed to the server site. Therefore, the client side will need the following components:\n", + "\n", "In this setup, all client-side metrics will not be directly posted to the StatsD Exporter. Instead, the metrics are streamed to the server site. Therefore, the client side will need the following components:\n", "- **JobMetricsCollector**\n", "- **SysMetricsCollector**\n", diff --git a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.5_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.5_recap/recap.ipynb index 08c8ad0df8..b264d9cf81 100644 --- a/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.5_recap/recap.ipynb +++ b/examples/tutorials/self-paced-training/part-2_federated_learning_system/chapter-3_federated_computing_platform/03.5_recap/recap.ipynb @@ -5,15 +5,25 @@ "id": "7c3c89fa-6355-4d34-8d85-a9447152996f", "metadata": {}, "source": [ - "# Chapter 3 Summary\n", + "# Recap Chapter 3: Federated Computing Platform\n", "\n", + "In this chapter, we explored key components and features of FLARE's federated computing platform:\n", "\n", - "In this chapter, we explored:\n", + "1. System Architecture\n", + " * Core components and their interactions\n", + " * high level system architecture design\n", "\n", - "* FLARE's overall system architecture\n", - "* POC mode to simulate deployment locally\n", - "* Different ways to interact with the FLARE system\n", - "* Monitoring the system with StatsD, Prometheus, and Grafana\n" + "2. POC (Proof of Concept) Mode\n", + " * Local deployment simulation\n", + " \n", + "3. System Interaction Methods\n", + " * Command-line interface (CLI)\n", + " * Python API\n", + " * FLARE Console\n", + "\n", + "4. System Monitoring and Observability\n", + " * System event metrics with StatsD, prometheus and Grafana dashboards\n", + " " ] }, { diff --git a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.1_identity_security/identity_security.ipynb b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.1_identity_security/identity_security.ipynb index b6e2936d42..9db2e404a4 100644 --- a/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.1_identity_security/identity_security.ipynb +++ b/examples/tutorials/self-paced-training/part-3_security_and_privacy/chapter-6_Security_in_federated_compute_system/06.1_identity_security/identity_security.ipynb @@ -182,7 +182,7 @@ "\n", "For more details please refer [documentation](https://nvflare.readthedocs.io/en/main/user_guide/security/identity_security.html)\n", "\n", - " \n" + "Now, lets move on to [site_security](../06.2_site_security_privacy_policy/site_policy.ipynb)\n" ] }, { diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.0_introduction/hori_vert.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.0_introduction/hori_vert.png new file mode 100644 index 0000000000..3274ce26ab Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.0_introduction/hori_vert.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.0_introduction/introduction.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.0_introduction/introduction.ipynb index 93567ed2aa..e915ba4b8d 100644 --- a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.0_introduction/introduction.ipynb +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.0_introduction/introduction.ipynb @@ -1,9 +1,67 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "cae3fb2d-8949-4e6d-a2af-0282ee973285", + "metadata": {}, + "source": [ + "# Federated Learning for XGBoost \n", + "This chapter demonstrates how to use NVFlare to train an XGBoost model in a federated learning setting. \n", + "Several potential variations of federated XGBoost are illustrated, including:\n", + "- non-secure horizontal collaboration with histogram-based and tree-based mechanisms.\n", + "- non-secure vertical collaboration with histogram-based mechanism.\n", + "- secure horizontal and vertical collaboration with histogram-based mechanism and homomorphic encryption.\n", + "\n", + "Let's first visit the basics of XGBoost and the collaboration modes." + ] + }, + { + "cell_type": "markdown", + "id": "6da8f899-2804-4d6e-ab03-a02ce115d32f", + "metadata": {}, + "source": [ + "## XGBoost \n", + "XGBoost is a machine learning algorithm that uses decision/regression trees to perform classification and regression tasks, \n", + "mapping a vector of feature values to its label prediction. It is especially powerful for tabular data, so even in the age of LLM, \n", + "it is still widely used for many tabular data use cases. It is also preferred for its explainability and efficiency.\n", + "\n", + "In these examples, we use [DMLC XGBoost](https://github.com/dmlc/xgboost), which is an optimized distributed gradient boosting library. \n", + "It offers advanced features like GPU accelerated capabilities, and distributed/federated learning support.\n", + "\n", + "## Collaboration Modes and Data Split\n", + "Essentially there are two collaboration modes: horizontal and vertical:\n", + "![hori_vert](./hori_vert.png)" + ] + }, + { + "cell_type": "markdown", + "id": "64466b4a-6f69-4663-b6f8-14a050a21634", + "metadata": {}, + "source": [ + "- In horizontal case, each participant has access to the same features (columns - \"x_1 x_2\") and label (\"y\") of different data samples (rows - 1/2/3 for Client A v.s. 4/5/6 for Client B). \n", + "In this case, everyone holds equal status as \"label owner\"\n", + "- In vertical case, each client has access to different features (columns - \"x_1 x_2 x_3\" for Client A v.s. \"x_4 x_5\" for Client B) of the same data samples (rows - 1/2/3).\n", + "We assume that only one is the \"label owner\" (or we call it as the \"active party\") - Client B owns label \"y\" \n", + "\n", + "To simulate the above two collaboration modes, we split the dataset both horizontally and vertically, and \n", + "we give site-1 the label column for simplicity.\n", + "\n", + "## Federated Training of XGBoost\n", + "Continue with this chapter for two scenarios:\n", + "### [Federated XGBoost without Encryption](../10.1_fed_xgboost/fed_xgboost.ipynb)\n", + "This section provides instructions for running federated XGBoost without homomorphic encryption, covering both histogram-based and tree-based horizontal collaboration, as well as histogram-based vertical collaboration.\n", + "\n", + "### [Secure Federated XGBoost with Homomorphic Encryption](../10.2_secure_fed_xgboost/secure_fed_xgboost.ipynb)\n", + "This section includes instructions on running secure federated XGBoost with homomorphic encryption under \n", + "histogram-based horizontal and vertical collaboration. Note that as tree-based methods exchange the local trained models (trees), rather than intermediate gradients / histograms, considering that the final model will be made available to all parties at the end of the federated learning, they do not have the same security concerns as histogram-based methods. Therefore under our current setting, we do not consider Homomorphic Encryption for tree-based methods.\n", + "\n", + "We will then finish this chapter with a [recap](../10.3_recap/recap.ipynb)" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "159f19a6-8dc7-4b35-a1ec-7fa327e4239f", + "id": "58cd3833-4dc5-4a53-b188-3db7104b356b", "metadata": {}, "outputs": [], "source": [] @@ -11,9 +69,9 @@ ], "metadata": { "kernelspec": { - "display_name": "nvflare_example", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "nvflare_example" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -25,7 +83,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.2" + "version": "3.11.5" } }, "nbformat": 4, diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/Running_fed_xgboost_applications.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/Running_fed_xgboost_applications.ipynb deleted file mode 100644 index f9529bb0bb..0000000000 --- a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/Running_fed_xgboost_applications.ipynb +++ /dev/null @@ -1,33 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "e74b79f7-0541-47ee-bdf7-b9f3423b0094", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/fed_xgboost.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/fed_xgboost.ipynb new file mode 100644 index 0000000000..e7ccbd7fda --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/fed_xgboost.ipynb @@ -0,0 +1,456 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d14f9dec-9990-46a8-b4ce-2ff81275454e", + "metadata": {}, + "source": [ + "# Federated XGBoost\n", + "Several mechanisms have been proposed for training an XGBoost model in a federated learning setting.\n", + "In this section, we illustrate the use of NVFlare to carry out *horizontal* federated learning using two approaches: histogram-based collaboration and tree-based collaboration.\n", + "And *vertical* federated learning using histogram-based collaboration." + ] + }, + { + "cell_type": "markdown", + "id": "2fce92a0-79a3-4cf2-adee-bb9c4125613c", + "metadata": {}, + "source": [ + "## Horizontal Federated XGBoost\n", + "Under horizontal setting, each participant joining the federated learning will have part of \n", + "the whole data samples / instances / records, while each sample has all the features.\n", + "\n", + "### Histogram-based Collaboration\n", + "The histogram-based collaboration federated XGBoost approach leverages NVFlare integration of [federated learning support](https://github.com/dmlc/xgboost/issues/7778) in the XGBoost open-source library,\n", + "which allows the existing *distributed* XGBoost training algorithm to operate in a federated manner,\n", + "with the federated clients acting as the distinct workers in the distributed XGBoost algorithm.\n", + "\n", + "In distributed XGBoost, individual workers share and aggregate gradient information about their respective portions of the training data,\n", + "as required to optimize tree node splitting when building the successive boosted trees.\n", + "\n", + "![hori_hist](./figs/hori_hist.png)\n", + "\n", + "The shared information is in the form of quantile sketches of feature values as well as corresponding sample gradient and sample Hessian histograms (\"Local G/H\") , based on which the global information can be computed (\"Global G/H\").\n", + "\n", + "Under federated histogram-based collaboration, information of precisely the same structure is exchanged among the clients.\n", + "The main differences are that the data is partitioned across the workers according to client data ownership, rather than being arbitrarily partionable, and all communication is via an aggregating federated [gRPC](https://grpc.io) server instead of direct client-to-client communication.\n", + "Histograms from different clients, in particular, are aggregated in the server and then communicated back to the clients.\n", + "\n", + "### Tree-based Collaboration\n", + "Under tree-based collaboration, individual trees are independently trained on each client's local data without aggregating the global sample gradient histogram information. \n", + "Trained trees are collected and passed to the server / other clients for aggregation and / or further boosting rounds.\n", + "\n", + "Comparing with histogram-based collaboration, the major difference is that the histogram-based methods exchange the intermediate results for tree-boosting, while tree-based methods exchange the final tree model.\n", + "\n", + "Under this setting, we can further distinguish between two types of tree-based collaboration: cyclic and bagging.\n", + "\n", + "#### Cyclic Training\n", + "\"Cyclic XGBoost\" is one way of performing tree-based federated boosting with \n", + "multiple sites: \n", + "\n", + "![hori_cyclic](./figs/cyclic.png)\n", + "\n", + "At each round of tree boosting, instead of relying on the whole \n", + "data statistics collected from all clients, the boosting relies on only one client's \n", + "local data. The resulting tree sequence is then forwarded to the next client for \n", + "next round's boosting. One full \"cycle\" will be complete when all clients have been covered.\n", + "\n", + "#### Bagging Aggregation\n", + "\n", + "\"Bagging XGBoost\" is another way of performing tree-based federated boosting with multiple sites: \n", + "\n", + "![hori_cyclic](./figs/tree.png)\n", + "\n", + "At each round of tree boosting, all sites start from the same \"global model\", and boost a number of trees (in current example, 1 tree) based on their local data. The resulting trees are then send to server. A bagging aggregation scheme is applied to all the submitted trees to update the global model, which is further distributed to all clients for next round's boosting. \n", + "\n", + "This scheme bears certain similarity to the [Random Forest mode](https://xgboost.readthedocs.io/en/stable/tutorials/rf.html) of XGBoost, where a `num_parallel_tree` is boosted based on random row/col splits, rather than a single tree. Under federated learning setting, such split is fixed to clients rather than random and without column subsampling. \n", + "\n", + "In addition to basic uniform shrinkage setting where all clients have the same learning rate, based on our research, we enabled scaled shrinkage across clients for weighted aggregation according to each client's data size, which is shown to significantly improve the model's performance on non-uniform quantity splits.\n", + "\n", + "Specifically, the global model is updated by aggregating the trees from all clients as a forest, and the global model is then broadcasted back to all clients for local prediction and further training.\n", + "\n", + "The XGBoost Booster API is leveraged to create in-memory Booster objects that persist across rounds to cache predictions from trees added in previous rounds and retain other data structures needed for training." + ] + }, + { + "cell_type": "markdown", + "id": "851e4197-db50-4054-84da-4faf4129e22c", + "metadata": {}, + "source": [ + "## Vertical Federated XGBoost\n", + "Under vertical setting, each participant joining the federated learning will \n", + "have part of the whole features, while each site has all the overlapping instances.\n", + "\n", + "### Private Set Intersection (PSI)\n", + "In this tutorial, we assume that all parties hold the same population but different features. \n", + "\n", + "In reality, however, not every site will have the same set of data samples (rows), ad we shall use PSI to first compare encrypted versions of the sites' datasets in order to jointly compute the intersection based on common IDs. To learn more about our PSI protocol implementation, see our [psi example](https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/psi/README.md).\n", + "\n", + "### Histogram-based Collaboration\n", + "Similar to its horizontal counterpart, under vertical collaboration, the gradients for each sample will be first computed with label information by the active party; then the gradients will be broadcasted to all passive parties, where they will be used to compute local feature histograms, and find the local best splits with their corresponding gain values; at the last stage, all local best splits will be synced to find the global best split, with which the next split of the tree can be determined. \n", + "\n", + "By exchanging gradient and split information among all sites and update the global model accordingly, vertical histogram-based method can result in the exact same model as the centralized training. \n", + "\n", + "![vert_hist](./figs/vert_hist.png)\n", + "\n", + "We leverage the [vertical federated learning support](https://github.com/dmlc/xgboost/issues/8424) in the XGBoost open-source library. This allows for the distributed XGBoost algorithm to operate in a federated manner on vertically split data." + ] + }, + { + "cell_type": "markdown", + "id": "f942f5b9-bd00-4536-8c9c-ae566e303001", + "metadata": {}, + "source": [ + "## Setup\n", + "Install required packages for data download and training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85d420de-1a0d-4964-bb1d-a79531c660ac", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "a8718f40-04e8-42e9-85f1-e256d00f4a5c", + "metadata": {}, + "source": [ + "## Data Preparation\n", + "Download and Store Data\n", + "To run the examples, we first download the dataset and stored in /tmp/nvflare/dataset/creditcard.csv with the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0e134c6-26b0-4535-bdc0-0be1801efd4e", + "metadata": {}, + "outputs": [], + "source": [ + "import kagglehub\n", + "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n", + "! mkdir -p /tmp/nvflare/dataset/\n", + "! cp {path}/creditcard.csv /tmp/nvflare/dataset/" + ] + }, + { + "cell_type": "markdown", + "id": "14be5301-ba4c-4139-9749-01064b45fd0d", + "metadata": {}, + "source": [ + "### Data Split\n", + "To prepare data for further experiments, we perform the following steps:\n", + "1. Split the dataset into training/validation and testing sets. \n", + "2. Split the training/validation set: \n", + " * Into \"train\" and \"valid\" for baseline centralized training.\n", + " * Into \"train\" and \"valid\" for each client under horizontal setting. \n", + " * Into \"train\" and \"valid\" for each client under vertical setting.\n", + "\n", + "Data splits used in this example can be generated with" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10141e9b-b060-4ae6-9c43-303ea3f5ef3f", + "metadata": {}, + "outputs": [], + "source": [ + "! bash prepare_data.sh" + ] + }, + { + "cell_type": "markdown", + "id": "317fab1c-f229-4ad3-a263-5ca0974d9245", + "metadata": {}, + "source": [ + "This will generate data splits for 3 clients under all experimental settings.\n", + "\n", + "From the prints, we can see we have in total `182276` rows (data samples) for training, each with `31` columns (30 features + 1 label) \n", + "\n", + "For vertical splits, site-wise column assignments are: \n", + "- site-1 split cols [0:12]\n", + "- site-2 split cols [12:21]\n", + "- site-3 split cols [21:31]\n", + "\n", + "For horizontal splits, site-wise row assignments are:\n", + "- site-1 split rows [0:60758]\n", + "- site-2 split rows [60758:121516]\n", + "- site-3 split rows [121516:182276]\n", + "\n", + "> **_NOTE:_** In this section, we have divided the dataset into separate columns for each site,\n", + "> assuming that the datasets from different sites have already been joined using Private Set\n", + "> Intersection (PSI). In practice, each site initially has its own separate dataset. To\n", + "> combine these datasets accurately, PSI is needed to match records with the same ID across\n", + "> different sites. \n", + "\n", + "> **_NOTE:_** The generated data files will be stored in the folder `/tmp/nvflare/dataset/xgb_dataset/`" + ] + }, + { + "cell_type": "markdown", + "id": "7fd0b0d8-2e38-498a-becc-d34ce4b57229", + "metadata": {}, + "source": [ + "## Experiments\n", + "We first run the centralized trainings to get the baseline performance, then run the federated XGBoost training using NVFlare Simulator via [JobAPI](https://nvflare.readthedocs.io/en/main/programming_guide/fed_job_api.html).\n", + "\n", + "### Centralized Baseline\n", + "For centralize training, we train the XGBoost model on the whole dataset.\n", + "\n", + "Let's first examining the data used for centralized baseline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a56fee6-1659-4cb8-8273-4b1e63dcf491", + "metadata": {}, + "outputs": [], + "source": [ + "!tree /tmp/nvflare/dataset/xgb_dataset/base_xgb_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c21892e7-b481-41ba-9470-001c86f0b4ab", + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "\n", + "def print_first_rows_csv(file_path, num_rows=1):\n", + " with open(file_path, 'r') as file:\n", + " csv_reader = csv.reader(file)\n", + " for i, row in enumerate(csv_reader):\n", + " if i >= num_rows:\n", + " break\n", + " print(','.join(row))\n", + "\n", + "file_path = '/tmp/nvflare/dataset/xgb_dataset/base_xgb_data/train.csv'\n", + "print_first_rows_csv(file_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b64a449-45f2-45af-8f7f-161a767a2152", + "metadata": {}, + "outputs": [], + "source": [ + "! python train_base.py " + ] + }, + { + "cell_type": "markdown", + "id": "2ffb6344-3bed-4089-a546-1818588f5751", + "metadata": {}, + "source": [ + "The results by default will be stored in the folder `/tmp/nvflare/workspace/fedxgb/train_base/`." + ] + }, + { + "cell_type": "markdown", + "id": "6a0c9c8a-c90f-4fcb-b3ed-601e79f7f1f9", + "metadata": {}, + "source": [ + "### Horizontal Experiments\n", + "Let's take a look at the dataset for horizontal experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5645c9d6-5711-4a3c-bf1b-f777b0460831", + "metadata": {}, + "outputs": [], + "source": [ + "!tree /tmp/nvflare/dataset/xgb_dataset/horizontal_xgb_data/" + ] + }, + { + "cell_type": "markdown", + "id": "739b0cb0-2760-4d3f-85e9-382470ffa04c", + "metadata": {}, + "source": [ + "First row of site-1 data, should be identical to the first row of baseline data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c07cdf1-fda6-47c3-aa31-e963a20a9a72", + "metadata": {}, + "outputs": [], + "source": [ + "file_path = '/tmp/nvflare/dataset/xgb_dataset/horizontal_xgb_data/site-1/train.csv'\n", + "print_first_rows_csv(file_path)" + ] + }, + { + "cell_type": "markdown", + "id": "7400fd5e-c094-4ae3-9f00-7c10aee71d78", + "metadata": {}, + "source": [ + "The following cases will be covered:\n", + "- Histogram-based collaboration\n", + "- Tree-based collaboration with cyclic training \n", + "- Tree-based collaboration with bagging training \n", + "\n", + "The experiments can be run with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a6189d1-285d-4473-9f9f-71b40c794c19", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "! python xgb_fl_job.py --training_algo histogram --data_split_mode horizontal\n", + "! python xgb_fl_job.py --training_algo cyclic --data_split_mode horizontal\n", + "! python xgb_fl_job.py --training_algo bagging --data_split_mode horizontal" + ] + }, + { + "cell_type": "markdown", + "id": "e13d0737-80fd-4a7e-8349-223fcd7badf8", + "metadata": {}, + "source": [ + "### Vertical Experiment\n", + "Let's take a look at the dataset for vertical experiments:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "400c3500-98e1-44f1-9d17-72daeb213499", + "metadata": {}, + "outputs": [], + "source": [ + "!tree /tmp/nvflare/dataset/xgb_dataset/vertical_xgb_data/" + ] + }, + { + "cell_type": "markdown", + "id": "f5243e40-6fe1-483b-b6dd-3104bd03ba2d", + "metadata": {}, + "source": [ + "First row of site-1/2/3 data combined together, should be identical to the first row of baseline data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6553a564-4fe9-413b-b8e9-07f15dc806e7", + "metadata": {}, + "outputs": [], + "source": [ + "file_path = '/tmp/nvflare/dataset/xgb_dataset/vertical_xgb_data/site-1/train.csv'\n", + "print_first_rows_csv(file_path)\n", + "file_path = '/tmp/nvflare/dataset/xgb_dataset/vertical_xgb_data/site-2/train.csv'\n", + "print_first_rows_csv(file_path)\n", + "file_path = '/tmp/nvflare/dataset/xgb_dataset/vertical_xgb_data/site-3/train.csv'\n", + "print_first_rows_csv(file_path)" + ] + }, + { + "cell_type": "markdown", + "id": "818d9f5d-e0f1-4d6c-881a-ea44d9712f14", + "metadata": {}, + "source": [ + "Histogram-based collaboration will be performed for vertical setting:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c38ad75c-dccf-44c9-889b-758225a26d08", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "! python xgb_fl_job.py --training_algo histogram --data_split_mode vertical" + ] + }, + { + "cell_type": "markdown", + "id": "8017a2c8-5e4b-4d3b-995d-4226b6845f59", + "metadata": {}, + "source": [ + "## Results\n", + "We can visualize the results via tensorboard records:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6611189-245c-4a1a-b182-0d080b8e094a", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext tensorboard\n", + "%tensorboard --logdir /tmp/nvflare/workspace/fedxgb/works" + ] + }, + { + "cell_type": "markdown", + "id": "45684795-ce6c-466f-ad2a-86146f8247ea", + "metadata": {}, + "source": [ + "For reference, the training curves for the four settings are below:\n", + "\n", + "![training_curves](./figs/training_curves.png)\n", + "\n", + "As shown, for this task, histogram-based methods, both vertical and horizontal, result in almost identical curves, and achieve better results as compared with bagging / cyclic.\n", + "Bagging and cyclic also converge to same training accuracy at the end of training. \n", + "\n", + "Also as expected, vertical histogram-based method achieves identical performance as baseline training." + ] + }, + { + "cell_type": "markdown", + "id": "8f38ab06-2fdc-4dfe-947c-83397b6db8e2", + "metadata": {}, + "source": [ + "Now let's move on to next section [Secure Federated XGBoost with Homomorphic Encryption](../10.2_secure_fed_xgboost/secure_fed_xgboost.ipynb) to see how to protect data privacy during histogram-based collaborations with federated learning and encryption" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8e5d474-c4be-4d99-8881-3fac805ae82c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/fed_xgboost_introduction.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/fed_xgboost_introduction.ipynb deleted file mode 100644 index 2ad53ed423..0000000000 --- a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/fed_xgboost_introduction.ipynb +++ /dev/null @@ -1,33 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "39b5b0cf-ea5d-4eaa-b028-a5e6110a217e", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/cyclic.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/cyclic.png new file mode 100644 index 0000000000..53db525101 Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/cyclic.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/hori_hist.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/hori_hist.png new file mode 100644 index 0000000000..28763ebc45 Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/hori_hist.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/training_curves.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/training_curves.png new file mode 100644 index 0000000000..1ead6f97ce Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/training_curves.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/tree.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/tree.png new file mode 100644 index 0000000000..cab87a94dc Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/tree.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/vert_hist.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/vert_hist.png new file mode 100644 index 0000000000..430e243be4 Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/figs/vert_hist.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/prepare_data.sh b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/prepare_data.sh new file mode 100644 index 0000000000..29e6ed5962 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/prepare_data.sh @@ -0,0 +1,35 @@ +DATASET_PATH="/tmp/nvflare/dataset/creditcard.csv" +SPLIT_PATH="/tmp/nvflare/dataset/xgb_dataset/" + +if [ ! -f "${DATASET_PATH}" ] +then + echo "Please check if you saved CreditCard dataset in ${DATASET_PATH}" +fi + +echo "Generating CreditCard data splits, reading from ${DATASET_PATH}" + +echo "Split data to training/validation v.s. testing" +python3 utils/prepare_data_traintest_split.py \ +--data_path "${DATASET_PATH}" \ +--test_ratio 0.2 \ +--out_folder "${SPLIT_PATH}" + +echo "Split training/validation data" +OUTPUT_PATH="${SPLIT_PATH}/base_xgb_data" +python3 utils/prepare_data_base.py \ +--data_path "${SPLIT_PATH}/train.csv" \ +--out_path "${OUTPUT_PATH}" + +echo "Split training/validation data vertically" +OUTPUT_PATH="${SPLIT_PATH}/vertical_xgb_data" +python3 utils/prepare_data_vertical.py \ +--data_path "${SPLIT_PATH}/train.csv" \ +--site_num 3 \ +--out_path "${OUTPUT_PATH}" + +echo "Split training/validation data horizontally" +OUTPUT_PATH="${SPLIT_PATH}/horizontal_xgb_data" +python3 utils/prepare_data_horizontal.py \ +--data_path "${SPLIT_PATH}/train.csv" \ +--site_num 3 \ +--out_path "${OUTPUT_PATH}" diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/requirements.txt b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/requirements.txt new file mode 100644 index 0000000000..38ed8c98d9 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/requirements.txt @@ -0,0 +1,9 @@ +pandas +torch +scikit-learn +tensorboard +kagglehub +shap +matplotlib +# require xgboost 2.2 version, for now need to install a nightly build +https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/federated-secure/xgboost-2.2.0.dev0%2B4601688195708f7c31fcceeb0e0ac735e7311e61-py3-none-manylinux_2_28_x86_64.whl \ No newline at end of file diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/train_base.py b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/train_base.py new file mode 100644 index 0000000000..cd15b31bdc --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/train_base.py @@ -0,0 +1,153 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import matplotlib.pyplot as plt +import pandas as pd +import shap +import xgboost as xgb + +PRINT_SAMPLE = False + + +def train_base_args_parser(): + parser = argparse.ArgumentParser(description="Train baseline XGBoost model") + parser.add_argument("--gpu", type=int, default=0, help="Whether to use gpu for training, 0 for cpu, 1 for gpu") + parser.add_argument( + "--data_train_root", + type=str, + default="/tmp/nvflare/dataset/xgb_dataset/base_xgb_data", + help="Path to training data folder", + ) + parser.add_argument( + "--data_test_file", + type=str, + default="/tmp/nvflare/dataset/xgb_dataset/test.csv", + help="Path to testing data file", + ) + parser.add_argument( + "--out_path", + type=str, + default="/tmp/nvflare/workspace/fedxgb/train_base", + help="Output path for the data split file", + ) + return parser + + +def load_test_data(data_path: str): + df = pd.read_csv(data_path) + # Split to feature and label + X = df.iloc[:, 1:] + y = df.iloc[:, 0] + return X, y + + +def main(): + parser = train_base_args_parser() + args = parser.parse_args() + if not os.path.exists(args.out_path): + os.makedirs(args.out_path) + + # Specify file path, rank 0 as the label owner, others as the feature owner + train_path = f"{args.data_train_root}/train.csv" + valid_path = f"{args.data_train_root}/valid.csv" + + # Load file directly to tell the match from loading with DMatrix + df_train = pd.read_csv(train_path, header=None) + if PRINT_SAMPLE: + # print number of rows and columns for each worker + print(f"Direct load: nrow={df_train.shape[0]}, ncol={df_train.shape[1]}") + # print one sample row of the data + print(f"Direct load: one sample row of the data: \n {df_train.iloc[0]}") + + # Load file, file will not be sharded in federated mode. + label = "&label_column=0" + # for Vertical XGBoost, read from csv with label_column and set data_split_mode to 1 for column mode + dtrain = xgb.DMatrix(train_path + f"?format=csv{label}") + dvalid = xgb.DMatrix(valid_path + f"?format=csv{label}") + + if PRINT_SAMPLE: + # print number of rows and columns for each worker + print(f"DMatrix: nrow={dtrain.num_row()}, ncol={dtrain.num_col()}") + # print one sample row of the data + data_sample = dtrain.get_data()[0] + print(f"DMatrix: one sample row of the data: \n {data_sample}") + + # Specify parameters via map, definition are same as c++ version + if args.gpu: + device = "cuda:0" + else: + device = "cpu" + param = { + "max_depth": 3, + "eta": 0.1, + "objective": "binary:logistic", + "eval_metric": "auc", + "tree_method": "hist", + "device": device, + "nthread": 1, + } + + # Specify validations set to watch performance + watchlist = [(dvalid, "eval"), (dtrain, "train")] + num_round = 30 + + # Run training, all the features in training API is available. + bst = xgb.train(param, dtrain, num_round, evals=watchlist) + + # Save the model + bst.save_model(f"{args.out_path}/model.base.json") + xgb.collective.communicator_print("Finished training\n") + + # save feature importance score to file + score = bst.get_score(importance_type="gain") + with open(f"{args.out_path}/feat_importance.base.txt", "w") as f: + for key in score: + f.write(f"{key}: {score[key]}\n") + + # Load test data + X_test, y_test = load_test_data(args.data_test_file) + # construct xgboost DMatrix + dmat_test = xgb.DMatrix(X_test, label=y_test) + + # Explain the model + explainer = shap.TreeExplainer(bst) + explanation = explainer(dmat_test) + + # save the beeswarm plot to png file + shap.plots.beeswarm(explanation, show=False) + img = plt.gcf() + img.savefig(f"{args.out_path}/shap.base.png") + + # dump tree and save to text file + dump = bst.get_dump() + with open(f"{args.out_path}/tree_dump.base.txt", "w") as f: + for tree in dump: + f.write(tree) + + # plot tree and save to png file + xgb.plot_tree(bst, num_trees=0, rankdir="LR") + fig = plt.gcf() + fig.set_size_inches(18, 5) + plt.savefig(f"{args.out_path}/tree.base.png", dpi=100) + + # export tree to dataframe + tree_df = bst.trees_to_dataframe() + tree_df.to_csv(f"{args.out_path}/tree_df.base.csv") + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_base.py b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_base.py new file mode 100644 index 0000000000..a0f452b57f --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_base.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import shutil + +import numpy as np +import pandas as pd + + +def data_split_args_parser(): + parser = argparse.ArgumentParser(description="Generate data split for dataset") + parser.add_argument("--data_path", type=str, help="Path to data file") + parser.add_argument( + "--out_path", + type=str, + default="./dataset", + help="Output path for the data split file", + ) + return parser + + +def split_num_proportion(n, site_num): + split = [] + ratio_vec = np.ones(site_num) + total = sum(ratio_vec) + left = n + for site in range(site_num - 1): + x = int(n * ratio_vec[site] / total) + left = left - x + split.append(x) + split.append(left) + return split + + +def main(): + parser = data_split_args_parser() + args = parser.parse_args() + + df = pd.read_csv(args.data_path, header=None) + + rows_total, cols_total = df.shape[0], df.shape[1] + + print(f"rows_total: {rows_total}, cols_total: {cols_total}") + + if os.path.exists(args.out_path): + shutil.rmtree(args.out_path) + + os.makedirs(args.out_path, exist_ok=True) + + # assign first 80% rows to train + df_train = df.iloc[: int(0.8 * df.shape[0]), :] + # assign last 20% rows to valid + df_valid = df.iloc[int(0.8 * df.shape[0]) :, :] + # save train and valid data + df_train.to_csv(path_or_buf=os.path.join(args.out_path, "train.csv"), index=False, header=False) + df_valid.to_csv(path_or_buf=os.path.join(args.out_path, "valid.csv"), index=False, header=False) + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_horizontal.py b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_horizontal.py new file mode 100644 index 0000000000..92a2c88615 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_horizontal.py @@ -0,0 +1,89 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import shutil + +import numpy as np +import pandas as pd + + +def data_split_args_parser(): + parser = argparse.ArgumentParser(description="Generate data split for dataset") + parser.add_argument("--data_path", type=str, help="Path to data file") + parser.add_argument("--site_num", type=int, default=3, help="Total number of sites") + parser.add_argument( + "--out_path", + type=str, + default="./dataset", + help="Output path for the data split file", + ) + return parser + + +def split_num_proportion(n, site_num): + split = [] + ratio_vec = np.ones(site_num) + total = sum(ratio_vec) + left = n + for site in range(site_num - 1): + x = int(n * ratio_vec[site] / total) + left = left - x + split.append(x) + split.append(left) + return split + + +def main(): + parser = data_split_args_parser() + args = parser.parse_args() + + df = pd.read_csv(args.data_path, header=None) + + rows_total, cols_total = df.shape[0], df.shape[1] + + print(f"site_num: {args.site_num}") + print(f"rows_total: {rows_total}, cols_total: {cols_total}") + + # split row + site_row_size = split_num_proportion(int(0.8 * rows_total), args.site_num) + print(f"site_row_size: {site_row_size}") + + if os.path.exists(args.out_path): + shutil.rmtree(args.out_path) + + # assign first 80% rows to train + df_train = df.iloc[: int(0.8 * rows_total), :] + # assign last 20% rows to valid + df_valid = df.iloc[int(0.8 * rows_total) :, :] + + for site in range(args.site_num): + row_start = sum(site_row_size[:site]) + row_end = sum(site_row_size[: site + 1]) + + df_split = df_train.iloc[row_start:row_end, :] + print(f"site-{site + 1} split rows [{row_start}:{row_end}]") + + data_path = os.path.join(args.out_path, f"site-{site + 1}") + if not os.path.exists(data_path): + os.makedirs(data_path, exist_ok=True) + + # save train and valid data + df_split.to_csv(path_or_buf=os.path.join(data_path, "train.csv"), index=False, header=False) + df_valid.to_csv(path_or_buf=os.path.join(data_path, "valid.csv"), index=False, header=False) + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_traintest_split.py b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_traintest_split.py new file mode 100644 index 0000000000..c913d52dd7 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_traintest_split.py @@ -0,0 +1,71 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import pandas as pd +from sklearn.model_selection import train_test_split + + +def data_split_args_parser(): + parser = argparse.ArgumentParser(description="Generate training/testing split for dataset") + parser.add_argument("--data_path", type=str, help="Path to data file") + parser.add_argument("--test_ratio", type=float, help="Ratio of testing set") + parser.add_argument( + "--out_folder", + type=str, + default="~/dataset", + help="Output folder for training/testing data", + ) + return parser + + +def main(): + parser = data_split_args_parser() + args = parser.parse_args() + + df = pd.read_csv(args.data_path) + df_pos = df[df.Class == 1] + df_neg = df[df.Class == 0] + # print the number of positive and negative samples + print("Number of positive samples: ", len(df_pos)) + print("Number of negative samples: ", len(df_neg)) + + # Split data into training and testing sets + X_pos = df_pos.drop(["Class"], axis=1) + y_pos = df_pos.Class + X_neg = df_neg.drop(["Class"], axis=1) + y_neg = df_neg.Class + X = pd.concat([X_pos, X_neg]) + y = pd.concat([y_pos, y_neg]) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_ratio, random_state=77) + df_train = pd.concat([y_train, X_train], axis=1) + df_test = pd.concat([y_test, X_test], axis=1) + + # print the number of positive and negative samples in training and testing sets + print("Number of positive samples in training set: ", len(df_train[df_train.Class == 1])) + print("Number of negative samples in training set: ", len(df_train[df_train.Class == 0])) + print("Number of positive samples in testing set: ", len(df_test[df_test.Class == 1])) + print("Number of negative samples in testing set: ", len(df_test[df_test.Class == 0])) + + # Save training and testing sets + if not os.path.exists(args.out_folder): + os.makedirs(args.out_folder, exist_ok=True) + df_train.to_csv(path_or_buf=os.path.join(args.out_folder, "train.csv"), index=False, header=False) + df_test.to_csv(path_or_buf=os.path.join(args.out_folder, "test.csv"), index=False, header=False) + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_vertical.py b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_vertical.py new file mode 100644 index 0000000000..dcf131a829 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/utils/prepare_data_vertical.py @@ -0,0 +1,93 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import shutil + +import numpy as np +import pandas as pd + + +def data_split_args_parser(): + parser = argparse.ArgumentParser(description="Generate data split for dataset") + parser.add_argument("--data_path", type=str, help="Path to data file") + parser.add_argument("--site_num", type=int, default=3, help="Total number of sites") + parser.add_argument( + "--out_path", + type=str, + default="./dataset", + help="Output path for the data split file", + ) + return parser + + +def split_num_proportion(n, site_num): + split = [] + ratio_vec = np.ones(site_num) + total = sum(ratio_vec) + left = n + for site in range(site_num - 1): + x = int(n * ratio_vec[site] / total) + left = left - x + split.append(x) + split.append(left) + return split + + +def main(): + parser = data_split_args_parser() + args = parser.parse_args() + + df = pd.read_csv(args.data_path, header=None) + + rows_total, cols_total = df.shape[0], df.shape[1] + + print(f"site_num: {args.site_num}") + print(f"rows_total: {rows_total}, cols_total: {cols_total}") + + # split col + cols_labelowner = int(cols_total * 0.4) + site_col_size = split_num_proportion(cols_total - cols_labelowner, args.site_num - 1) + site_col_size.insert(0, cols_labelowner) + print(f"site_col_size: {site_col_size}") + + if os.path.exists(args.out_path): + shutil.rmtree(args.out_path) + + for site in range(args.site_num): + col_start = sum(site_col_size[:site]) + col_end = sum(site_col_size[: site + 1]) + + df_split = df.iloc[:, col_start:col_end] + print(f"site-{site + 1} split cols [{col_start}:{col_end}]") + + data_path = os.path.join(args.out_path, f"site-{site + 1}") + if not os.path.exists(data_path): + os.makedirs(data_path, exist_ok=True) + + # assign first 80% rows to train + df_train = df_split.iloc[: int(0.8 * df_split.shape[0]), :] + # assign last 20% rows to valid + df_valid = df_split.iloc[int(0.8 * df_split.shape[0]) :, :] + + print(f"rows_training: {int(0.8 * df_split.shape[0])}") + + # save train and valid data + df_train.to_csv(path_or_buf=os.path.join(data_path, "train.csv"), index=False, header=False) + df_valid.to_csv(path_or_buf=os.path.join(data_path, "valid.csv"), index=False, header=False) + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/xgb_fl_job.py b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/xgb_fl_job.py new file mode 100644 index 0000000000..093ee1c0ca --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.1_fed_xgboost/xgb_fl_job.py @@ -0,0 +1,203 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nvflare.app_common.widgets.convert_to_fed_event import ConvertToFedEvent +from nvflare.app_opt.tracking.tb.tb_receiver import TBAnalyticsReceiver +from nvflare.app_opt.tracking.tb.tb_writer import TBWriter +from nvflare.app_opt.xgboost.histogram_based_v2.csv_data_loader import CSVDataLoader +from nvflare.job_config.api import FedJob + +ALGO_DIR_MAP = { + "bagging": "tree-based", + "cyclic": "tree-based", + "histogram": "histogram-based", +} + + +def define_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--data_root", + type=str, + default="/tmp/nvflare/dataset/xgb_dataset", + help="Path to dataset files for each site", + ) + parser.add_argument("--site_num", type=int, default=3, help="Total number of sites") + parser.add_argument("--round_num", type=int, default=30, help="Total number of training rounds") + parser.add_argument( + "--training_algo", type=str, default="histogram", choices=list(ALGO_DIR_MAP.keys()), help="Training algorithm" + ) + parser.add_argument("--nthread", type=int, default=16, help="nthread for xgboost") + parser.add_argument( + "--tree_method", type=str, default="hist", help="tree_method for xgboost - use hist for best perf" + ) + parser.add_argument( + "--data_split_mode", + type=str, + default="horizontal", + choices=["horizontal", "vertical"], + help="dataset split mode, horizontal or vertical", + ) + return parser.parse_args() + + +def _get_job_name(args) -> str: + return f"fedxgb_{args.site_num}_sites_{args.data_split_mode}_{args.training_algo}" + + +def _get_data_path(args) -> str: + return f"{args.data_root}/{args.data_split_mode}_xgb_data" + + +def main(): + args = define_parser() + job_name = _get_job_name(args) + dataset_path = _get_data_path(args) + site_num = args.site_num + job = FedJob(name=job_name, min_clients=site_num) + if args.data_split_mode == "horizontal": + data_split_mode = 0 + else: + data_split_mode = 1 + # Define the controller workflow and send to server + if args.training_algo == "histogram": + from nvflare.app_opt.xgboost.histogram_based_v2.fed_controller import XGBFedController + + controller = XGBFedController( + num_rounds=args.round_num, + data_split_mode=data_split_mode, + secure_training=False, + xgb_options={"early_stopping_rounds": 3, "use_gpus": False}, + xgb_params={ + "max_depth": 3, + "eta": 0.1, + "objective": "binary:logistic", + "eval_metric": "auc", + "tree_method": "hist", + "nthread": 1, + }, + ) + + from nvflare.app_opt.xgboost.histogram_based_v2.fed_executor import FedXGBHistogramExecutor + + executor = FedXGBHistogramExecutor( + data_loader_id="dataloader", + metrics_writer_id="metrics_writer", + ) + # Add tensorboard receiver to server + tb_receiver = TBAnalyticsReceiver( + tb_folder="tb_events", + ) + job.to_server(tb_receiver, id="tb_receiver") + elif args.training_algo == "bagging": + from nvflare.app_common.workflows.scatter_and_gather import ScatterAndGather + + controller = ScatterAndGather( + min_clients=args.site_num, + num_rounds=args.round_num, + start_round=0, + aggregator_id="aggregator", + persistor_id="persistor", + shareable_generator_id="shareable_generator", + wait_time_after_min_received=0, + train_timeout=0, + allow_empty_global_weights=True, + task_check_period=0.01, + persist_every_n_rounds=0, + snapshot_every_n_rounds=0, + ) + from nvflare.app_opt.xgboost.tree_based.model_persistor import XGBModelPersistor + + persistor = XGBModelPersistor(save_name="xgboost_model.json") + from nvflare.app_opt.xgboost.tree_based.shareable_generator import XGBModelShareableGenerator + + shareable_generator = XGBModelShareableGenerator() + from nvflare.app_opt.xgboost.tree_based.bagging_aggregator import XGBBaggingAggregator + + aggregator = XGBBaggingAggregator() + job.to_server(persistor, id="persistor") + job.to_server(shareable_generator, id="shareable_generator") + job.to_server(aggregator, id="aggregator") + elif args.training_algo == "cyclic": + from nvflare.app_common.workflows.cyclic_ctl import CyclicController + + controller = CyclicController( + num_rounds=int(args.round_num / args.site_num), + task_assignment_timeout=60, + persistor_id="persistor", + shareable_generator_id="shareable_generator", + task_check_period=0.01, + persist_every_n_rounds=0, + snapshot_every_n_rounds=0, + ) + from nvflare.app_opt.xgboost.tree_based.model_persistor import XGBModelPersistor + + persistor = XGBModelPersistor(save_name="xgboost_model.json", load_as_dict=False) + from nvflare.app_opt.xgboost.tree_based.shareable_generator import XGBModelShareableGenerator + + shareable_generator = XGBModelShareableGenerator() + job.to_server(persistor, id="persistor") + job.to_server(shareable_generator, id="shareable_generator") + # send controller to server + job.to_server(controller, id="xgb_controller") + + # Add executor and other components to clients + for site_id in range(1, site_num + 1): + if args.training_algo in ["bagging", "cyclic"]: + num_client_bagging = 1 + if args.training_algo == "bagging": + num_client_bagging = args.site_num + + from nvflare.app_opt.xgboost.tree_based.executor import FedXGBTreeExecutor + + executor = FedXGBTreeExecutor( + data_loader_id="dataloader", + training_mode=args.training_algo, + num_client_bagging=num_client_bagging, + num_local_parallel_tree=1, + local_subsample=1, + local_model_path="model.json", + global_model_path="model_global.json", + learning_rate=0.1, + objective="binary:logistic", + max_depth=3, + lr_scale=1, + eval_metric="auc", + tree_method="hist", + nthread=1, + ) + job.to(executor, f"site-{site_id}") + + dataloader = CSVDataLoader(folder=dataset_path) + job.to(dataloader, f"site-{site_id}", id="dataloader") + + if args.training_algo in ["histogram"]: + metrics_writer = TBWriter(event_type="analytix_log_stats") + job.to(metrics_writer, f"site-{site_id}", id="metrics_writer") + + event_to_fed = ConvertToFedEvent( + events_to_convert=["analytix_log_stats"], + fed_event_prefix="fed.", + ) + job.to(event_to_fed, f"site-{site_id}", id="event_to_fed") + + # Export job config and run the job + job.export_job("/tmp/nvflare/workspace/fedxgb/jobs/") + job.simulator_run(f"/tmp/nvflare/workspace/fedxgb/works/{job_name}") + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/enhancing_fed_xgboost_security.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/enhancing_fed_xgboost_security.ipynb deleted file mode 100644 index 758b75c807..0000000000 --- a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/enhancing_fed_xgboost_security.ipynb +++ /dev/null @@ -1,33 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "4793c97e-7897-4e70-860e-0cbce1c33b95", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/secure_hori.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/secure_hori.png new file mode 100644 index 0000000000..6c4ba4d270 Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/secure_hori.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/secure_vert.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/secure_vert.png new file mode 100644 index 0000000000..0ed982476b Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/secure_vert.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.base.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.base.png new file mode 100644 index 0000000000..c72d9c2ab2 Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.base.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.vert.secure.0.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.vert.secure.0.png new file mode 100644 index 0000000000..34b0dc7e8e Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.vert.secure.0.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.vert.secure.1.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.vert.secure.1.png new file mode 100644 index 0000000000..a1d2e7f339 Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.vert.secure.1.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.vert.secure.2.png b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.vert.secure.2.png new file mode 100644 index 0000000000..baea53f63b Binary files /dev/null and b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/figs/tree.vert.secure.2.png differ diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/horizontal_fed_xgboost_with_he.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/horizontal_fed_xgboost_with_he.ipynb deleted file mode 100644 index 4a263b78da..0000000000 --- a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/horizontal_fed_xgboost_with_he.ipynb +++ /dev/null @@ -1,33 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "4fdffd32-8f84-4db6-8bde-56484d7fa9c0", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/prepare_data.sh b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/prepare_data.sh new file mode 100644 index 0000000000..29e6ed5962 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/prepare_data.sh @@ -0,0 +1,35 @@ +DATASET_PATH="/tmp/nvflare/dataset/creditcard.csv" +SPLIT_PATH="/tmp/nvflare/dataset/xgb_dataset/" + +if [ ! -f "${DATASET_PATH}" ] +then + echo "Please check if you saved CreditCard dataset in ${DATASET_PATH}" +fi + +echo "Generating CreditCard data splits, reading from ${DATASET_PATH}" + +echo "Split data to training/validation v.s. testing" +python3 utils/prepare_data_traintest_split.py \ +--data_path "${DATASET_PATH}" \ +--test_ratio 0.2 \ +--out_folder "${SPLIT_PATH}" + +echo "Split training/validation data" +OUTPUT_PATH="${SPLIT_PATH}/base_xgb_data" +python3 utils/prepare_data_base.py \ +--data_path "${SPLIT_PATH}/train.csv" \ +--out_path "${OUTPUT_PATH}" + +echo "Split training/validation data vertically" +OUTPUT_PATH="${SPLIT_PATH}/vertical_xgb_data" +python3 utils/prepare_data_vertical.py \ +--data_path "${SPLIT_PATH}/train.csv" \ +--site_num 3 \ +--out_path "${OUTPUT_PATH}" + +echo "Split training/validation data horizontally" +OUTPUT_PATH="${SPLIT_PATH}/horizontal_xgb_data" +python3 utils/prepare_data_horizontal.py \ +--data_path "${SPLIT_PATH}/train.csv" \ +--site_num 3 \ +--out_path "${OUTPUT_PATH}" diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/project.yml b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/project.yml new file mode 100644 index 0000000000..8d7072ec46 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/project.yml @@ -0,0 +1,75 @@ +api_version: 3 +name: example_project +description: NVIDIA FLARE sample project yaml file + +participants: + # change overseer.example.com to the FQDN of the overseer + - name: overseer + type: overseer + org: nvidia + protocol: https + api_root: /api/v1 + port: 8443 + # change example.com to the FQDN of the server + - name: server1 + type: server + org: nvidia + fed_learn_port: 8002 + admin_port: 8003 + - name: site-1 + type: client + org: nvidia + # listening_host will enable creating one pair of cert/private key for this client + # so it can behave like a server for client api. The value must be a hostname that + # client api can reach via network. + # listening_host: site-1-lh + - name: site-2 + type: client + org: nvidia + - name: admin@nvidia.com + type: admin + org: nvidia + role: project_admin + +# The same methods in all builders are called in their order defined in builders section +builders: + - path: nvflare.lighter.impl.workspace.WorkspaceBuilder + args: + template_file: master_template.yml + - path: nvflare.lighter.impl.template.TemplateBuilder + - path: nvflare.lighter.impl.static_file.StaticFileBuilder + args: + # config_folder can be set to inform NVIDIA FLARE where to get configuration + config_folder: config + + # scheme for communication driver (currently supporting the default, grpc, only). + # scheme: grpc + + # app_validator is used to verify if uploaded app has proper structures + # if not set, no app_validator is included in fed_server.json + # app_validator: PATH_TO_YOUR_OWN_APP_VALIDATOR + + # when docker_image is set to a docker image name, docker.sh will be generated on server/client/admin + # docker_image: + + # download_job_url is set to http://download.server.com/ as default in fed_server.json. You can override this + # to different url. + # download_job_url: http://download.server.com/ + + overseer_agent: + path: nvflare.ha.overseer_agent.HttpOverseerAgent + # if overseer_exists is true, args here are ignored. Provisioning + # tool will fill role, name and other local parameters automatically. + # if overseer_exists is false, args in this section will be used. + overseer_exists: true + # args: + # sp_end_point: example1.com.8002:8003 + + - path: nvflare.lighter.impl.cert.CertBuilder + - path: nvflare.lighter.impl.he.HEBuilder + args: + poly_modulus_degree: 8192 + coeff_mod_bit_sizes: [60, 40, 40] + scale_bits: 40 + scheme: CKKS + - path: nvflare.lighter.impl.signature.SignatureBuilder diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/requirements.txt b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/requirements.txt new file mode 100644 index 0000000000..32af1e00dc --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/requirements.txt @@ -0,0 +1,11 @@ +pandas +torch +scikit-learn +tensorboard +kagglehub +shap +matplotlib +tenseal +# require xgboost 2.2 version, for now need to install a nightly build +https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/federated-secure/xgboost-2.2.0.dev0%2B4601688195708f7c31fcceeb0e0ac735e7311e61-py3-none-manylinux_2_28_x86_64.whl +ipcl_python @ git+https://github.com/intel/pailliercryptolib_python.git@development \ No newline at end of file diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/run_training_standalone.sh b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/run_training_standalone.sh new file mode 100644 index 0000000000..164638f252 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/run_training_standalone.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +workspace_root="/tmp/nvflare/workspace/fedxgb_secure/train_standalone/" +if [ ! -e "$workspace_root" ]; then + mkdir -p "$workspace_root" + echo "Directory created: $workspace_root" +else + echo "Directory already exists: $workspace_root" +fi + +dataset_path="/tmp/nvflare/dataset/xgb_dataset/" + +echo "Training baseline CPU" +python3 ./train_standalone/train_base.py --out_path "$workspace_root/base_cpu" --gpu 0 +echo "Training baseline GPU" +python3 ./train_standalone/train_base.py --out_path "$workspace_root/base_gpu" --gpu 1 +echo "Training horizontal CPU non-encrypted" +python3 ./train_standalone/train_federated.py --data_train_root "$dataset_path/horizontal_xgb_data" --out_path "$workspace_root/hori_cpu_non_enc" --vert 0 --gpu 0 --enc 0 +echo "Training horizontal CPU encrypted" +python3 ./train_standalone/train_federated.py --data_train_root "$dataset_path/horizontal_xgb_data" --out_path "$workspace_root/hori_cpu_enc" --vert 0 --gpu 0 --enc 1 +echo "Training horizontal GPU non-encrypted" +python3 ./train_standalone/train_federated.py --data_train_root "$dataset_path/horizontal_xgb_data" --out_path "$workspace_root/hori_gpu_non_enc" --vert 0 --gpu 1 --enc 0 +echo "Training horizontal GPU encrypted" +python3 ./train_standalone/train_federated.py --data_train_root "$dataset_path/horizontal_xgb_data" --out_path "$workspace_root/hori_gpu_enc" --vert 0 --gpu 1 --enc 1 +echo "Training vertical CPU non-encrypted" +python3 ./train_standalone/train_federated.py --data_train_root "$dataset_path/vertical_xgb_data" --out_path "$workspace_root/vert_cpu_non_enc" --vert 1 --gpu 0 --enc 0 +echo "Training vertical CPU encrypted" +python3 ./train_standalone/train_federated.py --data_train_root "$dataset_path/vertical_xgb_data" --out_path "$workspace_root/vert_cpu_enc" --vert 1 --gpu 0 --enc 1 +echo "Training vertical GPU non-encrypted" +python3 ./train_standalone/train_federated.py --data_train_root "$dataset_path/vertical_xgb_data" --out_path "$workspace_root/vert_gpu_non_enc" --vert 1 --gpu 1 --enc 0 +echo "Training vertical GPU encrypted" +python3 ./train_standalone/train_federated.py --data_train_root "$dataset_path/vertical_xgb_data" --out_path "$workspace_root/vert_gpu_enc" --vert 1 --gpu 1 --enc 1 \ No newline at end of file diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/secure_fed_xgboost.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/secure_fed_xgboost.ipynb new file mode 100644 index 0000000000..2acbf89b6a --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/secure_fed_xgboost.ipynb @@ -0,0 +1,315 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d8a97688-34ef-4425-9b25-d4c9fb086ad5", + "metadata": {}, + "source": [ + "# Secure Federated XGBoost with Homomorphic Encryption\n", + "This section illustrates the use of [NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.html) enabling secure federated [XGBoost](https://github.com/dmlc/xgboost) under both horizontal and vertical collaborations.\n", + "The examples are based on a [finance dataset](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud) to perform fraud detection.\n", + "\n", + "## Secure Federated Training of XGBoost\n", + "In last section, we visited several mechanisms for training an XGBoost model in a federated learning setting, including histogram-based vertical, histogram-based horizontal, and tree-based horizontal methods. \n", + "\n", + "In this example, we further extend the existing histogram-based horizontal and vertical federated learning approaches to support secure federated learning using homomorphic encryption. Depending on the characteristics of the data to be encrypted, we can choose between [CKKS](https://github.com/OpenMined/TenSEAL) and [Paillier](https://github.com/intel/pailliercryptolib_python).\n", + "\n", + "In the following, we illustrate both *histogram-based* *horizontal* and *vertical* federated XGBoost, *with* homomorphic encryption. We leverage the [vertical federated learning with secure features support](https://github.com/dmlc/xgboost/issues/9987) and [horizontal federated learning with secure features support](https://github.com/dmlc/xgboost/issues/10170) in the XGBoost open-source library.\n", + "\n", + "### Secure Vertical Federated Training of XGBoost\n", + "For vertical XGBoost, the active party holds the label, which can be considered “the most valuable asset” for the whole process, and should not be accessed by passive parties. Therefore, the active party in this case is the “major contributor” from a model training perspective, with a concern of leaking this information to passive clients. In this case, the security protection is mainly against passive clients over the label information. \n", + "\n", + "To protect label information for vertical collaboration, at every round of XGBoost after the active party computes the gradients for each sample, the gradients will be encrypted before sending to passive parties (Figure 1). Upon receiving the encrypted gradients (ciphertext), they will be accumulated according to the specific feature distribution at each passive party. The resulting cumulative histograms will be returned to the active party, decrypted, and further used for tree building by the active party.\n", + "\n", + "![secure_vert_hist](./figs/secure_vert.png)\n", + "\n", + "### Secure Horizontal Federated Training of XGBoost\n", + "For horizontal XGBoost, each party holds “equal status” (whole feature and label for partial population), while the federated server performs aggregation, without owning any data. Hence in this case, clients have a concern of leaking information to the server, and to each other. Hence, the information to be protected is each clients’ local histograms.\n", + "\n", + "To protect the local histograms for horizontal collaboration, the histograms will be encrypted before sending to the federated server for aggregation. The aggregation will then be performed over ciphertexts and the encrypted global histograms will be returned to clients, where they will be decrypted and used for tree building. In this way, the server will have no access to the plaintext histograms, while each client will only learn the global histogram after aggregation, rather than individual local histograms.\n", + "\n", + "![secure_hori_hist](./figs/secure_hori.png)\n", + "\n", + "### Encryption with proper HE schemes\n", + "With multiple libraries covering various HE schemes both with and without GPU support, it is important to properly choose the most efficient scheme for the specific needs of a particular federated XGBoost setting. Let’s look at one example, assume N=5 number of participants, M=200K total number of data samples, J=30 total number of features, and each feature histogram has K=256 slots. Depending on the type of federated learning applications: (Vertical or Horizontal application, we will need different algorithms. \n", + "\n", + "For vertical application, the encryption target is the individual g/h numbers, and the computation is to add the encrypted numbers according to which histogram slots they fall into. As the number of g/h is the same as the sample number, for each boosting round in theory:\n", + "\n", + "The total encryption needed will be M * 2 = 400k (g and h), and each time encrypts a single number\n", + "The total encrypted addition needed will be (M – K) * 2 * J ≈ 12m\n", + "In this case, an optimal scheme choice would be Paillier because the encryption needs to be performed over a single number. Using schemes targeting vectors like CKKS would be a significant waste of space. \n", + "\n", + "For horizontal application, on the other hand, the encryption target is the local histograms G/H, and the computation is to add local histograms together to form the global histogram. For each boosting round:\n", + "\n", + "The total encryption needed will be N * 2 = 10 (G and H), and each time encrypts a vector of length J * K = 7680\n", + "The total encrypted addition needed will be (N – 1) * 2 = 18\n", + "In this case, an optimal scheme choice would be CKKS because it is able to handle a histogram vector (with length 7680, for example) in one shot.\n", + "\n", + "We provide encryption solutions both with CPU-only, and with efficient GPU acceleration. " + ] + }, + { + "cell_type": "markdown", + "id": "39f5d29c-0f4b-484e-99d7-75adf79a996c", + "metadata": {}, + "source": [ + "## Setup\n", + "Install required packages for data download and training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "360512ae-bf5b-4868-893f-1e8df392bc47", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "c66f4223-63a2-438c-97cd-c0bfbdd6c442", + "metadata": {}, + "source": [ + "## Encryption Plugins\n", + "The secure XGBoost requires encryption plugins to work. The plugins are distributed with NVFlare package. If you build NVFlare from source, you need\n", + "to build the plugins following the instructions in this [README](https://github.com/NVIDIA/NVFlare/blob/main/integration/xgboost/encryption_plugins/README.md)\n", + "\n", + "The build process will generate 2 .so files: libcuda_paillier.so and libnvflare.so. Configure the path accordingly following the instructions in \n", + "[XGBoost User Guide](https://nvflare.readthedocs.io/en/main/user_guide/federated_xgboost/secure_xgboost_user_guide.html)" + ] + }, + { + "cell_type": "markdown", + "id": "6de13fdf-da09-4672-a1fe-e5653ceb47bc", + "metadata": {}, + "source": [ + "## Data Preparation\n", + "We follow the same data preparation process as regular federated without secure features. Download and Store Data To run the examples, we use the same data as the last section. We download the dataset and stored in /tmp/nvflare/dataset/creditcard.csv with the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51e4ed08-fcf2-4774-a802-260ffc74870d", + "metadata": {}, + "outputs": [], + "source": [ + "import kagglehub\n", + "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n", + "! mkdir -p /tmp/nvflare/dataset/\n", + "! cp {path}/creditcard.csv /tmp/nvflare/dataset/" + ] + }, + { + "cell_type": "markdown", + "id": "37c92744-b97b-4745-9efb-d61230c6cacb", + "metadata": {}, + "source": [ + "### Data Split\n", + "To prepare data for further experiments, we perform the following steps:\n", + "1. Split the dataset into training/validation and testing sets. \n", + "2. Split the training/validation set: \n", + " * Into \"train\" and \"valid\" for baseline centralized training.\n", + " * Into \"train\" and \"valid\" for each client under horizontal setting. \n", + " * Into \"train\" and \"valid\" for each client under vertical setting.\n", + "\n", + "Data splits used in this example can be generated with" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62c65eb5-aa3f-4d6f-bb63-97a9653115cd", + "metadata": {}, + "outputs": [], + "source": [ + "! bash prepare_data.sh" + ] + }, + { + "cell_type": "markdown", + "id": "39e955ae-d064-4e69-b8fd-c3e685fd83f8", + "metadata": {}, + "source": [ + "This will generate data splits for 3 clients under all experimental settings.\n", + "\n", + "> **_NOTE:_** In this section, we have divided the dataset into separate columns for each site,\n", + "> assuming that the datasets from different sites have already been joined using Private Set\n", + "> Intersection (PSI). However, in practice, each site initially has its own separate dataset. To\n", + "> combine these datasets accurately, you need to use PSI to match records with the same ID across\n", + "> different sites. \n", + "\n", + "> **_NOTE:_** The generated data files will be stored in the folder `/tmp/nvflare/dataset/xgb_dataset/`" + ] + }, + { + "cell_type": "markdown", + "id": "45a8e32b-22e2-44d0-ae02-e1611eea0c0c", + "metadata": {}, + "source": [ + "## Run Baseline and Standalone Experiments\n", + "First, we run the baseline centralized training and standalone federated XGBoost training for comparison.\n", + "In this case, we utilized the `mock` plugin to simulate the homomorphic encryption process. \n", + "For more details regarding federated XGBoost and the interface-plugin design,\n", + "please refer to our [documentation](https://nvflare.readthedocs.io/en/main/user_guide/federated_xgboost/secure_xgboost_user_guide.html).\n", + "\n", + "To run all experiments, we provide a script for all settings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be18fcc9-73fc-4c9d-8083-20bb2ea94fbd", + "metadata": {}, + "outputs": [], + "source": [ + "! bash run_training_standalone.sh" + ] + }, + { + "cell_type": "markdown", + "id": "2be6ddce-bf4b-499e-b135-5592c5328967", + "metadata": {}, + "source": [ + "This will cover baseline centralized training, federated xgboost run in the same machine\n", + "(server and clients are running in different processes) with and without secure feature." + ] + }, + { + "cell_type": "markdown", + "id": "feb613fa-045a-4fc5-aa6b-f7a2779619f7", + "metadata": {}, + "source": [ + "## Run Federated Experiments with NVFlare\n", + "We then run the federated XGBoost training using NVFlare Simulator via [JobAPI](https://nvflare.readthedocs.io/en/main/programming_guide/fed_job_api.html), without and with homomorphic encryption. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "297293bf-bf99-4a93-ab91-1757c96e61bb", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "! python xgb_fl_job.py --data_root /tmp/nvflare/dataset/xgb_dataset/horizontal_xgb_data --data_split_mode horizontal\n", + "! python xgb_fl_job.py --data_root /tmp/nvflare/dataset/xgb_dataset/horizontal_xgb_data --data_split_mode horizontal --secure True\n", + "! python xgb_fl_job.py --data_root /tmp/nvflare/dataset/xgb_dataset/vertical_xgb_data --data_split_mode vertical\n", + "! python xgb_fl_job.py --data_root /tmp/nvflare/dataset/xgb_dataset/vertical_xgb_data --data_split_mode vertical --secure True" + ] + }, + { + "cell_type": "markdown", + "id": "701ad13a-0ca5-47bf-846a-c02ba88ef68a", + "metadata": {}, + "source": [ + "Secure horizontal needs additional tenseal context provisioning:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5cf9590-89a6-454d-bdd8-076af767efaf", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "! nvflare provision -p project.yml -w /tmp/nvflare/workspace/fedxgb_secure/train_fl/works/horizontal_secure\n", + "! nvflare simulator /tmp/nvflare/workspace/fedxgb_secure/train_fl/jobs/horizontal_secure -w /tmp/nvflare/workspace/fedxgb_secure/train_fl/works/horizontal_secure/example_project/prod_00/site-1 -n 3 -t 3" + ] + }, + { + "cell_type": "markdown", + "id": "ead8cbd7-58da-49d6-ae94-9ad013521829", + "metadata": {}, + "source": [ + "## Results\n", + "Comparing the AUC results with centralized baseline, we have four observations:\n", + "1. The performance of the model trained with homomorphic encryption is identical to its counterpart without encryption.\n", + "2. Vertical federated learning (both secure and non-secure) have identical performance as the centralized baseline.\n", + "3. Horizontal federated learning (both secure and non-secure) have performance slightly different from the centralized baseline. This is because under horizontal FL, the local histogram quantiles are based on the local data distribution, which may not be the same as the global distribution.\n", + "4. GPU leads to different results compared to CPU, which is expected as the GPU involves some data conversions.\n", + "\n", + "Below are sample results for CPU training:\n", + "\n", + "The AUC of vertical learning (both secure and non-secure):\n", + "```\n", + "[0]\teval-auc:0.90515\ttrain-auc:0.92747\n", + "[1]\teval-auc:0.90516\ttrain-auc:0.92748\n", + "[2]\teval-auc:0.90518\ttrain-auc:0.92749\n", + "```\n", + "The AUC of horizontal learning (both secure and non-secure):\n", + "```\n", + "[0]\teval-auc:0.89789\ttrain-auc:0.92732\n", + "[1]\teval-auc:0.89791\ttrain-auc:0.92733\n", + "[2]\teval-auc:0.89791\ttrain-auc:0.92733\n", + "```\n", + "\n", + "Comparing the tree models with centralized baseline, we have the following observations:\n", + "1. Vertical federated learning (non-secure) has exactly the same tree model as the centralized baseline.\n", + "2. Vertical federated learning (secure) has the same tree structures as the centralized baseline, however, it produces different tree records at different parties - because each party holds different feature subsets, as illustrated below.\n", + "3. Horizontal federated learning (both secure and non-secure) have different tree models from the centralized baseline.\n", + "\n", + "| ![Tree Structures](./figs/tree.base.png) |\n", + "|:-------------------------------------------------:|\n", + "| *Baseline Model* |\n", + "| ![Tree Structures](./figs/tree.vert.secure.0.png) |\n", + "| *Secure Vertical Model at Party 0* |\n", + "| ![Tree Structures](./figs/tree.vert.secure.1.png) |\n", + "| *Secure Vertical Model at Party 1* |\n", + "| ![Tree Structures](./figs/tree.vert.secure.2.png) |\n", + "| *Secure Vertical Model at Party 2* |\n", + "\n", + "In this case we can notice that Party 0 holds Feature 7 and 10, Party 1 holds Feature 14, 17, and 12, and Party 2 holds none of the effective features for this tree - parties who do not hold the feature will and should not know the split value if it.\n", + "\n", + "By combining the feature splits at all parties, the tree structures will be identical to the centralized baseline model.\n", + "\n", + "When comparing the training and validation accuracy as well as the model outputs,\n", + "experiments conducted with NVFlare produce results that are identical\n", + "to those obtained from standalone scripts.\n", + "\n", + "For more information on the secure xgboost user guide please refer to\n", + "https://nvflare.readthedocs.io/en/main/user_guide/federated_xgboost/secure_xgboost_user_guide.html" + ] + }, + { + "cell_type": "markdown", + "id": "8fa00500-6ae1-49fa-82ca-1c5ab13b1339", + "metadata": {}, + "source": [ + "Now that we covered federated XGBoost under various settings: histogram-based and tree-based, horizontal and vertical, regular and secured. Let's have a [recap](../10.3_recap/recap.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4da46255-f85d-4ecb-9832-177725afdc62", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/train_standalone/train_base.py b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/train_standalone/train_base.py new file mode 100644 index 0000000000..c3b2fedac1 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/train_standalone/train_base.py @@ -0,0 +1,153 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import matplotlib.pyplot as plt +import pandas as pd +import shap +import xgboost as xgb + +PRINT_SAMPLE = False + + +def train_base_args_parser(): + parser = argparse.ArgumentParser(description="Train baseline XGBoost model") + parser.add_argument("--gpu", type=int, default=0, help="Whether to use gpu for training, 0 for cpu, 1 for gpu") + parser.add_argument( + "--data_train_root", + type=str, + default="/tmp/nvflare/dataset/xgb_dataset/base_xgb_data", + help="Path to training data folder", + ) + parser.add_argument( + "--data_test_file", + type=str, + default="/tmp/nvflare/dataset/xgb_dataset/test.csv", + help="Path to testing data file", + ) + parser.add_argument( + "--out_path", + type=str, + default="/tmp/nvflare/workspace/fedxgb_secure/train_standalone/base", + help="Output path for the data split file", + ) + return parser + + +def load_test_data(data_path: str): + df = pd.read_csv(data_path) + # Split to feature and label + X = df.iloc[:, 1:] + y = df.iloc[:, 0] + return X, y + + +def main(): + parser = train_base_args_parser() + args = parser.parse_args() + if not os.path.exists(args.out_path): + os.makedirs(args.out_path) + + # Specify file path, rank 0 as the label owner, others as the feature owner + train_path = f"{args.data_train_root}/train.csv" + valid_path = f"{args.data_train_root}/valid.csv" + + # Load file directly to tell the match from loading with DMatrix + df_train = pd.read_csv(train_path, header=None) + if PRINT_SAMPLE: + # print number of rows and columns for each worker + print(f"Direct load: nrow={df_train.shape[0]}, ncol={df_train.shape[1]}") + # print one sample row of the data + print(f"Direct load: one sample row of the data: \n {df_train.iloc[0]}") + + # Load file, file will not be sharded in federated mode. + label = "&label_column=0" + # for Vertical XGBoost, read from csv with label_column and set data_split_mode to 1 for column mode + dtrain = xgb.DMatrix(train_path + f"?format=csv{label}") + dvalid = xgb.DMatrix(valid_path + f"?format=csv{label}") + + if PRINT_SAMPLE: + # print number of rows and columns for each worker + print(f"DMatrix: nrow={dtrain.num_row()}, ncol={dtrain.num_col()}") + # print one sample row of the data + data_sample = dtrain.get_data()[0] + print(f"DMatrix: one sample row of the data: \n {data_sample}") + + # Specify parameters via map, definition are same as c++ version + if args.gpu: + device = "cuda:0" + else: + device = "cpu" + param = { + "max_depth": 3, + "eta": 0.1, + "objective": "binary:logistic", + "eval_metric": "auc", + "tree_method": "hist", + "device": device, + "nthread": 1, + } + + # Specify validations set to watch performance + watchlist = [(dvalid, "eval"), (dtrain, "train")] + num_round = 3 + + # Run training, all the features in training API is available. + bst = xgb.train(param, dtrain, num_round, evals=watchlist) + + # Save the model + bst.save_model(f"{args.out_path}/model.base.json") + xgb.collective.communicator_print("Finished training\n") + + # save feature importance score to file + score = bst.get_score(importance_type="gain") + with open(f"{args.out_path}/feat_importance.base.txt", "w") as f: + for key in score: + f.write(f"{key}: {score[key]}\n") + + # Load test data + X_test, y_test = load_test_data(args.data_test_file) + # construct xgboost DMatrix + dmat_test = xgb.DMatrix(X_test, label=y_test) + + # Explain the model + explainer = shap.TreeExplainer(bst) + explanation = explainer(dmat_test) + + # save the beeswarm plot to png file + shap.plots.beeswarm(explanation, show=False) + img = plt.gcf() + img.savefig(f"{args.out_path}/shap.base.png") + + # dump tree and save to text file + dump = bst.get_dump() + with open(f"{args.out_path}/tree_dump.base.txt", "w") as f: + for tree in dump: + f.write(tree) + + # plot tree and save to png file + xgb.plot_tree(bst, num_trees=0, rankdir="LR") + fig = plt.gcf() + fig.set_size_inches(18, 5) + plt.savefig(f"{args.out_path}/tree.base.png", dpi=100) + + # export tree to dataframe + tree_df = bst.trees_to_dataframe() + tree_df.to_csv(f"{args.out_path}/tree_df.base.csv") + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/train_standalone/train_federated.py b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/train_standalone/train_federated.py new file mode 100644 index 0000000000..1b57aa067f --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/train_standalone/train_federated.py @@ -0,0 +1,209 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import multiprocessing +import os +import time + +import matplotlib.pyplot as plt +import pandas as pd +import shap +import xgboost as xgb +import xgboost.federated + +PRINT_SAMPLE = False + + +def train_federated_args_parser(): + parser = argparse.ArgumentParser(description="Train federated XGBoost model") + parser.add_argument("--world_size", type=int, default=3, help="Total number of clients") + parser.add_argument("--gpu", type=int, default=0, help="Whether to use gpu for training, 0 for cpu, 1 for gpu") + parser.add_argument( + "--vert", type=int, default=0, help="Horizontal or vertical training, 0 for horizontal, 1 for vertical" + ) + parser.add_argument( + "--enc", type=int, default=0, help="Whether to use encryption plugin, 0 for non-encrypted, 1 for encrypted" + ) + parser.add_argument( + "--data_train_root", + type=str, + default="/tmp/nvflare/dataset/xgb_dataset/base_xgb_data", + help="Path to training data folder", + ) + parser.add_argument( + "--data_test_file", + type=str, + default="/tmp/nvflare/dataset/xgb_dataset/test.csv", + help="Path to testing data file", + ) + parser.add_argument( + "--out_path", + type=str, + default="/tmp/nvflare/workspace/fedxgb_secure/train_standalone/federated", + help="Output path for the data split file", + ) + return parser + + +def load_test_data(data_path: str): + df = pd.read_csv(data_path) + # Split to feature and label + X = df.iloc[:, 1:] + y = df.iloc[:, 0] + return X, y + + +def run_server(port: int, world_size: int) -> None: + xgboost.federated.run_federated_server(port=port, n_workers=world_size) + + +def run_worker(port: int, world_size: int, rank: int, args) -> None: + if args.enc: + plugin = {"name": "mock"} + else: + plugin = {} + communicator_env = { + "dmlc_communicator": "federated", + "federated_server_address": f"localhost:{port}", + "federated_world_size": world_size, + "federated_rank": rank, + "federated_plugin": plugin, + } + + # Always call this before using distributed module + with xgb.collective.CommunicatorContext(**communicator_env): + # Specify file path, rank 0 as the label owner, others as the feature owner + train_path = f"{args.data_train_root}/site-{rank + 1}/train.csv" + valid_path = f"{args.data_train_root}/site-{rank + 1}/valid.csv" + + # Load file directly to tell the match from loading with DMatrix + df_train = pd.read_csv(train_path, header=None) + if PRINT_SAMPLE: + # print number of rows and columns for each worker + print(f"Direct load: rank={rank}, nrow={df_train.shape[0]}, ncol={df_train.shape[1]}") + # print one sample row of the data + print(f"Direct load: rank={rank}, one sample row of the data: \n {df_train.iloc[0]}") + + # Load file, file will not be sharded in federated mode. + if args.vert: + data_split_mode = 1 + if rank == 0: + label = "&label_column=0" + else: + label = "" + else: + data_split_mode = 0 + label = "&label_column=0" + dtrain = xgb.DMatrix(train_path + f"?format=csv{label}", data_split_mode=data_split_mode) + dvalid = xgb.DMatrix(valid_path + f"?format=csv{label}", data_split_mode=data_split_mode) + + if PRINT_SAMPLE: + # print number of rows and columns for each worker + print(f"DMatrix: rank={rank}, nrow={dtrain.num_row()}, ncol={dtrain.num_col()}") + # print one sample row of the data + data_sample = dtrain.get_data()[0] + print(f"DMatrix: rank={rank}, one sample row of the data: \n {data_sample}") + + # Specify parameters via map, definition are same as c++ version + if args.gpu: + device = "cuda:0" + else: + device = "cpu" + param = { + "max_depth": 3, + "eta": 0.1, + "objective": "binary:logistic", + "eval_metric": "auc", + "tree_method": "hist", + "device": device, + "nthread": 1, + } + + # Specify validations set to watch performance + watchlist = [(dvalid, "eval"), (dtrain, "train")] + num_round = 3 + + # Run training, all the features in training API is available. + bst = xgb.train(param, dtrain, num_round, evals=watchlist) + + # Save the model + rank = xgb.collective.get_rank() + bst.save_model(f"{args.out_path}/model.{rank}.json") + xgb.collective.communicator_print("Finished training\n") + + # save feature importance score to file + score = bst.get_score(importance_type="gain") + with open(f"{args.out_path}/feat_importance.{rank}.txt", "w") as f: + for key in score: + f.write(f"{key}: {score[key]}\n") + + # Load test data + X_test, y_test = load_test_data(args.data_test_file) + # construct xgboost DMatrix + dmat_test = xgb.DMatrix(X_test, label=y_test) + + # Explain the model + explainer = shap.TreeExplainer(bst) + explanation = explainer(dmat_test) + + # save the beeswarm plot to png file + shap.plots.beeswarm(explanation, show=False) + img = plt.gcf() + img.savefig(f"{args.out_path}/shap.{rank}.png") + + # dump tree and save to text file + dump = bst.get_dump() + with open(f"{args.out_path}/tree_dump.{rank}.txt", "w") as f: + for tree in dump: + f.write(tree) + + # plot tree and save to png file + xgb.plot_tree(bst, num_trees=0, rankdir="LR") + fig = plt.gcf() + fig.set_size_inches(18, 5) + plt.savefig(f"{args.out_path}/tree.{rank}.png", dpi=100) + + # export tree to dataframe + tree_df = bst.trees_to_dataframe() + tree_df.to_csv(f"{args.out_path}/tree_df.{rank}.csv") + + +def main(): + parser = train_federated_args_parser() + args = parser.parse_args() + if not os.path.exists(args.out_path): + os.makedirs(args.out_path) + + port = 1111 + world_size = args.world_size + + server = multiprocessing.Process(target=run_server, args=(port, world_size)) + server.start() + time.sleep(1) + if not server.is_alive(): + raise Exception("Error starting Federated Learning server") + + workers = [] + for rank in range(world_size): + worker = multiprocessing.Process(target=run_worker, args=(port, world_size, rank, args)) + workers.append(worker) + worker.start() + for worker in workers: + worker.join() + server.terminate() + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/vertical_fed_xgboost_with_he.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/vertical_fed_xgboost_with_he.ipynb deleted file mode 100644 index 3c7b730b81..0000000000 --- a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/vertical_fed_xgboost_with_he.ipynb +++ /dev/null @@ -1,33 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "a30f033f-279e-4111-9e70-bce717231f14", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "nvflare_example", - "language": "python", - "name": "nvflare_example" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/xgb_fl_job.py b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/xgb_fl_job.py new file mode 100644 index 0000000000..ffe21af304 --- /dev/null +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.2_secure_fed_xgboost/xgb_fl_job.py @@ -0,0 +1,130 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nvflare.app_common.widgets.convert_to_fed_event import ConvertToFedEvent +from nvflare.app_opt.tracking.tb.tb_receiver import TBAnalyticsReceiver +from nvflare.app_opt.tracking.tb.tb_writer import TBWriter +from nvflare.app_opt.xgboost.histogram_based_v2.csv_data_loader import CSVDataLoader +from nvflare.app_opt.xgboost.histogram_based_v2.fed_controller import XGBFedController +from nvflare.app_opt.xgboost.histogram_based_v2.fed_executor import FedXGBHistogramExecutor +from nvflare.job_config.api import FedJob + + +def define_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--data_root", + type=str, + help="Path to dataset files for each site", + ) + parser.add_argument("--site_num", type=int, default=3, help="Total number of sites") + parser.add_argument("--round_num", type=int, default=3, help="Total number of training rounds") + parser.add_argument("--nthread", type=int, default=16, help="nthread for xgboost") + parser.add_argument( + "--tree_method", type=str, default="hist", help="tree_method for xgboost - use hist for best perf" + ) + parser.add_argument( + "--data_split_mode", + type=str, + default="horizontal", + choices=["horizontal", "vertical"], + help="dataset split mode, horizontal or vertical", + ) + parser.add_argument( + "--secure", + type=bool, + default=False, + help="Whether to use secure training", + ) + return parser.parse_args() + + +def _get_job_name(args) -> str: + if args.secure: + return f"{args.data_split_mode}_secure" + else: + return f"{args.data_split_mode}" + + +def main(): + args = define_parser() + job_name = _get_job_name(args) + dataset_path = args.data_root + site_num = args.site_num + job = FedJob(name=job_name, min_clients=site_num) + if args.data_split_mode == "horizontal": + data_split_mode = 0 + else: + data_split_mode = 1 + # Define the controller workflow and send to server + controller = XGBFedController( + num_rounds=args.round_num, + data_split_mode=data_split_mode, + secure_training=args.secure, + xgb_options={"early_stopping_rounds": 3, "use_gpus": False}, + xgb_params={ + "max_depth": 3, + "eta": 0.1, + "objective": "binary:logistic", + "eval_metric": "auc", + "tree_method": "hist", + "nthread": 1, + }, + client_ranks={"site-1": 0, "site-2": 1, "site-3": 2}, + in_process=True, + ) + job.to_server(controller, id="xgb_controller") + + # Add tensorboard receiver to server + tb_receiver = TBAnalyticsReceiver( + tb_folder="tb_events", + ) + job.to_server(tb_receiver, id="tb_receiver") + + # Add executor and other components to clients + for site_id in range(1, site_num + 1): + # Define the executor for clients + executor = FedXGBHistogramExecutor( + data_loader_id="dataloader", + metrics_writer_id="metrics_writer", + in_process=True, + ) + job.to(executor, f"site-{site_id}", id="executor") + + dataloader = CSVDataLoader(folder=dataset_path) + job.to(dataloader, f"site-{site_id}", id="dataloader") + + metrics_writer = TBWriter(event_type="analytix_log_stats") + job.to(metrics_writer, f"site-{site_id}", id="metrics_writer") + + event_to_fed = ConvertToFedEvent( + events_to_convert=["analytix_log_stats"], + fed_event_prefix="fed.", + ) + job.to(event_to_fed, f"site-{site_id}", id="event_to_fed") + + # Export job config and run the job + job.export_job("/tmp/nvflare/workspace/fedxgb_secure/train_fl/jobs/") + + # Run the job except for secure horizontal + if args.data_split_mode == "horizontal" and args.secure: + print("Secure horizontal is not supported in this version") + else: + job.simulator_run(f"/tmp/nvflare/workspace/fedxgb_secure/train_fl/works/{job_name}") + + +if __name__ == "__main__": + main() diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.3_recap/recap.ipynb b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.3_recap/recap.ipynb index 9206f8351b..844ede5574 100644 --- a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.3_recap/recap.ipynb +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-10_federated_XGBoost/10.3_recap/recap.ipynb @@ -1,9 +1,39 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "d9df414c-f4d6-4527-b201-a978ed73a3b7", + "metadata": {}, + "source": [ + "# Summary of Chapter 10" + ] + }, + { + "cell_type": "markdown", + "id": "47959d8f-e03c-4a46-a469-213dfe1909b6", + "metadata": {}, + "source": [ + "In this chapter, we visited Federated XGBoost enabled by functionalities from both XGBoost and NVFlare.\n", + "\n", + "Specifically, the following items have been covered:\n", + "\n", + "1. **[Federated XGBoost](../10.1_fed_xgboost/fed_xgboost.ipynb)**: histogram-based and tree-based horizontal collaborations, and histogram-based vertical collaboration, are covered\n", + "2. **[Secure Federated XGBoost](../10.2_secure_fed_xgboost/secure_fed_xgboost.ipynb)**: secure federated XGBoost with homomorphic encryption under \n", + "histogram-based horizontal and vertical collaboration.\n", + "\n", + "Key takeaways of this section are:\n", + "\n", + "1. NVFlare enables federated training of XGBoost models, under both horizontal and vertical collaboration modes.\n", + "2. For horizontal collaboration, federated XGBoost can be performed with tree-based and histogram-based methods, while for vertical collaboration, federated XGBoost relies on histogram-based method.\n", + "3. To protect key information, homomorphic encryption can be added to histogram-based pipelines to ensure data confidentiality under different configurations and needs.\n", + "\n", + "With NVFlare providing full support of federated XGBoost with secure features, users can benefit from efficient and safe federated pipelines of tree-based machine learning algorithms for their applications.\n" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "001060e8-93da-46c9-a848-2ae40e46e1d3", + "id": "b3b7bd34-89df-486b-9ee3-34ae4427e2a3", "metadata": {}, "outputs": [], "source": [] @@ -11,9 +41,9 @@ ], "metadata": { "kernelspec": { - "display_name": "nvflare_example", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "nvflare_example" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -25,7 +55,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.2" + "version": "3.13.2" } }, "nbformat": 4, diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-9_flare_low_level_apis/09.2_tasks_and_data_share/modules.py b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-9_flare_low_level_apis/09.2_tasks_and_data_share/modules.py index 62ccfb06f2..f7c0a44985 100644 --- a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-9_flare_low_level_apis/09.2_tasks_and_data_share/modules.py +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-9_flare_low_level_apis/09.2_tasks_and_data_share/modules.py @@ -22,7 +22,6 @@ class HelloController(Controller): - def control_flow(self, abort_signal: Signal, fl_ctx: FLContext): # Create the task with name "hello" task = Task(name="hello", data=Shareable()) @@ -43,7 +42,6 @@ def stop_controller(self, fl_ctx: FLContext): class HelloExecutor(Executor): - def execute( self, task_name: str, @@ -57,7 +55,6 @@ def execute( class HelloDataController(Controller): - def control_flow(self, abort_signal: Signal, fl_ctx: FLContext): # Prepare any extra parameters to send to the clients data = DXO( @@ -84,7 +81,6 @@ def stop_controller(self, fl_ctx: FLContext): class HelloDataExecutor(Executor): - def execute( self, task_name: str, @@ -100,7 +96,6 @@ def execute( class HelloResponseController(Controller): - def control_flow(self, abort_signal: Signal, fl_ctx: FLContext): # Prepare any extra parameters to send to the clients dxo = DXO( @@ -136,7 +131,6 @@ def _process_client_response(self, client_task, fl_ctx: FLContext) -> None: class HelloResponseExecutor(Executor): - def execute( self, task_name: str, diff --git a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-9_flare_low_level_apis/09.3_p2p_communication/modules.py b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-9_flare_low_level_apis/09.3_p2p_communication/modules.py index 6efc10c997..211a1fcfd4 100644 --- a/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-9_flare_low_level_apis/09.3_p2p_communication/modules.py +++ b/examples/tutorials/self-paced-training/part-4_advanced_federated_learning/chapter-9_flare_low_level_apis/09.3_p2p_communication/modules.py @@ -23,7 +23,6 @@ class BasicController(Controller): - def control_flow(self, abort_signal: Signal, fl_ctx: FLContext): self.broadcast_and_wait( task=Task(name="talk", data=Shareable()), @@ -40,7 +39,6 @@ def stop_controller(self, fl_ctx: FLContext): class P2PExecutor(Executor): - def execute( self, task_name: str, diff --git a/nvflare/app_common/abstract/statistics_spec.py b/nvflare/app_common/abstract/statistics_spec.py index b34dd93f68..2d5a056398 100644 --- a/nvflare/app_common/abstract/statistics_spec.py +++ b/nvflare/app_common/abstract/statistics_spec.py @@ -318,7 +318,7 @@ def failure_count(self, dataset_name: str, feature_name: str) -> int: """ return 0 - def percentiles(self, dataset_name: str, feature_name: str, percentiles: List) -> Dict: + def quantiles(self, dataset_name: str, feature_name: str, percentiles: List) -> Dict: """Return failed count for given dataset and feature. To perform data privacy min_count check, failure_count is always required. diff --git a/nvflare/app_common/app_constant.py b/nvflare/app_common/app_constant.py index a7c34b8444..fa9fcd2718 100644 --- a/nvflare/app_common/app_constant.py +++ b/nvflare/app_common/app_constant.py @@ -163,7 +163,7 @@ class StatisticsConstants(AppConstants): STATS_VAR = "var" STATS_STDDEV = "stddev" STATS_HISTOGRAM = "histogram" - STATS_PERCENTILE = "percentile" + STATS_QUANTILE = "quantile" STATS_MAX = "max" STATS_MIN = "min" STATS_FEATURES = "stats_features" @@ -174,8 +174,7 @@ class StatisticsConstants(AppConstants): STATS_BIN_RANGE = "range" STATS_TARGET_STATISTICS = "statistics" - STATS_PERCENTILES_KEY = "percentiles" - STATS_CENTROIDS_KEY = "centroids" + STATS_DIGEST_COORD = "digest_coord" FED_STATS_PRE_RUN = "fed_stats_pre_run" FED_STATS_TASK = "fed_stats" @@ -196,7 +195,7 @@ class StatisticsConstants(AppConstants): STATS_MEAN, STATS_MIN, STATS_MAX, - STATS_PERCENTILE, + STATS_QUANTILE, ], STATS_2nd_STATISTICS: [STATS_HISTOGRAM, STATS_VAR, STATS_STDDEV], } diff --git a/nvflare/app_common/executors/statistics/statistics_task_handler.py b/nvflare/app_common/executors/statistics/statistics_task_handler.py index e3b1bb45a7..560b9a464d 100644 --- a/nvflare/app_common/executors/statistics/statistics_task_handler.py +++ b/nvflare/app_common/executors/statistics/statistics_task_handler.py @@ -23,7 +23,7 @@ from nvflare.app_common.app_constant import StatisticsConstants as StC from nvflare.app_common.statistics.numeric_stats import filter_numeric_features from nvflare.app_common.statistics.statisitcs_objects_decomposer import fobs_registration -from nvflare.app_common.statistics.statistics_config_utils import get_feature_bin_range, get_target_percents +from nvflare.app_common.statistics.statistics_config_utils import get_feature_bin_range, get_target_quantiles from nvflare.fuel.utils import fobs from nvflare.security.logging import secure_format_exception @@ -96,7 +96,7 @@ def statistic_functions(self) -> dict: StC.STATS_HISTOGRAM: self.get_histogram, StC.STATS_MAX: self.get_max_value, StC.STATS_MIN: self.get_min_value, - StC.STATS_PERCENTILE: self.get_percentiles_and_centroids, + StC.STATS_QUANTILE: self.get_quantiles_and_centroids, } def _populate_result_statistics(self, statistics_result, ds_features, tm: StatisticConfig, shareable, fl_ctx, fn): @@ -319,7 +319,7 @@ def get_bin_range( return bin_range - def get_percentiles_and_centroids( + def get_quantiles_and_centroids( self, dataset_name: str, feature_name: str, @@ -328,8 +328,8 @@ def get_percentiles_and_centroids( fl_ctx: FLContext, ) -> dict: percentile_config = statistic_configs.config - target_percents = get_target_percents(percentile_config, feature_name) - result = self.stats_generator.percentiles(dataset_name, feature_name, target_percents) + target_percents = get_target_quantiles(percentile_config, feature_name) + result = self.stats_generator.quantiles(dataset_name, feature_name, target_percents) return result def _get_global_value_from_input(self, statistic_key: str, dataset_name: str, feature_name: str, inputs): diff --git a/nvflare/app_common/filters/svt_privacy.py b/nvflare/app_common/filters/svt_privacy.py index 5d168153a6..8381445072 100644 --- a/nvflare/app_common/filters/svt_privacy.py +++ b/nvflare/app_common/filters/svt_privacy.py @@ -45,10 +45,10 @@ def __init__( super().__init__(supported_data_kinds=[DataKind.WEIGHTS, DataKind.WEIGHT_DIFF], data_kinds_to_filter=data_kinds) - self.frac = fraction # fraction of the model to upload - self.eps_1 = epsilon + self.fraction = fraction # fraction of the model to upload + self.epsilon = epsilon self.eps_2 = None # to be derived from eps_1 - self.eps_3 = noise_var + self.noise_var = noise_var self.gamma = gamma self.tau = tau self.replace = replace @@ -76,21 +76,21 @@ def process_dxo(self, dxo: DXO, shareable: Shareable, fl_ctx: FLContext) -> Unio ) # precompute thresholds - n_upload = np.minimum(np.ceil(np.float64(delta_w.size) * self.frac), np.float64(delta_w.size)) + n_upload = np.minimum(np.ceil(np.float64(delta_w.size) * self.fraction), np.float64(delta_w.size)) # eps_1: threshold with noise - lambda_rho = self.gamma * 2.0 / self.eps_1 + lambda_rho = self.gamma * 2.0 / self.epsilon threshold = self.tau + np.random.laplace(scale=lambda_rho) # eps_2: query with noise - self.eps_2 = self.eps_1 * (2.0 * n_upload) ** (2.0 / 3.0) + self.eps_2 = self.epsilon * (2.0 * n_upload) ** (2.0 / 3.0) lambda_nu = self.gamma * 4.0 * n_upload / self.eps_2 self.logger.info( "total params: %s, epsilon: %s, " "perparam budget %s, threshold tau: %s + f(eps_1) = %s, " "clip gamma: %s", delta_w.size, - self.eps_1, - self.eps_1 / n_upload, + self.epsilon, + self.epsilon / n_upload, self.tau, threshold, self.gamma, @@ -107,7 +107,7 @@ def process_dxo(self, dxo: DXO, shareable: Shareable, fl_ctx: FLContext) -> Unio self.log_info(fl_ctx, "selected {} responses, requested {}".format(len(accepted), n_upload)) accepted = np.random.choice(accepted, size=np.int64(n_upload), replace=self.replace) # eps_3 return with noise - noise = np.random.laplace(scale=self.gamma * 2.0 / self.eps_3, size=accepted.shape) + noise = np.random.laplace(scale=self.gamma * 2.0 / self.noise_var, size=accepted.shape) self.log_info(fl_ctx, "noise max: {}, median {}".format(np.max(np.abs(noise)), np.median(np.abs(noise)))) delta_w[accepted] = np.clip(delta_w[accepted] + noise, a_min=-self.gamma, a_max=self.gamma) candidate_idx = list(set(np.arange(delta_w.size)) - set(accepted)) diff --git a/nvflare/app_common/statistics/numeric_stats.py b/nvflare/app_common/statistics/numeric_stats.py index 046f0ce0b4..3c6273f75d 100644 --- a/nvflare/app_common/statistics/numeric_stats.py +++ b/nvflare/app_common/statistics/numeric_stats.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,17 +15,18 @@ from math import sqrt from typing import Dict, List, TypeVar -from tdigest import TDigest - from nvflare.app_common.abstract.statistics_spec import Bin, BinRange, DataType, Feature, Histogram, HistogramType from nvflare.app_common.app_constant import StatisticsConstants as StC -from nvflare.app_common.statistics.statistics_config_utils import get_target_percents +from nvflare.app_opt.statistics.quantile_stats import get_quantiles +from nvflare.fuel.utils.log_utils import get_module_logger T = TypeVar("T") +logger = get_module_logger(name=__name__) + def get_global_feature_data_types( - client_feature_dts: Dict[str, Dict[str, List[Feature]]] + client_feature_dts: Dict[str, Dict[str, List[Feature]]], ) -> Dict[str, Dict[str, DataType]]: global_feature_data_types = {} for client_name in client_feature_dts: @@ -85,14 +86,8 @@ def get_global_stats( ds_stddev[ds_name][feature] = round(sqrt(feature_vars[feature]), precision) global_metrics[StC.STATS_STDDEV] = ds_stddev - elif metric == StC.STATS_PERCENTILE: - global_digest = {} - for client_name in stats: - - global_digest = aggregate_centroids(stats[client_name], global_digest) - - percent_config = statistic_configs.get(StC.STATS_PERCENTILE) - global_metrics[metric] = compute_percentiles(global_digest, percent_config, precision) + elif metric == StC.STATS_QUANTILE: + global_metrics[metric] = get_quantiles(stats, statistic_configs, precision) return global_metrics @@ -231,42 +226,3 @@ def filter_numeric_features(ds_features: Dict[str, List[Feature]]) -> Dict[str, numeric_ds_features[ds_name] = n_features return numeric_ds_features - - -def aggregate_centroids(metrics: Dict[str, Dict[str, Dict]], g_digest: dict) -> dict: - for ds_name in metrics: - if ds_name not in g_digest: - g_digest[ds_name] = {} - - feature_metrics = metrics[ds_name] - for feature_name in feature_metrics: - if feature_metrics[feature_name] is not None: - centroids: List = feature_metrics[feature_name].get(StC.STATS_CENTROIDS_KEY) - if feature_name not in g_digest[ds_name]: - g_digest[ds_name][feature_name] = TDigest() - - for centroid in centroids: - mean = centroid.get("m") - count = centroid.get("c") - g_digest[ds_name][feature_name].update(mean, count) - - return g_digest - - -def compute_percentiles(g_digest: Dict[str, Dict[str, TDigest]], quantile_config: Dict, precision: int = 4) -> dict: - g_ds_metrics = {} - for ds_name in g_digest: - if ds_name not in g_ds_metrics: - g_ds_metrics[ds_name] = {} - - feature_metrics = g_digest[ds_name] - for feature_name in feature_metrics: - digest = feature_metrics[feature_name] - percentiles = get_target_percents(quantile_config, feature_name) - percentile_values = {} - for percentile in percentiles: - percentile_values[percentile] = round(digest.percentile(percentile), precision) - - g_ds_metrics[ds_name][feature_name] = percentile_values - - return g_ds_metrics diff --git a/nvflare/app_common/statistics/statistics_config_utils.py b/nvflare/app_common/statistics/statistics_config_utils.py index b128df2394..76b208c2fc 100644 --- a/nvflare/app_common/statistics/statistics_config_utils.py +++ b/nvflare/app_common/statistics/statistics_config_utils.py @@ -30,7 +30,7 @@ def get_feature_bin_range(feature_name: str, hist_config: dict) -> Optional[List return bin_range -def get_target_percents(percentile_config: dict, feature_name: str): +def get_target_quantiles(percentile_config: dict, feature_name: str): if feature_name in percentile_config: percents = percentile_config.get(feature_name) elif "*" in percentile_config: diff --git a/nvflare/app_common/workflows/statistics_controller.py b/nvflare/app_common/workflows/statistics_controller.py index a835c95900..a43b8a531a 100644 --- a/nvflare/app_common/workflows/statistics_controller.py +++ b/nvflare/app_common/workflows/statistics_controller.py @@ -70,7 +70,7 @@ def __init__( "*": {"bins": 20}, "Age": {"bins": 10, "range": [0, 120]} }, - percentile: { + quantile: { "*": [25, 50, 75, 90], "Age": [50, 75, 95] } @@ -211,7 +211,7 @@ def _get_all_statistic_configs(self) -> List[StatisticConfig]: StC.STATS_MEAN: StatisticConfig(StC.STATS_MEAN, {}), StC.STATS_VAR: StatisticConfig(StC.STATS_VAR, {}), StC.STATS_STDDEV: StatisticConfig(StC.STATS_STDDEV, {}), - StC.STATS_PERCENTILE: StatisticConfig(StC.STATS_PERCENTILE, {}), + StC.STATS_QUANTILE: StatisticConfig(StC.STATS_QUANTILE, {}), } if StC.STATS_HISTOGRAM in self.statistic_configs: @@ -409,14 +409,12 @@ def _combine_all_statistics(self): hist: Histogram = self.client_statistics[statistic][client][ds][feature_name] buckets = StatisticsController._apply_histogram_precision(hist.bins, self.precision) result[feature_name][statistic][client][ds] = buckets - elif statistic == StC.STATS_PERCENTILE: - percentiles = self.client_statistics[statistic][client][ds][feature_name][ - StC.STATS_PERCENTILES_KEY - ] - formatted_percentiles = {} - for p in percentiles: - formatted_percentiles[p] = round(percentiles.get(p), self.precision) - result[feature_name][statistic][client][ds] = formatted_percentiles + elif statistic == StC.STATS_QUANTILE: + quantiles = self.client_statistics[statistic][client][ds][feature_name][StC.STATS_QUANTILE] + formatted_quantiles = {} + for p in quantiles: + formatted_quantiles[p] = round(quantiles.get(p), self.precision) + result[feature_name][statistic][client][ds] = formatted_quantiles else: result[feature_name][statistic][client][ds] = round( self.client_statistics[statistic][client][ds][feature_name], self.precision @@ -434,9 +432,9 @@ def _combine_all_statistics(self): if statistic == StC.STATS_HISTOGRAM: hist: Histogram = self.global_statistics[statistic][ds][feature_name] result[feature_name][statistic][StC.GLOBAL][ds] = hist.bins - elif statistic == StC.STATS_PERCENTILE: - percentiles = self.global_statistics[statistic][ds][feature_name] - result[feature_name][statistic][StC.GLOBAL][ds] = percentiles + elif statistic == StC.STATS_QUANTILE: + quantiles = self.global_statistics[statistic][ds][feature_name] + result[feature_name][statistic][StC.GLOBAL][ds] = quantiles else: result[feature_name][statistic][StC.GLOBAL].update( {ds: self.global_statistics[statistic][ds][feature_name]} diff --git a/nvflare/app_opt/pt/job_config/base_fed_job.py b/nvflare/app_opt/pt/job_config/base_fed_job.py index 0225064967..952e47244f 100644 --- a/nvflare/app_opt/pt/job_config/base_fed_job.py +++ b/nvflare/app_opt/pt/job_config/base_fed_job.py @@ -16,9 +16,9 @@ from torch import nn as nn +from nvflare.apis.analytix import ANALYTIC_EVENT_TYPE from nvflare.app_common.abstract.model_locator import ModelLocator from nvflare.app_common.abstract.model_persistor import ModelPersistor -from nvflare.app_common.tracking.tracker_types import ANALYTIC_EVENT_TYPE from nvflare.app_common.widgets.convert_to_fed_event import ConvertToFedEvent from nvflare.app_common.widgets.intime_model_selector import IntimeModelSelector from nvflare.app_common.widgets.streaming import AnalyticsReceiver diff --git a/nvflare/app_opt/statistics/df/df_core_statistics.py b/nvflare/app_opt/statistics/df/df_core_statistics.py index 5404889e8e..ad0c0ae4d0 100644 --- a/nvflare/app_opt/statistics/df/df_core_statistics.py +++ b/nvflare/app_opt/statistics/df/df_core_statistics.py @@ -12,23 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. from abc import ABC +from math import sqrt from typing import Dict, List, Optional import numpy as np import pandas as pd from pandas.core.series import Series -from tdigest import TDigest from nvflare.app_common.abstract.statistics_spec import BinRange, Feature, Histogram, HistogramType, Statistics from nvflare.app_common.app_constant import StatisticsConstants from nvflare.app_common.statistics.numpy_utils import dtype_to_data_type, get_std_histogram_buckets +from nvflare.fuel.utils.import_utils import optional_import class DFStatisticsCore(Statistics, ABC): - def __init__(self): + def __init__(self, max_bin=None): # assumption: the data can be loaded and cached in the memory self.data: Optional[Dict[str, pd.DataFrame]] = None super(DFStatisticsCore, self).__init__() + self.max_bin = max_bin def features(self) -> Dict[str, List[Feature]]: results: Dict[str, List[Feature]] = {} @@ -92,24 +94,25 @@ def min_value(self, dataset_name: str, feature_name: str) -> float: df = self.data[dataset_name] return df[feature_name].min() - def percentiles(self, dataset_name: str, feature_name: str, percents: List) -> Dict: - digest = self._prepare_t_digest(dataset_name, feature_name) + def quantiles(self, dataset_name: str, feature_name: str, percents: List) -> Dict: + TDigest, flag = optional_import("fastdigest", name="TDigest") results = {} + if not flag: + results[StatisticsConstants.STATS_QUANTILE] = {} + return results + + df = self.data[dataset_name] + data = df[feature_name] + max_bin = self.max_bin if self.max_bin else round(sqrt(len(data))) + digest = TDigest(data) + digest.compress(max_bin) + p_results = {} for p in percents: - v = round(digest.percentile(p), 4) + v = round(digest.quantile(p), 4) p_results[p] = v - results[StatisticsConstants.STATS_PERCENTILES_KEY] = p_results + results[StatisticsConstants.STATS_QUANTILE] = p_results - # Extract centroids (mean, count) from the digest to used for merge for the global - x = digest.centroids_to_list() - results[StatisticsConstants.STATS_CENTROIDS_KEY] = x + # Extract the Q-Digest into a dictionary + results[StatisticsConstants.STATS_DIGEST_COORD] = digest.to_dict() return results - - def _prepare_t_digest(self, dataset_name: str, feature_name: str) -> TDigest: - df = self.data[dataset_name] - data = df[feature_name] - digest = TDigest() - for value in data: - digest.update(value) - return digest diff --git a/nvflare/app_opt/statistics/quantile_stats.py b/nvflare/app_opt/statistics/quantile_stats.py new file mode 100644 index 0000000000..1daef0e6b7 --- /dev/null +++ b/nvflare/app_opt/statistics/quantile_stats.py @@ -0,0 +1,98 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +from nvflare.app_common.app_constant import StatisticsConstants as StC +from nvflare.fuel.utils.log_utils import get_module_logger + +try: + from fastdigest import TDigest + + TDIGEST_AVAILABLE = True +except ImportError: + TDIGEST_AVAILABLE = False + + +logger = get_module_logger(name="quantile_stats") + + +def get_quantiles(stats: Dict, statistic_configs: Dict, precision: int): + + logger.info(f"get_quantiles: stats: {TDIGEST_AVAILABLE=}") + + if not TDIGEST_AVAILABLE: + return {} + + global_digest = {} + for client_name in stats: + global_digest = merge_quantiles(stats[client_name], global_digest) + + quantile_config = statistic_configs.get(StC.STATS_QUANTILE) + return compute_quantiles(global_digest, quantile_config, precision) + + +def get_target_quantiles(quantile_config: dict, feature_name: str): + if feature_name in quantile_config: + percents = quantile_config.get(feature_name) + elif "*" in quantile_config: + percents = quantile_config.get("*") + else: + raise ValueError(f"feature: {feature_name} target percents are not defined.") + + return percents + + +def merge_quantiles(metrics: Dict[str, Dict[str, Dict]], g_digest: dict) -> dict: + + if not TDIGEST_AVAILABLE: + return g_digest + + for ds_name in metrics: + if ds_name not in g_digest: + g_digest[ds_name] = {} + + feature_metrics = metrics[ds_name] + for feature_name in feature_metrics: + if feature_metrics[feature_name] is not None: + digest_dict: Dict = feature_metrics[feature_name].get(StC.STATS_DIGEST_COORD) + feature_digest = TDigest.from_dict(digest_dict) + if feature_name not in g_digest[ds_name]: + g_digest[ds_name][feature_name] = feature_digest + else: + g_digest[ds_name][feature_name] = g_digest[ds_name][feature_name].merge(feature_digest) + + return g_digest + + +def compute_quantiles(g_digest: dict, quantile_config: Dict, precision: int) -> Dict: + g_ds_metrics = {} + if not TDIGEST_AVAILABLE: + return g_digest + + for ds_name in g_digest: + if ds_name not in g_ds_metrics: + g_ds_metrics[ds_name] = {} + + feature_metrics = g_digest[ds_name] + for feature_name in feature_metrics: + digest = feature_metrics[feature_name] + percentiles = get_target_quantiles(quantile_config, feature_name) + quantile_values = {} + for percentile in percentiles: + quantile_values[percentile] = round(digest.quantile(percentile), precision) + + g_ds_metrics[ds_name][feature_name] = quantile_values + + return g_ds_metrics diff --git a/nvflare/app_opt/tf/job_config/base_fed_job.py b/nvflare/app_opt/tf/job_config/base_fed_job.py index bf77cd1092..4140bf7e56 100644 --- a/nvflare/app_opt/tf/job_config/base_fed_job.py +++ b/nvflare/app_opt/tf/job_config/base_fed_job.py @@ -16,8 +16,8 @@ import tensorflow as tf +from nvflare.apis.analytix import ANALYTIC_EVENT_TYPE from nvflare.app_common.abstract.model_persistor import ModelPersistor -from nvflare.app_common.tracking.tracker_types import ANALYTIC_EVENT_TYPE from nvflare.app_common.widgets.convert_to_fed_event import ConvertToFedEvent from nvflare.app_common.widgets.intime_model_selector import IntimeModelSelector from nvflare.app_common.widgets.streaming import AnalyticsReceiver diff --git a/nvflare/app_opt/xgboost/tree_based/executor.py b/nvflare/app_opt/xgboost/tree_based/executor.py index f4ff475c64..2e1bb45101 100644 --- a/nvflare/app_opt/xgboost/tree_based/executor.py +++ b/nvflare/app_opt/xgboost/tree_based/executor.py @@ -174,7 +174,9 @@ def _local_boost_bagging(self, fl_ctx: FLContext): if self.writer: # note: writing auc before current training step, for passed in global model self.writer.add_scalar( - "AUC", auc, int((self.bst.num_boosted_rounds() - self.num_local_round - 1) / self.num_client_bagging) + "train_metrics", + auc, + int((self.bst.num_boosted_rounds() - self.num_local_round - 1) / self.num_client_bagging), ) return bst @@ -193,7 +195,7 @@ def _local_boost_cyclic(self, fl_ctx: FLContext): f"Client {self.client_id} AUC after training: {auc}", ) if self.writer: - self.writer.add_scalar("AUC", auc, self.bst.num_boosted_rounds() - 1) + self.writer.add_scalar("train_metrics", auc, self.bst.num_boosted_rounds() - 1) return self.bst def train( diff --git a/nvflare/dashboard/application/__init__.py b/nvflare/dashboard/application/__init__.py index a100099f7b..5e548fb0f3 100644 --- a/nvflare/dashboard/application/__init__.py +++ b/nvflare/dashboard/application/__init__.py @@ -40,9 +40,13 @@ def init_app(): if credential is None: print("Please set env var NVFL_CREDENTIAL") exit(1) - email = credential.split(":")[0] - pwd = credential.split(":")[1] - Store.seed_user(email, pwd) + parts = credential.split(":") + if len(parts) != 3: + print(f"Invalid value '{credential}' for env var NVFL_CREDENTIAL: it must be email:password:org") + email = parts[0] + pwd = parts[1] + org = parts[2] + Store.seed_user(email, pwd, org) with open(os.path.join(web_root, ".db_init_done"), "ab") as f: f.write(bytes()) return app diff --git a/nvflare/dashboard/application/blob.py b/nvflare/dashboard/application/blob.py index 3f2275b75a..cf0b34b49d 100644 --- a/nvflare/dashboard/application/blob.py +++ b/nvflare/dashboard/application/blob.py @@ -13,301 +13,143 @@ # limitations under the License. import io -import json import os +import re import subprocess import tempfile -from nvflare.lighter import tplt_utils, utils - -from .cert import CertPair, Entity, deserialize_ca_key, make_cert +from nvflare.lighter.constants import PropKey +from nvflare.lighter.entity import Project as ProvProject +from nvflare.lighter.impl.aws import AWSBuilder +from nvflare.lighter.impl.azure import AzureBuilder +from nvflare.lighter.impl.cert import CertBuilder +from nvflare.lighter.impl.signature import SignatureBuilder +from nvflare.lighter.impl.static_file import StaticFileBuilder +from nvflare.lighter.impl.workspace import WorkspaceBuilder +from nvflare.lighter.provisioner import Provisioner + +from .cert import deserialize_ca_key from .models import Client, Project, User - -lighter_folder = os.path.dirname(utils.__file__) -template = utils.load_yaml(os.path.join(lighter_folder, "templates", "master_template.yml")) -supported_csps = ["aws", "azure"] -for csp in supported_csps: - csp_template_file = os.path.join(lighter_folder, "templates", f"{csp}_template.yml") - if os.path.exists(csp_template_file): - template.update(utils.load_yaml(csp_template_file)) +from .store import Store, inc_dl -def get_csp_start_script_name(csp): - return f"{csp}_start.sh" +class DummyLogger: + """This dummy logger is used to suppress all log messages generated by the Provisioner, except for errors. + We print error messages to stdout. + """ + def info(self, msg: str): + pass -def gen_overseer(key): - project = Project.query.first() - entity = Entity(project.overseer) - issuer = Entity(project.short_name) - signing_cert_pair = CertPair(issuer, project.root_key, project.root_cert) - cert_pair = make_cert(entity, signing_cert_pair) - with tempfile.TemporaryDirectory() as tmp_dir: - overseer_dir = os.path.join(tmp_dir, entity.name) - dest_dir = os.path.join(overseer_dir, "startup") - os.mkdir(overseer_dir) - os.mkdir(dest_dir) - utils._write( - os.path.join(dest_dir, "start.sh"), - template["start_ovsr_sh"], - "t", - exe=True, - ) - utils._write( - os.path.join(dest_dir, "gunicorn.conf.py"), - utils.sh_replace(template["gunicorn_conf_py"], {"port": "8443"}), - "t", - exe=False, - ) - utils._write_pki(type="overseer", dest_dir=dest_dir, cert_pair=cert_pair, root_cert=project.root_cert) - run_args = ["zip", "-rq", "-P", key, "tmp.zip", "."] - subprocess.run(run_args, cwd=tmp_dir) - fileobj = io.BytesIO() - with open(os.path.join(tmp_dir, "tmp.zip"), "rb") as fo: - fileobj.write(fo.read()) - fileobj.seek(0) - return fileobj, f"{entity.name}.zip" + def error(self, msg: str): + print(f"ERROR: {msg}") + def debug(self, msg: str): + pass -def gen_server(key, first_server=True): - project = Project.query.first() - if first_server: - entity = Entity(project.server1) - fl_port = 8002 - admin_port = 8003 - else: - entity = Entity(project.server2) - fl_port = 8102 - admin_port = 8103 - issuer = Entity(project.short_name) - signing_cert_pair = CertPair(issuer, project.root_key, project.root_cert) - cert_pair = make_cert(entity, signing_cert_pair) + def warning(self, msg: str): + pass - config = json.loads(template["fed_server"]) - server_0 = config["servers"][0] - server_0["name"] = project.short_name - server_0["service"]["target"] = f"{entity.name}:{fl_port}" - server_0["service"]["scheme"] = project.scheme if hasattr(project, "scheme") else "grpc" - server_0["admin_host"] = entity.name - server_0["admin_port"] = admin_port - if project.ha_mode: - overseer_agent = {"path": "nvflare.ha.overseer_agent.HttpOverseerAgent"} - overseer_agent["args"] = { - "role": "server", - "overseer_end_point": f"https://{project.overseer}:8443/api/v1", - "project": project.short_name, - "name": entity.name, - "fl_port": str(fl_port), - "admin_port": str(admin_port), - } - else: - overseer_agent = {"path": "nvflare.ha.dummy_overseer_agent.DummyOverseerAgent"} - overseer_agent["args"] = {"sp_end_point": f"{project.server1}:8002:8003"} - config["overseer_agent"] = overseer_agent - replacement_dict = { - "admin_port": admin_port, - "fed_learn_port": fl_port, - "config_folder": "config", - "ha_mode": "true" if project.ha_mode else "false", - "docker_image": project.app_location.split(" ")[-1] if project.app_location else "nvflare/nvflare", - "org_name": "", - "type": "server", - "cln_uid": "", +def _get_provisioner(root_dir: str, scheme, docker_image=None): + overseer_agent = { + "path": "nvflare.ha.dummy_overseer_agent.DummyOverseerAgent", + "overseer_exists": False, + "args": {"sp_end_point": "server:8002:8003"}, } - tplt = tplt_utils.Template(template) + builders = [ + WorkspaceBuilder(), + StaticFileBuilder( + config_folder="config", + scheme=scheme, + docker_image=docker_image, + overseer_agent=overseer_agent, + ), + AWSBuilder(), + AzureBuilder(), + CertBuilder(), + SignatureBuilder(), + ] + return Provisioner(root_dir, builders) + + +def gen_server_blob(key): + return _gen_kit(key) + + +def _gen_kit(download_key, prepare_target_cb=None, **cb_kwargs): + # validate download_key + allowed_pattern = r"^[A-Za-z0-9]+$" + if not re.match(allowed_pattern, download_key): + raise RuntimeError(f"ERROR: detected unsafe download key: {download_key}") + + u = Store.get_user(1) + super_user = u.get("user") + fl_port = 8002 + admin_port = 8003 with tempfile.TemporaryDirectory() as tmp_dir: - server_dir = os.path.join(tmp_dir, entity.name) - dest_dir = os.path.join(server_dir, "startup") - os.mkdir(server_dir) - os.mkdir(dest_dir) - utils._write_common( - type="server", - dest_dir=dest_dir, - template=template, - tplt=tplt, - replacement_dict=replacement_dict, - config=config, + project = Project.query.first() + scheme = project.scheme if hasattr(project, "scheme") else "grpc" + docker_image = project.app_location.split(" ")[-1] if project.app_location else "nvflare/nvflare" + provisioner = _get_provisioner(tmp_dir, scheme, docker_image) + + # the root key is protected by password + root_pri_key = deserialize_ca_key(project.root_key) + + prov_project = ProvProject( + project.short_name, + project.description, + props={ + "api_version": 3, + }, + root_private_key=root_pri_key, + serialized_root_cert=project.root_cert, ) - utils._write_pki(type="server", dest_dir=dest_dir, cert_pair=cert_pair, root_cert=project.root_cert) - if not project.ha_mode: - for csp in supported_csps: - utils._write( - os.path.join(dest_dir, get_csp_start_script_name(csp)), - tplt.get_start_sh(csp=csp, type="server", entity=entity), - "t", - exe=True, - ) - signatures = utils.sign_all(dest_dir, deserialize_ca_key(project.root_key)) - json.dump(signatures, open(os.path.join(dest_dir, "signature.json"), "wt")) - - # local folder creation - dest_dir = os.path.join(server_dir, "local") - os.mkdir(dest_dir) - utils._write_local(type="server", dest_dir=dest_dir, template=template) - - # workspace folder file - utils._write( - os.path.join(server_dir, "readme.txt"), - template["readme_fs"], - "t", - ) - run_args = ["zip", "-rq", "-P", key, "tmp.zip", "."] - subprocess.run(run_args, cwd=tmp_dir) - fileobj = io.BytesIO() - with open(os.path.join(tmp_dir, "tmp.zip"), "rb") as fo: - fileobj.write(fo.read()) - fileobj.seek(0) - return fileobj, f"{entity.name}.zip" - - -def gen_client(key, id): - project = Project.query.first() - client = Client.query.get(id) - entity = Entity(client.name, client.organization.name) - issuer = Entity(project.short_name) - signing_cert_pair = CertPair(issuer, project.root_key, project.root_cert) - cert_pair = make_cert(entity, signing_cert_pair) - config = json.loads(template["fed_client"]) - config["servers"][0]["name"] = project.short_name - config["servers"][0]["service"]["scheme"] = project.scheme if hasattr(project, "scheme") else "grpc" - replacement_dict = { - "client_name": entity.name, - "config_folder": "config", - "docker_image": project.app_location.split(" ")[-1] if project.app_location else "nvflare/nvflare", - "org_name": entity.org, - "type": "client", - "cln_uid": f"uid={entity.name}", - } - for k in ["client_name", "org_name", "cln_uid"]: - value = replacement_dict[k] - escaped_value = value.replace("'", "\\'") - replacement_dict[k] = escaped_value - - if project.ha_mode: - overseer_agent = {"path": "nvflare.ha.overseer_agent.HttpOverseerAgent"} - overseer_agent["args"] = { - "role": "client", - "overseer_end_point": f"https://{project.overseer}:8443/api/v1", - "project": project.short_name, - "name": entity.name, - } - else: - overseer_agent = {"path": "nvflare.ha.dummy_overseer_agent.DummyOverseerAgent"} - overseer_agent["args"] = {"sp_end_point": f"{project.server1}:8002:8003"} - config["overseer_agent"] = overseer_agent - - tplt = tplt_utils.Template(template) - with tempfile.TemporaryDirectory() as tmp_dir: - client_dir = os.path.join(tmp_dir, entity.name) - dest_dir = os.path.join(client_dir, "startup") - os.mkdir(client_dir) - os.mkdir(dest_dir) - - utils._write_pki(type="client", dest_dir=dest_dir, cert_pair=cert_pair, root_cert=project.root_cert) - utils._write_common( - type="client", - dest_dir=dest_dir, - template=template, - tplt=tplt, - replacement_dict=replacement_dict, - config=config, + # use org of superuser + org = super_user.get("organization", "nvflare") + server_name = project.server1 + server = prov_project.set_server( + name=server_name, + org=org, + props={ + "fed_learn_port": fl_port, + "admin_port": admin_port, + "default_host": server_name, + }, ) - for csp in supported_csps: - utils._write( - os.path.join(dest_dir, get_csp_start_script_name(csp)), - tplt.get_start_sh(csp=csp, type="client", entity=entity), - "t", - exe=True, - ) - - signatures = utils.sign_all(dest_dir, deserialize_ca_key(project.root_key)) - json.dump(signatures, open(os.path.join(dest_dir, "signature.json"), "wt")) - - # local folder creation - dest_dir = os.path.join(client_dir, "local") - os.mkdir(dest_dir) - utils._write_local(type="client", dest_dir=dest_dir, template=template, capacity=client.capacity.capacity) - - # workspace folder file - utils._write( - os.path.join(client_dir, "readme.txt"), - template["readme_fc"], - "t", - ) + target = server + if prepare_target_cb is not None: + target = prepare_target_cb(prov_project, **cb_kwargs) - run_args = ["zip", "-rq", "-P", key, "tmp.zip", "."] - subprocess.run(run_args, cwd=tmp_dir) + ctx = provisioner.provision(prov_project, logger=DummyLogger()) + result_dir = ctx.get_result_location() + ent_dir = os.path.join(result_dir, target.name) + subprocess.run(["zip", "-rq", "-P", download_key, "tmp.zip", "."], cwd=ent_dir) fileobj = io.BytesIO() - with open(os.path.join(tmp_dir, "tmp.zip"), "rb") as fo: + with open(os.path.join(ent_dir, "tmp.zip"), "rb") as fo: fileobj.write(fo.read()) fileobj.seek(0) - return fileobj, f"{entity.name}.zip" + return fileobj, f"{target.name}.zip" -def gen_user(key, id): - project = Project.query.first() - server_name = project.server1 - user = User.query.get(id) - entity = Entity(user.email, user.organization.name, user.role.name) - issuer = Entity(project.short_name) - signing_cert_pair = CertPair(issuer, project.root_key, project.root_cert) - cert_pair = make_cert(entity, signing_cert_pair) +def gen_client_blob(key, id): + return _gen_kit(key, _prepare_client, client_id=id) - config = json.loads(template["fed_admin"]) - replacement_dict = {"admin_name": entity.name, "cn": server_name, "admin_port": "8003", "docker_image": ""} - if project.ha_mode: - overseer_agent = {"path": "nvflare.ha.overseer_agent.HttpOverseerAgent"} - overseer_agent["args"] = { - "role": "admin", - "overseer_end_point": f"https://{project.overseer}:8443/api/v1", - "project": project.short_name, - "name": entity.name, - } - else: - overseer_agent = {"path": "nvflare.ha.dummy_overseer_agent.DummyOverseerAgent"} - overseer_agent["args"] = {"sp_end_point": f"{project.server1}:8002:8003"} - config["admin"].update({"overseer_agent": overseer_agent}) +def _prepare_client(prov_project: ProvProject, client_id): + client = Client.query.get(client_id) + inc_dl(Client, client_id) + return prov_project.add_client(name=client.name, org=client.organization.name, props={}) - with tempfile.TemporaryDirectory() as tmp_dir: - user_dir = os.path.join(tmp_dir, entity.name) - dest_dir = os.path.join(user_dir, "startup") - os.mkdir(user_dir) - os.mkdir(dest_dir) - utils._write(os.path.join(dest_dir, "fed_admin.json"), json.dumps(config, indent=2), "t") - utils._write( - os.path.join(dest_dir, "fl_admin.sh"), - utils.sh_replace(template["fl_admin_sh"], replacement_dict), - "t", - exe=True, - ) - utils._write_pki(type="client", dest_dir=dest_dir, cert_pair=cert_pair, root_cert=project.root_cert) - signatures = utils.sign_all(dest_dir, deserialize_ca_key(project.root_key)) - json.dump(signatures, open(os.path.join(dest_dir, "signature.json"), "wt")) +def gen_user_blob(key, id): + return _gen_kit(key, _prepare_user, user_id=id) - # local folder creation - dest_dir = os.path.join(user_dir, "local") - os.mkdir(dest_dir) - # workspace folder file - utils._write( - os.path.join(user_dir, "readme.txt"), - template["readme_am"], - "t", - ) - utils._write( - os.path.join(user_dir, "system_info.ipynb"), - utils.sh_replace(template["adm_notebook"], replacement_dict), - "t", - ) - run_args = ["zip", "-rq", "-P", key, "tmp.zip", "."] - subprocess.run(run_args, cwd=tmp_dir) - fileobj = io.BytesIO() - with open(os.path.join(tmp_dir, "tmp.zip"), "rb") as fo: - fileobj.write(fo.read()) - fileobj.seek(0) - return fileobj, f"{entity.name}.zip" +def _prepare_user(prov_project: ProvProject, user_id): + user = User.query.get(user_id) + inc_dl(User, user_id) + admin = prov_project.add_admin(name=user.email, org=user.organization.name, props={PropKey.ROLE: user.role.name}) + return admin diff --git a/nvflare/dashboard/application/cert.py b/nvflare/dashboard/application/cert.py index d04b4d27ad..c8027ff55b 100644 --- a/nvflare/dashboard/application/cert.py +++ b/nvflare/dashboard/application/cert.py @@ -12,15 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import datetime import os from dataclasses import dataclass -from cryptography import x509 from cryptography.hazmat.backends import default_backend -from cryptography.hazmat.primitives import hashes, serialization -from cryptography.hazmat.primitives.asymmetric import rsa -from cryptography.x509.oid import NameOID +from cryptography.hazmat.primitives import serialization + +from nvflare.lighter.utils import Identity, generate_cert, generate_keys, serialize_cert, serialize_pri_key dashboard_pp = os.environ.get("NVFL_DASHBOARD_PP") if dashboard_pp is not None: @@ -45,73 +43,6 @@ class CertPair: ser_cert: str = None -def serialize_pri_key(pri_key, passphrase=None): - if passphrase is None or not isinstance(passphrase, bytes): - return pri_key.private_bytes( - encoding=serialization.Encoding.PEM, - format=serialization.PrivateFormat.TraditionalOpenSSL, - encryption_algorithm=serialization.NoEncryption(), - ) - else: - return pri_key.private_bytes( - encoding=serialization.Encoding.PEM, - format=serialization.PrivateFormat.TraditionalOpenSSL, - encryption_algorithm=serialization.BestAvailableEncryption(password=passphrase), - ) - - -def serialize_cert(cert): - return cert.public_bytes(serialization.Encoding.PEM) - - -def generate_keys(): - pri_key = rsa.generate_private_key(public_exponent=65537, key_size=2048, backend=default_backend()) - pub_key = pri_key.public_key() - return pri_key, pub_key - - -def x509_name(cn_name, org_name=None, role=None): - name = [x509.NameAttribute(NameOID.COMMON_NAME, cn_name)] - if org_name is not None: - name.append(x509.NameAttribute(NameOID.ORGANIZATION_NAME, org_name)) - if role: - name.append(x509.NameAttribute(NameOID.UNSTRUCTURED_NAME, role)) - return x509.Name(name) - - -def generate_cert(subject, issuer, signing_pri_key, subject_pub_key, valid_days=360, ca=False): - x509_subject = x509_name(subject.name, subject.org, subject.role) - x509_issuer = x509_name(issuer.name, issuer.org, issuer.role) - builder = ( - x509.CertificateBuilder() - .subject_name(x509_subject) - .issuer_name(x509_issuer) - .public_key(subject_pub_key) - .serial_number(x509.random_serial_number()) - .not_valid_before(datetime.datetime.utcnow()) - .not_valid_after( - # Our certificate will be valid for 360 days - datetime.datetime.utcnow() - + datetime.timedelta(days=valid_days) - # Sign our certificate with our private key - ) - .add_extension(x509.SubjectAlternativeName([x509.DNSName(subject.name)]), critical=False) - ) - if ca: - builder = ( - builder.add_extension( - x509.SubjectKeyIdentifier.from_public_key(subject_pub_key), - critical=False, - ) - .add_extension( - x509.AuthorityKeyIdentifier.from_issuer_public_key(subject_pub_key), - critical=False, - ) - .add_extension(x509.BasicConstraints(ca=True, path_length=None), critical=False) - ) - return builder.sign(signing_pri_key, hashes.SHA256(), default_backend()) - - def _pack(entity, pri_key, cert, passphrase=None): ser_pri_key = serialize_pri_key(pri_key, passphrase) ser_cert = serialize_cert(cert) @@ -121,17 +52,16 @@ def _pack(entity, pri_key, cert, passphrase=None): def make_root_cert(subject: Entity): pri_key, pub_key = generate_keys() - cert = generate_cert(subject=subject, issuer=subject, signing_pri_key=pri_key, subject_pub_key=pub_key, ca=True) + cert = generate_cert( + subject=Identity(subject.name, subject.org, subject.role), + issuer=Identity(subject.name, subject.org, subject.role), + signing_pri_key=pri_key, + subject_pub_key=pub_key, + ca=True, + ) return _pack(subject, pri_key, cert, passphrase=dashboard_pp) -def make_cert(subject: Entity, issuer_cert_pair: CertPair): - pri_key, pub_key = generate_keys() - issuer_pri_key = deserialize_ca_key(issuer_cert_pair.ser_pri_key) - cert = generate_cert(subject, issuer_cert_pair.owner, issuer_pri_key, pub_key, valid_days=360, ca=False) - return _pack(subject, pri_key, cert, passphrase=None) - - def deserialize_ca_key(ser_pri_key): pri_key = serialization.load_pem_private_key(ser_pri_key, password=dashboard_pp, backend=default_backend()) return pri_key diff --git a/nvflare/dashboard/application/clients.py b/nvflare/dashboard/application/clients.py index cf940bfa21..50c215e6b0 100644 --- a/nvflare/dashboard/application/clients.py +++ b/nvflare/dashboard/application/clients.py @@ -18,6 +18,7 @@ from nvflare.dashboard.application.constants import FLARE_DASHBOARD_NAMESPACE +from .blob import gen_client_blob from .store import Store, check_role @@ -82,7 +83,7 @@ def client_blob(id): c, p = check_role(creator_id, get_jwt(), get_jwt_identity()) if p or c: pin = request.json.get("pin") - fileobj, filename = Store.get_client_blob(pin, id) + fileobj, filename = gen_client_blob(pin, id) response = make_response(fileobj.read()) response.headers.set("Content-Type", "zip") response.headers.set("Content-Disposition", f'attachment; filename="{filename}"') diff --git a/nvflare/dashboard/application/project.py b/nvflare/dashboard/application/project.py index 09c007731b..076c2aa966 100644 --- a/nvflare/dashboard/application/project.py +++ b/nvflare/dashboard/application/project.py @@ -20,6 +20,7 @@ from nvflare.dashboard.application.constants import FLARE_DASHBOARD_NAMESPACE from . import jwt +from .blob import gen_server_blob from .store import Store @@ -108,12 +109,7 @@ def login(): def overseer_blob(): claims = get_jwt() if claims.get("role") == "project_admin": - pin = request.json.get("pin") - fileobj, filename = Store.get_overseer_blob(pin) - response = make_response(fileobj.read()) - response.headers.set("Content-Type", "zip") - response.headers.set("Content-Disposition", f'attachment; filename="{filename}"') - return response + return jsonify({"status": "unauthorized"}), 403 else: return jsonify({"status": "unauthorized"}), 403 @@ -124,7 +120,7 @@ def server_blob(id): claims = get_jwt() if claims.get("role") == "project_admin": pin = request.json.get("pin") - fileobj, filename = Store.get_server_blob(pin, id == 1) + fileobj, filename = gen_server_blob(pin) response = make_response(fileobj.read()) response.headers.set("Content-Type", "zip") response.headers.set("Content-Disposition", f'attachment; filename="{filename}"') diff --git a/nvflare/dashboard/application/store.py b/nvflare/dashboard/application/store.py index ea62d8a2a6..80deaf70b0 100644 --- a/nvflare/dashboard/application/store.py +++ b/nvflare/dashboard/application/store.py @@ -17,7 +17,6 @@ from werkzeug.security import check_password_hash, generate_password_hash -from .blob import gen_client, gen_overseer, gen_server, gen_user from .cert import Entity, make_root_cert from .models import Capacity, Client, Organization, Project, Role, User, db @@ -64,12 +63,12 @@ def ready(cls): return user.approval_state >= 100 if user else False @classmethod - def seed_user(cls, email, pwd): + def seed_user(cls, email, pwd, org): seed_user = { "name": "super_name", "email": email, "password": pwd, - "organization": "", + "organization": org, "role": "project_admin", "approval_state": 200, } @@ -134,16 +133,6 @@ def get_project(cls): project_dict = cls._add_registered_info(project_dict) return add_ok({"project": project_dict}) - @classmethod - def get_overseer_blob(cls, key): - fileobj, filename = gen_overseer(key) - return fileobj, filename - - @classmethod - def get_server_blob(cls, key, first_server=True): - fileobj, filename = gen_server(key, first_server) - return fileobj, filename - @classmethod def get_orgs(cls): all_orgs = Organization.query.all() @@ -256,12 +245,6 @@ def delete_client(cls, id): db.session.commit() return add_ok({}) - @classmethod - def get_client_blob(cls, key, id): - fileobj, filename = gen_client(key, id) - inc_dl(Client, id) - return fileobj, filename - @classmethod def create_user(cls, req): name = req.get("name", "") @@ -374,9 +357,3 @@ def delete_user(cls, id): db.session.commit() return add_ok({}) - - @classmethod - def get_user_blob(cls, key, id): - fileobj, filename = gen_user(key, id) - inc_dl(User, id) - return fileobj, filename diff --git a/nvflare/dashboard/application/users.py b/nvflare/dashboard/application/users.py index 3728c128ec..21afc3c559 100644 --- a/nvflare/dashboard/application/users.py +++ b/nvflare/dashboard/application/users.py @@ -18,6 +18,7 @@ from nvflare.dashboard.application.constants import FLARE_DASHBOARD_NAMESPACE +from .blob import gen_user_blob from .store import Store, check_role @@ -83,7 +84,7 @@ def user_blob(id): if not c and not p: return jsonify({"status": "unauthorized"}), 403 pin = request.json.get("pin") - fileobj, filename = Store.get_user_blob(pin, id) + fileobj, filename = gen_user_blob(pin, id) response = make_response(fileobj.read()) response.headers.set("Content-Type", "zip") response.headers.set("Content-Disposition", f'attachment; filename="{filename}"') diff --git a/nvflare/fuel/flare_api/api_spec.py b/nvflare/fuel/flare_api/api_spec.py index d11e526c3b..35190e9840 100644 --- a/nvflare/fuel/flare_api/api_spec.py +++ b/nvflare/fuel/flare_api/api_spec.py @@ -197,6 +197,29 @@ def download_job_result(self, job_id: str) -> str: """ pass + @abstractmethod + def list_job_components(self, job_id: str) -> List[str]: + """Get the list of additional job components for the specified job. + + Args: + job_id (str): ID of the job + + Returns: a list of the additional job components + + """ + pass + + def download_job_components(self, job_id: str) -> str: + """Download additional job components (e.g., ERRORLOG_site-1) for a specified job. + + Args: + job_id (str): ID of the job + + Returns: folder path to the location of the downloaded additional job components + + """ + pass + @abstractmethod def abort_job(self, job_id: str): """Abort the specified job diff --git a/nvflare/fuel/flare_api/flare_api.py b/nvflare/fuel/flare_api/flare_api.py index fbe73cae0b..d2ecbc2ca2 100644 --- a/nvflare/fuel/flare_api/flare_api.py +++ b/nvflare/fuel/flare_api/flare_api.py @@ -370,6 +370,36 @@ def download_job_result(self, job_id: str) -> str: location = meta.get(MetaKey.LOCATION) return location + def list_job_components(self, job_id: str) -> List[str]: + """Get the list of additional job components for the specified job. + + Args: + job_id (str): ID of the job + + Returns: a list of the additional job components + + """ + self._validate_job_id(job_id) + result = self._do_command(AdminCommandNames.LIST_JOB + " " + job_id) + meta = result[ResultKey.META] + job_components_list = meta.get(MetaKey.JOB_COMPONENTS, []) + return job_components_list + + def download_job_components(self, job_id: str) -> str: + """Download additional job components (e.g., ERRORLOG_site-1) for a specified job. + + Args: + job_id (str): ID of the job + + Returns: folder path to the location of the downloaded additional job components + + """ + self._validate_job_id(job_id) + result = self._do_command(AdminCommandNames.DOWNLOAD_JOB_COMPONENTS + " " + job_id) + meta = result[ResultKey.META] + location = meta.get(MetaKey.LOCATION) + return location + def abort_job(self, job_id: str): """Abort the specified job. diff --git a/nvflare/lighter/constants.py b/nvflare/lighter/constants.py index 594d23c2b7..434d58b33b 100644 --- a/nvflare/lighter/constants.py +++ b/nvflare/lighter/constants.py @@ -18,7 +18,6 @@ class WorkDir: WORKSPACE = "workspace" WIP = "wip_dir" STATE = "state_dir" - RESOURCES = "resources_dir" CURRENT_PROD_DIR = "current_prod_dir" @@ -63,10 +62,10 @@ class PropKey: class CtxKey(WorkDir, PropKey): PROJECT = "__project__" TEMPLATE = "__template__" + TEMP_FILES_LOADED = "__temp_files_loaded__" PROVISION_MODE = "__provision_model__" LOGGER = "__logger__" LAST_PROD_STAGE = "last_prod_stage" - TEMPLATE_FILES = "template_files" SERVER_NAME = "server_name" ROOT_CERT = "root_cert" ROOT_PRI_KEY = "root_pri_key" @@ -135,6 +134,11 @@ class TemplateSectionKey: HELM_CHART_DEPLOYMENT_SERVER = "helm_chart_deployment_server" RELAY_RESOURCES_JSON = "relay_resources_json" FED_RELAY = "fed_relay" + CLOUD_SCRIPT_HEADER = "cloud_script_header" + AZURE_START_SVR_HEADER_SH = "azure_start_svr_header_sh" + AZURE_START_COMMON_SH = "azure_start_common_sh" + AZURE_START_CLN_HEADER_SH = "azure_start_cln_header_sh" + AWS_START_SH = "aws_start_sh" class ProvFileName: @@ -174,6 +178,8 @@ class ProvFileName: CUSTOM_CA_CERT_FILE_NAME = "customRootCA.pem" RELAY_RESOURCES_JSON = "relay__resources.json" FED_RELAY_JSON = "fed_relay.json" + AZURE_START_SH = "azure_start.sh" + AWS_START_SH = "aws_start.sh" class CertFileBasename: diff --git a/nvflare/lighter/ctx.py b/nvflare/lighter/ctx.py index 6576f0b243..c0eba3fca1 100644 --- a/nvflare/lighter/ctx.py +++ b/nvflare/lighter/ctx.py @@ -13,11 +13,13 @@ # limitations under the License. import json import os -from typing import Optional +from typing import List, Optional, Union import yaml +import nvflare.lighter as prov from nvflare.lighter import utils +from nvflare.lighter.utils import load_yaml from .constants import CtxKey, PropKey, ProvisionMode from .entity import Entity, Project @@ -30,9 +32,8 @@ def __init__(self, workspace_root_dir: str, project: Project): wip_dir = os.path.join(workspace_root_dir, "wip") state_dir = os.path.join(workspace_root_dir, "state") - resources_dir = os.path.join(workspace_root_dir, "resources") - self.update({CtxKey.WIP: wip_dir, CtxKey.STATE: state_dir, CtxKey.RESOURCES: resources_dir}) - dirs = [workspace_root_dir, resources_dir, wip_dir, state_dir] + self.update({CtxKey.WIP: wip_dir, CtxKey.STATE: state_dir}) + dirs = [workspace_root_dir, wip_dir, state_dir] utils.make_dirs(dirs) # set commonly used data into ctx @@ -44,18 +45,30 @@ def __init__(self, workspace_root_dir: str, project: Project): fed_learn_port = server.get_prop(PropKey.FED_LEARN_PORT, 8002) self[CtxKey.FED_LEARN_PORT] = fed_learn_port self[CtxKey.SERVER_NAME] = server.name + self[CtxKey.TEMP_FILES_LOADED] = [] + self[CtxKey.TEMPLATE] = {} def get_project(self) -> Project: return self.get(CtxKey.PROJECT) - def set_template(self, template: dict): - self[CtxKey.TEMPLATE] = template + def load_templates(self, temp_files: Union[str, List[str]]): + if isinstance(temp_files, str): + temp_files = [temp_files] + elif not isinstance(temp_files, list): + raise ValueError(f"temp_files must be str or List[str] but got {type(temp_files)}") - def get_template(self): - return self.get(CtxKey.TEMPLATE) + prov_folder = os.path.dirname(prov.__file__) + temp_folder = os.path.join(prov_folder, "templates") + + loaded = self[CtxKey.TEMP_FILES_LOADED] + template = self[CtxKey.TEMPLATE] + for f in temp_files: + if f not in loaded: + template.update(load_yaml(os.path.join(temp_folder, f))) + loaded.append(f) def get_template_section(self, section_key: str): - template = self.get_template() + template = self.get(CtxKey.TEMPLATE) if not template: raise RuntimeError("template is not available") @@ -98,9 +111,6 @@ def get_local_dir(self, entity: Entity): def get_state_dir(self): return self.get(CtxKey.STATE) - def get_resources_dir(self): - return self.get(CtxKey.RESOURCES) - def get_workspace(self): return self.get(CtxKey.WORKSPACE) @@ -113,7 +123,7 @@ def json_load_template_section(self, section_key: str): def build_from_template( self, dest_dir: str, - temp_section: str, + temp_section: Union[str, List[str]], file_name, replacement=None, mode="t", @@ -140,12 +150,20 @@ def build_from_template( def build_section_from_template( self, - temp_section: str, + temp_section: Union[str, List[str]], replacement=None, content_modify_cb=None, **cb_kwargs, ): - section = self.get_template_section(temp_section) + if isinstance(temp_section, str): + temp_section = [temp_section] + elif not isinstance(temp_section, list): + raise ValueError(f"temp_section must be str or List[str] but got {type(temp_section)}") + + section = "" + for s in temp_section: + section += self.get_template_section(s) + if replacement: section = utils.sh_replace(section, replacement) if content_modify_cb: diff --git a/nvflare/lighter/entity.py b/nvflare/lighter/entity.py index 3175063ff3..a2bec1bcfa 100644 --- a/nvflare/lighter/entity.py +++ b/nvflare/lighter/entity.py @@ -315,7 +315,7 @@ def __init__( participants=None, props: dict = None, serialized_root_cert=None, - serialized_root_private_key=None, + root_private_key=None, ): """A container class to hold information about this FL project. @@ -327,7 +327,7 @@ def __init__( participants: if provided, list of participants of the project props: properties of the project serialized_root_cert: if provided, the root cert to be used for the project - serialized_root_private_key: if provided, the root private key for signing certs of sites and admins + root_private_key: if provided, the root private key for signing certs of sites and admins Raises: ValueError: when participant criteria is violated @@ -335,12 +335,12 @@ def __init__( Entity.__init__(self, "project", name, props) if serialized_root_cert: - if not serialized_root_private_key: - raise ValueError("missing serialized_root_private_key while serialized_root_cert is provided") + if not root_private_key: + raise ValueError("missing root_private_key while serialized_root_cert is provided") self.description = description self.serialized_root_cert = serialized_root_cert - self.serialized_root_private_key = serialized_root_private_key + self.root_private_key = root_private_key self.server = None self.overseer = None self.clients = [] @@ -377,6 +377,7 @@ def set_server(self, name: str, org: str, props: dict): self._check_unique_name(name) self.server = Participant(ParticipantType.SERVER, name, org, props, self) self.all_names[name] = True + return self.server def get_server(self): """Get the server definition. Only one server is supported! @@ -424,8 +425,10 @@ def add_participant(self, participant: Participant): def add_client(self, name: str, org: str, props: dict): self._check_unique_name(name) - self.clients.append(Participant(ParticipantType.CLIENT, name, org, props, self)) + client = Participant(ParticipantType.CLIENT, name, org, props, self) + self.clients.append(client) self.all_names[name] = True + return client def get_clients(self): return self.clients @@ -446,6 +449,7 @@ def add_admin(self, name: str, org: str, props: dict): raise ValueError(f"missing role in admin '{name}'") self.admins.append(admin) self.all_names[name] = True + return admin def get_admins(self): return self.admins diff --git a/nvflare/lighter/impl/aws.py b/nvflare/lighter/impl/aws.py new file mode 100644 index 0000000000..f1f0ba3036 --- /dev/null +++ b/nvflare/lighter/impl/aws.py @@ -0,0 +1,65 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from nvflare.lighter.constants import ProvFileName, TemplateSectionKey +from nvflare.lighter.spec import Builder, Project, ProvisionContext + + +class AWSBuilder(Builder): + def __init__(self): + Builder.__init__(self) + + def initialize(self, project: Project, ctx: ProvisionContext): + ctx.load_templates(["master_template.yml", "aws_template.yml"]) + + def build(self, project: Project, ctx: ProvisionContext): + # build server + server = project.get_server() + dest_dir = ctx.get_kit_dir(server) + replacement = { + "type": "server", + "inbound_rule": "aws ec2 authorize-security-group-ingress --region ${REGION} --group-id $sg_id --protocol tcp --port 8002-8003 --cidr 0.0.0.0/0 >> ${LOGFILE}.sec_grp.log", + "cln_uid": "", + "server_name": server.name, + "ORG": server.org, + } + ctx.build_from_template( + dest_dir=dest_dir, + file_name=ProvFileName.AWS_START_SH, + temp_section=[ + TemplateSectionKey.CLOUD_SCRIPT_HEADER, + TemplateSectionKey.AWS_START_SH, + ], + replacement=replacement, + exe=True, + ) + + for client in project.get_clients(): + dest_dir = ctx.get_kit_dir(client) + replacement = { + "type": "client", + "inbound_rule": "", + "cln_uid": f"uid={client.name}", + "ORG": client.org, + } + + ctx.build_from_template( + dest_dir=dest_dir, + file_name=ProvFileName.AWS_START_SH, + temp_section=[ + TemplateSectionKey.CLOUD_SCRIPT_HEADER, + TemplateSectionKey.AWS_START_SH, + ], + replacement=replacement, + exe=True, + ) diff --git a/nvflare/lighter/impl/azure.py b/nvflare/lighter/impl/azure.py new file mode 100644 index 0000000000..7973bc2212 --- /dev/null +++ b/nvflare/lighter/impl/azure.py @@ -0,0 +1,51 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from nvflare.lighter.constants import ProvFileName, TemplateSectionKey +from nvflare.lighter.spec import Builder, Project, ProvisionContext + + +class AzureBuilder(Builder): + def __init__(self): + Builder.__init__(self) + + def initialize(self, project: Project, ctx: ProvisionContext): + ctx.load_templates(["master_template.yml", "azure_template.yml"]) + + def build(self, project: Project, ctx: ProvisionContext): + # build server + server = project.get_server() + dest_dir = ctx.get_kit_dir(server) + ctx.build_from_template( + dest_dir=dest_dir, + file_name=ProvFileName.AZURE_START_SH, + temp_section=[ + TemplateSectionKey.CLOUD_SCRIPT_HEADER, + TemplateSectionKey.AZURE_START_SVR_HEADER_SH, + TemplateSectionKey.AZURE_START_COMMON_SH, + ], + exe=True, + ) + + for participant in project.get_clients(): + dest_dir = ctx.get_kit_dir(participant) + ctx.build_from_template( + dest_dir=dest_dir, + file_name=ProvFileName.AZURE_START_SH, + temp_section=[ + TemplateSectionKey.CLOUD_SCRIPT_HEADER, + TemplateSectionKey.AZURE_START_CLN_HEADER_SH, + TemplateSectionKey.AZURE_START_COMMON_SH, + ], + exe=True, + ) diff --git a/nvflare/lighter/impl/cert.py b/nvflare/lighter/impl/cert.py index 9157ff2038..d380baaf86 100644 --- a/nvflare/lighter/impl/cert.py +++ b/nvflare/lighter/impl/cert.py @@ -12,21 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import datetime import json import os from cryptography import x509 from cryptography.hazmat.backends import default_backend -from cryptography.hazmat.primitives import hashes, serialization -from cryptography.hazmat.primitives.asymmetric import rsa +from cryptography.hazmat.primitives import serialization from cryptography.x509.oid import NameOID from nvflare.lighter.constants import CertFileBasename, CtxKey, ParticipantType, PropKey from nvflare.lighter.ctx import ProvisionContext from nvflare.lighter.entity import Participant, Project from nvflare.lighter.spec import Builder -from nvflare.lighter.utils import serialize_cert, serialize_pri_key +from nvflare.lighter.utils import Identity, generate_cert, generate_keys, serialize_cert, serialize_pri_key class _CertState: @@ -115,7 +113,16 @@ def initialize(self, project: Project, ctx: ProvisionContext): self.persistent_state = _CertState(state_dir) state = self.persistent_state - if state.is_available: + if project.root_private_key: + # using project provided credentials + self.serialized_cert = project.serialized_root_cert + self.root_cert = x509.load_pem_x509_certificate(self.serialized_cert, default_backend()) + self.pri_key = project.root_private_key + self.pub_key = self.pri_key.public_key() + self.subject = self.root_cert.subject + self.issuer = self.subject.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value + state.is_available = True + elif state.is_available: state_root_cert = state.get_root_cert() self.serialized_cert = state_root_cert.encode("ascii") self.root_cert = x509.load_pem_x509_certificate(self.serialized_cert, default_backend()) @@ -132,7 +139,7 @@ def initialize(self, project: Project, ctx: ProvisionContext): def _build_root(self, subject, subject_org): assert isinstance(self.persistent_state, _CertState) if not self.persistent_state.is_available: - pri_key, pub_key = self._generate_keys() + pri_key, pub_key = generate_keys() self.issuer = subject self.root_cert = self._generate_cert(subject, subject_org, self.issuer, pri_key, pub_key, ca=True) self.pri_key = pri_key @@ -248,7 +255,7 @@ def build(self, project: Project, ctx: ProvisionContext): self._build_write_cert_pair(admin, CertFileBasename.CLIENT, ctx) def get_pri_key_cert(self, participant: Participant): - pri_key, pub_key = self._generate_keys() + pri_key, pub_key = generate_keys() subject = participant.subject subject_org = participant.org if participant.type == ParticipantType.ADMIN: @@ -269,13 +276,7 @@ def get_pri_key_cert(self, participant: Participant): return pri_key, cert @staticmethod - def _generate_keys(): - pri_key = rsa.generate_private_key(public_exponent=65537, key_size=2048, backend=default_backend()) - pub_key = pri_key.public_key() - return pri_key, pub_key - def _generate_cert( - self, subject, subject_org, issuer, @@ -286,58 +287,25 @@ def _generate_cert( role=None, server: Participant = None, ): - x509_subject = self._x509_name(subject, subject_org, role) - x509_issuer = self._x509_name(issuer) - - builder = ( - x509.CertificateBuilder() - .subject_name(x509_subject) - .issuer_name(x509_issuer) - .public_key(subject_pub_key) - .serial_number(x509.random_serial_number()) - .not_valid_before(datetime.datetime.utcnow()) - .not_valid_after( - # Our certificate will be valid for 360 days - datetime.datetime.utcnow() - + datetime.timedelta(days=valid_days) - # Sign our certificate with our private key - ) - ) - if ca: - builder = ( - builder.add_extension( - x509.SubjectKeyIdentifier.from_public_key(subject_pub_key), - critical=False, - ) - .add_extension( - x509.AuthorityKeyIdentifier.from_issuer_public_key(subject_pub_key), - critical=False, - ) - .add_extension(x509.BasicConstraints(ca=True, path_length=None), critical=False) - ) + server_default_host = None + server_additional_hosts = None if server: # This is to generate a server cert. # Use SubjectAlternativeName for all host names - default_host = server.get_default_host() - host_names = server.get_prop(PropKey.HOST_NAMES) - sans = [x509.DNSName(default_host)] - if host_names: - for h in host_names: - if h != default_host: - sans.append(x509.DNSName(h)) - builder = builder.add_extension(x509.SubjectAlternativeName(sans), critical=False) - else: - builder = builder.add_extension(x509.SubjectAlternativeName([x509.DNSName(subject)]), critical=False) - return builder.sign(signing_pri_key, hashes.SHA256(), default_backend()) - - def _x509_name(self, cn_name, org_name=None, role=None): - name = [x509.NameAttribute(NameOID.COMMON_NAME, cn_name)] - if org_name is not None: - name.append(x509.NameAttribute(NameOID.ORGANIZATION_NAME, org_name)) - if role: - name.append(x509.NameAttribute(NameOID.UNSTRUCTURED_NAME, role)) - return x509.Name(name) + server_default_host = server.get_default_host() + server_additional_hosts = server.get_prop(PropKey.HOST_NAMES) + + return generate_cert( + subject=Identity(subject, subject_org, role), + issuer=Identity(issuer), + signing_pri_key=signing_pri_key, + subject_pub_key=subject_pub_key, + valid_days=valid_days, + ca=ca, + server_default_host=server_default_host, + server_additional_hosts=server_additional_hosts, + ) def finalize(self, project: Project, ctx: ProvisionContext): assert isinstance(self.persistent_state, _CertState) diff --git a/nvflare/lighter/impl/docker_launcher.py b/nvflare/lighter/impl/docker_launcher.py index b876628041..aa47865881 100644 --- a/nvflare/lighter/impl/docker_launcher.py +++ b/nvflare/lighter/impl/docker_launcher.py @@ -168,6 +168,9 @@ def _build_client(self, client: Participant, ctx: ProvisionContext): exe=True, ) + def initialize(self, project: Project, ctx: ProvisionContext): + ctx.load_templates("docker_launcher_template.yml") + def build(self, project: Project, ctx: ProvisionContext): compose = ctx.yaml_load_template_section(TemplateSectionKey.COMPOSE_YAML) self.services = compose.get("services") diff --git a/nvflare/lighter/impl/helm_chart.py b/nvflare/lighter/impl/helm_chart.py index 570e1113eb..2e236b11c7 100644 --- a/nvflare/lighter/impl/helm_chart.py +++ b/nvflare/lighter/impl/helm_chart.py @@ -33,6 +33,7 @@ def __init__(self, docker_image): self.helm_chart_templates_directory = None def initialize(self, project: Project, ctx: ProvisionContext): + ctx.load_templates("master_template.yml") self.helm_chart_directory = os.path.join(ctx.get_wip_dir(), ProvFileName.HELM_CHART_DIR) os.mkdir(self.helm_chart_directory) diff --git a/nvflare/lighter/impl/static_file.py b/nvflare/lighter/impl/static_file.py index 08458e085b..ad410265bf 100644 --- a/nvflare/lighter/impl/static_file.py +++ b/nvflare/lighter/impl/static_file.py @@ -43,7 +43,6 @@ def __init__( download_job_url="", docker_image="", overseer_agent: dict = None, - components="", ): """Build all static files from template. @@ -67,7 +66,6 @@ def __init__( self.download_job_url = download_job_url self.app_validator = app_validator self.overseer_agent = overseer_agent - self.components = components def _build_overseer(self, overseer: Participant, ctx: ProvisionContext): dest_dir = ctx.get_kit_dir(overseer) @@ -829,6 +827,7 @@ def _determine_client_hierarchy(project: Project, ctx: ProvisionContext): ctx[CtxKey.CLIENT_MAP] = client_map def initialize(self, project: Project, ctx: ProvisionContext): + ctx.load_templates("master_template.yml") self._determine_relay_hierarchy(project, ctx) self._determine_client_hierarchy(project, ctx) diff --git a/nvflare/lighter/impl/workspace.py b/nvflare/lighter/impl/workspace.py index a868956861..c1ebff15b3 100644 --- a/nvflare/lighter/impl/workspace.py +++ b/nvflare/lighter/impl/workspace.py @@ -15,10 +15,9 @@ import os import shutil -import nvflare.lighter as prov from nvflare.lighter.constants import CtxKey from nvflare.lighter.spec import Builder, Project, ProvisionContext -from nvflare.lighter.utils import load_yaml, make_dirs +from nvflare.lighter.utils import make_dirs class WorkspaceBuilder(Builder): @@ -47,23 +46,7 @@ def __init__(self, template_file=None): Args: template_file: one or more template file names """ - self.template_files = template_file - - def _build_template(self, ctx: ProvisionContext): - prov_folder = os.path.dirname(prov.__file__) - temp_folder = os.path.join(prov_folder, "templates") - - temp_files_to_load = self.template_files - if not temp_files_to_load: - # load everything - temp_files_to_load = [f for f in os.listdir(temp_folder) if os.path.isfile(f)] - elif isinstance(temp_files_to_load, str): - temp_files_to_load = [temp_files_to_load] - - template = dict() - for f in temp_files_to_load: - template.update(load_yaml(os.path.join(temp_folder, f))) - ctx.set_template(template) + self.template_files = template_file # obsolete def initialize(self, project: Project, ctx: ProvisionContext): workspace_dir = ctx.get_workspace() @@ -74,7 +57,6 @@ def initialize(self, project: Project, ctx: ProvisionContext): if stage > last: last = stage ctx[CtxKey.LAST_PROD_STAGE] = last - self._build_template(ctx) def build(self, project: Project, ctx: ProvisionContext): participants = project.get_all_participants() diff --git a/nvflare/lighter/provisioner.py b/nvflare/lighter/provisioner.py index 94894b54fe..057347001c 100644 --- a/nvflare/lighter/provisioner.py +++ b/nvflare/lighter/provisioner.py @@ -89,8 +89,6 @@ def provision(self, project: Project, mode=None, logger=None) -> ProvisionContex workspace_root_dir = os.path.join(self.root_dir, project.name) ctx = ProvisionContext(workspace_root_dir, project) - if self.template: - ctx.set_template(self.template) if not mode: mode = ProvisionMode.NORMAL diff --git a/nvflare/lighter/utils.py b/nvflare/lighter/utils.py index 8cee919cca..909009cb51 100644 --- a/nvflare/lighter/utils.py +++ b/nvflare/lighter/utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import json import os import random @@ -22,23 +23,105 @@ from cryptography import x509 from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import hashes, serialization -from cryptography.hazmat.primitives.asymmetric import padding +from cryptography.hazmat.primitives.asymmetric import padding, rsa +from cryptography.x509.oid import NameOID from nvflare.lighter.tool_consts import NVFLARE_SIG_FILE, NVFLARE_SUBMITTER_CRT_FILE -def serialize_pri_key(pri_key): - return pri_key.private_bytes( - encoding=serialization.Encoding.PEM, - format=serialization.PrivateFormat.TraditionalOpenSSL, - encryption_algorithm=serialization.NoEncryption(), +class Identity: + def __init__(self, name: str, org: str = None, role: str = None): + self.name = name + self.org = org + self.role = role + + +def generate_cert( + subject: Identity, + issuer: Identity, + signing_pri_key, + subject_pub_key, + valid_days=360, + ca=False, + server_default_host=None, + server_additional_hosts=None, +): + if isinstance(server_additional_hosts, str): + server_additional_hosts = [server_additional_hosts] + + x509_subject = x509_name(subject.name, subject.org, subject.role) + x509_issuer = x509_name(issuer.name, issuer.org, issuer.role) + + builder = ( + x509.CertificateBuilder() + .subject_name(x509_subject) + .issuer_name(x509_issuer) + .public_key(subject_pub_key) + .serial_number(x509.random_serial_number()) + .not_valid_before(datetime.datetime.utcnow()) + .not_valid_after(datetime.datetime.utcnow() + datetime.timedelta(days=valid_days)) ) + if ca: + builder = ( + builder.add_extension( + x509.SubjectKeyIdentifier.from_public_key(subject_pub_key), + critical=False, + ) + .add_extension( + x509.AuthorityKeyIdentifier.from_issuer_public_key(subject_pub_key), + critical=False, + ) + .add_extension(x509.BasicConstraints(ca=True, path_length=None), critical=False) + ) + + if server_default_host: + # This is to generate a server cert. + # Use SubjectAlternativeName for all host names + sans = [x509.DNSName(server_default_host)] + if server_additional_hosts: + for h in server_additional_hosts: + if h != server_default_host: + sans.append(x509.DNSName(h)) + builder = builder.add_extension(x509.SubjectAlternativeName(sans), critical=False) + else: + builder = builder.add_extension(x509.SubjectAlternativeName([x509.DNSName(subject.name)]), critical=False) + return builder.sign(signing_pri_key, hashes.SHA256(), default_backend()) + + +def serialize_pri_key(pri_key, passphrase=None): + if passphrase is None or not isinstance(passphrase, bytes): + return pri_key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.TraditionalOpenSSL, + encryption_algorithm=serialization.NoEncryption(), + ) + else: + return pri_key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.TraditionalOpenSSL, + encryption_algorithm=serialization.BestAvailableEncryption(password=passphrase), + ) def serialize_cert(cert): return cert.public_bytes(serialization.Encoding.PEM) +def generate_keys(): + pri_key = rsa.generate_private_key(public_exponent=65537, key_size=2048, backend=default_backend()) + pub_key = pri_key.public_key() + return pri_key, pub_key + + +def x509_name(cn_name, org_name=None, role=None): + name = [x509.NameAttribute(NameOID.COMMON_NAME, cn_name)] + if org_name is not None: + name.append(x509.NameAttribute(NameOID.ORGANIZATION_NAME, org_name)) + if role: + name.append(x509.NameAttribute(NameOID.UNSTRUCTURED_NAME, role)) + return x509.Name(name) + + def load_crt(path): with open(path, "rb") as f: return load_crt_bytes(f.read()) @@ -326,69 +409,3 @@ def _write(file_full_path, content, mode, exe=False): def write(file_full_path, content, mode, exe=False): _write(file_full_path, content, mode, exe) - - -def _write_common(type, dest_dir, template, tplt, replacement_dict, config): - mapping = {"server": "svr", "client": "cln"} - write(os.path.join(dest_dir, f"fed_{type}.json"), json.dumps(config, indent=2), "t") - write( - os.path.join(dest_dir, "docker.sh"), - sh_replace(template[f"docker_{mapping[type]}_sh"], replacement_dict), - "t", - exe=True, - ) - write( - os.path.join(dest_dir, "start.sh"), - sh_replace(template[f"start_{mapping[type]}_sh"], replacement_dict), - "t", - exe=True, - ) - write( - os.path.join(dest_dir, "sub_start.sh"), - sh_replace(tplt.get_sub_start_sh(), replacement_dict), - "t", - exe=True, - ) - write( - os.path.join(dest_dir, "stop_fl.sh"), - template["stop_fl_sh"], - "t", - exe=True, - ) - - -def _write_local(type, dest_dir, template, capacity=""): - write( - os.path.join(dest_dir, "log_config.json.default"), - template["log_config"], - "t", - ) - write( - os.path.join(dest_dir, "privacy.json.sample"), - template["sample_privacy"], - "t", - ) - write( - os.path.join(dest_dir, "authorization.json.default"), - template["default_authz"], - "t", - ) - if type == "server": - resources = json.loads(template["local_server_resources"]) - elif type == "client": - resources = json.loads(template["local_client_resources"]) - for component in resources["components"]: - if "nvflare.app_common.resource_managers.gpu_resource_manager.GPUResourceManager" == component["path"]: - component["args"] = json.loads(capacity) - break - write( - os.path.join(dest_dir, "resources.json.default"), - json.dumps(resources, indent=2), - "t", - ) - - -def _write_pki(type, dest_dir, cert_pair, root_cert): - write(os.path.join(dest_dir, f"{type}.crt"), cert_pair.ser_cert, "b", exe=False) - write(os.path.join(dest_dir, f"{type}.key"), cert_pair.ser_pri_key, "b", exe=False) - write(os.path.join(dest_dir, "rootCA.pem"), root_cert, "b", exe=False) diff --git a/setup.cfg b/setup.cfg index b772acd3b7..386c4e42e9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,7 +36,6 @@ install_requires = docker>=6.0 websockets>=10.4 pyhocon - tdigest [options.extras_require] HE = diff --git a/tests/unit_test/app_common/statistics/quantile_test.py b/tests/unit_test/app_common/statistics/quantile_test.py new file mode 100644 index 0000000000..38a85fe964 --- /dev/null +++ b/tests/unit_test/app_common/statistics/quantile_test.py @@ -0,0 +1,266 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +from typing import List + +import numpy as np +import pandas as pd +import pytest + +from nvflare.apis.fl_context import FLContext +from nvflare.app_common.app_constant import StatisticsConstants +from nvflare.app_opt.statistics.df.df_core_statistics import DFStatisticsCore +from nvflare.app_opt.statistics.quantile_stats import compute_quantiles, merge_quantiles + +try: + from fastdigest import TDigest + + TDIGEST_AVAILABLE = True +except ImportError: + TDIGEST_AVAILABLE = False + + +class MockDFStats(DFStatisticsCore): + def __init__(self, given_median: int): + super().__init__() + self.median = given_median + self.data = {"train": None} + + def initialize(self, fl_ctx: FLContext): + self.load_data() + + def load_data(self): + data = np.concatenate( + (np.arange(0, self.median), [self.median], np.arange(self.median + 1, self.median * 2 + 1)) + ) + + # Shuffle the data to make it unordered + np.random.shuffle(data) + + # Create the DataFrame + df = pd.DataFrame(data, columns=["Feature"]) + self.data = {"train": df} + + +class MockDFStats2(DFStatisticsCore): + def __init__(self, data_array: List[int]): + super().__init__() + self.raw_data = data_array + self.data = {"train": None} + + def initialize(self, fl_ctx: FLContext): + self.load_data() + + def load_data(self): + # Create the DataFrame + df = pd.DataFrame(self.raw_data, columns=["Feature"]) + self.data = {"train": df} + + +class TestQuantile: + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest is not installed") + def test_tdigest1(self): + # Small dataset + data = [1, 2, 3, 4, 5] + fd = TDigest(data) + + # Insert values + np_data = pd.Series(data) + + assert fd.quantile(0.25) == np_data.quantile(0.25) + assert fd.quantile(0.5) == np_data.quantile(0.5) + assert fd.quantile(0.75) == np_data.quantile(0.75) + + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest is not installed") + def test_tdigest2(self): + # Small dataset + data = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5] + fd = TDigest(data) + # Insert values + np_data = pd.Series(data) + + assert fd.quantile(0.25) == np_data.quantile(0.25) + assert fd.quantile(0.5) == np_data.quantile(0.5) + assert fd.quantile(0.75) == np_data.quantile(0.75) + + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest is not installed") + def test_tdigest3(self): + # Small dataset + data = [-50.0, -40.4, -30.3, -20.3, -10.1, 0, 1.1, 2.2, 3.3, 4.4, 5.5] + fd = TDigest(data) + + np_data = pd.Series(data) + + assert round(fd.quantile(0.25), 2) == np_data.quantile(0.25) + assert round(fd.quantile(0.5), 2) == np_data.quantile(0.5) + assert round(fd.quantile(0.75), 2) == np_data.quantile(0.75) + + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest is not installed") + def test_tdigest4(self): + # Small dataset + data = [-5, -4, -3, -2, -1, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0] + fd = TDigest(data) + + np_data = pd.Series(data) + + assert round(fd.quantile(0.25), 2) == np_data.quantile(0.25) + assert round(fd.quantile(0.5), 2) == np_data.quantile(0.5) + assert round(fd.quantile(0.75), 2) == np_data.quantile(0.75) + + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest is not installed") + def test_tdigest5(self): + # Small dataset + data1 = [x for x in range(-5, 0)] + data2 = [x * 1.0 for x in range(0, 6)] + data = data1 + data2 + fd = TDigest(data) + + assert fd.quantile(0.5) == 0 + assert fd.quantile(0.1) == -4 + assert fd.quantile(0.9) == 4 + + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest is not installed") + def test_tdigest6(self): + # Small dataset + data1 = [x for x in range(-10000, 0)] + data2 = [x * 1.0 for x in range(0, 10000 + 1)] + fd = TDigest(data1) + merged_fd = fd.merge(TDigest(data2)) + + fdx = TDigest(data1 + data2) + + np_data = pd.Series(data1 + data2) + + assert fdx.quantile(0.5) == np_data.quantile(0.5) + assert merged_fd.quantile(0.5) == np_data.quantile(0.5) + + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest is not installed") + def test_tdigest7(self): + median = 10 + data = np.concatenate((np.arange(0, median), [median], np.arange(median + 1, median * 2 + 1))) + # Shuffle the data to make it unordered + np.random.shuffle(data) + + fd = TDigest(data) + + v = fd.quantile(0.5) + + print(sorted(data), v, median) + + assert v == median + + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest is not installed") + def test_tdigest8(self): + + median = 10 + data = np.concatenate((np.arange(0, median), [median], np.arange(median + 1, median * 2 + 1))) + # Shuffle the data to make it unordered + np.random.shuffle(data) + data1 = [0, 1, 2, 3, 4, 5] + data2 = [100, 110, 120, 130, 140, 150] + + fd1 = TDigest(data1) + fd2 = TDigest(data2) + + fd = TDigest(data) + + fd.merge(fd1) + fd.merge(fd2) + + v = fd.quantile(0.5) + + print(sorted(data), v, median) + + assert v == median + + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest is not installed") + def test_tdigest_merge_serde(self): + + median = 10 + data = np.concatenate((np.arange(0, median), [median], np.arange(median + 1, median * 2 + 1))) + # Shuffle the data to make it unordered + np.random.shuffle(data) + data1 = [0, 1, 2, 3, 4, 5] + data2 = [100, 110, 120, 130, 140, 150] + + fd1 = TDigest(data1) + fd2 = TDigest(data2) + + fd = TDigest(data) + + fd.merge(fd1.from_dict(fd1.to_dict())) + fd.merge(fd2.from_dict(fd2.to_dict())) + + v = fd.quantile(0.5) + + print(sorted(data), v, median) + + assert v == median + + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest is not installed") + def test_tdigest_compress(self): + + digest = TDigest(range(101)) + print(f"Before: {len(digest)} centroids") + + before_median = digest.quantile(0.5) + before_25 = digest.quantile(0.25) + before_75 = digest.quantile(0.75) + + digest.compress(10) # compress to 10 (or fewer) centroids + + print(f" After: {len(digest)} centroids") + + print(json.dumps(digest.to_dict(), indent=2)) + + after_median = digest.quantile(0.5) + after_25 = digest.quantile(0.25) + after_75 = digest.quantile(0.75) + + assert before_median == after_median + assert before_25 == after_25 + assert before_75 == after_75 + assert len(digest) == 10 + + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest is not installed") + def test_percentile_metrics(self): + stats_generator = MockDFStats(given_median=100) + stats_generator.load_data() + percentiles = stats_generator.quantiles("train", "Feature", percents=[0.5]) + result = percentiles.get(StatisticsConstants.STATS_QUANTILE) + digest_dict = percentiles.get(StatisticsConstants.STATS_DIGEST_COORD) + assert digest_dict is not None + assert result is not None + print(sorted(stats_generator.data["train"]["Feature"])) + + assert result.get(0.5) == stats_generator.median + + @pytest.mark.skipif(not TDIGEST_AVAILABLE, reason="fastdigest package not installed") + def test_percentile_metrics_aggregation(self): + stats_generators = [ + MockDFStats2(data_array=[0, 1, 2, 3, 4, 5, 6]), + MockDFStats(given_median=10), + MockDFStats2(data_array=[100, 110, 120, 130, 140, 150, 160]), + ] + global_digest = {} + for g in stats_generators: # each site/client + g.load_data() + local_quantiles = g.quantiles("train", "Feature", percents=[0.5]) + local_metrics = {"train": {"Feature": local_quantiles}} + global_digest = merge_quantiles(local_metrics, global_digest) + + result = compute_quantiles(global_digest, {"Feature": [0.5]}, 2) + + expected_median = 10 + assert result["train"]["Feature"].get(0.5) == expected_median diff --git a/tests/unit_test/app_opt/statistics/__init__.py b/tests/unit_test/app_opt/statistics/__init__.py deleted file mode 100644 index d9155f923f..0000000000 --- a/tests/unit_test/app_opt/statistics/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/unit_test/app_opt/statistics/percentiles_test.py b/tests/unit_test/app_opt/statistics/percentiles_test.py deleted file mode 100644 index 8a4aa6a428..0000000000 --- a/tests/unit_test/app_opt/statistics/percentiles_test.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - -import numpy as np -import pandas as pd - -from nvflare.apis.fl_context import FLContext -from nvflare.app_common.app_constant import StatisticsConstants -from nvflare.app_common.statistics.numeric_stats import aggregate_centroids, compute_percentiles -from nvflare.app_opt.statistics.df.df_core_statistics import DFStatisticsCore - - -class MockDFStats(DFStatisticsCore): - def __init__(self, given_median: int): - super().__init__() - self.median = given_median - self.data = {"train": None} - - def initialize(self, fl_ctx: FLContext): - self.load_data() - - def load_data(self): - data = np.concatenate( - (np.arange(0, self.median), [self.median], np.arange(self.median + 1, self.median * 2 + 1)) - ) - - # Shuffle the data to make it unordered - np.random.shuffle(data) - - # Create the DataFrame - df = pd.DataFrame(data, columns=["Feature"]) - self.data = {"train": df} - - -class MockDFStats2(DFStatisticsCore): - def __init__(self, data_array: List[int]): - super().__init__() - self.raw_data = data_array - self.data = {"train": None} - - def initialize(self, fl_ctx: FLContext): - self.load_data() - - def load_data(self): - # Create the DataFrame - df = pd.DataFrame(self.raw_data, columns=["Feature"]) - self.data = {"train": df} - - -class TestPercentiles: - def test_percentile_metrics(self): - stats_generator = MockDFStats(given_median=100) - stats_generator.load_data() - percentiles = stats_generator.percentiles("train", "Feature", percents=[50]) - result = percentiles.get(StatisticsConstants.STATS_PERCENTILES_KEY) - print(f"{percentiles=}") - assert result is not None - assert result.get(50) == stats_generator.median - - def test_percentile_metrics_aggregation(self): - stats_generators = [ - MockDFStats2(data_array=[0, 1, 2, 3, 4, 5]), - MockDFStats(given_median=10), - MockDFStats2(data_array=[100, 110, 120, 130, 140, 150]), - ] - global_digest = {} - result = {} - for g in stats_generators: # each site/client - g.load_data() - local_percentiles = g.percentiles("train", "Feature", percents=[50]) - local_metrics = {"train": {"Feature": local_percentiles}} - aggregate_centroids(local_metrics, global_digest) - result = compute_percentiles(global_digest, {"Feature": [50]}, 2) - - expected_median = 10 - assert result["train"]["Feature"].get(50) == expected_median diff --git a/tests/unit_test/dashboard/conftest.py b/tests/unit_test/dashboard/conftest.py index 26b110674a..d4f07f6dee 100644 --- a/tests/unit_test/dashboard/conftest.py +++ b/tests/unit_test/dashboard/conftest.py @@ -32,7 +32,7 @@ def app(): if os.path.exists(sqlite_file): os.remove(sqlite_file) os.environ["DATABASE_URL"] = f"sqlite:///{sqlite_file}" - os.environ["NVFL_CREDENTIAL"] = f"{TEST_USER}:{TEST_PW}" + os.environ["NVFL_CREDENTIAL"] = f"{TEST_USER}:{TEST_PW}:nvidia" app = init_app() app.config.update( {