Merge branch 'main' into gnn_enc

NVIDIA · Oct 9, 2024 · a35ae31 · a35ae31
2 parents 9cc9fc3 + d2d1f96
commit a35ae31
Show file tree

Hide file tree

Showing 100 changed files with 2,927 additions and 4,239 deletions.
diff --git a/ci/run_integration.sh b/ci/run_integration.sh
@@ -45,8 +45,14 @@ remove_pipenv() {
 
 integration_test_tf() {
     echo "Run TF integration test..."
-    # not using pipenv because we need tensorflow package from the container
-    python -m pip install -e .[dev]
+    # since running directly in container, point python to python3.12
+    ln -sfn /usr/bin/python3.12 /usr/bin/python
+    ln -sfn /usr/bin/python3.12 /usr/bin/python3
+    # somehow the base container has blinker which should be removed
+    apt remove -y python3-blinker python-blinker-doc || true
+    # pipenv does not work with TensorFlow so using pip
+    python3.12 -m pip install -e .[dev]
+    python3.12 -m pip install tensorflow[and-cuda]
     export PYTHONPATH=$PWD
     testFolder="tests/integration_test"
     clean_up_snapshot_and_job

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,3 +1,7 @@
+# For Running NVIDIA FLARE in a Docker container, see
+# https://nvflare.readthedocs.io/en/main/quickstart.html#containerized-deployment-with-docker
+# This Dockerfile is primarily for building Docker images to publish for dashboard.
+
 FROM python:3.8
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -y install zip
 COPY nvflare /opt/NVFlare/nvflare

diff --git a/docs/examples/federated_statistics_overview.rst b/docs/examples/federated_statistics_overview.rst
@@ -177,7 +177,3 @@ Summary
 We provided federated statistics operators that can easily aggregate and visualize the local statistics for different data site and features.
 We hope this feature will make it easier to perform federated data analysis. For more details, please look at :github_nvflare_link:`Federated Statistics (Github) <examples/advanced/federated-statistics/README.md>`
 
-Previous Versions of Federated XGBoost
---------------------------------------
-
-   - `Federated XGBoost for 2.2 <https://github.com/NVIDIA/NVFlare/tree/2.2/examples/xgboost>`_
diff --git a/docs/examples/hello_world_examples.rst b/docs/examples/hello_world_examples.rst
@@ -5,11 +5,11 @@ Hello World examples can be run from the :github_nvflare_link:`hello_world noteb
 
 .. toctree::
 
-   Deep Learning to Federated Learning (GitHub) <https://github.com/NVIDIA/NVFlare/blob/main/examples/hello-world/ml-to-fl>
-   Step-by-Step Examples (GitHub) <https://github.com/NVIDIA/NVFlare/blob/main/examples/hello-world/step-by-step>
+   :github_nvflare_link:`Deep Learning to Federated Learning (GitHub) <examples/hello-world/ml-to-fl>`
+   :github_nvflare_link:`Step-by-Step Examples (GitHub) <examples/hello-world/step-by-step>`
    hello_fedavg_numpy
    hello_cross_val
-   Hello Cyclic Weight Transfer (GitHub) <https://github.com/NVIDIA/NVFlare/blob/main/examples/hello-world/hello-cyclic>
+   :github_nvflare_link:`Hello Cyclic Weight Transfer (GitHub) <examples/hello-world/hello-cyclic>`
    hello_pt_job_api
    hello_tf_job_api
-   Hello Client Controlled Workflow (GitHub) <https://github.com/NVIDIA/NVFlare/blob/main/examples/hello-world/hello-ccwf>
+   :github_nvflare_link:`Hello Client Controlled Workflow (GitHub) <examples/hello-world/hello-ccwf>`
diff --git a/docs/examples/medical_image_analysis.rst b/docs/examples/medical_image_analysis.rst
@@ -4,6 +4,6 @@ Medical Image Analysis
 
 .. toctree::
 
-   Hello MONAI Bundle (GitHub) <https://github.com/NVIDIA/NVFlare/tree/main/integration/monai/examples/spleen_ct_segmentation_sim>
-   Differential Privacy for BraTS18 Segmentation (GitHub) <https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/brats18>
-   Prostate Segmentation from Multi-source Data (GitHub) <https://github.com/NVIDIA/NVFlare/tree/main/examples/advanced/prostate>
+   github_nvflare_link:`Hello MONAI Bundle (GitHub) <integration/monai/examples/spleen_ct_segmentation_sim>`
+   github_nvflare_link:`Differential Privacy for BraTS18 Segmentation (GitHub) <examples/advanced/brats18>`
+   github_nvflare_link:`Prostate Segmentation from Multi-source Data (GitHub) <examples/advanced/prostate>`
diff --git a/docs/examples/tutorial_notebooks.rst b/docs/examples/tutorial_notebooks.rst
@@ -4,6 +4,6 @@ Tutorial Notebooks
 
 .. toctree::
 
-   FL Simulator Notebook (GitHub) <https://github.com/NVIDIA/NVFlare/blob/main/examples/tutorials/flare_simulator.ipynb>
-   Hello FLARE API Notbook (GitHub) <https://github.com/NVIDIA/NVFlare/blob/main/examples/tutorials/flare_api.ipynb>
-   NVFLARE in POC Mode (GitHub) <https://github.com/NVIDIA/NVFlare/blob/main/examples/tutorials/setup_poc.ipynb>
+   :github_nvflare_link:`FL Simulator Notebook (GitHub) <examples/tutorials/flare_simulator.ipynb>`
+   :github_nvflare_link:`Hello FLARE API Notebook (GitHub) <examples/tutorials/flare_api.ipynb>`
+   :github_nvflare_link:`NVFLARE POC Mode in detail Notebook (GitHub) <examples/tutorials/setup_poc.ipynb>`
diff --git a/docs/index.rst b/docs/index.rst
@@ -10,7 +10,7 @@ NVIDIA FLARE
    fl_introduction
    flare_overview
    whats_new
-   getting_started
+   Getting Started <quickstart>
 
 .. toctree::
    :maxdepth: -1

diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -93,7 +93,7 @@ Clone NVFlare repo to get examples, and switch to either the main branch or the
 
   $ git clone https://github.com/NVIDIA/NVFlare.git
   $ cd NVFlare
-  $ git switch 2.4
+  $ git switch 2.5
 
 Note on branches:
 

diff --git a/docs/release_notes/flare_250.rst b/docs/release_notes/flare_250.rst
@@ -60,10 +60,13 @@ learnt by the central aggregation server.
 With our encryption plugins working with XGBoost, NVFlare now supports all secure federated schemes for XGBoost model training, with
 both CPU and GPU.
 
+Please check `federated xgboost with nvflare user guide <https://nvflare.readthedocs.io/en/main/user_guide/federated_xgboost.html>`
+and the :github_nvflare_link:`example <examples/advanced/xgboost_secure>`
+
 Tensorflow support
 ==================
-With community contributions, we add FedOpt, FedProx and Scaffold algorithms using Tensorflow to create parity with Pytorch. You
-can them :github_nvflare_link:`here <nvflare/app_opt/tf>`.
+With community contributions, we add FedOpt, FedProx and Scaffold algorithms using Tensorflow.
+You can check the code :github_nvflare_link:`here <nvflare/app_opt/tf>` and the :github_nvflare_link:`example <examples/getting_started/tf>`
 
 FOBS Auto Registration
 ======================
@@ -121,7 +124,7 @@ FedOpt, FedProx, Scaffold implementation for Tensorflow.
 
 FedBN: Federated Learning on Non-IID Features via Local Batch Normalization
 ---------------------------------------------------------------------------
-The `FedBN example <https://github.com/NVIDIA/NVFlare/tree/main/research/fed-bn>`_ showcases a federated learning algorithm designed
+The :github_nvflare_link:`FedBN example <research/fed-bn>` showcases a federated learning algorithm designed
 to address the feature shift problem when aggregating models across different data distributions.
 
 In this work, we propose an effective method that uses local batch normalization to alleviate the feature shift before averaging models.
@@ -131,7 +134,7 @@ are supported by a convergence analysis that shows in a simplified setting that
 
 End-to-end Federated XGBoost examples
 -------------------------------------
-In `this example <https://github.com/NVIDIA/NVFlare/blob/5fc5ff31f35be63330dec38e1c4e80a6f84586ed/examples/advanced/finance-end-to-end/xgboost.ipynb>`__,
+In :github_nvflare_link:`this example <examples/advanced/finance-end-to-end/xgboost.ipynb>`,
 we try to show that end-to-end process of feature engineering, pre-processing and training in federated settings. You
 can use FLARE to perform federated ETL and then training. 
 

diff --git a/docs/resources/Dockerfile b/docs/resources/Dockerfile
@@ -1,7 +1,7 @@
-ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.03-py3
+ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
 FROM ${PYTORCH_IMAGE}
 
-ARG NVF_VERSION=2.4
+ARG NVF_VERSION=main
 ENV NVF_BRANCH=${NVF_VERSION}
 
 RUN python3 -m pip install -U pip

diff --git a/docs/user_guide/federated_xgboost/secure_xgboost_user_guide.rst b/docs/user_guide/federated_xgboost/secure_xgboost_user_guide.rst
@@ -21,7 +21,7 @@ It supports federated training in the following 4 modes:
 When running with NVFlare, all the GRPC connections in XGBoost are local and the messages are forwarded to other clients through NVFlare's CellNet communication.
 The local GRPC ports are selected automatically by NVFlare.
 
-The encryption is handled in XGBoost by encryption plugins, which are external components that can be installed at runtime. The plugins are bundled with NVFlare.
+The encryption is handled in XGBoost by encryption plugins, which are external components that can be installed at runtime.
 
 Prerequisites
 =============
@@ -130,15 +130,14 @@ For vertical (column-split) training, the datasets on all clients contain differ
 XGBoost Plugin Configuration
 ============================
 XGBoost requires an encryption plugin to handle secure training.
-Two plugins are initially shipped with NVFlare,
 
 - **cuda_paillier**: The default plugin. This plugin uses GPU for cryptographic operations.
 - **nvflare**: This plugin forwards data locally to NVFlare process for encryption.
 
 .. note::
 
-   All clients must use the same plugin. When different plugins are used,
-   the XGBoost’s behavior is undetermined. It may cause the client to crash.
+   All clients must use the same plugin. When different plugins are used in different clients,
+   the behavior of federated XGBoost is undetermined, which can cause the job to crash.
 
 The **cuda_paillier** plugin requires NVIDIA GPUs that support compute capability 7.0 or higher. Also, CUDA
 12.2 or 12.4 must be installed. Please refer to https://developer.nvidia.com/cuda-gpus for more information.
@@ -226,10 +225,10 @@ The server_context.tenseal file is not needed.
 Building Encryption Plugins
 ===========================
 
-In case the included plugin files don't work for your environment, the plugins can be built from the source code.
+The plugins need to be built from the source code for your specific environment.
 
 To build the plugins, check out the NVFlare source code from https://github.com/NVIDIA/NVFlare and following the
-instructions in this document: https://github.com/NVIDIA/NVFlare/blob/main/integration/xgboost/encryption_plugins/README.md.
+instructions in :github_nvflare_link:`this document. <integration/xgboost/encryption_plugins/README.md>`
 
 Job Configuration
 =================

diff --git a/docs/user_guide/flower_integration/flower_job_structure.rst b/docs/user_guide/flower_integration/flower_job_structure.rst
@@ -29,7 +29,7 @@ The ``pyproject.toml`` file exists in the job's ``custom`` folder. It is an impo
 client app definition and configuration information. Such information is used by the Flower system to find the
 server app and the client app, and to pass app-specific configuration to the apps.
 
-Here is an example of ``pyproject.toml``, taken from https://github.com/NVIDIA/NVFlare/blob/main/examples/hello-world/hello-flower/jobs/hello-flwr-pt/app/custom/pyproject.toml.
+Here is an example of ``pyproject.toml``, taken from :github_nvflare_link:`this example <examples/hello-world/hello-flower/jobs/hello-flwr-pt/app/custom/pyproject.toml>`.
 
 .. code-block:: toml
 

diff --git a/docs/user_guide/security/data_privacy_protection.rst b/docs/user_guide/security/data_privacy_protection.rst
@@ -14,4 +14,4 @@ general-purpose data :ref:`filtering mechanism <filters>` for processing task da
 This mechanism has been used for the purpose of data privacy protection on the client side. For example, differential
 privacy filters can be applied to model weights before sending to the server for aggregation.
 
-NVFLARE has implemented some commonly used privacy protection filters: https://github.com/NVIDIA/NVFlare/tree/main/nvflare/app_common/filters
+NVFLARE has implemented github_nvflare_link:`some commonly used privacy protection filters. <nvflare/app_common/filters>`
diff --git a/docs/user_guide/security/serialization.rst b/docs/user_guide/security/serialization.rst
@@ -5,4 +5,4 @@ Message Serialization
 NVFLARE uses a secure mechanism called FOBS (Flare OBject Serializer) for message serialization and
 deserialization when exchanging data between the server and clients.
 
-See `<https://github.com/NVIDIA/NVFlare/blob/main/nvflare/fuel/utils/fobs/README.rst>`_ for usage guidelines.
+See :github_nvflare_link:`README <nvflare/fuel/utils/fobs/README.rst>` for usage guidelines.
diff --git a/examples/advanced/bionemo/downstream/downstream_nvflare.ipynb b/examples/advanced/bionemo/downstream/downstream_nvflare.ipynb
@@ -75,7 +75,7 @@
    "source": [
     "### Download Model Checkpoints\n",
     "\n",
-    "In order to download pretrained models from the NGC registry, **please ensure that you have installed and configured the NGC CLI**, check the [Quickstart Guide](https://docs.nvidia.com/bionemo-framework/latest/quickstart-fw.html) for more info. The following code will download the pretrained model `esm2nv_650M_converted.nemo` from the NGC registry."
+    "In order to download pretrained models from the NGC registry, **please ensure that you have installed and configured the NGC CLI**, check the [Quickstart Guide](https://docs.nvidia.com/bionemo-framework/latest) for more info. The following code will download the pretrained model `esm2nv_650M_converted.nemo` from the NGC registry."
    ]
   },
   {

diff --git a/...advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/base_config.yaml b/...advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/base_config.yaml
@@ -100,7 +100,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -154,7 +154,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: /data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...onemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml b/...onemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml
@@ -13,7 +13,7 @@ encoder_frozen: False
 trainer:
   devices: 1 # number of GPUs or CPUs
   num_nodes: 1 
-  max_epochs: 200
+  max_epochs: 20
   val_check_interval: 0.0
   limit_val_batches: 0.0 # number of batches in validation step, use fraction for fraction of data, 0 to disable
   limit_test_batches: 0.0 # number of batches in test step, use fraction for fraction of data, 0 to disable
@@ -54,7 +54,7 @@ model:
     target_column: ["Y"] #["3state"np.sum(test_df['Y']==0), "resolved"] # names of label columns in csv file
     target_sizes: [2] # number of classes in each label for classifications or 1 for regression
     num_classes: 2
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...anced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/pretrain_small.yaml b/...anced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/pretrain_small.yaml
@@ -15,7 +15,7 @@ model:
       test: x[000..049]
       val: x[000..049]
     micro_batch_size: ${model.micro_batch_size}
-    num_workers: 10
+    num_workers: 2
 
     # Supported kwargs (with default values):
     #     text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True)

diff --git a/...ced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/config/config_fed_server.conf b/...ced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/config/config_fed_server.conf
@@ -35,7 +35,7 @@
             min_clients = 6
 
             # number of global round of the training.
-            num_rounds = 50
+            num_rounds = 20
 
             # starting round is 0-based
             start_round = 0

diff --git a/.../advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/base_config.yaml b/.../advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/base_config.yaml
@@ -100,7 +100,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -154,7 +154,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: /data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...ionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml b/...ionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml
@@ -54,7 +54,7 @@ model:
     target_column: ["Y"] #["3state"np.sum(test_df['Y']==0), "resolved"] # names of label columns in csv file
     target_sizes: [2] # number of classes in each label for classifications or 1 for regression
     num_classes: 2
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...vanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/pretrain_small.yaml b/...vanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/pretrain_small.yaml
@@ -15,7 +15,7 @@ model:
       test: x[000..049]
       val: x[000..049]
     micro_batch_size: ${model.micro_batch_size}
-    num_workers: 10
+    num_workers: 2
 
     # Supported kwargs (with default values):
     #     text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True)

diff --git a/...s/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/base_config.yaml b/...s/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/base_config.yaml
@@ -100,7 +100,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -154,7 +154,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: /data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml b/...bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml
@@ -13,7 +13,7 @@ encoder_frozen: False
 trainer:
   devices: 1 # number of GPUs or CPUs
   num_nodes: 1 
-  max_epochs: 200
+  max_epochs: 20
   val_check_interval: 0.0
   limit_val_batches: 0.0 # number of batches in validation step, use fraction for fraction of data, 0 to disable
   limit_test_batches: 0.0 # number of batches in test step, use fraction for fraction of data, 0 to disable
@@ -54,7 +54,7 @@ model:
     target_column: ["Y"] #["3state"np.sum(test_df['Y']==0), "resolved"] # names of label columns in csv file
     target_sizes: [2] # number of classes in each label for classifications or 1 for regression
     num_classes: 2
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}

diff --git a/...dvanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/pretrain_small.yaml b/...dvanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/pretrain_small.yaml
@@ -15,7 +15,7 @@ model:
       test: x[000..049]
       val: x[000..049]
     micro_batch_size: ${model.micro_batch_size}
-    num_workers: 10
+    num_workers: 2
 
     # Supported kwargs (with default values):
     #     text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True)

diff --git a/examples/advanced/bionemo/downstream/sabdab/run_sim_sabdab.py b/examples/advanced/bionemo/downstream/sabdab/run_sim_sabdab.py
@@ -14,15 +14,14 @@
 
 from nvflare import SimulatorRunner
 
-n_clients = 6
-
 # Choose from one of the available jobs
 job_name = "central_sabdab_esm1nv"
-# job_name = "local_sabdab_esm1nv"
-# job_name = "fedavg_sabdab_esm1nv"
+n_clients = 1
+# job_name = "local_sabdab_esm1nv"; n_clients = 6
+# job_name = "fedavg_sabdab_esm1nv"; n_clients = 6
 
 simulator = SimulatorRunner(
-    job_folder=f"jobs/{job_name}", workspace=f"/tmp/nvflare/results/{job_name}", n_clients=n_clients, threads=1
+    job_folder=f"jobs/{job_name}", workspace=f"/tmp/nvflare/results/{job_name}", n_clients=n_clients, threads=n_clients
 )
 run_status = simulator.run()
 print("Simulator finished with run_status", run_status)
diff --git a/...anced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/custom/base_config.yaml b/...anced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/custom/base_config.yaml
@@ -140,7 +140,7 @@ model:
     ngc_registry_target: uniref50_2022_05
     ngc_registry_version: v23.06
     data_prefix: "" # must be null or ""
-    num_workers: 8
+    num_workers: 2
     dataloader_type: single # cyclic
     reset_position_ids: False # Reset position ids after end-of-document token
     reset_attention_mask: False # Reset attention mask after end-of-document token
@@ -239,7 +239,7 @@ model:
       batch_size: 128
       num_epochs: 10
       shuffle: True
-      num_workers: 8
+      num_workers: 2
       task_name: secondary_structure
       dataset_path: ${oc.env:BIONEMO_HOME}/data/FLIP/${model.dwnstr_task_validation.dataset.task_name}
       dataset:

diff --git a/...onemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/custom/downstream_flip_scl.yaml b/...onemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/custom/downstream_flip_scl.yaml
@@ -47,7 +47,7 @@ model:
     target_column: ["TARGET"] #["3state", "resolved"] # names of label columns in csv file
     target_sizes: [10] # number of classes in each label for classifications or 1 for regression
     num_classes: 10
-    num_workers: 8
+    num_workers: 2
     shuffle: True # shuffle training dataset
     max_seq_length: ${model.seq_length}
     emb_batch_size: ${model.micro_batch_size}