diff --git a/.github/DISCUSSION_TEMPLATE/q-a.yml b/.github/DISCUSSION_TEMPLATE/q-a.yml index 67a4da6eb5..529fcb9c48 100644 --- a/.github/DISCUSSION_TEMPLATE/q-a.yml +++ b/.github/DISCUSSION_TEMPLATE/q-a.yml @@ -9,26 +9,26 @@ body: attributes: label: Python version (`python3 -V`) description: Python version (`python3 -V`) - placeholder: "3.8" + placeholder: "3.10" validations: required: true - type: input attributes: label: NVFlare version (`python3 -m pip list | grep "nvflare"`) description: NVFlare version (`python3 -m pip list | grep "nvflare"`) - placeholder: "2.3.8" + placeholder: "2.5.0" validations: required: true - type: input attributes: label: NVFlare branch (if running examples, please use the branch that corresponds to the NVFlare version, `git branch`) description: NVFlare branch (if running examples, please use the branch that corresponds to the NVFlare version, `git branch`) - placeholder: "2.3" + placeholder: "2.5" - type: input attributes: label: Operating system description: Operating system (Ubuntu, MacOS, Windows, etc) - placeholder: "Ubuntu 20.04" + placeholder: "Ubuntu 22.04" validations: required: true - type: checkboxes diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 4a3f5cd198..54de1b1d76 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -24,9 +24,9 @@ A clear and concise description of what you expected to happen. If applicable, add screenshots to help explain your problem. **Desktop (please complete the following information):** - - OS: [e.g. ubuntu 16.04] - - Python Version [e.g. 3.8] - - NVFlare Version [e.g. 2.1] + - OS: [e.g. ubuntu 22.04] + - Python Version [e.g. 3.10] + - NVFlare Version [e.g. 2.5] **Additional context** Add any other context about the problem here. diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 844cdf1c93..a6999935bf 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -42,24 +42,26 @@ jobs: args: ${{ env.args }} # This job only runs for pull request comments - if: contains('\ - chesterxgchen,\ - pxLi,\ - IsaacYangSLA,\ - YanxuanLiu,\ - yhwen,\ - YuanTingHsieh,\ - holgerroth,\ - yhwen,\ - nvkevlu,\ - nvidianz,\ - yanchengnv,\ - ZiyueXu77,\ - Can-Zhao,\ - guopengf,\ - SYangster,\ - yinqingh,\ - ', format('{0},', github.actor)) && github.event.comment.body == '/build' + if: | + github.event.comment.body == '/build' && + ( + github.actor == 'chesterxgchen' || + github.actor == 'pxLi' || + github.actor == 'IsaacYangSLA' || + github.actor == 'YanxuanLiu' || + github.actor == 'yhwen' || + github.actor == 'YuanTingHsieh' || + github.actor == 'holgerroth' || + github.actor == 'yhwen' || + github.actor == 'nvkevlu' || + github.actor == 'nvidianz' || + github.actor == 'yanchengnv' || + github.actor == 'ZiyueXu77' || + github.actor == 'Can-Zhao' || + github.actor == 'guopengf' || + github.actor == 'SYangster' || + github.actor == 'yinqingh' + ) steps: - name: Check if comment is issued by authorized person run: blossom-ci @@ -74,7 +76,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: repository: ${{ fromJson(needs.Authorization.outputs.args).repo }} ref: ${{ fromJson(needs.Authorization.outputs.args).ref }} diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 193f7b48e5..0425542192 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -36,7 +36,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000000..5501c295eb --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,57 @@ +name: Deploy to gh-pages + +on: + # Trigger the workflow if any web/** files are modified + push: + branches: + - "main" + - "2.5" + paths: + - 'web/**' + workflow_dispatch: + +env: + site_path: ./web + version_path: / + +# Allow this job to clone the repo and create a page deployment +permissions: + contents: write + pages: write + id-token: write + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Update version_path for non-main branches + if: ${{ github.ref_type == 'branch' && github.ref_name != 'main'}} + run: echo version_path=/version/${{ github.ref_name }}/ >> $GITHUB_ENV + + - name: Checkout your repository + uses: actions/checkout@v4 + + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: 20 + cache: npm + cache-dependency-path: "${{ env.site_path }}/package-lock.json" + + - name: Install dependencies + run: npm install + working-directory: ${{ env.site_path }} + + - name: Build project + run: npm run build + env: + PUBLIC_GH_BRANCH: ${{ github.ref_name }} + working-directory: ${{ env.site_path }} + + - name: Deploy + uses: JamesIves/github-pages-deploy-action@v4.6.4 + with: + branch: gh-pages + folder: ${{ env.site_path }}/dist + target-folder: ${{ env.version_path }} + clean-exclude: version diff --git a/.github/workflows/markdown-links-check.yml b/.github/workflows/markdown-links-check.yml index 0f0c3f4505..1a8686ea30 100644 --- a/.github/workflows/markdown-links-check.yml +++ b/.github/workflows/markdown-links-check.yml @@ -17,19 +17,17 @@ name: Check Markdown links on: push: - branches: [ "main", "dev" ] pull_request: - # The branches below must be a subset of the branches above - branches: [ "main", "dev" ] jobs: markdown-link-check: runs-on: ubuntu-latest steps: - - uses: actions/checkout@master + - uses: actions/checkout@v4 - uses: gaurav-nelson/github-action-markdown-link-check@1.0.15 with: max-depth: -1 use-verbose-mode: 'yes' + config-file: '.github/workflows/mlc_config.json' check-modified-files-only: 'yes' - base-branch: 'dev' + base-branch: 'main' diff --git a/.github/workflows/mlc_config.json b/.github/workflows/mlc_config.json new file mode 100644 index 0000000000..3cff6d5621 --- /dev/null +++ b/.github/workflows/mlc_config.json @@ -0,0 +1,7 @@ +{ + "ignorePatterns": [ + { + "pattern": "catalog.ngc.nvidia.com" + } + ] +} \ No newline at end of file diff --git a/.github/workflows/premerge.yml b/.github/workflows/premerge.yml index bc9048752a..1056a1a3d6 100644 --- a/.github/workflows/premerge.yml +++ b/.github/workflows/premerge.yml @@ -17,8 +17,6 @@ name: pre-merge on: # quick tests for pull requests and the releasing branches push: - branches: - - dev pull_request: workflow_dispatch: @@ -29,17 +27,17 @@ jobs: fail-fast: false matrix: os: [ ubuntu-22.04, ubuntu-20.04 ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.9", "3.10", "3.11", "3.12" ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e .[dev] + python3 -m pip install --upgrade pip + python3 -m pip install --no-cache-dir -e .[dev] - name: Run unit test run: ./runtest.sh @@ -49,17 +47,17 @@ jobs: fail-fast: false matrix: os: [ ubuntu-22.04, ubuntu-20.04 ] - python-version: [ "3.8", "3.9", "3.10" ] + python-version: [ "3.9", "3.10", "3.11", "3.12" ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install -e .[dev] - pip install build twine torch torchvision + python3 -m pip install --upgrade pip + python3 -m pip install --no-cache-dir -e .[dev] + python3 -m pip install --no-cache-dir build twine torch torchvision - name: Run wheel build run: python3 -m build --wheel diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..e005367822 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "integration/xgboost/encryption_plugins/cuda_plugin/CGBN"] + path = integration/xgboost/encryption_plugins/cuda_plugin/CGBN + url = https://github.com/NVlabs/CGBN.git diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 41450770c5..2ce1d0e0e6 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,7 +9,7 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.8" + python: "3.10" # Build documentation in the docs/ directory with Sphinx sphinx: @@ -26,6 +26,6 @@ sphinx: python: install: - method: pip - path: .[doc] + path: .[dev] # system_packages: true diff --git a/3rdParty/astro.LICENSE.txt b/3rdParty/astro.LICENSE.txt new file mode 100644 index 0000000000..5e8ad18f09 --- /dev/null +++ b/3rdParty/astro.LICENSE.txt @@ -0,0 +1,59 @@ +MIT License + +Copyright (c) 2021 Fred K. Schott + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +""" +This license applies to parts of the `packages/create-astro` and `packages/astro` subdirectories originating from the https://github.com/sveltejs/kit repository: + +Copyright (c) 2020 [these people](https://github.com/sveltejs/kit/graphs/contributors) + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +""" +This license applies to parts of the `packages/create-astro` and `packages/astro` subdirectories originating from the https://github.com/vitejs/vite repository: + +MIT License + +Copyright (c) 2019-present, Yuxi (Evan) You and Vite contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" \ No newline at end of file diff --git a/3rdParty/flash-attn.LICENSE.txt b/3rdParty/flash-attn.LICENSE.txt new file mode 100644 index 0000000000..5860e4b33f --- /dev/null +++ b/3rdParty/flash-attn.LICENSE.txt @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/3rdParty/flowbite.LICENSE.txt b/3rdParty/flowbite.LICENSE.txt new file mode 100644 index 0000000000..df5d19b412 --- /dev/null +++ b/3rdParty/flowbite.LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Bergside Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/3rdParty/glide.LICENSE.txt b/3rdParty/glide.LICENSE.txt new file mode 100644 index 0000000000..d09bebecf1 --- /dev/null +++ b/3rdParty/glide.LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2013-present Jędrzej Chałubek + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/3rdParty/peft.LICENSE.txt b/3rdParty/peft.LICENSE.txt new file mode 100644 index 0000000000..261eeb9e9f --- /dev/null +++ b/3rdParty/peft.LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/3rdParty/prism.LICENSE.txt b/3rdParty/prism.LICENSE.txt new file mode 100644 index 0000000000..1941f980b3 --- /dev/null +++ b/3rdParty/prism.LICENSE.txt @@ -0,0 +1,21 @@ +MIT LICENSE + +Copyright (c) 2012 Lea Verou + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/3rdParty/tailwindcss.LICENSE.txt b/3rdParty/tailwindcss.LICENSE.txt new file mode 100644 index 0000000000..2ba9d4cd1d --- /dev/null +++ b/3rdParty/tailwindcss.LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) Tailwind Labs, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/3rdParty/trl.LICENSE.txt b/3rdParty/trl.LICENSE.txt new file mode 100644 index 0000000000..261eeb9e9f --- /dev/null +++ b/3rdParty/trl.LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dc3554870a..32843e03e6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -47,10 +47,11 @@ To collaborate efficiently, please read through this section and follow them. #### Checking the coding style We check code style using flake8 and isort. A bash script (`runtest.sh`) is provided to run all tests locally. +You can use `runtest.sh -f` to use black to fix your code style automatically as well. License information: all source code files should start with this paragraph: ``` -# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -83,13 +84,13 @@ python3 -m pip install nvflare[dev] To build the docs, please run. ```bash -./build_docs.sh --html +./build_doc.sh --html ``` Once built, you can view the docs in `docs/_build folder`. To clean the docs, please run ```bash -./build_docs.sh --clean +./build_doc.sh --clean ``` #### Signing your work diff --git a/MANIFEST.in b/MANIFEST.in index 0c67f7d41e..09b9d627da 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,4 @@ include versioneer.py include nvflare/_version.py +include nvflare/libs/*.so +include nvflare/fuel/utils/*.json diff --git a/README.md b/README.md index a4685ccd75..a98270f4e1 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,61 @@ -**NV**IDIA **F**ederated **L**earning **A**pplication **R**untime **E**nvironment +[![Blossom-CI](https://github.com/NVIDIA/nvflare/workflows/Blossom-CI/badge.svg?branch=main)](https://github.com/NVIDIA/nvflare/actions) +[![documentation](https://readthedocs.org/projects/nvflare/badge/?version=main)](https://nvflare.readthedocs.io/en/main/?badge=main) +[![license](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](./LICENSE) +[![pypi](https://badge.fury.io/py/nvflare.svg)](https://badge.fury.io/py/nvflare) +[![pyversion](https://img.shields.io/pypi/pyversions/nvflare.svg)](https://badge.fury.io/py/nvflare) +[![downloads](https://static.pepy.tech/badge/nvflare)](https://pepy.tech/project/nvflare) -[NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.html) is a domain-agnostic, open-source, extensible SDK that -allows researchers and data scientists to adapt existing ML/DL workflows(PyTorch, TensorFlow, Scikit-learn, XGBoost etc.) -to a federated paradigm. It enables platform developers to build a secure, privacy-preserving offering -for a distributed multi-party collaboration. +# NVIDIA FLARE -**NVIDIA FLARE** is built on a componentized architecture that allows you to take federated learning workloads -from research and simulation to real-world production deployment. Key components include: +[NVIDIA FLARE](https://nvidia.github.io/NVFlare/) (**NV**IDIA **F**ederated **L**earning **A**pplication **R**untime **E**nvironment) +is a domain-agnostic, open-source, extensible SDK that allows researchers and data scientists to adapt existing ML/DL workflows to a federated paradigm. +It enables platform developers to build a secure, privacy-preserving offering for a distributed multi-party collaboration. -* Support both deep learning and traditional machine algorithms +## Features +FLARE is built on a componentized architecture that allows you to take federated learning workloads +from research and simulation to real-world production deployment. + +Application Features +* Support both deep learning and traditional machine learning algorithms (eg. PyTorch, TensorFlow, Scikit-learn, XGBoost etc.) * Support horizontal and vertical federated learning -* Built-in FL algorithms (e.g., FedAvg, FedProx, FedOpt, Scaffold, Ditto ) -* Support multiple training workflows (e.g., scatter & gather, cyclic) and validation workflows (global model evaluation, cross-site validation) +* Built-in Federated Learning algorithms (e.g., FedAvg, FedProx, FedOpt, Scaffold, Ditto, etc.) +* Support multiple server and client-controlled training workflows (e.g., scatter & gather, cyclic) and validation workflows (global model evaluation, cross-site validation) * Support both data analytics (federated statistics) and machine learning lifecycle management -* Privacy preservation with differential privacy, homomorphic encryption -* Security enforcement through federated authorization and privacy policy -* Easily customizable and extensible -* Deployment on cloud and on premise -* Simulator for rapid development and prototyping -* Dashboard UI for simplified project management and deployment +* Privacy preservation with differential privacy, homomorphic encryption, private set intersection (PSI) + +From Simulation to Real-World +* FLARE Client API to transition seamlessly from ML/DL to FL with minimal code changes +* Simulator and POC mode for rapid development and prototyping +* Fully customizable and extensible components with modular design +* Deployment on cloud and on-premise +* Dashboard for project management and deployment +* Security enforcement through federated authorization and privacy policy * Built-in support for system resiliency and fault tolerance +> _Take a look at [NVIDIA FLARE Overview](https://nvflare.readthedocs.io/en/main/flare_overview.html) for a complete overview, and [What's New](https://nvflare.readthedocs.io/en/main/whats_new.html) for the lastest changes._ + ## Installation -To install the [current release](https://pypi.org/project/nvflare/), you can simply run: +To install the [current release](https://pypi.org/project/nvflare/): ``` $ python3 -m pip install nvflare ``` -## Getting started +## Getting Started You can quickly get started using the [FL simulator](https://nvflare.readthedocs.io/en/main/getting_started.html#the-fl-simulator). - A detailed [getting started](https://nvflare.readthedocs.io/en/main/getting_started.html) guide is available in the [documentation](https://nvflare.readthedocs.io/en/main/index.html). -Examples and notebook tutorials are located [here](./examples). +Examples and notebook tutorials are located at [NVFlare/examples](./examples). + +## Community + +We welcome community contributions! Please refer to the [contributing guidelines](https://github.com/NVIDIA/NVFlare/blob/main/CONTRIBUTING.md) for more details. + +Ask and answer questions, share ideas, and engage with other community members at [NVFlare Discussions](https://github.com/NVIDIA/NVFlare/discussions). -## Related talks and publications +## Related Talks and Publications -For a list of talks, blogs, and publications related to NVIDIA FLARE, see [here](https://nvflare.readthedocs.io/en/main/publications_and_talks.html). +Take a look at our growing list of [talks, blogs, and publications](https://nvflare.readthedocs.io/en/main/publications_and_talks.html) related to NVIDIA FLARE. ## License -NVIDIA FLARE has Apache 2.0 license, as found in [LICENSE](https://github.com/NVIDIA/NVFlare/blob/main/LICENSE) file. +NVIDIA FLARE is released under an [Apache 2.0 license](https://github.com/NVIDIA/NVFlare/blob/main/LICENSE). diff --git a/build_doc.sh b/build_doc.sh index e91c1a5331..384b6d1fa6 100755 --- a/build_doc.sh +++ b/build_doc.sh @@ -49,7 +49,7 @@ function clean_docs() { } function build_html_docs() { - pip install -e .[doc] + pip install -e .[dev] sphinx-apidoc --module-first -f -o docs/apidocs/ nvflare "*poc" "*private" sphinx-build -b html docs docs/_build } diff --git a/ci/run_integration.sh b/ci/run_integration.sh index aa4f79002c..4c8bae5bb9 100755 --- a/ci/run_integration.sh +++ b/ci/run_integration.sh @@ -45,8 +45,14 @@ remove_pipenv() { integration_test_tf() { echo "Run TF integration test..." - # not using pipenv because we need tensorflow package from the container - python -m pip install -e .[dev] + # since running directly in container, point python to python3.12 + ln -sfn /usr/bin/python3.12 /usr/bin/python + ln -sfn /usr/bin/python3.12 /usr/bin/python3 + # somehow the base container has blinker which should be removed + apt remove -y python3-blinker python-blinker-doc || true + # pipenv does not work with TensorFlow so using pip + python3.12 -m pip install -e .[dev] + python3.12 -m pip install tensorflow[and-cuda] export PYTHONPATH=$PWD testFolder="tests/integration_test" clean_up_snapshot_and_job diff --git a/commit_message.txt b/commit_message.txt new file mode 100644 index 0000000000..49b8cb0549 --- /dev/null +++ b/commit_message.txt @@ -0,0 +1,34 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/docker/Dockerfile b/docker/Dockerfile index c30293bc0c..0192caa113 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,3 +1,7 @@ +# For Running NVIDIA FLARE in a Docker container, see +# https://nvflare.readthedocs.io/en/main/quickstart.html#containerized-deployment-with-docker +# This Dockerfile is primarily for building Docker images to publish for dashboard. + FROM python:3.8 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -y install zip COPY nvflare /opt/NVFlare/nvflare diff --git a/docs/_static/css/additions.css b/docs/_static/css/additions.css index 999ff74614..a8490da9b4 100644 --- a/docs/_static/css/additions.css +++ b/docs/_static/css/additions.css @@ -1,3 +1,6 @@ .wy-menu-vertical li.toctree-l4.current li.toctree-l5>a{display:block;background:#b1b1b1;padding:.4045em 7.3em} .wy-menu-vertical li.toctree-l5.current li.toctree-l6>a{display:block;background:#a9a9a9;padding:.4045em 8.8em} -.wy-menu-vertical li.toctree-l5{font-size: .9em;} \ No newline at end of file +.wy-menu-vertical li.toctree-l5{font-size: .9em;} +.wy-menu > .caption > span.caption-text { + color: #76b900; + } \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index fa388e92eb..57a8f9e1c7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -44,7 +44,7 @@ def resolve_xref(self, env, fromdocname, builder, typ, target, node, contnode): # -- Project information ----------------------------------------------------- project = "NVIDIA FLARE" -copyright = "2023, NVIDIA" +copyright = "2024, NVIDIA" author = "NVIDIA" # The full version, including alpha/beta/rc tags @@ -114,6 +114,7 @@ def resolve_xref(self, env, fromdocname, builder, typ, target, node, contnode): html_scaled_image_link = False html_show_sourcelink = True html_favicon = "favicon.ico" +html_logo = "resources/nvidia_logo.png" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/contributing.rst b/docs/contributing.rst index 466f9a3289..45f500a26a 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -97,14 +97,14 @@ To build the docs, please run. .. code:: bash - ./build_docs --html + ./build_doc.sh --html Once built, you can view the docs in ``docs/_build folder``. To clean the docs, please run .. code:: bash - ./build_docs --clean + ./build_doc.sh --clean Signing your work ^^^^^^^^^^^^^^^^^ diff --git a/docs/example_applications_algorithms.rst b/docs/example_applications_algorithms.rst index 976e88396d..52e0b65e2b 100644 --- a/docs/example_applications_algorithms.rst +++ b/docs/example_applications_algorithms.rst @@ -3,90 +3,179 @@ #################### Example Applications #################### -NVIDIA FLARE has several tutorials and examples to help you get started with federated learning and to explore certain features in -:github_nvflare_link:`the examples directory `. +NVIDIA FLARE has several tutorials and examples to help you get started with federated learning and to explore certain features in the +:github_nvflare_link:`examples directory `. + +1. Hello World Examples +======================= +Can be run from the :github_nvflare_link:`hello_world notebook `. .. toctree:: - :maxdepth: -1 - :hidden: + :maxdepth: 1 + :hidden: + + examples/hello_world_examples + +1.1. Deep Learning to Federated Learning +---------------------------------------- + + * :github_nvflare_link:`Deep Learning to Federated Learning (GitHub) ` - Example for converting Deep Learning (DL) to Federated Learning (FL) using the Client API. + +1.2. Workflows +-------------- + + * :ref:`Hello FedAvg with NumPy ` - Example using the FedAvg workflow with a NumPy trainer + * :ref:`Hello Cross-Site Validation ` - Example using the Cross Site Eval workflow, also demonstrates running cross site validation using the previous training results. + * :github_nvflare_link:`Hello Cyclic Weight Transfer (GitHub) ` - Example using the CyclicController workflow to implement `Cyclic Weight Transfer `_ with TensorFlow as the deep learning training framework + * :github_nvflare_link:`Swarm Learning ` - Example using Swarm Learning and Client-Controlled Cross-site Evaluation workflows. + * :github_nvflare_link:`Client-Controlled Cyclic Weight Transfer ` - Example using Client-Controlled Cyclic workflow using Client API. + +1.3. Deep Learning +------------------ + + * :ref:`Hello PyTorch ` - Example image classifier using FedAvg and PyTorch as the deep learning training framework + * :ref:`Hello TensorFlow ` - Example image classifier using FedAvg and TensorFlow as the deep learning training frameworks + + +2. Step-By-Step Example Series +============================== + +:github_nvflare_link:`Step-by-Step Examples (GitHub) ` - Step-by-step examples series with CIFAR-10 (image data) and HIGGS (tabular data) to showcase different FLARE features, workflows, and APIs. + +2.1 CIFAR-10 Image Data Examples +-------------------------------- + + * :github_nvflare_link:`image_stats ` - federated statistics (histograms) of CIFAR10. + * :github_nvflare_link:`sag ` - scatter and gather (SAG) workflow with PyTorch with Client API. + * :github_nvflare_link:`sag_deploy_map ` - scatter and gather workflow with deploy_map configuration for deployment of apps to different sites using the Client API. + * :github_nvflare_link:`sag_model_learner ` - scatter and gather workflow illustrating how to write client code using the ModelLearner. + * :github_nvflare_link:`sag_executor ` - scatter and gather workflow demonstrating show to write client-side executors. + * :github_nvflare_link:`sag_mlflow ` - MLflow experiment tracking logs with the Client API in scatter & gather workflows. + * :github_nvflare_link:`sag_he ` - homomorphic encyption using Client API and POC -he mode. + * :github_nvflare_link:`cse ` - cross-site evaluation using the Client API. + * :github_nvflare_link:`cyclic ` - cyclic weight transfer workflow with server-side controller. + * :github_nvflare_link:`cyclic_ccwf ` - client-controlled cyclic weight transfer workflow with client-side controller. + * :github_nvflare_link:`swarm ` - swarm learning and client-side cross-site evaluation with Client API. + +2.2 HIGGS Tabular Data Examples +------------------------------- + + * :github_nvflare_link:`tabular_stats `- federated stats tabular histogram calculation. + * :github_nvflare_link:`sklearn_linear `- federated linear model (logistic regression on binary classification) learning on tabular data. + * :github_nvflare_link:`sklearn_svm `- federated SVM model learning on tabular data. + * :github_nvflare_link:`sklearn_kmeans `- federated k-Means clustering on tabular data. + * :github_nvflare_link:`xgboost `- federated horizontal xgboost learning on tabular data with bagging collaboration. + + +3. Tutorial Notebooks +===================== + + * :github_nvflare_link:`Intro to the FL Simulator ` - Shows how to use the :ref:`fl_simulator` to run a local simulation of an NVFLARE deployment to test and debug an application without provisioning a real FL project. + * :github_nvflare_link:`Hello FLARE API ` - Goes through the different commands of the :ref:`flare_api` to show the syntax and usage of each. + * :github_nvflare_link:`NVFLARE in POC Mode ` - Shows how to use :ref:`POC mode ` to test the features of a full FLARE deployment on a single machine. + * :github_nvflare_link:`Job CLI Tutorial ` - Walks through the different commands of the Job CLI and showcases syntax and example usages. - examples/hello_world_examples - examples/tutorial_notebooks - examples/fl_algorithms - examples/traditional_ml_examples - examples/medical_image_analysis - examples/federated_statistics - Federated Site Policies (GitHub) - examples/tensorboard_streaming - examples/fl_experiment_tracking_mlflow +4. Federated Learning Algorithms +================================ + * :github_nvflare_link:`Federated Learning with CIFAR-10 (GitHub) ` - Includes examples of using FedAvg, FedProx, FedOpt, SCAFFOLD, homomorphic encryption, and streaming of TensorBoard metrics to the server during training -The following tutorials and quickstart guides walk you through some of these examples: + .. toctree:: + :maxdepth: 2 - 1. **Hello World** introduction to NVFlare. + examples/fl_algorithms - 1.1. Deep Learning to Federated Learning - * :github_nvflare_link:`Deep Learning to Federated Learning (GitHub) ` - Example for converting Deep Learning (DL) to Federated Learning (FL). +5. Privacy Preserving Algorithms +================================ +Privacy preserving algorithms in NVIDIA FLARE are implemented as :ref:`filters ` that can be applied as data is sent or received between peers. - 1.2. Step-by-Step Examples - * :github_nvflare_link:`Step-by-Step Examples (GitHub) ` - Step-by-step examples for running a federated learning project with NVFlare. + * :github_nvflare_link:`Federated Learning with CIFAR-10 (GitHub) ` - Includes examples of using FedAvg, FedProx, FedOpt, SCAFFOLD, homomorphic encryption, and streaming of TensorBoard metrics to the server during training + * :github_nvflare_link:`Differential Privacy for BraTS18 segmentation (GitHub) `- Example using SVT Differential Privacy for BraTS18 segmentation. - 2. **Hello World Examples** which can be run from the :github_nvflare_link:`hello_world notebook `. +6. Traditional ML examples +========================== - 2.1. Workflows - * :ref:`Hello Scatter and Gather ` - Example using the Scatter And Gather (SAG) workflow with a Numpy trainer - * :ref:`Hello Cross-Site Validation ` - Example using the Cross Site Model Eval workflow with a Numpy trainer - * :github_nvflare_link:`Hello Cyclic Weight Transfer (GitHub) ` - Example using the CyclicController workflow to implement `Cyclic Weight Transfer `_ with TensorFlow as the deep learning training framework + * :github_nvflare_link:`Federated Linear Model with Scikit-learn (GitHub) ` - For an example of using NVIDIA FLARE with `scikit-learn `_, a widely used open-source machine learning library that supports supervised and unsupervised learning. + * :github_nvflare_link:`Federated K-Means Clustering with Scikit-learn (GitHub) ` - NVIDIA FLARE with `scikit-learn `_ and k-Means. + * :github_nvflare_link:`Federated SVM with Scikit-learn (GitHub) ` - NVIDIA FLARE with `scikit-learn `_ and `SVM `_. + * :github_nvflare_link:`Federated Horizontal XGBoost (GitHub) ` - Includes examples of histogram-based and tree-based algorithms. Tree-based algorithms also includes bagging and cyclic approaches + * :github_nvflare_link:`Federated Learning for Random Forest based on XGBoost (GitHub) ` - Example of using NVIDIA FLARE with `scikit-learn `_ and `Random Forest `_. + * :github_nvflare_link:`Federated Vertical XGBoost (GitHub) ` - Example using Private Set Intersection and XGBoost on vertically split HIGGS data. - 2.2. Deep Learning - * :ref:`Hello PyTorch ` - Example image classifier using FedAvg and PyTorch as the deep learning training framework - * :ref:`Hello TensorFlow ` - Example image classifier using FedAvg and TensorFlow as the deep learning training frameworks +7. Medical Image Analysis +========================= - 3. **Tutorial notebooks** + * :github_nvflare_link:`MONAI Integration (GitHub) ` - For an example of using NVIDIA FLARE to train a 3D medical image analysis model using federated averaging (FedAvg) and MONAI Bundle `MONAI `_ + * :github_nvflare_link:`Federated Learning with Differential Privacy for BraTS18 segmentation (GitHub) ` - Illustrates the use of differential privacy for training brain tumor segmentation models using federated learning + * :github_nvflare_link:`Federated Learning for Prostate Segmentation from Multi-source Data (GitHub) ` - Example of training a multi-institutional prostate segmentation model using `FedAvg `_, `FedProx `_, and `Ditto `_ - * :github_nvflare_link:`Intro to the FL Simulator ` - Shows how to use the :ref:`fl_simulator` to run a local simulation of an NVFLARE deployment to test and debug an application without provisioning a real FL project. - * :github_nvflare_link:`Hello FLARE API ` - Goes through the different commands of the :ref:`flare_api` to show the syntax and usage of each. - * :github_nvflare_link:`NVFLARE in POC Mode ` - Shows how to use :ref:`POC mode ` to test the features of a full FLARE deployment on a single machine. +8. Federated Statistics +======================= - 4. **FL algorithms** + * :ref:`Federated Statistic Overview ` - Discuss the overall federated statistics features. + * :github_nvflare_link:`Federated Statistics for medical imaging (Github) ` - Example of gathering local image histogram to compute the global dataset histograms. + * :github_nvflare_link:`Federated Statistics for tabular data with DataFrame (Github) ` - Example of gathering local statistics summary from Pandas DataFrame to compute the global dataset statistics. + * :github_nvflare_link:`Federated Statistics with Monai Statistics integration for Spleen CT Image (Github) ` - Example demonstrated Monai statistics integration and few other features in federated statistics + + .. toctree:: + :maxdepth: 1 + :hidden: - * :github_nvflare_link:`Federated Learning with CIFAR-10 (GitHub) ` - Includes examples of using FedAvg, FedProx, FedOpt, SCAFFOLD, homomorphic encryption, and streaming of TensorBoard metrics to the server during training - * :ref:`Federated XGBoost ` - Includes examples of histogram-based and tree-based algorithms. Tree-based algorithms also includes bagging and cyclic approaches + examples/federated_statistics_overview - 5. **Traditional ML examples** +9. Federated Site Policies +========================== - * :github_nvflare_link:`Federated Linear Model with Scikit-learn (GitHub) ` - For an example of using NVIDIA FLARE with `scikit-learn `_, a widely used open-source machine learning library that supports supervised and unsupervised learning. - * :github_nvflare_link:`Federated K-Means Clustering with Scikit-learn (GitHub) ` - NVIDIA FLARE with `scikit-learn `_ and k-Means. - * :github_nvflare_link:`Federated SVM with Scikit-learn (GitHub) ` - NVIDIA FLARE with `scikit-learn `_ and `SVM `_. - * :github_nvflare_link:`Federated Learning for Random Forest based on XGBoost (GitHub) ` - Example of using NVIDIA FLARE with `scikit-learn `_ and `Random Forest `_. + * :github_nvflare_link:`Federated Policies (Github) ` - Discuss the federated site policies for authorization, resource and data privacy management + * :github_nvflare_link:`Custom Authentication (Github) ` - Show the custom authentication policy and secure mode. + * :github_nvflare_link:`Job-Level Authorization (Github) ` - Show the job-level authorization policy and secure mode. + * :github_nvflare_link:`KeyCloak Site Authentication Integration (Github) ` - Demonstrate KeyCloak integration for supporting site-specific authentication. - 6. **Medical Image Analysis** +10. Experiment Tracking +======================= - * :github_nvflare_link:`MONAI Integration (GitHub) ` - For an example of using NVIDIA FLARE to train a 3D medical image analysis model using federated averaging (FedAvg) and MONAI Bundle `MONAI `_ - * :github_nvflare_link:`Federated Learning with Differential Privacy for BraTS18 segmentation (GitHub) ` - Illustrates the use of differential privacy for training brain tumor segmentation models using federated learning - * :github_nvflare_link:`Federated Learning for Prostate Segmentation from Multi-source Data (GitHub) ` - Example of training a multi-institutional prostate segmentation model using `FedAvg `_, `FedProx `_, and `Ditto `_ + * :github_nvflare_link:`FL Experiment Tracking with TensorBoard Streaming ` - :ref:`(documentation) ` - Example building on Hello PyTorch with TensorBoard streaming from clients to server + * :github_nvflare_link:`FL Experiment Tracking with MLflow ` - :ref:`(documentation) `- Example integrating Hello PyTorch with MLflow with streaming from clients to server + * :github_nvflare_link:`FL Experiment Tracking with Weights and Biases ` - Example integrating Hello PyTorch with Weights and Biases streaming capability from clients to server. + * :github_nvflare_link:`MONAI FLARE Integration Experiment Tracking ` - Example using FLARE and MONAI integration with experiment tracking streaming from clients to server. - 7. **Federated Statistics** + .. toctree:: + :maxdepth: 1 + :hidden: - * :ref:`Federated Statistic Overview ` - Discuss the overall federated statistics features - * :github_nvflare_link:`Federated Statistics for medical imaging (Github) ` - Example of gathering local image histogram to compute the global dataset histograms. - * :github_nvflare_link:`Federated Statistics for tabular data with DataFrame (Github) ` - Example of gathering local statistics summary from Pandas DataFrame to compute the global dataset statistics. - * :github_nvflare_link:`Federated Statistics with Monai Statistics integration for Spleen CT Image (Github) ` - Example demonstrated Monai statistics integration and few other features in federated statistics + examples/tensorboard_streaming + examples/fl_experiment_tracking_mlflow - 8. **Federated Site Policies** +11. Natural Language Processing (NLP) +====================================== - * :github_nvflare_link:`Federated Policies (Github) ` - Discuss the federated site policies for authorization, resource and data privacy management + * :github_nvflare_link:`NLP-NER (Github) ` - Illustrates both `BERT `_ and `GPT-2 `_ models from `Hugging Face `_ (`BERT-base-uncased `_, `GPT-2 `_) on a Named Entity Recognition (NER) task using the `NCBI disease dataset `_. - 9. **Experiment tracking** +12. FL Hierarchical Unification Bridge (HUB) +============================================ - * :ref:`FL Experiment Tracking with TensorBoard Streaming ` - Example building on Hello PyTorch with TensorBoard streaming from clients to server - * :ref:`FL Experiment Tracking with MLflow ` - Example integrating Hello PyTorch with MLflow with streaming from clients to server + * :github_nvflare_link:`FL HUB ` - Example for FL HUB allowing hierarchical interaction between several levels of FLARE FL systems. - 10. **NLP** +13. Federated Large Language Model (LLM) +======================================== - * :github_nvflare_link:`NLP-NER (Github) ` - Illustrates both `BERT `_ and `GPT-2 `_ models from `Hugging Face `_ (`BERT-base-uncased `_, `GPT-2 `_) on a Named Entity Recognition (NER) task using the `NCBI disease dataset `_. + * :github_nvflare_link:`Parameter Efficient Fine Turning ` - Example utilizing NeMo's PEFT methods to adapt a LLM to a downstream task. + * :github_nvflare_link:`Prompt-Tuning Example ` - Example for using FLARE with NeMo for prompt learning. + * :github_nvflare_link:`Supervised Fine Tuning (SFT) ` - Example to fine-tune all parameters of a LLM on supervised data. + * :github_nvflare_link:`LLM Tuning via HuggingFace SFT Trainer ` - Example for using FLARE with a HuggingFace trainer for LLM tuning tasks. + + +14. Graph Neural Network (GNN) +============================== + + * :github_nvflare_link:`Protein Classification ` - Example using GNNs for Protein Classification using `PPI `_ dataset using GraphSAGE. + * :github_nvflare_link:`Financial Transaction Classification ` - Example using GNNs for Financial Transaction Classification with `Elliptic++ `_ dataset using GraphSAGE. + +15. Financial Applications +========================== + + * :github_nvflare_link:`Financial Application with Federated XGBoost Methods ` Example using XGBoost in various ways to train a federated model to perform fraud detection with a finance dataset. -For the complete collection of example applications, see https://github.com/NVIDIA/NVFlare/tree/main/examples. Setting up a virtual environment for examples and notebooks =========================================================== @@ -162,60 +251,10 @@ Most hello-* examples use a custom folder within the FL application. Note that using a custom folder in the app needs to be :ref:`allowed ` when using secure provisioning. By default, this option is disabled in the secure mode. POC mode, however, will work with custom code by default. -In contrast, the :github_nvflare_link:`CIFAR-10 `, -:github_nvflare_link:`prostate segmentation `, -and :github_nvflare_link:`BraTS18 segmentation ` examples assume that the +In contrast, the :github_nvflare_link:`CIFAR-10 `, +:github_nvflare_link:`prostate segmentation `, +and :github_nvflare_link:`BraTS18 segmentation ` examples assume that the learner code is already installed on the client's system and available in the PYTHONPATH. Hence, the app folders do not include the custom code there. The PYTHONPATH is set in the ``run_poc.sh`` or ``run_secure.sh`` scripts of the example. Running these scripts as described in the README will make the learner code available to the clients. - - -.. _fl_algorithms: - -Federated Learning Algorithms -============================= - -Federated Averaging -------------------- -In NVIDIA FLARE, FedAvg is implemented through the :ref:`scatter_and_gather_workflow`. In the federated averaging workflow, -a set of initial weights is distributed to client workers who perform local training. After local training, clients -return their local weights as a Shareables that are aggregated (averaged). This new set of global average weights is -redistributed to clients and the process repeats for the specified number of rounds. - -FedProx -------- -`FedProx `_ implements a :class:`Loss function ` -to penalize a client's local weights based on deviation from the global model. An example configuration can be found in -cifar10_fedprox of the :github_nvflare_link:`CIFAR-10 example `. - -FedOpt ------- -`FedOpt `_ implements a :class:`ShareableGenerator ` -that can use a specified Optimizer and Learning Rate Scheduler when updating the global model. An example configuration -can be found in cifar10_fedopt of :github_nvflare_link:`CIFAR-10 example `. - -SCAFFOLD --------- -`SCAFFOLD `_ uses a slightly modified version of the CIFAR-10 Learner implementation, -namely the `CIFAR10ScaffoldLearner`, which adds a correction term during local training following the `implementation `_ -as described in `Li et al. `_ - -Ditto ------ -`Ditto `_ uses a slightly modified version of the prostate Learner implementation, -namely the `ProstateDittoLearner`, which decouples local personalized model from global model via an additional model -training and a controllable prox term. See the :github_nvflare_link:`prostate segmentation example ` -for an example with ditto in addition to FedProx, FedAvg, and centralized training. - -Federated XGBoost ------------------ - -* :github_nvflare_link:`Federated XGBoost (GitHub) ` - Includes examples of histogram-based and tree-based algorithms. Tree-based algorithms also includes bagging and cyclic approaches - -Federated Analytics -------------------- - -* :github_nvflare_link:`Federated Statistics for medical imaging (Github) ` - Example of gathering local image histogram to compute the global dataset histograms. -* :github_nvflare_link:`Federated Statistics for tabular data with DataFrame (Github) ` - Example of gathering local statistics summary from Pandas DataFrame to compute the global dataset statistics. -* :github_nvflare_link:`Federated Statistics with Monai Statistics integration for Spleen CT Image (Github) ` - Example demonstrated Monai statistics integration and few other features in federated statistics diff --git a/docs/examples/federated_statistics.rst b/docs/examples/federated_statistics.rst deleted file mode 100644 index 322c09e6f8..0000000000 --- a/docs/examples/federated_statistics.rst +++ /dev/null @@ -1,10 +0,0 @@ -******************** -Federated Statistics -******************** - -.. toctree:: - - federated_statistics_overview - Federated Statistics for medical imaging (Github) - Federated Statistics for tabular data with DataFrame (Github) - Federated Statistics with Monai Statistics integration for Spleen CT Image (Github) diff --git a/docs/examples/federated_statistics_overview.rst b/docs/examples/federated_statistics_overview.rst index 2ab9edcea1..48ccc0a9ed 100644 --- a/docs/examples/federated_statistics_overview.rst +++ b/docs/examples/federated_statistics_overview.rst @@ -57,16 +57,16 @@ The main steps are: * provide server side configuration to specify target statistics and their configurations and output location * implement the local statistics generator (statistics_spec) * provide client side configuration to specify data input location - * The detailed example instructions can be found in :github_nvflare_link:`Data frame statistics `_ + * The detailed example instructions can be found in :github_nvflare_link:`Data frame statistics ` COVID 19 Radiology Image Examples --------------------------------- The second example provided is an image histogram example. Different from the tabular data example, the image example show the following: -* The :github_nvflare_link:`image_statistics.py ` only needs to calculate the count and histogram target statistics, then user only needs to provide the calculation count, failure_count and histogram functions. There is no need to implement other metrics functions (sum, mean,std_dev etc.) ( get_failure_count by default return 0 ) +* The :github_nvflare_link:`image_statistics.py ` only needs to calculate the count and histogram target statistics, then user only needs to provide the calculation count, failure_count and histogram functions. There is no need to implement other metrics functions (sum, mean,std_dev etc.) ( get_failure_count by default return 0 ) * For each site's dataset, there are several thousands of images, the local histogram is aggregate histogram of all the image histograms. * The image files are large, we can't load everything in memory, then calculate the statistics. We will need to iterate through files for each calculation. For single feature, such as example. This is ok. If there are multiple features, such as multiple channels, reload image to memory for each channel to do histogram calculation is really wasteful. -* Unlike :github_nvflare_link:`Data frame statistics `_, the histogram bin's global range is pre-defined by user [0, 256] where in Data frame statistics, besides "Age", all other features histogram global bin range is dynamically estimated based on local min/max values +* Unlike :github_nvflare_link:`Data frame statistics `, the histogram bin's global range is pre-defined by user [0, 256] where in Data frame statistics, besides "Age", all other features histogram global bin range is dynamically estimated based on local min/max values Here some of the image histogram ( the underline image files have only 1 channel) @@ -175,9 +175,5 @@ Some of the local statistics (such as count, failure count, sum etc.) can be cal Summary ======= We provided federated statistics operators that can easily aggregate and visualize the local statistics for different data site and features. -We hope this feature will make it easier to perform federated data analysis. For more details, please look at :github_nvflare_link:`Federated Statistics (Github) ` +We hope this feature will make it easier to perform federated data analysis. For more details, please look at :github_nvflare_link:`Federated Statistics (Github) ` -Previous Versions of Federated XGBoost --------------------------------------- - - - `Federated XGBoost for 2.2 `_ diff --git a/docs/examples/fl_algorithms.rst b/docs/examples/fl_algorithms.rst index f2d2e85e82..10b1ecfea3 100644 --- a/docs/examples/fl_algorithms.rst +++ b/docs/examples/fl_algorithms.rst @@ -1,8 +1,64 @@ +.. _fl_algorithms: + ******************** FL Algorithms ******************** -.. toctree:: +Federated Averaging +------------------- +In NVIDIA FLARE, FedAvg is implemented through the :ref:`scatter_and_gather_workflow`. In the federated averaging workflow, +a set of initial weights is distributed to client workers who perform local training. After local training, clients +return their local weights as a Shareables that are aggregated (averaged). This new set of global average weights is +redistributed to clients and the process repeats for the specified number of rounds. + +FedProx +------- +`FedProx `_ implements a :class:`Loss function ` +to penalize a client's local weights based on deviation from the global model. An example configuration can be found in +cifar10_fedprox of the :github_nvflare_link:`CIFAR-10 example `. + +FedOpt +------ +`FedOpt `_ implements a :class:`ShareableGenerator ` +that can use a specified Optimizer and Learning Rate Scheduler when updating the global model. An example configuration +can be found in cifar10_fedopt of :github_nvflare_link:`CIFAR-10 example `. + +SCAFFOLD +-------- +`SCAFFOLD `_ uses a slightly modified version of the CIFAR-10 Learner implementation, +namely the `CIFAR10ScaffoldLearner`, which adds a correction term during local training following the `implementation `_ +as described in `Li et al. `_. An example configuration can be found in cifar10_scaffold of :github_nvflare_link:`CIFAR-10 example `. + +Ditto +----- +`Ditto `_ uses a slightly modified version of the prostate Learner implementation, +namely the `ProstateDittoLearner`, which decouples local personalized model from global model via an additional model +training and a controllable prox term. See the :github_nvflare_link:`prostate segmentation example ` +for an example with ditto in addition to FedProx, FedAvg, and centralized training. + +Federated XGBoost +----------------- + +NVFlare supports federated learning using popular gradient boosting library XGBoost. +It uses XGBoost library with federated plugin (xgboost version >= 1.7.0rc1) to perform the learning. + +Using XGBoost with NVFlare has following benefits compared with running federated XGBoost directly, + +* XGBoost instance's life-cycle is managed by NVFlare. Both XGBoost client and server + are started/stopped automatically by NVFlare workflow. +* For histogram-based XGBoost federated server can be configured automatically with auto-assigned port number. +* When mutual TLS is used, the certificates are managed by NVFlare using existing + provisioning process. +* No need to manually configure each instance. Instance specific parameters + like code:`rank` are assigned automatically by the NVFlare controller. + +* :github_nvflare_link:`Federated Horizontal XGBoost (GitHub) ` - Includes examples of histogram-based and tree-based algorithms. Tree-based algorithms also includes bagging and cyclic approaches +* :github_nvflare_link:`Federated Vertical XGBoost (GitHub) ` - Example using Private Set Intersection and XGBoost on vertically split HIGGS data. + +Federated Analytics +------------------- + +* :github_nvflare_link:`Federated Statistics for medical imaging (Github) ` - Example of gathering local image histogram to compute the global dataset histograms. +* :github_nvflare_link:`Federated Statistics for tabular data with DataFrame (Github) ` - Example of gathering local statistics summary from Pandas DataFrame to compute the global dataset statistics. +* :github_nvflare_link:`Federated Statistics with Monai Statistics integration for Spleen CT Image (Github) ` - Example demonstrated Monai statistics integration and few other features in federated statistics - Federated Learning with CIFAR-10 (GitHub) - xgboost diff --git a/docs/examples/fl_experiment_tracking_mlflow.rst b/docs/examples/fl_experiment_tracking_mlflow.rst index 14b6e36860..1000e899c7 100644 --- a/docs/examples/fl_experiment_tracking_mlflow.rst +++ b/docs/examples/fl_experiment_tracking_mlflow.rst @@ -51,12 +51,10 @@ with the appropriate path to the directory containing the "pt" directory with cu Adding MLflow Logging to Configurations ------------------------------------------------ -Inside the config folder there are two files, ``config_fed_client.json`` and ``config_fed_server.json``. +Inside the config folder there are two files, ``config_fed_client.conf`` and ``config_fed_server.conf``. -.. literalinclude:: ../../examples/advanced/experiment-tracking/mlflow/jobs/hello-pt-mlflow/app/config/config_fed_client.json - :language: json - :linenos: - :caption: config_fed_client.json +.. literalinclude:: ../../examples/advanced/experiment-tracking/mlflow/jobs/hello-pt-mlflow/app/config/config_fed_client.conf + :caption: config_fed_client.conf Take a look at the components section of the client config at line 24. The first component is the ``pt_learner`` which contains the initialization, training, and validation logic. @@ -69,10 +67,8 @@ within NVFlare with the information to track. Finally, :class:`ConvertToFedEvent` converts local events to federated events. This changes the event ``analytix_log_stats`` into a fed event ``fed.analytix_log_stats``, which will then be streamed from the clients to the server. -.. literalinclude:: ../../examples/advanced/experiment-tracking/mlflow/jobs/hello-pt-mlflow/app/config/config_fed_server.json - :language: json - :linenos: - :caption: config_fed_server.json +.. literalinclude:: ../../examples/advanced/experiment-tracking/mlflow/jobs/hello-pt-mlflow/app/config/config_fed_server.conf + :caption: config_fed_server.conf Under the component section in the server config, we have the :class:`MLflowReceiver`. This component receives diff --git a/docs/examples/hello_cross_val.rst b/docs/examples/hello_cross_val.rst index dc5e89b322..f8c62f82a5 100644 --- a/docs/examples/hello_cross_val.rst +++ b/docs/examples/hello_cross_val.rst @@ -110,36 +110,15 @@ and adding a ``random_epsilon`` before returning the results packaged with a DXO NVIDIA FLARE can be used with any data packaged inside a :ref:`Shareable ` object (subclasses ``dict``), and :ref:`DXO ` is recommended as a way to manage that data in a standard way. -Application Configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Inside the config folder there are two files, ``config_fed_client.json`` and ``config_fed_server.json``. - -.. literalinclude:: ../../examples/hello-world/hello-numpy-cross-val/jobs/hello-numpy-cross-val/app/config/config_fed_server.json - :language: json - :linenos: - :caption: config_fed_server.json - -The server now has a second workflow configured after Scatter and Gather, :class:`CrossSiteModelEval`. - -The components "model_locator" and "formatter" have been added to work with the cross site model evaluation workflow, -and the rest is the same as in :doc:`Hello Scatter and Gather `. - +Cross site validation! +---------------------- -.. literalinclude:: ../../examples/hello-world/hello-numpy-cross-val/jobs/hello-numpy-cross-val/app/config/config_fed_client.json - :language: json - :linenos: - :caption: config_fed_client.json +We can run it using NVFlare simulator -The client configuration now has more tasks and an additional Executor ``NPValidator`` configured to handle the "validate" task. -The "submit_model" task has been added to the ``NPTrainer`` Executor to work with the :class:`CrossSiteModelEval` -workflow to get the client models. +.. code-block:: bash -Cross site validation! ----------------------- + python3 job_train_and_cse.py -.. |ExampleApp| replace:: hello-numpy-cross-val -.. include:: run_fl_system.rst During the first phase, the model will be trained. @@ -154,23 +133,18 @@ This can produce a lot of results. All the results will be kept in the job's wor Understanding the Output ^^^^^^^^^^^^^^^^^^^^^^^^ -After starting the server and clients, you should begin to see -some outputs in each terminal tracking the progress of the FL run. -As each client finishes training, it will start the cross site validation process. -During this you'll see several important outputs the track the progress of cross site validation. +You can find the running logs and results inside the simulator's workspace: -The server shows the log of each client requesting models, the models it sends and the results received. -Since the server could be responding to many clients at the same time, it may -require careful examination to make proper sense of events from the jumbled logs. +.. code-block:: bash + ls /tmp/nvflare/jobs/workdir/ + server/ site-1/ site-2/ startup/ -.. include:: access_result.rst -.. note:: - You could see the cross-site validation results - at ``[DOWNLOAD_DIR]/[JOB_ID]/workspace/cross_site_val/cross_val_results.json`` +The cross site validation results: -.. include:: shutdown_fl_system.rst +.. code-block:: bash + cat /tmp/nvflare/jobs/workdir/server/simulate_job/cross_site_val/cross_val_results.json Congratulations! @@ -186,3 +160,4 @@ Previous Versions of Hello Cross-Site Validation - `hello-numpy-cross-val for 2.1 `_ - `hello-numpy-cross-val for 2.2 `_ - `hello-numpy-cross-val for 2.3 `_ + - `hello-numpy-cross-val for 2.4 `_ diff --git a/docs/examples/hello_fedavg_numpy.rst b/docs/examples/hello_fedavg_numpy.rst new file mode 100644 index 0000000000..ead5e8490b --- /dev/null +++ b/docs/examples/hello_fedavg_numpy.rst @@ -0,0 +1,191 @@ +.. _hello_fedavg_numpy: + +Hello FedAvg with NumPy +======================= + +Before You Start +---------------- + +Before jumping into this guide, make sure you have an environment with +`NVIDIA FLARE `_ installed. + +You can follow :ref:`getting_started` on the general concept of setting up a +Python virtual environment (the recommended environment) and how to install NVIDIA FLARE. + + +Introduction +------------- + +This tutorial is meant solely to demonstrate how the NVIDIA FLARE system works, without introducing any actual deep +learning concepts. + +Through this exercise, you will learn how to use NVIDIA FLARE with numpy to perform basic +computations across two clients with the included :class:`FedAvg` workflow, +which sends the model to the clients then aggregates the results that come back. + +Due to the simplified weights, you will be able to clearly see and understand +the results of the FL aggregation and the model persistor process. + +The setup of this exercise consists of one **server** and two **clients**. +The model is set to the starting weights ``[[1, 2, 3], [4, 5, 6], [7, 8, 9]]``. + +The following steps compose one cycle of weight updates, called a **round**: + + #. Clients are responsible for adding a delta to the weights to calculate new weights for the model. + #. These updates are then sent to the server which will aggregate them to produce a model with new weights. + #. Finally, the server sends this updated version of the model back to each client, so the clients can continue to calculate the next model weights in future rounds. + +For this exercise, we will be working with the ``hello-fedavg-numpy`` in the examples folder. + +Let's get started. First clone the repo, if you haven't already: + +.. code-block:: shell + + $ git clone https://github.com/NVIDIA/NVFlare.git + +Remember to activate your NVIDIA FLARE Python virtual environment from the installation guide. +Ensure numpy is installed. + +.. code-block:: shell + + (nvflare-env) $ python3 -m pip install numpy + +Now that you have all your dependencies installed, let's look into the ``fedavg_script_executor_hello-numpy.py`` script which +builds the job with the Job API. + + +NVIDIA FLARE Job API +-------------------- + +The ``fedavg_script_executor_hello-numpy.py`` script builds the job with the Job API. The following sections are the key lines to focus on: + +Define a FedJob +^^^^^^^^^^^^^^^^ +:class:`FedJob` allows you to generate job configurations in a Pythonic way. It is initialized with the +name for the job, which will also be used as the directory name if the job is exported. + +.. code-block:: python + + from nvflare import FedAvg, FedJob, ScriptExecutor + + job = FedJob(name="hello-fedavg-numpy") + +Define the Controller Workflow +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Define the controller workflow and send to server. We use :class:`FedAvg` and specify the number of +clients and rounds, then use the :func:`to` routine to send the component to the server for the job. + +.. code-block:: python + + n_clients = 2 + num_rounds = 3 + + controller = FedAvg( + num_clients=n_clients, + num_rounds=num_rounds, + ) + job.to(controller, "server") + +Add Clients +^^^^^^^^^^^^ +Next, we can use the :class:`ScriptExecutor` and send it to each of the +clients to run our training script. We will examine the training script ``hello-numpy_fl.py`` in the next main section. + +The :func:`to` routine sends the component to the specified client for the job. Here, our clients +are named "site-0" and "site-1" and we are using the same training script for both. + +.. code-block:: python + + from nvflare.client.config import ExchangeFormat + + train_script = "src/hello-numpy_fl.py" + + for i in range(n_clients): + executor = ScriptExecutor( + task_script_path=train_script, task_script_args="", params_exchange_format=ExchangeFormat.NUMPY + ) + job.to(executor, f"site-{i}") + + +Optionally Export the Job or Run in Simulator +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +With all the components needed for the job, you can export the job to a directory with :func:`export` +if you want to look at what is built and configured for each client. You can use the exported job to submit it to a real NVFlare deployment +using the :ref:`FLARE Console ` or :ref:`flare_api`. + +.. code-block:: python + + job.export_job("/tmp/nvflare/jobs/job_config") + +This is optional if you just want to run the job in a simulator environment directly, as :class:`FedJob` has +a :func:`simulator_run` function. + +.. code-block:: python + + job.simulator_run("/tmp/nvflare/jobs/workdir") + +The results are saved in the specified directory provided as an argument to the :func:`simulator_run` function. + + +NVIDIA FLARE Client Training Script +------------------------------------ +The training script ``hello-numpy_fl.py`` is the main script that will be run on the clients. It contains print statements to +help you follow the output while the FL system is running. + +On the client side, the training workflow is as follows: + + 1. Receive the model from the FL server (for this example we initialize the model in the client code to the numpy array [[1, 2, 3], [4, 5, 6], [7, 8, 9]] if the model params are empty). + 2. Perform training on the received global model and calculate metrics. + 3. Send the new model back to the FL server. + +Using NVFlare's Client API, there are three essential methods to help achieve this workflow: + + - `init()`: Initializes NVFlare Client API environment. + - `receive()`: Receives model from the FL server. + - `send()`: Sends the model to the FL server. + +The following code snippet highlights how these methods are used in the training script: + +.. code-block:: python + + import nvflare.client as flare + + flare.init() # 1. Initializes NVFlare Client API environment. + input_model = flare.receive() # 2. Receives model from the FL server. + params = input_model.params # 3. Obtain the required information from the received model. + + # original local training code + new_params = train(params) + + output_model = flare.FLModel(params=new_params) # 4. Put the results in a new `FLModel` + flare.send(output_model) # 5. Sends the model to the FL server. + +This has been simplified to ignore dealing with data formats to focus on the NVFlare Client API, but you can find the full training +script ``hello-numpy_fl.py`` in the ``src`` directory of :github_nvflare_link:`examples/hello-world/hello-fedavg-numpy `. + + +Running the Job API Script +--------------------------- +Now that you have a good understanding of the training script, you can run the job with the ``fedavg_script_executor_hello-numpy.py`` script: + +.. code-block:: shell + + (nvflare-env) $ python3 fedavg_script_executor_hello-numpy.py + +This will run the job in a simulator environment and you should be able to see the output as the job proceeds to completion. + +You've successfully run your first numpy federated learning system. + +You now have a decent grasp of the main FL concepts, and are ready to start exploring how NVIDIA FLARE can be applied to many other tasks. + +The full application for this exercise can be found in +:github_nvflare_link:`examples/hello-world/hello-fedavg-numpy `. + +Previous Versions of this Example (previously Hello Scatter and Gather) +----------------------------------------------------------------------- + + - `hello-numpy-sag for 2.0 `_ + - `hello-numpy-sag for 2.1 `_ + - `hello-numpy-sag for 2.2 `_ + - `hello-numpy-sag for 2.3 `_ + - `hello-numpy-sag for 2.4 `_ diff --git a/docs/examples/hello_pt.rst b/docs/examples/hello_pt.rst deleted file mode 100644 index c9ef27d1c2..0000000000 --- a/docs/examples/hello_pt.rst +++ /dev/null @@ -1,243 +0,0 @@ -.. _hello_pt: - -Hello PyTorch -============= - -Before You Start ----------------- - -Feel free to refer to the :doc:`detailed documentation <../programming_guide>` at any point -to learn more about the specifics of `NVIDIA FLARE `_. - -Make sure you have an environment with NVIDIA FLARE installed. - -You can follow :ref:`getting_started` on the general concept of setting up a -Python virtual environment (the recommended environment) and how to install NVIDIA FLARE. - - -Introduction -------------- - -Through this exercise, you will integrate NVIDIA FLARE with the popular -deep learning framework `PyTorch `_ and learn how to use NVIDIA FLARE to train a convolutional -network with the CIFAR10 dataset using the included Scatter and Gather workflow. - -The setup of this exercise consists of one **server** and two **clients**. - -The following steps compose one cycle of weight updates, called a **round**: - - #. Clients are responsible for generating individual weight-updates for the model using their own CIFAR10 dataset. - #. These updates are then sent to the server which will aggregate them to produce a model with new weights. - #. Finally, the server sends this updated version of the model back to each client. - -For this exercise, we will be working with the ``hello-pt`` application in the examples folder. -Custom FL applications can contain the folders: - - #. **custom**: contains the custom components (``simple_network.py``, ``cifar10trainer.py``) - #. **config**: contains client and server configurations (``config_fed_client.json``, ``config_fed_server.json``) - #. **resources**: contains the logger config (``log.config``) - -Now that you have a rough idea of what is going on, let's get started. First clone the repo: - -.. code-block:: shell - - $ git clone https://github.com/NVIDIA/NVFlare.git - -Now remember to activate your NVIDIA FLARE Python virtual environment from the installation guide. - -Since you will use PyTorch and torchvision for this exercise, let's go ahead and install both libraries: - -.. code-block:: shell - - (nvflare-env) $ python3 -m pip install torch torchvision - - -.. note:: - - There is a pending fix related to Pillow, PyTorch==1.9 and Numpy. If you see exception related to - ``enumerate(self.train_loader)``, downgrade your Pillow to 8.2.0. - - .. code-block:: shell - - (nvflare-env) $ python3 -m pip install torch torchvision Pillow==8.2.0 - -If you would like to go ahead and run the exercise now, you can skip directly to :ref:`hands-on`. - -NVIDIA FLARE Client -------------------- - -Neural Network -^^^^^^^^^^^^^^^ - -With all the required dependencies installed, you are ready to run a Federated Learning -with two clients and one server. The training procedure and network -architecture are modified from -`Training a Classifier `_. - - -Let's see what an extremely simplified CIFAR10 training looks like: - -.. literalinclude:: ../../examples/hello-world/hello-pt/jobs/hello-pt/app/custom/simple_network.py - :language: python - :caption: simple_network.py - -This ``SimpleNetwork`` class is your convolutional neural network to train with the CIFAR10 dataset. -This is not related to NVIDIA FLARE, so we implement it in a file called ``simple_network.py``. - -Dataset & Setup -^^^^^^^^^^^^^^^^ - -Now implement the custom class ``Cifar10Trainer`` as an NVIDIA FLARE Executor in a file -called ``cifar10trainer.py``. - -In a real FL experiment, each client would have their own dataset used for their local training. -For simplicity's sake, you can download the same CIFAR10 dataset from the Internet via torchvision's datasets module. -Additionally, you need to set up the optimizer, loss function and transform to process the data. -You can think of all of this code as part of your local training loop, as every deep learning training has a similar setup. - -Since you will encapsulate every training-related step in the ``Cifar10Trainer`` class, -let's put this preparation stage into the ``__init__`` method: - -.. literalinclude:: ../../examples/hello-world/hello-pt/jobs/hello-pt/app/custom/cifar10trainer.py - :language: python - - -Local Train -^^^^^^^^^^^ - -Now that you have your network and dataset setup, in the ``Cifar10Trainer`` class. -Let's also implement a local training loop in a method called ``local_train``: - -.. literalinclude:: ../../examples/hello-world/hello-pt/jobs/hello-pt/app/custom/cifar10trainer.py - :language: python - :pyobject: Cifar10Trainer._local_train - - -.. note:: - - Everything up to this point is completely independent of NVIDIA FLARE. It is just purely a PyTorch - deep learning exercise. You will now build the NVIDIA FLARE application based on this PyTorch code. - - -Integrate NVIDIA FLARE with Local Train -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -NVIDIA FLARE makes it easy to integrate your local train code into the NVIDIA FLARE API. - -The simplest way to do this is to subclass the ``Executor`` class and -implement one method ``execute``, which is called every time the client receives -an updated model from the server with the task "train" (the server will broadcast the "train" task in the Scatter and -Gather workflow we will configure below). -We can then call our local train inside the ``execute`` method. - -.. note:: - - The ``execute`` method inside the ``Executor`` class is where all of the client side computation occurs. - In these exercises, we update the weights by training on a local dataset, however, it is important to remember that NVIDIA FLARE is not restricted to just deep learning. - The type of data passed between the server and the clients, and the computations that the clients perform can be anything, as long as all of the FL Components agree on the same format. - -Take a look at the following code: - -.. literalinclude:: ../../examples/hello-world/hello-pt/jobs/hello-pt/app/custom/cifar10trainer.py - :language: python - :pyobject: Cifar10Trainer.execute - -The concept of ``Shareable`` is described in :ref:`shareable `. -Essentially, every NVIDIA FLARE client receives the model weights from the server in ``shareable`` format. -It is then passed into the ``execute`` method, and returns a new ``shareable`` back to the server. -The data is managed by using DXO (see :ref:`data_exchange_object` for details). - -Thus, the first thing is to retrieve the model weights delivered by server via ``shareable``, and this can be seen in -the first part of the code block above before ``local_train`` is called. - -We then perform a local train so the client's model is trained with its own dataset. - -After finishing the local train, the train method builds a new ``shareable`` with newly-trained weights -and metadata and returns it back to the NVIDIA FLARE server for aggregation. - -There is additional logic to handle the "submit_model" task, but that is for the CrossSiteModelEval workflow, -so we will be addressing that in a later example. - -FLContext -^^^^^^^^^ - -The ``FLContext`` is used to set and retrieve FL related information among the FL components via ``set_prop()`` and -``get_prop()`` as well as get services provided by the underlying infrastructure. You can find more details in the -:ref:`documentation `. - -NVIDIA FLARE Server & Application ---------------------------------- - -In this exercise, you can use the default settings, which leverage NVIDIA FLARE built-in components for NVIDIA FLARE server. - -These built-in components are commonly used in most deep learning scenarios. - -However, you are encouraged to build your own components to fully customize NVIDIA FLARE to meet your environment, - which we will demonstrate in the following exercises. - - -Application Configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Inside the config folder there are two files, ``config_fed_client.json`` and ``config_fed_server.json``. - -.. literalinclude:: ../../examples/hello-world/hello-pt/jobs/hello-pt/app/config/config_fed_client.json - :language: json - :linenos: - :caption: config_fed_client.json - -Take a look at line 8. - -This is the ``Cifar10Trainer`` you just implemented. - -The NVIDIA FLARE client loads this application configuration and picks your implementation. - -You can easily change it to another class so your NVIDIA FLARE client has different training logic. - -The tasks "train" and "submit_model" have been configured to work with the ``Cifar10Trainer`` Executor. -The "validate" task for ``Cifar10Validator`` and the "submit_model" task are used for the ``CrossSiteModelEval`` workflow, -so we will be addressing that in a later example. - - -.. literalinclude:: ../../examples/hello-world/hello-pt/jobs/hello-pt/app/config/config_fed_server.json - :language: json - :linenos: - :caption: config_fed_server.json - -The server application configuration, like said before, leverages NVIDIA FLARE built-in components. -Remember, you are encouraged to change them to your own classes whenever you have different application logic. - -Note that on line 12, ``persistor`` points to ``PTFileModelPersistor``. -NVIDIA FLARE provides a built-in PyTorch implementation for a model persistor, -however for other frameworks/libraries, you will have to implement your own. - -The Scatter and Gather workflow is implemented by :class:`ScatterAndGather` -and is configured to make use of the components with id "aggregator", "persistor", and "shareable_generator". -The workflow code is all open source now, so feel free to study and use it as inspiration -to write your own workflows to support your needs. - -.. _hands-on: - -Train the Model, Federated! ---------------------------- - -.. |ExampleApp| replace:: hello-pt -.. include:: run_fl_system.rst - -.. include:: access_result.rst - -.. include:: shutdown_fl_system.rst - -Congratulations! -You've successfully built and run your first federated learning system. - -The full source code for this exercise can be found in -:github_nvflare_link:`examples/hello-pt `. - -Previous Versions of Hello PyTorch ----------------------------------- - - - `hello-pt for 2.0 `_ - - `hello-pt for 2.1 `_ - - `hello-pt for 2.2 `_ - - `hello-pt for 2.3 `_ diff --git a/docs/examples/hello_pt_job_api.rst b/docs/examples/hello_pt_job_api.rst new file mode 100644 index 0000000000..810fc43d6a --- /dev/null +++ b/docs/examples/hello_pt_job_api.rst @@ -0,0 +1,263 @@ +.. _hello_pt_job_api: + +Hello PyTorch with Job API +========================== + +Before You Start +---------------- + +Feel free to refer to the :doc:`detailed documentation <../programming_guide>` at any point +to learn more about the specifics of `NVIDIA FLARE `_. + +We recommend you first finish the :doc:`Hello FedAvg with NumPy ` exercise since it introduces the +federated learning concepts of `NVIDIA FLARE `_. + +Make sure you have an environment with NVIDIA FLARE installed. + +You can follow :ref:`getting_started` on the general concept of setting up a +Python virtual environment (the recommended environment) and how to install NVIDIA FLARE. + +Introduction +------------- + +Through this exercise, you will integrate NVIDIA FLARE with the popular +deep learning framework `PyTorch `_ and learn how to use NVIDIA FLARE to train a convolutional +network with the CIFAR10 dataset using the included :class:`FedAvg` workflow. + +The setup of this exercise consists of one **server** and two **clients**. + +The following steps compose one cycle of weight updates, called a **round**: + + #. Clients are responsible for generating individual weight-updates for the model using their own CIFAR10 dataset. + #. These updates are then sent to the server which will aggregate them to produce a model with new weights. + #. Finally, the server sends this updated version of the model back to each client. + +For this exercise, we will be working with the ``hello-pt`` application in the examples folder. + +Let's get started. First clone the repo: + +.. code-block:: shell + + $ git clone https://github.com/NVIDIA/NVFlare.git + +Remember to activate your NVIDIA FLARE Python virtual environment from the installation guide. + +Since you will use PyTorch and torchvision for this exercise, let's go ahead and install both libraries: + +.. code-block:: shell + + (nvflare-env) $ python3 -m pip install torch torchvision + +If you would like to go ahead and run the exercise now, you can run the ``fedavg_script_executor_hello-pt.py`` script which +builds the job with the Job API and runs the job with the FLARE Simulator. + +NVIDIA FLARE Job API +-------------------- + +The ``fedavg_script_executor_hello-pt.py`` script for this hello-pt example is very similar to the ``fedavg_script_executor_hello-numpy.py`` script +for the :doc:`Hello FedAvg with NumPy ` exercise. Other than changes to the names of the job and client script, the only difference +is a line to define the initial global model for the server: + +.. code-block:: python + + # Define the initial global model and send to server + job.to(SimpleNetwork(), "server") + + +NVIDIA FLARE Client Training Script +------------------------------------ +The training script for this example, ``hello-pt_cifar10_fl.py``, is the main script that will be run on the clients. It contains the PyTorch specific +logic for training. + +Neural Network +^^^^^^^^^^^^^^^ + +The training procedure and network architecture are modified from +`Training a Classifier `_. + +Let's see what an extremely simplified CIFAR10 training looks like: + +.. literalinclude:: ../../examples/hello-world/hello-pt/src/simple_network.py + :language: python + :caption: simple_network.py + +This ``SimpleNetwork`` class is your convolutional neural network to train with the CIFAR10 dataset. +This is not related to NVIDIA FLARE, so we implement it in a file called ``simple_network.py``. + +Dataset & Setup +^^^^^^^^^^^^^^^^ + +In a real FL experiment, each client would have their own dataset used for their local training. +You can download the CIFAR10 dataset from the Internet via torchvision's datasets module, so for simplicity's sake, this is +the dataset we will be using on each client. +Additionally, you need to set up the optimizer, loss function and transform to process the data. +You can think of all of this code as part of your local training loop, as every deep learning training has a similar setup. + +In the ``hello-pt_cifar10_fl.py`` script, we take care of all of this setup before the ``flare.init()``. + +Local Train +^^^^^^^^^^^ + +Now with the network and dataset setup, let's also implement the local training loop with the NVFlare's Client API: + +.. code-block:: python + + flare.init() + + summary_writer = SummaryWriter() + while flare.is_running(): + input_model = flare.receive() + + model.load_state_dict(input_model.params) + + steps = epochs * len(train_loader) + for epoch in range(epochs): + running_loss = 0.0 + for i, batch in enumerate(train_loader): + images, labels = batch[0].to(device), batch[1].to(device) + optimizer.zero_grad() + + predictions = model(images) + cost = loss(predictions, labels) + cost.backward() + optimizer.step() + + running_loss += cost.cpu().detach().numpy() / images.size()[0] + + output_model = flare.FLModel(params=model.cpu().state_dict(), meta={"NUM_STEPS_CURRENT_ROUND": steps}) + + flare.send(output_model) + + +The code above is simplified from the ``hello-pt_cifar10_fl.py`` script to focus on the three essential methods of the NVFlare's Client API to +achieve the training workflow: + + - `init()`: Initializes NVFlare Client API environment. + - `receive()`: Receives model from the FL server. + - `send()`: Sends the model to the FL server. + +NVIDIA FLARE Server & Application +--------------------------------- +In this example, the server runs :class:`FedAvg` with the default settings. + +If you export the job with the :func:`export` function, you will see the +configurations for the server and each client. The server configuration is ``config_fed_server.json`` in the config folder +in app_server: + +.. code-block:: json + + { + "format_version": 2, + "workflows": [ + { + "id": "controller", + "path": "nvflare.app_common.workflows.fedavg.FedAvg", + "args": { + "num_clients": 2, + "num_rounds": 2 + } + } + ], + "components": [ + { + "id": "json_generator", + "path": "nvflare.app_common.widgets.validation_json_generator.ValidationJsonGenerator", + "args": {} + }, + { + "id": "model_selector", + "path": "nvflare.app_common.widgets.intime_model_selector.IntimeModelSelector", + "args": { + "aggregation_weights": {}, + "key_metric": "accuracy" + } + }, + { + "id": "receiver", + "path": "nvflare.app_opt.tracking.tb.tb_receiver.TBAnalyticsReceiver", + "args": { + "events": [ + "fed.analytix_log_stats" + ] + } + }, + { + "id": "persistor", + "path": "nvflare.app_opt.pt.file_model_persistor.PTFileModelPersistor", + "args": { + "model": { + "path": "src.simple_network.SimpleNetwork", + "args": {} + } + } + }, + { + "id": "model_locator", + "path": "nvflare.app_opt.pt.file_model_locator.PTFileModelLocator", + "args": { + "pt_persistor_id": "persistor" + } + } + ], + "task_data_filters": [], + "task_result_filters": [] + } + +This is automatically created by the Job API. The server application configuration leverages NVIDIA FLARE built-in components. + +Note that ``persistor`` points to ``PTFileModelPersistor``. This is automatically configured when the model SimpleNetwork is added +to the server with the :func:`to` function. The Job API detects that the model is a PyTorch model +and automatically configures :class:`PTFileModelPersistor` +and :class:`PTFileModelLocator`. + + +Client Configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The client configuration is ``config_fed_client.json`` in the config folder of each client app folder: + +.. code-block:: json + + { + "format_version": 2, + "executors": [ + { + "tasks": [ + "*" + ], + "executor": { + "path": "nvflare.app_common.executors.script_executor.ScriptExecutor", + "args": { + "task_script_path": "src/hello-pt_cifar10_fl.py" + } + } + } + ], + "components": [ + { + "id": "event_to_fed", + "path": "nvflare.app_common.widgets.convert_to_fed_event.ConvertToFedEvent", + "args": { + "events_to_convert": [ + "analytix_log_stats" + ] + } + } + ], + "task_data_filters": [], + "task_result_filters": [] + } + +The ``task_script_path`` is set to the path of the client training script. + +The full source code for this exercise can be found in +:github_nvflare_link:`examples/hello-world/hello-pt `. + +Previous Versions of Hello PyTorch +---------------------------------- + + - `hello-pt for 2.0 `_ + - `hello-pt for 2.1 `_ + - `hello-pt for 2.2 `_ + - `hello-pt for 2.3 `_ + - `hello-pt for 2.4 `_ diff --git a/docs/examples/hello_scatter_and_gather.rst b/docs/examples/hello_scatter_and_gather.rst index 14e29168c3..da5d3a2e5d 100644 --- a/docs/examples/hello_scatter_and_gather.rst +++ b/docs/examples/hello_scatter_and_gather.rst @@ -163,7 +163,7 @@ You've successfully built and run your first numpy federated learning system. You now have a decent grasp of the main FL concepts, and are ready to start exploring how NVIDIA FLARE can be applied to many other tasks. The full application for this exercise can be found in -:github_nvflare_link:`examples/hello-numpy-sag `, +:github_nvflare_link:`examples/hello-world/hello-numpy-sag `, with the client and server components implemented in the :github_nvflare_link:`nvflare/app-common/np ` folder of the NVFlare code tree. Previous Versions of Hello Scatter and Gather diff --git a/docs/examples/hello_tf2.rst b/docs/examples/hello_tf2.rst deleted file mode 100644 index fec2c1e477..0000000000 --- a/docs/examples/hello_tf2.rst +++ /dev/null @@ -1,261 +0,0 @@ -.. _hello_tf2: - -Hello TensorFlow 2 -================== - -Before You Start ----------------- - -We recommend you first finish either the :doc:`hello_pt` or the :doc:`hello_scatter_and_gather` exercise. - -Those guides go more in depth in explaining the federated learning aspect of `NVIDIA FLARE `_. - -Here we assume you have already installed NVIDIA FLARE inside a python virtual environment -and have already cloned the repo. - -Introduction -------------- - -Through this exercise, you will integrate NVIDIA FLARE with the popular deep learning framework -`TensorFlow 2 `_ and learn how to use NVIDIA FLARE to train a convolutional -network with the MNIST dataset using the Scatter and Gather workflow. -You will also be introduced to some new components and concepts, including filters, aggregators, and event handlers. - -The setup of this exercise consists of one **server** and two **clients**. - -The following steps compose one cycle of weight updates, called a **round**: - - #. Clients are responsible for generating individual weight-updates for the model using their own MNIST dataset. - #. These updates are then sent to the server which will aggregate them to produce a model with new weights. - #. Finally, the server sends this updated version of the model back to each client. - -For this exercise, we will be working with the ``hello-tf2`` application in the examples folder. -Custom FL applications can contain the folders: - - #. **custom**: contains the custom components (``tf2_net.py``, ``trainer.py``, ``filter.py``, ``tf2_model_persistor.py``) - #. **config**: contains client and server configurations (``config_fed_client.json``, ``config_fed_server.json``) - #. **resources**: contains the logger config (``log.config``) - -Let's get started. -Since this task is using TensorFlow, let's go ahead and install the library inside our virtual environment: - -.. code-block:: shell - - (nvflare-env) $ python3 -m pip install tensorflow - - -NVIDIA FLARE Client -------------------- - -Neural Network -^^^^^^^^^^^^^^^ - -With all the required dependencies installed, you are ready to run a Federated Learning system -with two clients and one server. - -Before you start, let's see what a simplified MNIST network looks like. - -.. literalinclude:: ../../examples/hello-world/hello-tf2/jobs/hello-tf2/app/custom/tf2_net.py - :language: python - :lines: 15- - :lineno-start: 15 - :linenos: - :caption: tf2_net.py - -This ``Net`` class is the convolutional neural network to train with MNIST dataset. -This is not related to NVIDIA FLARE, so implement it in a file called ``tf2_net.py``. - -Dataset & Setup -^^^^^^^^^^^^^^^^ - -Now you have to implement the class ``Trainer``, which is a subclass of ``Executor`` in NVIDIA FLARE, -in a file called ``trainer.py``. - -Before you can really start a training, you need to set up your dataset. -In this exercise, you can download it from the Internet via ``tf.keras``'s datasets module, -and split it in half to create a separate dataset for each client. -Additionally, you must setup the optimizer, loss function and transform to process the data. - -Since every step will be encapsulated in the ``SimpleTrainer`` class, -let's put this preparation stage into one method ``setup``: - -.. literalinclude:: ../../examples/hello-world/hello-tf2/jobs/hello-tf2/app/custom/trainer.py - :language: python - :lines: 41-71 - :lineno-start: 41 - :linenos: - - -How can you ensure this setup method is called before the client receives the model from the server? - -The Trainer class is also a :ref:`FLComponent `, which always receives ``Event`` whenever -NVIDIA FLARE enters or leaves a certain stage. - -In this case, there is an ``Event`` called ``EventType.START_RUN`` which perfectly matches these requirements. -Because our trainer is a subclass of ``FLComponent``, you can implement the handler to handle the event and call the setup method: - -.. literalinclude:: ../../examples/hello-world/hello-tf2/jobs/hello-tf2/app/custom/trainer.py - :language: python - :lines: 37-39 - :lineno-start: 37 - :linenos: - -.. note:: - - This is a new concept you haven't learned in previous two exercises. - - The concepts of ``event`` and ``handler`` are very powerful because you are free to - add your logic so it can run at different time and process various events. - - The entire list of events fired by NVIDIA FLARE is shown at :ref:`Event types `. - - -You have everything you need, now let's implement the last method called ``execute``, which is -called every time the client receives an updated model from the server with the Task we will configure. - - -Link NVIDIA FLARE with Local Train -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Take a look at the following code: - -.. literalinclude:: ../../examples/hello-world/hello-tf2/jobs/hello-tf2/app/custom/trainer.py - :language: python - :pyobject: SimpleTrainer.execute - -Every NVIDIA FLARE client receives the model weights from the server in the :ref:`shareable `. -This application uses the ``exclude_var`` filter, so make sure to replace the missing layer with weights from the clients' previous training round: - -.. literalinclude:: ../../examples/hello-world/hello-tf2/jobs/hello-tf2/app/custom/trainer.py - :language: python - :lines: 111-115 - :lineno-start: 111 - :linenos: - -Now update the local model with those received weights: - -.. literalinclude:: ../../examples/hello-world/hello-tf2/jobs/hello-tf2/app/custom/trainer.py - :language: python - :lines: 118 - :lineno-start: 118 - :linenos: - -Then perform a simple :code:`self.model.fit` so the client's model is trained with its own dataset: - -.. literalinclude:: ../../examples/hello-world/hello-tf2/jobs/hello-tf2/app/custom/trainer.py - :language: python - :lines: 122-127 - :lineno-start: 122 - :linenos: - -After finishing the local train, the train method uses the newly-trained weights to build a new ``DXO`` to update the -``Shareable`` with and then returns it back to the NVIDIA FLARE server. - - -NVIDIA FLARE Server & Application ---------------------------------- - -Filter -^^^^^^^ - -:ref:`filter ` can be used for additional data processing in the ``Shareable``, for both -inbound and outbound data from the client and/or server. - -For this exercise, we use a basic ``exclude_var`` filter to exclude the variable/layer ``flatten`` from the task result -as it goes outbound from the client to the server. The excluded layer is replaced with all zeros of the same shape, -which reduces compression size and ensures that the clients' weights for this variable are not shared with the server. - -.. literalinclude:: ../../examples/hello-world/hello-tf2/jobs/hello-tf2/app/custom/filter.py - :language: python - :lines: 15- - :lineno-start: 15 - :linenos: - :caption: filter.py - -The filtering procedure occurs in the one required method, process, which receives and returns a shareable. -The parameters for what is excluded and the inbound/outbound option are all set in ``config_fed_client.json`` -(shown later below) and passed in through the constructor. - - -Model Aggregator -^^^^^^^^^^^^^^^^ - -The :ref:`model aggregator ` is used by the server to aggregate the clients' models into one model -within the Scatter and Gather workflow. - -In this exercise, we perform a simple average over the two clients' weights with the -:class:`InTimeAccumulateWeightedAggregator` -and configure for it to be used in ``config_fed_server.json`` (shown later below). - -Model Persistor -^^^^^^^^^^^^^^^ - -The model persistor is used to load and save models on the server. - -.. literalinclude:: ../../examples/hello-world/hello-tf2/jobs/hello-tf2/app/custom/tf2_model_persistor.py - :language: python - :lines: 15- - :lineno-start: 15 - :linenos: - :caption: tf2_model_persistor.py - -In this exercise, we simply serialize the model weights dictionary using pickle and -save it to a log directory calculated in initialize. -The file is saved on the FL server and the weights file name is defined in ``config_fed_server.json``. -Depending on the frameworks and tools, the methods of saving the model may vary. - -FLContext is used throughout these functions to provide various useful FL-related information. -You can find more details in the :ref:`documentation `. - -Application Configuration -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Finally, inside the config folder there are two files, ``config_fed_client.json`` and ``config_fed_server.json``. - -.. literalinclude:: ../../examples/hello-world/hello-tf2/jobs/hello-tf2/app/config/config_fed_server.json - :language: json - :linenos: - :caption: config_fed_server.json - - -Note how the :class:`ScatterAndGather` workflow is -configured to use the included ``aggregator`` :class:`InTimeAccumulateWeightedAggregator` -and ``shareable_generator`` :class:`FullModelShareableGenerator`. -The ``persistor`` is configured to use ``TF2ModelPersistor`` in the custom directory of this hello_tf2 app with full -Python module paths. - - -.. literalinclude:: ../../examples/hello-world/hello-tf2/jobs/hello-tf2/app/config/config_fed_client.json - :language: json - :linenos: - :caption: config_fed_client.json - - -Here, ``executors`` is configured with the Trainer implementation ``SimpleTrainer``. -Also, we set up ``filter.ExcludeVars`` as a ``task_result_filters`` and pass in ``["flatten"]`` as the argument. -Both of these are configured for the only Task that will be broadcast in the Scatter and Gather workflow, "train". - -Train the Model, Federated! ---------------------------- - -.. |ExampleApp| replace:: hello-tf2 -.. include:: run_fl_system.rst - -.. include:: access_result.rst - -.. include:: shutdown_fl_system.rst - -Congratulations! - -You've successfully built and run a federated learning system using TensorFlow 2. - -The full source code for this exercise can be found in -:github_nvflare_link:`examples/hello-tf2 `. - -Previous Versions of Hello TensorFlow 2 ---------------------------------------- - - - `hello-tf2 for 2.0 `_ - - `hello-tf2 for 2.1 `_ - - `hello-tf2 for 2.2 `_ - - `hello-tf2 for 2.3 `_ diff --git a/docs/examples/hello_tf_job_api.rst b/docs/examples/hello_tf_job_api.rst new file mode 100644 index 0000000000..9b7b456ce3 --- /dev/null +++ b/docs/examples/hello_tf_job_api.rst @@ -0,0 +1,223 @@ +.. _hello_tf_job_api: + +Hello TensorFlow with Job API +============================== + +Before You Start +---------------- +Feel free to refer to the :doc:`detailed documentation <../programming_guide>` at any point +to learn more about the specifics of `NVIDIA FLARE `_. + +We recommend you first finish the :doc:`Hello FedAvg with NumPy ` exercise since it introduces the +federated learning concepts of `NVIDIA FLARE `_. + +Make sure you have an environment with NVIDIA FLARE installed. + +You can follow :ref:`getting_started` on the general concept of setting up a +Python virtual environment (the recommended environment) and how to install NVIDIA FLARE. + +Here we assume you have already installed NVIDIA FLARE inside a python virtual environment +and have already cloned the repo. + +Introduction +------------- +Through this exercise, you will integrate NVIDIA FLARE with the popular deep learning framework +`TensorFlow `_ and learn how to use NVIDIA FLARE to train a convolutional +network with the MNIST dataset using the :class:`FedAvg` workflow. + +You will also be introduced to some new components and concepts, including filters, aggregators, and event handlers. + +The setup of this exercise consists of one **server** and two **clients**. + +The following steps compose one cycle of weight updates, called a **round**: + + #. Clients are responsible for generating individual weight-updates for the model using their own MNIST dataset. + #. These updates are then sent to the server which will aggregate them to produce a model with new weights. + #. Finally, the server sends this updated version of the model back to each client. + +For this exercise, we will be working with the ``hello-tf`` application in the examples folder. + +Let's get started. Since this task is using TensorFlow, let's go ahead and install the library inside our virtual environment: + +.. code-block:: shell + + (nvflare-env) $ python3 -m pip install tensorflow + +With all the required dependencies installed, you are ready to run a Federated Learning system +with two clients and one server. If you would like to go ahead and run the exercise now, you can run +the ``fedavg_script_executor_hello-tf.py`` script which builds the job with the Job API and runs the +job with the FLARE Simulator. + +NVIDIA FLARE Job API +-------------------- +The ``fedavg_script_executor_hello-tf.py`` script for this hello-tf example is very similar to the ``fedavg_script_executor_hello-numpy.py`` script +for the :doc:`Hello FedAvg with NumPy ` example and also the script for the :doc:`Hello PyTorch ` +example. Other than changes to the names of the job and client script, the only difference is the line to define the initial global model +for the server: + +.. code-block:: python + + # Define the initial global model and send to server + job.to(TFNet(), "server") + + +NVIDIA FLARE Client Training Script +------------------------------------ +The training script for this example, ``hello-tf_fl.py``, is the main script that will be run on the clients. It contains the TensorFlow specific +logic for training. + +Neural Network +^^^^^^^^^^^^^^^ +Let's see what a simplified MNIST network looks like. + +.. literalinclude:: ../../examples/hello-world/hello-tf/src/tf_net.py + :language: python + :lines: 15- + :lineno-start: 15 + :linenos: + :caption: tf_net.py + +This ``TFNet`` class is the convolutional neural network to train with MNIST dataset. +This is not related to NVIDIA FLARE, and it is implemented in a file called ``tf_net.py``. + +Dataset & Setup +^^^^^^^^^^^^^^^^ +Before starting training, you need to set up your dataset. +In this exercise, it is downloaded from the Internet via ``tf.keras``'s datasets module +and split in half to create a separate dataset for each client. Note that this is just for an example since in a real-world scenario, +you will likely have different datasets for each client. + +Additionally, the optimizer and loss function need to be configured. + +All of this happens before the ``while flare.is_running():`` line in ``hello-tf_fl.py``. + +.. literalinclude:: ../../examples/hello-world/hello-tf/src/hello-tf_fl.py + :language: python + :lines: 29-57 + :lineno-start: 29 + :linenos: + :caption: hello-tf_fl.py + +Client Local Train +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The client code gets the weights from the input_model received from the server then performs a simple :code:`self.model.fit` +so the client's model is trained with its own dataset: + +.. literalinclude:: ../../examples/hello-world/hello-tf/src/hello-tf_fl.py + :language: python + :lines: 58-91 + :lineno-start: 58 + :linenos: + +After finishing the local training, the newly-trained weights are sent back to the NVIDIA FLARE server in the params of +:mod:`FLModel`. + + +NVIDIA FLARE Server & Application +--------------------------------- +In this example, the server runs :class:`FedAvg` with the default settings. + +If you export the job with the :func:`export` function, you will see the +configurations for the server and each client. The server configuration is ``config_fed_server.json`` in the config folder +in app_server: + +.. code-block:: json + + { + "format_version": 2, + "workflows": [ + { + "id": "controller", + "path": "nvflare.app_common.workflows.fedavg.FedAvg", + "args": { + "num_clients": 2, + "num_rounds": 3 + } + } + ], + "components": [ + { + "id": "json_generator", + "path": "nvflare.app_common.widgets.validation_json_generator.ValidationJsonGenerator", + "args": {} + }, + { + "id": "model_selector", + "path": "nvflare.app_common.widgets.intime_model_selector.IntimeModelSelector", + "args": { + "aggregation_weights": {}, + "key_metric": "accuracy" + } + }, + { + "id": "persistor", + "path": "nvflare.app_opt.tf.model_persistor.TFModelPersistor", + "args": { + "model": { + "path": "src.tf_net.TFNet", + "args": {} + } + } + } + ], + "task_data_filters": [], + "task_result_filters": [] + } + +This is automatically created by the Job API. The server application configuration leverages NVIDIA FLARE built-in components. + +Note that ``persistor`` points to ``TFModelPersistor``. This is automatically configured when the model is added +to the server with the :func:`to` function. The Job API detects that the model is a TensorFlow model +and automatically configures :class:`TFModelPersistor`. + + +Client Configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The client configuration is ``config_fed_client.json`` in the config folder of each client app folder: + +.. code-block:: json + + { + "format_version": 2, + "executors": [ + { + "tasks": [ + "*" + ], + "executor": { + "path": "nvflare.app_common.executors.script_executor.ScriptExecutor", + "args": { + "task_script_path": "src/hello-tf_fl.py" + } + } + } + ], + "components": [ + { + "id": "event_to_fed", + "path": "nvflare.app_common.widgets.convert_to_fed_event.ConvertToFedEvent", + "args": { + "events_to_convert": [ + "analytix_log_stats" + ] + } + } + ], + "task_data_filters": [], + "task_result_filters": [] + } + +The ``task_script_path`` is set to the path of the client training script. + +The full source code for this exercise can be found in +:github_nvflare_link:`examples/hello-tf `. + +Previous Versions of Hello TensorFlow (previously Hello TensorFlow 2) +--------------------------------------------------------------------- + + - `hello-tf2 for 2.0 `_ + - `hello-tf2 for 2.1 `_ + - `hello-tf2 for 2.2 `_ + - `hello-tf2 for 2.3 `_ + - `hello-tf2 for 2.4 `_ diff --git a/docs/examples/hello_world_examples.rst b/docs/examples/hello_world_examples.rst index 780c154340..4a2b855dc4 100644 --- a/docs/examples/hello_world_examples.rst +++ b/docs/examples/hello_world_examples.rst @@ -5,11 +5,11 @@ Hello World examples can be run from the :github_nvflare_link:`hello_world noteb .. toctree:: - Deep Learning to Federated Learning (GitHub) - Step-by-Step Examples (GitHub) - hello_scatter_and_gather + :github_nvflare_link:`Deep Learning to Federated Learning (GitHub) ` + :github_nvflare_link:`Step-by-Step Examples (GitHub) ` + hello_fedavg_numpy hello_cross_val - Hello Cyclic Weight Transfer (GitHub) - hello_pt - hello_tf2 - Hello Client Controlled Workflow (GitHub) + :github_nvflare_link:`Hello Cyclic Weight Transfer (GitHub) ` + hello_pt_job_api + hello_tf_job_api + :github_nvflare_link:`Hello Client Controlled Workflow (GitHub) ` diff --git a/docs/examples/medical_image_analysis.rst b/docs/examples/medical_image_analysis.rst index 6f7470b594..91485498f8 100644 --- a/docs/examples/medical_image_analysis.rst +++ b/docs/examples/medical_image_analysis.rst @@ -4,6 +4,6 @@ Medical Image Analysis .. toctree:: - Hello MONAI Bundle (GitHub) - Differential Privacy for BraTS18 Segmentation (GitHub) - Prostate Segmentation from Multi-source Data (GitHub) + github_nvflare_link:`Hello MONAI Bundle (GitHub) ` + github_nvflare_link:`Differential Privacy for BraTS18 Segmentation (GitHub) ` + github_nvflare_link:`Prostate Segmentation from Multi-source Data (GitHub) ` diff --git a/docs/examples/tensorboard_streaming.rst b/docs/examples/tensorboard_streaming.rst index 2e195ea09d..30c9993738 100644 --- a/docs/examples/tensorboard_streaming.rst +++ b/docs/examples/tensorboard_streaming.rst @@ -10,13 +10,13 @@ In this exercise, you will learn how to stream TensorBoard events from the clien to the server in order to visualize live training metrics from a central place on the server. This exercise will be working with the ``tensorboard`` example in the advanced examples folder under experiment-tracking, -which builds upon :doc:`hello_pt` by adding TensorBoard streaming. +which builds upon :doc:`hello_pt_job_api` by adding TensorBoard streaming. The setup of this exercise consists of one **server** and two **clients**. .. note:: - This exercise differs from :doc:`hello_pt`, as it uses the ``Learner`` API along with the ``LearnerExecutor``. + This exercise differs from :doc:`hello_pt_job_api`, as it uses the ``Learner`` API along with the ``LearnerExecutor``. In short, the execution flow is abstracted away into the ``LearnerExecutor``, allowing you to only need to implement the required methods in the ``Learner`` class. This will not be the focus of this guide, however you can learn more at :class:`Learner` and :class:`LearnerExecutor`. diff --git a/docs/examples/traditional_ml_examples.rst b/docs/examples/traditional_ml_examples.rst deleted file mode 100644 index 9cd2c758f5..0000000000 --- a/docs/examples/traditional_ml_examples.rst +++ /dev/null @@ -1,10 +0,0 @@ -*********************** -Traditional ML Examples -*********************** - -.. toctree:: - - Federated Linear Model with Scikit-learn (GitHub) - Federated K-Means Clustering with Scikit-learn (GitHub) - Federated SVM with Scikit-learn (GitHub) - Federated Learning for Random Forest based on XGBoost (GitHub) diff --git a/docs/examples/tutorial_notebooks.rst b/docs/examples/tutorial_notebooks.rst index b6fb68a997..be16d31714 100644 --- a/docs/examples/tutorial_notebooks.rst +++ b/docs/examples/tutorial_notebooks.rst @@ -4,6 +4,6 @@ Tutorial Notebooks .. toctree:: - FL Simulator Notebook (GitHub) - Hello FLARE API Notbook (GitHub) - NVFLARE in POC Mode (GitHub) + :github_nvflare_link:`FL Simulator Notebook (GitHub) ` + :github_nvflare_link:`Hello FLARE API Notebook (GitHub) ` + :github_nvflare_link:`NVFLARE POC Mode in detail Notebook (GitHub) ` diff --git a/docs/examples/xgboost.rst b/docs/examples/xgboost.rst deleted file mode 100644 index 71c7d6edc7..0000000000 --- a/docs/examples/xgboost.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. _federated_xgboost: - -Federated XGBoost -================= - -Overview --------- - -NVFlare supports federated learning using popular gradient boosting library XGBoost. -It uses XGBoost library with federated plugin (xgboost version >= 1.7.0rc1) to perform the learning. - -Using XGBoost with NVFlare has following benefits compared with running federated XGBoost directly, - -* XGBoost instance's life-cycle is managed by NVFlare. Both XGBoost client and server - are started/stopped automatically by NVFlare workflow. -* For histogram-based XGBoost federated server can be configured automatically with auto-assigned port number. -* When mutual TLS is used, the certificates are managed by NVFlare using existing - provisioning process. -* No need to manually configure each instance. Instance specific parameters - like code:`rank` are assigned automatically by the NVFlare controller. - -Examples --------- - -Basic components to run XGBoost are already included with NVFlare distribution. -Most XGBoost jobs can be created without custom code. - -Please refer to :code:`NVFlare/examples/advanced/xgboost` for more details. - -Previous Versions of Federated XGBoost --------------------------------------- - - - `Federated XGBoost for 2.2 `_ diff --git a/docs/fl_introduction.rst b/docs/fl_introduction.rst new file mode 100644 index 0000000000..04cb9a9cd5 --- /dev/null +++ b/docs/fl_introduction.rst @@ -0,0 +1,64 @@ +.. _fl_introduction: + +########################### +What is Federated Learning? +########################### + +Federated Learning is a distributed learning paradigm where training occurs across multiple clients, each with their own local datasets. +This enables the creation of common robust models without sharing sensitive local data, helping solve issues of data privacy and security. + +How does Federated Learning Work? +================================= +The federated learning (FL) server orchestrates the collaboration of multiple clients by first sending an initial model to the FL clients. +The clients perform training on their local datasets, then send the model updates back to the FL server for aggregation to form a global model. +This process forms a single round of federated learning and after a number of rounds, a robust global model can be developed. + +.. image:: resources/fl_diagram.png + :height: 500px + :align: center + +FL Terms and Definitions +======================== + +- FL server: manages job lifecycle, orchestrates workflow, assigns tasks to clients, performs aggregation +- FL client: executes tasks, performs local computation/learning with local dataset, submits result back to FL server +- FL algorithms: FedAvg, FedOpt, FedProx etc. implemented as workflows + +.. note:: + + Here we describe the centralized version of FL, where the FL server has the role of the aggregrator node. However in a decentralized version such as + swarm learning, FL clients can serve as the aggregator node instead. + +- Types of FL + + - horizontal FL: clients hold different data samples over the same features + - vertical FL: clients hold different features over an overlapping set of data samples + - swarm learning: a decentralized subset of FL where orchestration and aggregation is performed by the clients + +Main Benefits +============= + +Enhanced Data Privacy and Security +---------------------------------- +Federated learning facilitates data privacy and data locality by ensuring that the data remains at each site. +Additionally, privacy preserving techniques such as homomorphic encryption and differential privacy filters can also be leveraged to further protect the transferred data. + +Improved Accuracy and Diversity +------------------------------- +By training with a variety of data sources across different clients, a robust and generalizable global model can be developed to better represent heterogeneous datasets. + +Scalability and Network Efficiency +---------------------------------- +With the ability to perform training at the edge, federated learning can be highly scalable across the globe. +Additionally only needing to transfer the model weights rather than entire datasets enables efficient use of network resources. + +Applications +============ +An important application of federated learning is in the healthcare sector, where data privacy regulations and patient record confidentiality make training models challenging. +Federated learning can help break down these healthcare data silos to allow hospitals and medical institutions to collaborate and pool their medical knowledge without the need to share their data. +Some common use cases involve classification and detection tasks, drug discovery with federated protein LLMs, and federated analytics on medical devices. + +Furthermore there are many other areas and industries such as financial fraud detection, autonomous vehicles, HPC, mobile applications, etc. +where the ability to use distributed data silos while maintaining data privacy is essential for the development of better models. + +Read on to learn how FLARE is built as a flexible federated computing framework to enable federated learning from research to production. \ No newline at end of file diff --git a/docs/flare_overview.rst b/docs/flare_overview.rst index e183d7fce3..15eaafa8d3 100644 --- a/docs/flare_overview.rst +++ b/docs/flare_overview.rst @@ -5,105 +5,142 @@ NVIDIA FLARE Overview ##################### **NVIDIA FLARE** (NVIDIA Federated Learning Application Runtime Environment) is a domain-agnostic, open-source, -extensible SDK that allows researchers and data scientists to adapt existing ML/DL workflow to a federated paradigm. +extensible SDK that allows researchers, data scientists and data engineers to adapt existing ML/DL and compute workflows to a federated paradigm. +With the FLARE platform, developers can create a secure and privacy-preserving solution for decentralized data computing, facilitating distributed multi-party collaboration. -With Nvidia FLARE platform developers can build a secure, privacy preserving offering -for a distributed multi-party collaboration. +Key Features +============ -NVIDIA FLARE SDK is built for robust, production scale for real-world federated learning deployments. +Federated Computing +------------------- -It includes: +At its core, FLARE serves as a federated computing framework, with applications such as Federated Learning and Federated Analytics built upon this foundation. +Notably, it is agnostic to datasets, workloads, and domains. In contrast to centralized data lake solutions that necessitate copying data to a central location, FLARE brings computing capabilities directly to distributed datasets. +This approach ensures that data remains within the compute node, with only pre-approved, selected results shared among collaborators. +Moreover, FLARE is system agnostic, offering easy integration with various data processing frameworks through the implementation of the FLARE client. +This client facilitates deployment in sub-processes, Docker containers, Kubernetes pods, HPC, or specialized systems. - * A runtime environment enabling data scientists and researchers to easily carry out FL experiments in a - real-world scenario. Nvidia FLARE supports multiple task execution, maximizing data scientist's productivity. - - * System capabilities to start up federated learning with high availability infrastructure. - - * Built-in implementations of: +Built for productivity +---------------------- - * Federated training workflows (scatter-and-gather, Cyclic) - * Federated evaluation workflows (global model evaluation, cross site model validation); - * Learning algorithms (FedAvg, FedOpt, FedProx) - * Privacy preserving algorithms (homomorphic encryption, differential privacy) +FLARE is designed for maximum productivity, providing a range of tools to enhance user experience and research efficiency at different stages of the development process: - * Extensible management tools for: +- **FLARE Client API:** Enables users to transition seamlessly from ML/DL to FL with just a few lines of code changes. +- **Simulator CLI:** Allows users to simulate federated learning or computing jobs in multi-process settings within a single computer, offering quick response and debugging. The same job can be deployed directly to production. +- **POC CLI:** Facilitates the simulation of federated learning or computing jobs in multi-process settings within one computer. Different processes represent server, clients, and an admin console, providing users with a realistic sense of the federated network. It also allows users to simulate project deployment on a single host. +- **Job CLI:** Permits users to create and submit jobs directly in POC or production environments. +- **FLARE API:** Enables users to run jobs directly from Python code or notebooks. +- **FLARE Dashboard:** Allows users to set up, approve, and distribute deployment artifacts among collaborators. +- **Preflight Check Tool:** Enables users to verify if the federated system is correctly set up before running any jobs. +- **Cloud Deployment CLI:** Allows users to start and deploy FLARE on cloud service providers (AWS or Azure) with a single CLI command. +- **ML Experiment Tracking Support:** Enables users to log to TensorBoard, MLFlow, and Weights & Biases for comprehensive experiment tracking. - * Secure provisioning (TLS certificates) - * Orchestration (Admin Console) | (Admin APIs) - * Monitoring of federated learning experiments (Aux APIs; Tensorboard visualization) - - * A rich set of programmable APIs allowing researchers to create new federated workflows, - learning & privacy preserving algorithms. +Built for security & privacy +---------------------------- +FLARE prioritizes robust security and privacy preservation: -High-level System Architecture -============================== -As outlined above, NVIDIA FLARE includes components that allow researchers and developers to build and deploy -end-to-end federated learning applications. +- **Secure Provisioning:** Utilizes TLS certificates to ensure a secure environment. +- **Event-based Security Plugin Mechanism:** Enables local, user-defined authentication and authorization for enhanced security. +- **Authorization Policy Control:** Empowers local entities to control authorization policies within the federated framework. +- **Data and Result Filter Mechanism:** Enhances data protection through a filter mechanism. +- **Audit Logs:** Provides audit logs for increased transparency and accountability. +- **Federated Learning Algorithms:** Incorporates advanced algorithms for privacy preservation, including Differential Privacy, Homomorphic Encryption, and Multi-party Private Set Intersection (PSI). -The high-level architecture is shown in the diagram below. +Built for concurrency & scalability +----------------------------------- -This includes the foundational components of the NVIDIA FLARE API and tools for privacy preservation and -secure management of the platform. +FLARE is designed for optimal concurrency, supporting resource-based multi-job execution when the necessary resources are available. This concurrent run capability enhances the efficiency of job execution within the framework. +Additionally, for setups involving devices across multiple regions, FLARE offers FL HUB (Hierarchical Unified Bridge) features. These features enable the establishment of a tiered federated learning system, enhancing flexibility and scalability in multi-region configurations. -On top of this foundation are the building blocks for federated learning applications, -with a set of federation workflows and learning algorithms. +Built for customization +----------------------- -Alongside this central stack are tools that allow experimentation and proof-of-concept development -with the FL Simulator (POC mode), along with a set of tools used to deploy and manage production workflows. +FLARE is structured in layers, with each layer composed of customizable components. This modular design ensures that every layer is easily pluggable, allowing for seamless customization. -.. image:: resources/FL_stack.jpg - :height: 300px +Rich examples repository +------------------------ +FLARE provides a wealth of built-in implementations for various federated learning workflows, along with numerous examples, showcasing diverse algorithms. Examples include: +- **Federated training workflows** + + - Server-side controlled flow: scatter-and-gather, cyclic-weight transfer, federated evaluation, cross-site-model evaluation + - Client-side controlled flow: cyclic-weight transfer, swarm-learning, cross-site-model evaluation + - Split Learning + +- **Learning algorithms** (FedAvg, FedOpt, FedProx, Scaffold, Ditto, FedSM, Fed AutoRL etc.) +- **Privacy preserving algorithms** (homomorphic encryption, differential privacy) +- **Federated Learning Examples** + + - Large Language Model + - Medical Image Analysis + - Federated Statistics + - Traditional Machine Learning (scikit-learn, linear model, SVM, Kmeans, Random Forest) + - Federated XGBoost (horizontal and vertical) + - NLP + - GNN + - Federated Multi-Party PSI + +- **Feature Tutorials** + + - Simulator, FLARE API, POC mode, Job CLI + - ML-to-FL FLARE Client API + - Step-by-step progressive series + +Built for integration +--------------------- + +FLARE offers multiple integration options with third-party systems, with the Flare Agent providing a seamless and straightforward approach. +This allows FLARE to collaborate effortlessly with third-party systems, irrespective of the programming languages and technologies they are built upon. + +Built for production +-------------------- + +The FLARE SDK is designed for robust, production-scale deployment in real-world federated learning and computing scenarios. +It has found applications in various industries, including healthcare, financial services, and self-driving cars. +FLARE has been successfully deployed in both cloud and on-premise environments. + +High-level System Architecture +============================== + +As detailed above, FLARE incorporates components that empower researchers and developers to construct and deploy end-to-end federated learning applications. +The high-level architecture, depicted in the diagram below, encompasses the foundational layer of the FLARE communication, messaging streaming layers, and tools dedicated to privacy preservation and secure platform management. +Atop this foundation lie the building blocks for federated learning applications, featuring a suite of federation workflows and learning algorithms. +Adjacent to this central stack are tools facilitating experimentation and simulation with the FL Simulator and POC CLI, complemented by a set of tools designed for the deployment and management of production workflows. + +.. image:: resources/flare_overview.png + :height: 500px Design Principles ================= -* Keep it simple - less is more -* Design to specification -* Build for real-world scenarios -* Keep the system general-purpose -* Client system friendly +- Less is more +- Design to specification +- Build for real-world scenarios +- Keep the system general-purpose +- Client system friendly **Less is more** - -We strive to solve unique challenges by doing less while enabling others to do more. -We can't solve whole worlds' problems, but by building an open platform we can enable -others to solve world's problems. - -This design principle means we intentionally limit the scope of the implementation, -only building the necessary components. For a given implementation, we follow specifications -in a way that allows others to easily customize and extend. - +We strive to solve unique challenges by doing less while enabling others to do more. +We can't solve whole worlds' problems, but by building an open platform we can enable others to solve world's problems. +This design principle means we intentionally limit the scope of the implementation, only building the necessary components. +For a given implementation, we follow specifications in a way that allows others to easily customize and extend. **Design to Specification** +Every component and API is specification-based, so that alternative implementations can be constructed by following the spec. +This allows pretty much every component to be customized. +We strive to be open-minded in reference implementations, encouraging developers and end-users to extend and customize to meet the needs of their specific workflows. -Every component and API is specification-based, so that alternative implementations can be -constructed by following the spec. This allows pretty much every component to be customized. - -We strive to be open-minded in reference implementations, encouraging developers and end-users -to extend and customize to meet the needs of their specific workflows. - - -**Build for real-world scenarios** - -We build to handle real-world use cases where unexpected events or misbehaving code can be -handled in a way that allows components or the system as a whole to fail gracefully. -The reference implementations of the default components are designed to solve real-world -problems in a straightforward way. - +**Built for real-world scenarios** +We build to handle real-world use cases where unexpected events or misbehaving code can be handled in a way that allows components or the system as a whole to fail gracefully. +The reference implementations of the default components are designed to solve real-world problems in a straightforward way. **Keep the system general-purpose** - -We design ths system to be general purpose, to enable different "federated" computing use cases. +We design the system to be general purpose, to enable different “federated” computing use cases. We carefully package the components into different layers with minimal dependencies between layers. -In this way, implementations for specific use cases should not demand modifications to the -underlying system core. - +In this way, implementations for specific use cases should not demand modifications to the underlying system core. **Client system friendly** - We design the system so that it can run anywhere with minimal environmental dependencies. -We also strive to build the system in a way that does not interfere with the deployment environment, -allowing FLARE to be easily integrated into your own applications or platforms. +We also strive to build the system in a way that does not interfere with the deployment environment, allowing FLARE to be easily integrated into your own applications or platforms. diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 1abb87c58d..9cb8d7dca7 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -4,381 +4,4 @@ Getting Started ############### -.. _quickstart: - -Getting Started: Quick Start -============================ - -Install NVFLARE ---------------- - -.. code-block:: shell - - $ python3 -m pip install nvflare - -Clone NVFLARE repo to get examples, switch main branch (latest stable branch) - -.. code-block:: shell - - $ git clone https://github.com/NVIDIA/NVFlare.git - $ cd NVFlare - $ git switch main - - -Note on branches: - -* The `main `_ branch is the default (unstable) development branch - -* The 2.0, 2.1, 2.2, and 2.3 etc. branches are the branches for each major release and minor patches - - -Quick Start with Simulator --------------------------- -Making sure the NVFLARE environment is set up correctly following :ref:`installation`, you can run an example application with :ref:`starting_fl_simulator` -using the following script: - -.. code-block:: shell - - nvflare simulator -w /tmp/nvflare/hello-numpy-sag -n 2 -t 2 examples/hello-world/hello-numpy-sag/jobs/hello-numpy-sag - -Now you can watch the simulator run two clients (n=2) with two threads (t=2) -and logs are saved in the `/tmp/nvflare/hello-numpy-sag` workspace. - -Getting Started Guide -===================== - -This Getting Started guide is geared towards new users of NVIDIA FLARE and walks through installation, the FL Simulator, -and a simple "hello world" application. - -Once you're familiar with the platform, the :ref:`Example Applications ` are a great next step. -These examples introduce some of the key concepts of the platform and showcase the integration of popular libraries -and frameworks like Numpy, Pytorch, Tensorflow, and MONAI. - -Any FLARE application used with the FL Simulator can also be run in a real-world, distributed FL deployment. -The :ref:`Real-World FL ` section describes some of the considerations and tools used for -establishing a secure, distributed FL workflow. - -.. _installation: - -Installation -============= - -Python Version --------------- - -NVIDIA FLARE requires Python 3.8+. - -Install NVIDIA FLARE in a virtual environment ---------------------------------------------- - -It is highly recommended to install NVIDIA FLARE in a virtual environment if you are not using :ref:`containerized_deployment`. -This guide briefly describes how to create a virtual environment with venv. - -Virtual Environments and Packages -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Python's official document explains the main idea about virtual environments. -The module used to create and manage virtual environments is called `venv `_. -You can find more information there. We only describe a few necessary steps for a virtual environment for NVIDIA FLARE. - - -Depending on your OS and the Python distribution, you may need to install the Python's venv package separately. For example, in Ubuntu -20.04, you need to run the following commands to continue creating a virtual environment with venv. Note that in newer versions of Ubuntu, -you may need to make sure you are using Python 3.8 and not a newer version. - -.. code-block:: shell - - $ sudo apt update - $ sudo apt-get install python3-venv - - -Once venv is installed, you can use it to create a virtual environment with: - -.. code-block:: shell - - $ python3 -m venv nvflare-env - -This will create the ``nvflare-env`` directory in current working directory if it doesn't exist, -and also create directories inside it containing a copy of the Python interpreter, -the standard library, and various supporting files. - - -Activate the virtualenv by running the following command: - -.. code-block:: shell - - $ source nvflare-env/bin/activate - - -You may find that the pip and setuptools versions in the venv need updating: - -.. code-block:: shell - - (nvflare-env) $ python3 -m pip install -U pip - (nvflare-env) $ python3 -m pip install -U setuptools - - -Install Stable Release ----------------------- - -Stable releases are available on `NVIDIA FLARE PyPI `_: - -.. code-block:: shell - - $ python3 -m pip install nvflare - - -.. _containerized_deployment: - -Containerized Deployment with Docker -==================================== - -Running NVIDIA FLARE in a Docker container is sometimes a convenient way to ensure a -uniform OS and software environment across client and server systems. This can be used -as an alternative to the bare-metal Python virtual environment described above and will -use a similar installation to simplify transitioning between a bare metal and containerized -environment. - -To get started with a containerized deployment, you will first need to install a supported -container runtime and the NVIDIA Container Toolkit to enable support for GPUs. System requirements -and instructions for this can be found in the `NVIDIA Container Toolkit Install Guide `_. - -A simple Dockerfile is used to capture the base requirements and dependencies. In -this case, we're building an environment that will support PyTorch-based workflows, -in particular the :github_nvflare_link:`Hello PyTorch ` -example. The base for this build is the NGC PyTorch container. On this base image, -we will install the necessary dependencies and clone the NVIDIA FLARE GitHub -source code into the root workspace directory. - -Let's first create a folder called ``build`` and then create a file inside named ``Dockerfile``: - -.. code-block:: shell - - mkdir build - cd build - touch Dockerfile - -Using any text editor to edit the Dockerfile and paste the following: - -.. literalinclude:: resources/Dockerfile - :language: dockerfile - -We can then build the new container by running docker build in the directory containing -this Dockerfile, for example tagging it nvflare-pt: - -.. code-block:: shell - - docker build -t nvflare-pt . -f Dockerfile - -This will result in a docker image, ``nvflare-pt:latest``. You can run this container with Docker, -in this example mounting a local ``my-workspace`` directory into the container for use as a persistent -workspace: - -.. code-block:: shell - - mkdir my-workspace - docker run --rm -it --gpus all \ - --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ - -w $(pwd -P)/my-workspace:/workspace/my-workspace \ - nvflare-pt:latest - -Once the container is running, you can also exec into the container, for example if you need another -terminal to start additional FLARE clients. First find the ``CONTAINER ID`` using ``docker ps``, and then -use that ID to exec into the container: - -.. code-block:: shell - - docker ps # use the CONTAINER ID in the output - docker exec -it /bin/bash - -This container can be used to run the FL Simulator or any FL server or client. When using the -FL Simulator (described in the next section), you can simply mount in any directories needed for -your FLARE application code, and run the Simulator within the Docker container with -all dependencies installed. - -Ways to Run NVFLARE -=================== -NVFLARE can currently support running with the FL Simulator, POC mode, or Production mode. - -FL Simulator is lightweight and uses threads to simulate different clients. -The code used for the simulator can be directly used in production mode. - -POC mode is a quick way to get set up to run locally on one machine. The FL server and each client -run on different processes or dockers. - -Production mode is secure with TLS certificates - depending the choice the deployment, you can further choose: - - - HA or non-HA - - Local or remote - - On-premise or on cloud - -Using non-HA, secure, local mode (all clients and server running on the same host), production mode is very similar to POC mode except it is secure. - -Which mode should I choose for running NVFLARE? - - - For a quick research run, use the FL Simulator - - For simulating real cases within the same machine, use POC or production (local, non-HA, secure) mode. POC has convenient ``nvflare poc`` commands for ease of use. - - For all other cases, use production mode. - -.. _starting_fl_simulator: - -The FL Simulator -========================= - -After installing the nvflare pip package, you have access to the NVFlare CLI including the FL Simulator. -The Simulator allows you to start a FLARE server and any number of connected clients on your local -workstation or laptop, and to quickly deploy an application for testing and debugging. - -Basic usage for the :ref:`FL Simulator ` is available with ``nvflare simulator -h``: - -.. code-block:: shell - - $ nvflare simulator -h - usage: nvflare simulator [-h] [-w WORKSPACE] [-n N_CLIENTS] [-c CLIENTS] [-t THREADS] [-gpu GPU] [-m MAX_CLIENTS] job_folder - - positional arguments: - job_folder - - optional arguments: - -h, --help show this help message and exit - -w WORKSPACE, --workspace WORKSPACE - WORKSPACE folder - -n N_CLIENTS, --n_clients N_CLIENTS - number of clients - -c CLIENTS, --clients CLIENTS - client names list - -t THREADS, --threads THREADS - number of parallel running clients - -gpu GPU, --gpu GPU list of GPU Device Ids, comma separated - -m MAX_CLIENTS, --max_clients MAX_CLIENTS - max number of clients - - -Before we get into the Simulator, we'll walk through a few additional setup steps in the next section required -to run an example application. - - -Running an example application -================================ - -Any of the :ref:`example_applications` can be used with the FL Simulator. We'll demonstrate the steps here -using the hello-pt example. - -First, we need to clone the NVFlare repo to get the source code for the examples: - -.. code-block:: shell - - $ git clone https://github.com/NVIDIA/NVFlare.git - - -Please make sure to switch to the correct branch that matches the NVFlare library version you installed. - -.. code-block:: shell - - $ git switch [nvflare version] - - -We can then copy the necessary files (the exercise code in the examples directory of the NVFlare repository) -to a working directory: - -.. code-block:: shell - - mkdir simulator-example - cp -rf NVFlare/examples/hello-world/hello-pt simulator-example/ - -The hello-pt application requires a few dependencies to be installed. As in the installation section, -we can install these in the Python virtual environment by running: - -.. code-block:: shell - - source nvflare-env/bin/activate - python3 -m pip install -r simulator-example/requirements.txt - -If using the Dockerfile above to run in a container, these dependencies have already been installed. - -Next, we can create a workspace for the Simulator to use for outputs of the application run, and launch -the simulator using ``simulator-example/hello-pt/jobs/hello-pt`` as the input job directory. In this example, we'll -run on two clients using two threads: - -.. code-block:: shell - - mkdir simulator-example/workspace - nvflare simulator -w simulator-example/workspace -n 2 -t 2 simulator-example/hello-pt/jobs/hello-pt - -Now you will see output streaming from the server and client processes as they execute the federated -application. Once the run completes, your workspace directory will contain the input application configuration -and codes, logs of the output, site and global models, cross-site validation results. - -.. code-block:: shell - - $ tree -L 3 simulator-example/workspace/ - simulator-example/workspace/ - ├── audit.log - ├── local - │ └── log.config - ├── simulate_job - │ ├── app_server - │ │ ├── config - │ │ ├── custom - │ │ └── FL_global_model.pt - │ ├── app_site-1 - │ │ ├── audit.log - │ │ ├── config - │ │ ├── custom - │ │ └── log.txt - │ ├── app_site-2 - │ │ ├── audit.log - │ │ ├── config - │ │ ├── custom - │ │ └── log.txt - │ ├── cross_site_val - │ │ ├── cross_val_results.json - │ │ ├── model_shareables - │ │ └── result_shareables - │ ├── log.txt - │ ├── models - │ │ └── local_model.pt - │ └── tb_events - │ ├── site-1 - │ └── site-2 - └── startup - - -Now that we've explored an example application with the FL Simulator, we can look at what it takes to bring -this type of application to a secure, distributed deployment in the :ref:`Real World Federated Learning ` -section. - - -.. _setting_up_poc: - -Setting Up the Application Environment in POC Mode -================================================== - -To get started with a proof of concept (POC) setup after :ref:`installation`, run this command to generate a poc folder -with an overseer, server, two clients, and one admin client: - -.. code-block:: shell - - $ nvflare poc prepare -n 2 - -For more details, see :ref:`poc_command`. - -.. _starting_poc: - -Starting the Application Environment in POC Mode -================================================ - -Once you are ready to start the FL system, you can run the following command -to start the server and client systems and an admin console: - -.. code-block:: - - nvflare poc start - -To start the server and client systems without an admin console: - -.. code-block:: - - nvflare poc start -ex admin@nvidia.com - -For more details, see :ref:`poc_command`. +See :ref:`installation`. diff --git a/docs/index.rst b/docs/index.rst index f4d1f140ea..16e1fb8788 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,16 +5,29 @@ NVIDIA FLARE .. toctree:: :maxdepth: -1 :hidden: + :caption: Introduction + fl_introduction flare_overview whats_new - key_features - getting_started + Getting Started + +.. toctree:: + :maxdepth: -1 + :hidden: + :caption: Guides + example_applications_algorithms real_world_fl user_guide - best_practices programming_guide + best_practices + +.. toctree:: + :maxdepth: -1 + :hidden: + :caption: Miscellaneous + faq publications_and_talks contributing @@ -22,7 +35,7 @@ NVIDIA FLARE glossary NVIDIA FLARE (NVIDIA Federated Learning Application Runtime Environment) is a domain-agnostic, open-source, extensible SDK that allows -researchers and data scientists to adaptexisting ML/DL workflows (PyTorch, RAPIDS, Nemo, TensorFlow) to a federated paradigm; and enables +researchers and data scientists to adapt existing ML/DL workflows (PyTorch, RAPIDS, Nemo, TensorFlow) to a federated paradigm; and enables platform developers to build a secure, privacy preserving offering for a distributed multi-party collaboration. NVIDIA FLARE is built on a componentized architecture that gives you the flexibility to take federated learning workloads from research @@ -35,18 +48,21 @@ and simulation to real-world production deployment. Some of the key components - **Management tools** for secure provisioning and deployment, orchestration, and management - **Specification-based API** for extensibility -Learn more in the :ref:`FLARE Overview `, :ref:`Key Features `, :ref:`What's New `, and the -:ref:`User Guide ` and :ref:`Programming Guide `. +Learn more about FLARE features in the :ref:`FLARE Overview ` and :ref:`What's New `. Getting Started =============== -For first-time users and FL researchers, FLARE provides the :ref:`fl_simulator` that allows you to build, test, and deploy applications locally. -The :ref:`Getting Started guide ` covers installation and walks through an example application using the FL Simulator. +For first-time users and FL researchers, FLARE provides the :ref:`FL Simulator ` that allows you to build, test, and deploy applications locally. +The :ref:`Getting Started ` guide covers installation and walks through an example application using the FL Simulator. +Additional examples can be found at the :ref:`Examples Applications `, which showcase different federated learning workflows and algorithms on various machine learning and deep learning tasks. +FLARE for Users +=============== +If you want to learn how to interact with the FLARE system, please refer to the :ref:`User Guide `. When you are ready to for a secure, distributed deployment, the :ref:`Real World Federated Learning ` section covers the tools and process required to deploy and operate a secure, real-world FLARE project. FLARE for Developers ==================== -When you're ready to build your own application, the :ref:`Programming Best Practices `, :ref:`FAQ`, and -:ref:`Programming Guide ` give an in depth look at the FLARE platform and APIs. +When you're ready to build your own application, the :ref:`Programming Guide `, :ref:`Programming Best Practices `, :ref:`FAQ`, and :ref:`API Reference ` +give an in depth look at the FLARE platform and APIs. diff --git a/docs/key_features.rst b/docs/key_features.rst deleted file mode 100644 index 1e4284ac1f..0000000000 --- a/docs/key_features.rst +++ /dev/null @@ -1,75 +0,0 @@ -.. _key_features: - -############ -Key Features -############ - -See :ref:`whats_new` for the new key features added for each release. - -Key Features of the FLARE Platform -================================== -NVIDIA FLARE provides a set of commonly-used algorithms to illustrate best practices and allow simplified development of -common Federated Learning Workflows. - -Training Workflows ------------------- - - :ref:`Scatter and Gather (SAG) ` is a reference implementation of the default - workflow in previous versions of NVIDIA FLARE. SAG implements a hub and spoke model in which the central server - Controller broadcasts Tasks to be Executed on the client Workers. After the client Executors return their Task's - Shareable result (e.g., client model weights from DL training), the server Controller aggregates the results, for - example with a federated weighted average. - - :ref:`Cyclic ` is a reference implementation of a cyclic workflow, in which the central server issues a - series of tasks to be scheduled for cyclic execution among a group of clients. The client worker Executor passes - the Task's Shareable result to the next client for further execution, and so on, until the final client returns - the final Shareable to the server. - -Evaluation Workflows --------------------- - - :ref:`Cross site model validation ` is a workflow that allows validation of each - client model and the server global model against each client dataset. - - Data is not shared, rather the collection of models is distributed to each client site to run local validation. - - The results of local validation are collected by the server to construct an all-to-all matrix of - model performance vs. client dataset. - - - :ref:`Global model evaluation ` is a subset of cross-site model validation in which - the server's global model is distributed to each client for evaluation on the client's local dataset. - -Privacy Preservation Algorithms -------------------------------- -Privacy preserving algorithms in NVIDIA FLARE are implemented as :ref:`filters ` -that can be applied as data is sent or received between peers. - - - Differential privacy: - - - Exclude specific variables (:class:`ExcludeVars`) - - truncate weights by percentile (:class:`PercentilePrivacy`) - - apply sparse vector techniques (:class:`SVTPrivacy`). - - - Homomorphic encryption: NVIDIA FLARE provides homomorphic encryption and decryption - filters that can be used by clients to encrypt Shareable data before sending it to a peer. - - The server does not have a decryption key but using HE can operate on the encrypted data to aggregate - and return the encrypted aggregated data to clients. - - Clients can then decrypt the data with their local key and continue local training. - -Learning Algorithms -------------------- - - - Fed average (implemented through the :ref:`scatter_and_gather_workflow`) - In the federated averaging workflow, - a set of initial weights is distributed to client Workers who perform local training. After local training, - clients return their local weights as a Shareables that are aggregated (averaged). This new set of global average - weights is redistributed to clients and the process repeats for the specified number of rounds. - - `FedProx `_ (example configuration can be found in cifar10_fedprox of :github_nvflare_link:`CIFAR-10 example `) - - implements a :class:`Loss function ` to penalize a client's local weights based on deviation from the global model. - - `FedOpt `_ (example configuration can be found in cifar10_fedopt of :github_nvflare_link:`CIFAR-10 example `) - - implements a :class:`ShareableGenerator ` that - can use a specified Optimizer and Learning Rate Scheduler when updating the global model. - -Example Applications ---------------------- - -NVIDIA FLARE provide a rich set of :ref:`example applications ` to walk your through the whole -process. diff --git a/docs/programming_guide.rst b/docs/programming_guide.rst index ecd029649b..1053f84270 100644 --- a/docs/programming_guide.rst +++ b/docs/programming_guide.rst @@ -35,8 +35,10 @@ Please refer to :ref:`application` for more details. .. toctree:: :maxdepth: 1 + programming_guide/fed_job_api programming_guide/workflows_and_controllers - programming_guide/executor + programming_guide/execution_api_type + programming_guide/fl_model programming_guide/shareable programming_guide/data_exchange_object programming_guide/fl_context diff --git a/docs/programming_guide/component_configuration.rst b/docs/programming_guide/component_configuration.rst index a257316eb2..0821b59380 100644 --- a/docs/programming_guide/component_configuration.rst +++ b/docs/programming_guide/component_configuration.rst @@ -23,7 +23,7 @@ Once the component is loaded, you can find it by ``component_id``, which is spec Component configuration and lookup ================================== To understand component configuration, we can look at the job configuration and see how the components are defined and -used. Below is the server side configuration for :ref:`hello_pt`. +used. Below is the server side configuration for :ref:`hello_pt_job_api`. .. code-block:: json @@ -139,7 +139,7 @@ For example: { "id": "shareable_generator", - "name": "PTFedOptModelShareableGenerator", + "path": "nvflare.app_opt.pt.fedopt.PTFedOptModelShareableGenerator", "args": { "device": "cpu", "source_model": "model", diff --git a/docs/programming_guide/controllers/client_controlled_workflows.rst b/docs/programming_guide/controllers/client_controlled_workflows.rst index 823abfe997..1ec932536d 100644 --- a/docs/programming_guide/controllers/client_controlled_workflows.rst +++ b/docs/programming_guide/controllers/client_controlled_workflows.rst @@ -70,6 +70,7 @@ All client controlled workflows must have a server side controller that extends result_clients_policy: str = DefaultValuePolicy.ALL, max_status_report_interval: float = Constant.PER_CLIENT_STATUS_REPORT_TIMEOUT, progress_timeout: float = Constant.WORKFLOW_PROGRESS_TIMEOUT, + private_p2p: bool = True, ): Init args for ServerSideController @@ -192,9 +193,11 @@ purpose of status reporting is to let the server know that the job is still prog When the end-of-workflow message is received from the server, it stops the execution of the current learning task, if any. .. _ccwf_cyclic_learning: + *************** Cyclic Learning *************** + With Cyclic Learning, the learning process is done in several rounds. In each round, participating clients do training in turns, following a predetermined sequential order. Each client trains from the result received from the previous client in the sequence. @@ -233,10 +236,11 @@ Cyclic Learning: Server Side Controller starting_client: str = "", max_status_report_interval: float = Constant.PER_CLIENT_STATUS_REPORT_TIMEOUT, progress_timeout: float = Constant.WORKFLOW_PROGRESS_TIMEOUT, - rr_order: str = RROrder.FIXED, + private_p2p: bool = True, + cyclic_order: str = CyclicOrder.FIXED, ): -The only extra init arg is ``rr_order``, which specifies how the round-robin sequence is to be computed for each round: fixed order or random order. +The only extra init arg is ``cyclic_order``, which specifies how the cyclic sequence is to be computed for each round: fixed order or random order. Of all the init args, only the ``num_rounds`` must be explicitly specified. All others can take default values: @@ -363,6 +367,7 @@ Cyclic Learning: config_fed_client.json The ``cyclic_learn`` and ``cyclic_rcv_final_learn_result`` contain model data. You can apply ``task_data_filters`` if privacy is a concern (the OUT filter for the sending client, and IN filters for the receiving client). .. _ccwf_swarm_learning: + ************** Swarm Learning ************** @@ -606,6 +611,7 @@ Swarm Learning: config_fed_client.json Client assigned tasks contain model data. You can apply task_data_filters if privacy is a concern (the OUT filter for the sending client, and IN filters for the receiving client). .. _ccwf_cross_site_evaluation: + ********************* Cross Site Evaluation ********************* diff --git a/docs/programming_guide/controllers/controllers.rst b/docs/programming_guide/controllers/controllers.rst index 32aa55a77a..cf4a8f4368 100644 --- a/docs/programming_guide/controllers/controllers.rst +++ b/docs/programming_guide/controllers/controllers.rst @@ -11,7 +11,7 @@ Controller/Worker Interactions NVFlare's collaborative computing is achieved through the Controller/Worker interactions. The following diagram shows how the Controller and Worker interact. -.. image:: ../resources/Controller.png +.. image:: ../../resources/Controller.png :height: 300px The Controller is a python object that controls or coordinates the Workers to get a job done. The controller is run on @@ -73,7 +73,9 @@ The Controller's Task Manager manages the task's lifecycle: .. note:: - In NVIDIA FLARE 2.0, the underlying communication is by gRPC: the client always initiates communication by sending - a request to the server and a receiving response. When we say "server sends task to the client", it is only - conceptual. With gRPC, the client sends the "ask for next task" request to the server, and the server responds with - the task data. + In NVIDIA FLARE, the underlying communication is facilitated through gRPC: + the client always initiates communication by sending a request to the server and receiving a response. + When referring to the scenario where the "server sends a task to the client," + it is important to note that this is a conceptual representation. + In reality, with gRPC, the client initiates the interaction by sending a "request for the next task" to the server, + and the server responds by providing the task data. diff --git a/docs/programming_guide/controllers/cross_site_model_evaluation.rst b/docs/programming_guide/controllers/cross_site_model_evaluation.rst index 456e8fc138..75936806d5 100644 --- a/docs/programming_guide/controllers/cross_site_model_evaluation.rst +++ b/docs/programming_guide/controllers/cross_site_model_evaluation.rst @@ -23,7 +23,7 @@ example that implements the :class:`cross site model evaluation workflow` to write the results to a JSON file on the server. -Example with Cross Site Model Evaluation / Federated Evaluation Workflow -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -See the :github_nvflare_link:`Hello Numpy Cross-Site Validation ` for an example application with -the cross site model evaluation / federated evaluation workflow. +Examples with Cross Site Model Evaluation / Federated Evaluation Workflow +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +See :github_nvflare_link:`Hello Numpy Cross-Site Validation ` and +:github_nvflare_link:`Step-by-step Cross-site Evaluation ` for examples using server-controlled cross-site evaluation workflows. diff --git a/docs/programming_guide/controllers/initialize_global_weights.rst b/docs/programming_guide/controllers/initialize_global_weights.rst index aed6b035d0..6634d3133e 100644 --- a/docs/programming_guide/controllers/initialize_global_weights.rst +++ b/docs/programming_guide/controllers/initialize_global_weights.rst @@ -26,7 +26,7 @@ Two changes are needed: The updated file should look like the following: -.. literalinclude:: ../resources/init_weights_1_config_fed_server.json +.. literalinclude:: ../../resources/init_weights_1_config_fed_server.json :language: json diff --git a/docs/programming_guide/controllers/model_controller.rst b/docs/programming_guide/controllers/model_controller.rst new file mode 100644 index 0000000000..8eb9693834 --- /dev/null +++ b/docs/programming_guide/controllers/model_controller.rst @@ -0,0 +1,270 @@ +.. _model_controller: + +################### +ModelController API +################### + +The FLARE :mod:`ModelController` API provides an easy way for users to write and customize FLModel-based controller workflows. + +* Highly flexible with a simple API (run routine and basic communication and utility functions) +* :ref:`fl_model` for the communication data structure, everything else is pure Python +* Option to support pre-existing components and FLARE-specific functionalities + +.. note:: + + The ModelController API is a high-level API meant to simplify writing workflows. + If users prefer or need the full flexibility of the Controller with all the capabilities of FLARE functions, refer to the :ref:`controllers`. + + +Core Concepts +============= + +As an example, we can take a look at the popular federated learning workflow, "FedAvg" which has the following steps: + +#. FL server initializes an initial model +#. For each round (global iteration): + + #. FL server sends the global model to clients + #. Each FL client starts with this global model and trains on their own data + #. Each FL client sends back their trained model + #. FL server aggregates all the models and produces a new global model + + +To implement this workflow using the ModelController there are a few essential parts: + +* Import and subclass the :class:`nvflare.app_common.workflows.model_controller.ModelController`. +* Implement the ``run()`` routine for the workflow logic. +* Utilize ``send_model()`` / ``send_model_and_wait()`` for communication to send tasks with FLModel to target clients, and receive FLModel results. +* Customize workflow using predefined utility functions and components, or implement your own logics. + + +Here is an example of the FedAvg workflow using the :class:`BaseFedAvg` base class: + +.. code-block:: python + + # BaseFedAvg subclasses ModelController and defines common functions and variables such as aggregate(), update_model(), self.start_round, self.num_rounds + class FedAvg(BaseFedAvg): + + # run routine that user must implement + def run(self) -> None: + self.info("Start FedAvg.") + + # load model (by default uses persistor, can provide custom method) + model = self.load_model() + model.start_round = self.start_round + model.total_rounds = self.num_rounds + + # for each round (global iteration) + for self.current_round in range(self.start_round, self.start_round + self.num_rounds): + self.info(f"Round {self.current_round} started.") + model.current_round = self.current_round + + # obtain self.num_clients clients + clients = self.sample_clients(self.num_clients) + + # send model to target clients with default train task, wait to receive results + results = self.send_model_and_wait(targets=clients, data=model) + + # use BaseFedAvg aggregate function + aggregate_results = self.aggregate( + results, aggregate_fn=self.aggregate_fn + ) # using default aggregate_fn with `WeightedAggregationHelper`. Can overwrite self.aggregate_fn with signature Callable[List[FLModel], FLModel] + + # update global model with aggregation results + model = self.update_model(model, aggregate_results) + + # save model (by default uses persistor, can provide custom method) + self.save_model(model) + + self.info("Finished FedAvg.") + + +Below is a comprehensive table overview of the :class:`ModelController` API: + + +.. list-table:: ModelController API + :widths: 25 35 50 + :header-rows: 1 + + * - API + - Description + - API Doc Link + * - run + - Run routine for workflow. + - :func:`run` + * - send_model_and_wait + - Send a task with data to targets (blocking) and wait for results.. + - :func:`send_model_and_wait` + * - send_model + - Send a task with data to targets (non-blocking) with callback. + - :func:`send_model` + * - sample_clients + - Returns a list of num_clients clients. + - :func:`sample_clients` + * - save_model + - Save model with persistor. + - :func:`save_model` + * - load_model + - Load model from persistor. + - :func:`load_model` + + +Communication +============= + +The ModelController uses a task based communication where tasks are sent to targets, and targets execute the tasks and return results. +The :ref:`fl_model` is standardized data structure object that is sent along with each task, and :ref:`fl_model` responses are received for the results. + +.. note:: + + The :ref:`fl_model` object can be any type of data depending on the specific task. + For example, in the "train" and "validate" tasks we send the model parameters along with the task so the target clients can train and validate the model. + However in many other tasks that do not involve sending the model (e.g. "submit_model"), the :ref:`fl_model` can contain any type of data (e.g. metadata, metrics etc.) or may not be needed at all. + + +send_model_and_wait +------------------- +:func:`send_model_and_wait` is the core communication function which enables users to send tasks to targets, and wait for responses. + +The ``data`` is an :ref:`fl_model` object, and the ``task_name`` is the task for the target executors to execute (Client API executors by default support "train", "validate", and "submit_model", however executors can be written for any arbitrary task name). + +``targets`` can be chosen from client names obtained with ``sample_clients()``. + +Returns the :ref:`fl_model` responses from the target clients once the task is completed (``min_responses`` have been received, or ``timeout`` time has passed). + +send_model +---------- +:func:`send_model` is the non-blocking version of +:func:`send_model_and_wait` with a user-defined callback when receiving responses. + +A callback with the signature ``Callable[[FLModel], None]`` can be passed in, which will be called when a response is received from each target. + +The task is standing until either ``min_responses`` have been received, or ``timeout`` time has passed. +Since this call is asynchronous, the Controller :func:`get_num_standing_tasks` method can be used to get the number of standing tasks for synchronization purposes. + +For example, in the :github_nvflare_link:`CrossSiteEval ` workflow, the tasks are asynchronously sent with :func:`send_model` to get each client's model. +Then through a callback, the clients' models are sent to the other clients for validation. +Finally, the workflow waits for all standing tasks to complete with :func:`get_num_standing_tasks`. +Below is an example of how these functions can be used. For more details view the implementation of :github_nvflare_link:`CrossSiteEval `. + + +.. code-block:: python + + class CrossSiteEval(ModelController): + ... + def run(self) -> None: + ... + # Create submit_model task and broadcast to all participating clients + self.send_model( + task_name=AppConstants.TASK_SUBMIT_MODEL, + data=data, + targets=self._participating_clients, + timeout=self._submit_model_timeout, + callback=self._receive_local_model_cb, + ) + ... + # Wait for all standing tasks to complete, since we used non-blocking `send_model()` + while self.get_num_standing_tasks(): + if self.abort_signal.triggered: + self.info("Abort signal triggered. Finishing cross site validation.") + return + self.debug("Checking standing tasks to see if cross site validation finished.") + time.sleep(self._task_check_period) + + self.save_results() + self.info("Stop Cross-Site Evaluation.") + + def _receive_local_model_cb(self, model: FLModel): + # Send this model to all clients to validate + model.meta[AppConstants.MODEL_OWNER] = model_name + self.send_model( + task_name=AppConstants.TASK_VALIDATION, + data=model, + targets=self._participating_clients, + timeout=self._validation_timeout, + callback=self._receive_val_result_cb, + ) + ... + + +Saving & Loading +================ + +persistor +--------- +The :func:`save_model` and :func:`load_model` +functions utilize the configured :class:`ModelPersistor` set in the ModelController ``persistor_id: str = "persistor"`` init argument. + +custom save & load +------------------ +Users can also choose to instead create their own custom save and load functions rather than use a persistor. + +For example we can use PyTorch's save and load functions for the model parameters, and save the FLModel metadata with :mod:`FOBS` separately to different filepaths. + +.. code-block:: python + + import torch + from nvflare.fuel.utils import fobs + + class MyController(ModelController): + ... + def save_model(self, model, filepath=""): + params = model.params + # PyTorch save + torch.save(params, filepath) + + # save FLModel metadata + model.params = {} + fobs.dumpf(model, filepath + ".metadata") + model.params = params + + def load_model(self, filepath=""): + # PyTorch load + params = torch.load(filepath) + + # load FLModel metadata + model = fobs.loadf(filepath + ".metadata") + model.params = params + return model + + +Note: for non-primitive data types such as ``torch.nn.Module`` (used for the initial PyTorch model), +we must configure a corresponding FOBS decomposer for serialization and deserialization. +Read more at :ref:`serialization`. + +.. code-block:: python + + from nvflare.app_opt.pt.decomposers import TensorDecomposer + + fobs.register(TensorDecomposer) + + +Additional Functionalities +========================== + +In some cases, more advanced FLARE-specific functionalities may be of use. + +The :mod:`BaseModelController` class provides access to the engine ``self.engine`` and FLContext ``self.fl_ctx`` if needed. +Functions such as ``get_component()`` and ``build_component()`` can be used to load or dynamically build components. + +Furthermore, the underlying :mod:`Controller` class offers additional communication functions and task related utilities. +Many of our pre-existing workflows are based on this lower-level Controller API. +For more details refer to the :ref:`controllers` section. + +Examples +======== + +Examples of basic workflows using the ModelController API: + +* :github_nvflare_link:`Cyclic ` +* :github_nvflare_link:`BaseFedAvg ` +* :github_nvflare_link:`FedAvg ` + +Advanced examples: + +* :github_nvflare_link:`Scaffold ` +* :github_nvflare_link:`FedOpt ` +* :github_nvflare_link:`PTFedAvgEarlyStopping ` +* :github_nvflare_link:`Kaplan-Meier ` +* :github_nvflare_link:`Logistic Regression Newton Raphson ` +* :github_nvflare_link:`FedBPT ` diff --git a/docs/programming_guide/controllers/scatter_and_gather_workflow.rst b/docs/programming_guide/controllers/scatter_and_gather_workflow.rst index ad5b1d9507..44c9d232a8 100644 --- a/docs/programming_guide/controllers/scatter_and_gather_workflow.rst +++ b/docs/programming_guide/controllers/scatter_and_gather_workflow.rst @@ -7,7 +7,7 @@ of NVIDIA FLARE with a Server aggregating results from Clients that have produce At the core, the control_flow of :class:`nvflare.app_common.workflows.scatter_and_gather.ScatterAndGather` is a for loop: -.. image:: ../resources/fed_sag_round.png +.. image:: ../../resources/fed_sag_round.png :height: 400px Trainer diff --git a/docs/programming_guide/execution_api_type.rst b/docs/programming_guide/execution_api_type.rst new file mode 100644 index 0000000000..0f04d6d894 --- /dev/null +++ b/docs/programming_guide/execution_api_type.rst @@ -0,0 +1,99 @@ +.. _execution_api_type: + +####################### +From Local to Federated +####################### + +In the FLARE system, a federated learning algorithm is defined in a Job format +(for details, please refer to :ref:`job`). + +A Job consists of multiple "workflows" and "executors." + +The simplified job execution flow is as follows: + +- The workflow schedules a task for the FL clients. +- Each FL client performs the received task and sends the result back. +- The workflow receives the results and determines if it is done. +- If it is not done, it schedules a new task +- If it is done, it proceeds to the next workflow in the Job. + +Users need to adapt their local training or computing logic into FLARE's task +execution abstractions to make their training or computing federated. + +We offer various levels of abstraction for writing task execution code, +catering to use cases that span from complete customizability to easy user adaptation. + +Execution API Type +================== + +Below is a general overview of the key ideas and use cases for each type: + +Client API +---------- + +The :ref:`client_api` provides the most straightforward way to write FL code, +and can easily be used to convert centralized code with minimal code changes. +The Client API uses the :class:`FLModel` +object for data transfer and supports common tasks such as train, validate, and submit_model. +Option for using PyTorch Lightning is also available. +For Client API executors, the in-process and external-process executors are provided for different use cases. + +We recommend users start with the Client API, and to consider the other types +for more specific cases as required. + +ModelLearner +------------ + +The :ref:`model_learner` is designed to simplify writing learning logic by +minimizing FLARE-specific concepts. +The :class:`ModelLearner` +defines familiar learning functions for training and validation, +and uses the :class:`FLModel` +object for transferring learning information. +The ModelLearner also contains several convenient capabilities, +such as lifecycle and logging information. + +The ModelLearner is best used when working with standard machine learning code +that can fit well into the train and validate methods and can be easily adapted +to the ModelLearner subclass and method structure. + +Executor +-------- + +:ref:`executor` is the most flexible for defining custom logic and tasks, +as with a custom executor and controller, any form of computation can be performed. +However, Executors must deal directly with FLARE-specific communication concepts +such as :class:`Shareable`, :class:`DXO`, +and :class:`FLContext`. +As a result, many higher-level APIs are built on top of Executors in order to +abstract these concepts away for easier user adaptation. + +Overall, writing an Executor is most useful when implementing tasks and logic +that do not fit within the structure of higher-level APIs or other predefined Executors. + +3rd-Party System Integration +---------------------------- + +There are cases where users have a pre-existing ML/DL training system +infrastructure that cannot be easily adapted to the FLARE client. + +The :ref:`3rd_party_integration` pattern allows for a seamless integration +between the FLARE system and a third-party external training system. + +With the use of the :mod:`FlareAgent ` and +:mod:`TaskExchanger `, +users can easily enable any 3rd-party system to receive tasks and submit results back to the server. + +Please use the following chart to decide which abstraction to use: + +.. image:: ../resources/task_execution_decision_chart.png + +For more details about each type, refer to each page below. + +.. toctree:: + :maxdepth: 1 + + execution_api_type/3rd_party_integration + execution_api_type/client_api + execution_api_type/model_learner + execution_api_type/executor diff --git a/docs/programming_guide/execution_api_type/3rd_party_integration.rst b/docs/programming_guide/execution_api_type/3rd_party_integration.rst new file mode 100644 index 0000000000..093490897f --- /dev/null +++ b/docs/programming_guide/execution_api_type/3rd_party_integration.rst @@ -0,0 +1,350 @@ +.. _3rd_party_integration: + +############################ +3rd-Party System Integration +############################ + +NVFLARE supports a seamless integration between the FLARE system and a +third-party external training system. +This is especially useful with pre-existing ML/DL training system +infrastructure that cannot be easily adapted to the FLARE client. + +The FL Client uses the :class:`TaskExchanger` +executor to receive tasks, and submit results to the FLARE server. +The 3rd-party system uses the :class:`FlareAgent` to +interact with the TaskExchanger to get tasks and submit results. + +This integration pattern is illustrated in the diagram below: + +.. image:: ../../resources/3rd_party_integration_diagram.png + :height: 400px + +Requirements +============ + +- The key to enabling this integration is the "agent_id" that must be made known to both systems. + The FL client gets this information from the job's config_fed_client, and the + 3rd-party trainer gets this from its own launch process. +- It is assumed that the customer already has a way to dynamically generate the + "agent_id" for each job, and start its trainer process with this information. +- Each FL client must be able to open an address (host:port) to allow the trainer to connect to. + Depending on where the trainer is running, the connection may or may not need to be in secure mode (TLS). +- We will need to modify the "project.yml" for NVFlare provision system + and generate new package folders for each participating sites +- The trainer must be a Python program that can integrate with the NVFLARE library. +- The trainer must be able to connect to the server, as well as the address that + is dynamically opened by the FL client. + +Prepare the Trainer +=================== + +Let's prepare the trainer code first, we will modify the "project.yml" in the +next section for project setup. + +You need to modify your trainer code to integrate with the :class:`FlareAgent` API. +This API provides simple ``get_task()`` and ``submit_result()`` methods to interact with the FL client. + +We will go through the steps one by one: + +1. Create Agent +--------------- + +The :class:`FlareAgent` is responsible +for interacting with the FL client to exchange task data. + +If using FLModel, :class:`FlareAgentWithFLModel` +subclasses FlareAgent and provides conversion from shareables to task using the FLModel data structure. + +If using CellPipe, a convenient class :class:`FlareAgentWithCellPipe` +can be used. + +Please refer to their API page for detailed explanations of each argument: + + - :class:`FlareAgent` + - :class:`FlareAgentWithFLModel` + - :class:`FlareAgentWithCellPipe` + +You can create the FlareAgentWithCellPipe as the following code: + +.. code-block:: python + + from nvflare.client.flare_agent import FlareAgentWithCellPipe + + agent = FlareAgentWithCellPipe( + root_url="grpc://server:8002", + site_name=args.site_name, + agent_id=args.agent_id, + workspace_dir=args.workspace, + secure_mode=True, + submit_result_timeout=2.0, + heartbeat_timeout=120.0, + ) + +2. Start Agent +-------------- + +After we create the agent, we need to start it. +We can call ``agent.start()`` to start the agent. +This call must be made before trying to get tasks. + +For example: + +.. code-block:: python + + agent.start() + +3. Process Tasks +---------------- + +The training is a continuous process of getting a task, executing the task, +and submitting the task result. + +Call ``agent.get_task()`` to get a Task object from the FL client. +This is a blocking call and returns only when a task is available. +If there are no more tasks available (i.e. end of the job), ``AgentClosed`` +exception will be raised, and signaling to end the training. + +The :class:`Task` object contains 3 pieces of +information: task_name, task_id, and data. +The task_name tells you what the task is (e.g. train). +The task_id is a UUID of the task instance. +The data contains model data to be trained on. + +Once the task is completed, the result can be submitted to the FL client by calling ``agent.submit_result()``. +A return code (``rc``) must be provided to indicate whether the task was executed successfully. +If the ``rc`` is not RC.OK, then the job will be aborted. + +For example: + +.. code-block:: python + + while True: + print("getting task ...") + try: + task = agent.get_task() + except AgentClosed: + print("agent closed - exit") + break + + print(f"got task: {task}") + rc, meta, result = train(task.data) # perform train task + submitted = agent.submit_result(TaskResult(data=result, meta=meta, return_code=rc)) + print(f"result submitted: {submitted}") + +4. Stop Agent +------------- + +At the end of the training, ``agent.stop()`` must be called to end the program gracefully. +If this call is missed, the program may not exit properly. + +.. code-block:: python + + agent.stop() + + +5. Putting Together +------------------- + +Now we learn all the necessary steps, we can put together into the following +example code of this usage pattern: + +.. literalinclude:: ../../resources/3rd_party_trainer.py + :language: python + + +Notes: + +- This pattern of (``start``, ``get_task``, ``submit_result``, and ``stop``) is strictly enforced. + If the pattern is not followed (e.g. ``get_task``, then ``get_task`` again without ``submit_result``), + you will get a ``CallStateError`` exception. +- The only way to know that the job is ended is the ``AgentClosed`` exception from the ``get_task`` call. + This exception is raised when the FL client tells the agent that the job is done; + or when the FL client is considered dead (missing heartbeats for the configured period of time). +- If your training algorithm runs into an unrecoverable error and wants to end the job, + you should use a proper return code (e.g. ``RC.EXECUTION_EXCEPTION``). + +Project Setup +============= + +After we prepare the trainer code we can follow the steps below to properly +set up the project and jobs. + +Step One - Provision +-------------------- + +The FL client site will behave like both client and server for connecting from the perspective of the trainer. +This requires the client site to have two sets of TLS credentials. +Make sure to specify the "listening_host" for the client in the project.yml when provisioning the project. + +.. note:: + We assume you understand NVFlare provision, if not please read :ref:`provisioning`. + +An example looks like: + +.. code-block:: yaml + + participants: + # change example.com to the FQDN of the server + - name: server + type: server + org: nvidia + fed_learn_port: 8002 + admin_port: 8003 + - name: site_1 + type: client + org: nvidia + listening_host: localhost + - name: site_2 + type: client + org: nvidia + listening_host: localhost + +Once the project is provisioned, check the "startup" kit generated for the clients. +You should see the following files, among others: + +client.crt, client.key, server.crt, server.key, rootCA.pem + +Note that the specified listening_host of a site must be a hostname that +the external trainer can reach via network. + +Step Two - Prepare Job Configuration +------------------------------------ + +For each job, configure the config_fed_client.json to use +:class:`TaskExchanger` as the executor. + +.. code-block:: + + { + "format_version": 2, + "executors": [ + { + "tasks": [ + "train" + ], + "executor": { + "path": "nvflare.app_common.executors.task_exchanger.TaskExchanger", + "args": { + "pipe_id": "pipe" + "peer_read_timeout": 30, + "heartbeat_timeout": 60 + } + } + } + ], + "task_result_filters": [], + "task_data_filters": [], + components = [ + { + id = "pipe" + path = "nvflare.fuel.utils.pipe.cell_pipe.CellPipe" + args { + mode = "PASSIVE" + site_name = "{SITE_NAME}" + token = "{SITE_NAME}" + root_url = "{ROOT_URL}" + secure_mode = "{SECURE_MODE}" + workspace_dir = "{WORKSPACE}" + } + } + ] + } + +Make sure the parameters of the :class:`TaskExchanger` +are configured properly, and change the default values as needed. + +Please refer to the API page for a detailed explanation of each argument: +:class:`TaskExchanger` + +Step Three - Trainer Setup +-------------------------- + +For each client site, you will have an FL client and a trainer process. + +To make our integration work, please follow the following steps to +setup the trainer process on each client site: + + - Make sure the trainer process has access to a local file system. + - Create a "workspace" folder that is going to be used by this trainer process + This workspace will be used for all jobs. + - Copy the "startup" folder of the client site to this "workspace" folder + If needed, any additional config files required by the trainer can also + be placed in this "workspace" folder. + - Create the trainer script following the steps in the above section. + Please set the FlareAgentWithCellPipe's "workspace_dir" to the path of + this "workspace" folder that you just created. + Please make sure the "agent_id" value of FlareAgentWithCellPipe is the same + as the "token" value in the above + +Verification +============ + +The FL client (TaskExchanger) and your trainer process (FlareAgentWithCellPipe) +do not have to be started at exactly the same time. + +Whichever is started first will wait for the other for ``heartbeat_timeout`` seconds. +Once they both are started and connected, you can verify they are directly +connected using the Admin console's ``cells`` commands. + +The following example shows two clients (site-1, site-2) connected to their +external trainers via the agent_id/token "ext_trainer": + +.. code-block:: shell + + > cells + server + server.10d1d3b7-fb50-4c83-9575-e510f32c5d21 + site-1 + site-1.10d1d3b7-fb50-4c83-9575-e510f32c5d21 + site-2 + site-2.10d1d3b7-fb50-4c83-9575-e510f32c5d21 + site-1_ext_trainer_active + site-2_ext_trainer_active + site-2_ext_trainer_passive + site-1_ext_trainer_passive + Total Cells: 10 + + +The ``cells`` command lists all cells. + +Notice that the job ``10d1d3b7-fb50-4c83-9575-e510f32c5d21`` is running on both +"site-1" and "site-2" clients. + +Also notice that there are two pairs of corresponding cells +(site-1_ext_trainer_active, site-1_ext_trainer_passive) +and ((site-2_ext_trainer_active, site-2_ext_trainer_passive)). + + +Optional - Setup for Adhoc Direct Connection between FL Client and Trainer +========================================================================== + +FL client and the trainer can always talk to each other via the server, +but it could be slow, especially if the server is located far away. +The enable adhoc direct connections between the FL client and the trainer, +configure the comm_config.json on the client site as follows: + +.. code-block:: json + + { + "allow_adhoc_conns": true, + "use_aio_grpc": true, + "adhoc": { + "scheme": "tcp", + "resources": { + "host": "localhost", + "secure": true + } + } + } + +This file must be placed into the site's "local" folder within its workspace. + +Pay attention to the following: + +- For most cases, the "scheme" should be set to "tcp" to get the best performance. + If "tcp" cannot be used, you can use "grpc". +- In "resources": + + - If FL client and the trainer are within the same trusted network, + you can set "secure" to false; otherwise set it to true. + - The value of the "host" must match the "listening_host" value of the site used in provision. diff --git a/docs/programming_guide/execution_api_type/client_api.rst b/docs/programming_guide/execution_api_type/client_api.rst new file mode 100644 index 0000000000..fb002f8934 --- /dev/null +++ b/docs/programming_guide/execution_api_type/client_api.rst @@ -0,0 +1,284 @@ +.. _client_api: + +########## +Client API +########## + +The FLARE Client API provides an easy way for users to convert their centralized, +local training code into federated learning code with the following benefits: + +* Only requires a few lines of code changes, without the need to restructure the code or implement a new class +* Reduces the number of new FLARE specific concepts exposed to users +* Easy adaptation from existing local training code using different frameworks + (PyTorch, PyTorch Lightning, HuggingFace) + +Core concept +============ + +The general structure of the popular federated learning (FL) workflow, "FedAvg" is as follows: + +#. FL server initializes an initial model +#. For each round (global iteration): + + #. FL server sends the global model to clients + #. Each FL client starts with this global model and trains on their own data + #. Each FL client sends back their trained model + #. FL server aggregates all the models and produces a new global model + +On the client side, the training workflow is as follows: + +#. Receive the model from the FL server +#. Perform local training on the received global model and/or evaluate the + received global model for model selection +#. Send the new model back to the FL server + +To convert a centralized training code to federated learning, we need to +adapt the code to do the following steps: + +#. Obtain the required information from received :ref:`fl_model` +#. Run local training +#. Put the results in a new :ref:`fl_model` to be sent back + +For a general use case, there are three essential methods for the Client API: + +* ``init()``: Initializes NVFlare Client API environment. +* ``receive()``: Receives model from NVFlare side. +* ``send()``: Sends the model to NVFlare side. + +Users can use the Client API to change their centralized training code to +federated learning, for example: + +.. code-block:: python + + import nvflare.client as flare + + flare.init() # 1. Initializes NVFlare Client API environment. + input_model = flare.receive() # 2. Receives model from NVFlare side. + params = input_model.params # 3. Obtain the required information from received FLModel + + # original local training code begins + new_params = local_train(params) + # original local training code ends + + output_model = flare.FLModel(params=new_params) # 4. Put the results in a new FLModel + flare.send(output_model) # 5. Sends the model to NVFlare side. + +With 5 lines of code changes, we convert the centralized training code to +federated learning setting. + +After this, we can utilize the job templates and the :ref:`job_cli` +to generate a job so it can be run using :ref:`fl_simulator` +or submit to a deployed NVFlare system. + +Below is a table overview of key Client APIs. + +.. list-table:: Client API + :widths: 25 25 50 + :header-rows: 1 + + * - API + - Description + - API Doc Link + * - init + - Initializes NVFlare Client API environment. + - :func:`init` + * - receive + - Receives model from NVFlare side. + - :func:`receive` + * - send + - Sends the model to NVFlare side. + - :func:`send` + * - system_info + - Gets NVFlare system information. + - :func:`system_info` + * - get_job_id + - Gets job id. + - :func:`get_job_id` + * - get_site_name + - Gets site name. + - :func:`get_site_name` + * - is_running + - Returns whether the NVFlare system is up and running. + - :func:`is_running` + * - is_train + - Returns whether the current task is a training task. + - :func:`is_train` + * - is_evaluate + - Returns whether the current task is an evaluate task. + - :func:`is_evaluate` + * - is_submit_model + - Returns whether the current task is a submit_model task. + - :func:`is_submit_model` + +.. list-table:: Lightning APIs + :widths: 25 25 50 + :header-rows: 1 + + * - API + - Description + - API Doc Link + * - patch + - Patches the PyTorch Lightning Trainer for usage with FLARE. + - :func:`patch` + +.. list-table:: Metrics Logger + :widths: 25 25 50 + :header-rows: 1 + + * - API + - Description + - API Doc Link + * - SummaryWriter + - SummaryWriter mimics the usage of Tensorboard's SummaryWriter. + - :class:`SummaryWriter` + * - WandBWriter + - WandBWriter mimics the usage of weights and biases. + - :class:`WandBWriter` + * - MLflowWriter + - MLflowWriter mimics the usage of MLflow. + - :class:`MLflowWriter` + +Please check Client API Module :mod:`nvflare.client.api` for more in-depth +information about all of the Client API functionalities. + +If you are using PyTorch Lightning in your training code, you can check the +Lightning API Module :mod:`nvflare.app_opt.lightning.api`. + +.. note:: + The decorator API has been deprecated since release 2.5.0. + Please use the Client API instead. + + +Client API communication patterns +================================= + +.. image:: ../../resources/client_api.png + :height: 300px + +We offer various implementations of Client APIs tailored to different scenarios, each linked with distinct communication patterns. + +In-process Client API +--------------------- + +The in-process executor entails both the training script and client executor operating within the same process. +The training script will be launched once at the event of START_RUN and will keep on running till the END_RUN event. +Communication between them occurs through an efficient in-memory databus. + +When the training process involves either a single GPU or no GPUs, and the training script doesn't integrate third-party +training systems, the in-process executor is preferable (when available). + +Sub-process Client API +---------------------- + +On the other hand, the LauncherExecutor employs the SubprocessLauncher to use a sub-process to execute the training script. This results in the client executor +and training script residing in separate processes. The "launch_once" option is provided to the SubprocessLauncher to control +whether to launch the external script everytime when getting the task from server, or just launch the script once at the event +of START_RUN and keeps running till the END_RUN event. Communication between them is facilitated by either CellPipe +(default) or FilePipe. + +For scenarios involving multi-GPU training or the utilization of external training infrastructure, opting for the Launcher executor might be more suitable. + + +Choice of different Pipes +========================= +In the 2.5.x release, for most users, we recommend utilizing the default setting with the in-process executor +(defaulting to memory-based data exchanges). +Conversely, in the 2.4.x release, we suggest using the default setting with CellPipe for most users. + +CellPipe facilitates TCP-based cell-to-cell connections between the Executor and training script processes on +the local host. The term cell represents logical endpoints. This communication enables the exchange of models, metrics, +and metadata between the two processes. + +In contrast, FilePipe offers file-based communication between the Executor and training script processes, +utilizing a job-specific file directory for exchanging models and metadata via files. While FilePipe is easier to set up +than CellPipe, it's not suitable for high-frequency metrics exchange. + + +Configuration +============= + +Different configurations are available for each type of executor. + +in-process executor configuration +--------------------------------- +This configuration specifically caters to PyTorch applications, providing serialization and deserialization +(aka Decomposers) for commonly used PyTorch objects. For non-PyTorch applications, the generic +:class:`InProcessClientAPIExecutor` can be employed. + +.. literalinclude:: ../../../job_templates/sag_pt_in_proc/config_fed_client.conf + + +subprocess launcher Executor configuration +------------------------------------------ +In the config_fed_client in the FLARE app, in order to launch the training script we use the +:class:`SubprocessLauncher` component. +The defined ``script`` is invoked, and ``launch_once`` can be set to either +launch once for the whole job (launch_once = True), or launch a process for each task received from the server (launch_once = False) + +``launch_once`` dictates how many times the training scripts are invoked during the overall training process. +When set to False, the executor essentially invokes ``python .py`` every round of training. +Typically, launch_once is set to True. + +A corresponding :class:`ClientAPILauncherExecutor` +is used as the executor to handle the tasks and perform the data exchange using the pipe. +For the Pipe component we provide implementations of :class:`FilePipe` +and :class:`CellPipe`. + +.. literalinclude:: ../../../job_templates/sag_pt/config_fed_client.conf + +For example configurations, take a look at the :github_nvflare_link:`job_templates ` +directory for templates using the launcher and Client API. + +.. note:: + In that case that the user does not need to launch the process and instead + has their own existing external training system, this would involve using + the :ref:`3rd_party_integration`, which is based on the same underlying mechanisms. + +Examples +======== + +For examples of using Client API with different frameworks, +please refer to :github_nvflare_link:`examples/hello-world/ml-to-fl `. + +For additional examples, also take a look at the +:github_nvflare_link:`step-by-step series ` +that use Client API to write the +:github_nvflare_link:`train script `. + + +Selection of Job Templates +========================== +To help user quickly setup job configurations, we create many job templates. You can pick one job template that close to your use cases +and adapt to your needs by modify the needed variables. + +use command ``nvflare job list_templates`` you can find all job templates nvflare provided. + +.. image:: ../../resources/list_templates_results.png + :height: 300px + +looking at the ``Execution API Type``, you will find ``client_api``. That's indicates the specified job template will use +Client API configuration. You can further nail down the selection by choice of machine learning framework: pytorch or sklearn or xgboost, +in-process or not, type of models ( GNN, NeMo LLM), workflow patterns ( Swarm learning or standard fedavg with scatter and gather (sag)) etc. + + +Custom Data Class Serialization/Deserialization +=============================================== + +To pass data in the form of a custom class, you can leverage the serialization tool inside NVFlare. + +For example: + +.. code-block:: python + + class CustomClass: + def __init__(self, x, y): + self.x = 1 + self.y = 2 + +If you are using classes derived from ``Enum`` or dataclass, they will be handled by the default decomposers. +For other custom classes, you will need to write a dedicated custom decomposer and ensure it is registered +using fobs.register on both the server side and client side, as well as in train.py. + +Please note that for the custom data class to work, it must be placed in a separate file from train.py. + +For more details on serialization, please refer to :ref:`serialization`. diff --git a/docs/programming_guide/executor.rst b/docs/programming_guide/execution_api_type/executor.rst similarity index 86% rename from docs/programming_guide/executor.rst rename to docs/programming_guide/execution_api_type/executor.rst index 90ad35bed7..2c50c69245 100644 --- a/docs/programming_guide/executor.rst +++ b/docs/programming_guide/execution_api_type/executor.rst @@ -1,16 +1,16 @@ .. _executor: -Executors -========= +Executor +======== -.. image:: ../resources/Executor.png +.. image:: ../../resources/Executor.png :height: 300px -An :class:`Executor` in NVIDIA FLARE is a type of FLComponent for FL clients that has an -``execute`` method that produces a Shareable from an input Shareable. The ``execute`` method also takes a str for -task_name, FLContext, and abort_signal. +An :class:`Executor` is an FLComponent for FL clients used for executing tasks, +wherein the ``execute`` method receives and returns a Shareable object given a task name, +``FLContext``, and ``abort_signal``. -.. literalinclude:: ../../nvflare/apis/executor.py +.. literalinclude:: ../../../nvflare/apis/executor.py :language: python :lines: 24- @@ -64,7 +64,7 @@ sub-worker processes. Any component which listens to the event in the sub-worker accordingly. Also, any event fired by the FL component in the sub-worker processes will be relayed by the MultiProcessExecutor to all other components to handle. -.. image:: ../resources/multi_process_executor.png +.. image:: ../../resources/multi_process_executor.png :height: 400px MultiProcessExecutor keeps the same FL Executor API signature. When turning the FL executor into @@ -93,7 +93,7 @@ processes to use. "local_epochs": 5, "steps_aggregation": 0, "model_reader_writer": { - "name": "PTModelReaderWriter" + "path": "nvflare.app_opt.pt.model_reader_writer.PTModelReaderWriter" } } } diff --git a/docs/programming_guide/execution_api_type/model_learner.rst b/docs/programming_guide/execution_api_type/model_learner.rst new file mode 100644 index 0000000000..292d0e78c3 --- /dev/null +++ b/docs/programming_guide/execution_api_type/model_learner.rst @@ -0,0 +1,202 @@ +.. _model_learner: + +############# +Model Learner +############# + +Introduction +============ + +The goal of :github_nvflare_link:`ModelLearner ` is to make it easier to write learning logic by minimizing FLARE specific concepts exposed to the user. + +The central concept of the ModelLearner is :github_nvflare_link:`FLModel `, which defines a structure to support federated learning functions with familiar learning terms. +To create a concrete model learner, the researcher will implement the training and validation methods only with the FLModel object. +The researcher no longer needs to deal with FLARE specific concepts such as Shareable and FLContext, though they are still available for advanced cases where FLModel is not enough. + +How to Create Model Learner +=========================== + +To create a concrete model learner, you extend from the ModelLearner class. The following shows the example of NPLearner: + +.. code-block:: python + + from nvflare.app_common.abstract.model_learner import ModelLearner + from nvflare.app_common.abstract.fl_model import FLModel, ParamsType + + + class NPLearner(ModelLearner): + +The following methods must be implemented: + +.. code-block:: python + + def initialize(self) + def train(self, model: FLModel) -> Union[str, FLModel]: + def get_model(self, model_name: str) -> Union[str, FLModel]: + def validate(self, model: FLModel) -> Union[str, FLModel]: + def configure(self, model: FLModel) + def abort(self) + def finalize(self) + +Please see the docstrings of these methods for explanation at :class:`ModelLearner`. + +Initialization and Finalization +------------------------------- + +In the case that the ModelLearner requires initialization, put your initialization logic in the ``initialize`` method, which is called only once before the learning job starts. +The ModelLearner base class provides many convenience methods that you may use in the initialization logic. + +Similarly your ModelLearner may need to be properly ended. +If so, put such logic in the ``finalize`` method, which is called only once when the learning job is finished. + +Learning Logic +-------------- + +Your learning logic is implemented in the ``train`` and ``validate`` methods. All learning information is contained in the FLModel object. +Similarly the result of the learning methods is either a FLModel object (when processing succeeds) or a str for the ReturnCode when processing fails for some reason. + +You should check the FLModel object's params_type to ensure that it has the params you expected. + +If possible, you should periodically check whether the ModelLearner has been asked to abort in your learning logic, especially before or after a long-running step. +You can do so by calling the ``self.is_aborted()`` method. The typical usage pattern is: + +.. code-block:: python + + if self.is_aborted(): + return ReturnCode.TASK_ABORTED + + +If you run into a case that prevents the learning logic from proceeding, you can simply return a proper ReturnCode from the learning method. + +Return Requested Model +---------------------- + +The ModelLearner may be asked to return a specified type of model (e.g. best model). +For example, when training is done, the server may ask you to return the best local model so then it can send it to other sites to validate. +To support this, you need to implement the ``get_model`` method and return the requested model. + +Dynamic Configuration +--------------------- + +If you want to configure the ModelLearner dynamically based on information sent from the server (instead of statically based on locally configured information), you can do so by implementing the ``configure`` method. +The FLModel object should specify the config parameters for the model learning functions. + +Abort Gracefully +---------------- + +The ModelLearner may be asked to abort during the execution of its learning methods (e.g. the user may issue the ``abort_job`` command, or the server's controller may decide to abort the task). +Depending on the framework your learning method uses (e.g. MONAI, Ignite, TensorFlow, etc.), you may need to do something to make the training framework abort gracefully. +In this case, you will put such logic in the ``abort`` method. + +The ``abort`` method is optional. You don't need to implement this method if your training framework cannot be interrupted or does not need to be interrupted. + +Logging Methods +--------------- + +The ModelLearner base class provides convenience methods for logging: + +.. code-block:: python + + def debug(self, msg: str) + def info(self, msg: str) + def error(self, msg: str) + def warning(self, msg: str) + def exception(self, msg: str) + def critical(self, msg: str) + +You can use these methods to create log messages at different log levels in your learning logic. + +Get Additional Component +------------------------ + +FLARE runtime provides many service components (e.g. stats logging, security, config service) that you can use in your learner implementation. +You can get these objects via this method provided by the ModelLearner class: + +.. code-block:: python + + def get_component(self, component_id: str) -> Any + +You usually should call this when initializing the learner. + +Here is an example of using an AnalyticsSender client component in CIFAR10ModelLearner: + +.. code-block:: python + + self.writer = self.get_component( + self.analytic_sender_id + ) + +Get Contextual Information +-------------------------- + +The FLModel object contains essential information about the learning task. There is still other contextual information that you may need: + +- site_name: the name of the training site +- engine: the FLARE engine that provides additional information and services +- workspace: the workspace that you can use to retrieve and/or write data to +- job_id: the ID of the job +- app_root: the root directory of the current job in the workspace. +- shareable: the Shareable object that comes with the task +- fl_ctx: the FLContext object that comes with the task + +These are directly available in your learner object (self). + +The ModelLearner base class also provides additional convenience methods for you to get properties in the Shareable and FLContext objects: + +.. code-block:: python + + def get_shareable_header(self, key: str, default=None) + def get_context_prop(self, key: str, default=None) + +How to Install Model Learner +============================ + +Once your model learner is developed, you need to install it to the training client. +The model learner must work with the ModelLearnerExecutor that FLARE provides. +The following example shows how the model learner is configured in the job's ``config_fed_client.json``: + +.. code-block:: json + + { + "format_version": 2, + "executors": [ + { + "tasks": [ + "train" + ], + "executor": { + "name": "LearnerExecutor", + "path": "nvflare.app_common.executors.model_learner_executor.ModelLearnerExecutor", + "args": { + "learner_id": "np_learner" + } + } + } + ], + "task_result_filters": [ + ], + "task_data_filters": [ + ], + "components": [ + { + "id": "np_learner", + "path": "np_learner.NPLearner", + "args": { + } + } + ] + } + +Pay attention to the following: + +- The ``path`` of the ``executor`` must be ``nvflare.app_common.executors.model_learner_executor.ModelLearnerExecutor``. +- The ``learner_id`` in the ``executor`` and the ``id`` in the ``components`` must match (In this example it is ``np_learner``). +- The path of the ``np_learner`` component must point to your model learner implementation. + +More Resources +============== + +In addition to the :github_nvflare_link:`ModelLearner ` and :github_nvflare_link:`FLModel ` APIs, also take a look at some examples using the ModelLearner: + +- :github_nvflare_link:`Step-by-step ModelLearner ` +- :github_nvflare_link:`CIFAR10 ModelLearner ` diff --git a/docs/programming_guide/experiment_tracking.rst b/docs/programming_guide/experiment_tracking.rst index 06a274ee81..10a2fe27b7 100644 --- a/docs/programming_guide/experiment_tracking.rst +++ b/docs/programming_guide/experiment_tracking.rst @@ -4,139 +4,17 @@ Experiment Tracking ################### -*********************** -Overview and Approaches -*********************** +FLARE seamlessly integrates with leading experiment tracking systems—MLflow, Weights & Biases, and TensorBoard—to facilitate comprehensive monitoring of metrics. -In a federated computing setting, the data is distributed across multiple devices or systems, and training is run -on each device independently while preserving each client's data privacy. +You can choose between decentralized and centralized tracking configurations: -Assuming a federated system consisting of one server and many clients and the server coordinating the ML training of clients, -we can interact with ML experiment tracking tools in two different ways: +- **Decentralized tracking**: Each client manages its own metrics and experiment tracking server locally, maintaining training metric privacy. However, this setup limits the ability to compare data across different sites. +- **Centralized tracking**: All metrics are streamed to a central FL server, which then pushes the data to a selected tracking system. This setup supports effective cross-site metric comparisons - - Client-side experiment tracking: Each client will directly send the log metrics/parameters to the ML experiment - tracking server (like MLflow or Weights and Biases) or local file system (like tensorboard) - - Aggregated experiment tracking: Clients will send the log metrics/parameters to FL server, and the FL server will - send the metrics to ML experiment tracking server or local file system +We provide solutions for different client execution types. For the Client API, use the corresponding experiment tracking APIs. For Executors or Learners, use the experiment tracking LogWriters. -Each approach will have its use cases and unique challenges. In NVFLARE, we developed a server-side approach (in the -provided examples, the Receiver is on the FL server, but it could also be on the FL client): +.. toctree:: + :maxdepth: 1 - - Clients don't need to have access to the tracking server, avoiding the additional - authentication for every client. In many cases, the clients may be from different organizations - and different from the host organization of the experiment tracking server. - - Since we reduced connections to the tracking server from N clients to just one server, the traffic to the tracking server - can be highly reduced. In some cases, such as in MLFLow, the events can be buffered in the server and sent to the tracking - server in batches, further reducing the traffic to the tracking server. The buffer may add additional latency, so you can - disable the buffering if you can set the buffer flush time to 0 assuming the tracking server can take the traffic. - - Another key benefit of using server-side experiment tracking is that we separate the metrics data collection - from the metrics data delivery to the tracking server. Clients are only responsible for collecting metrics, and only the server needs to - know about the tracking server. This allows us to have different tools for data collection and data delivery. - For example, if the client has training code with logging in Tensorboard syntax, without changing the code, the server can - receive the logged data and deliver the metrics to MLflow. - - Server-side experiment tracking also can organize different clients' results into different experiment runs so they can be easily - compared side-by-side. - -************************************** -Tools, Sender, LogWriter and Receivers -************************************** - -With the "experiment_tracking" examples in the advanced examples directory, you can see how to track and visualize -experiments in real time and compare results by leveraging several experiment tracking solutions: - - - `Tensorboard `_ - - `MLflow `_ - - `Weights and Biases `_ - -.. note:: - - The user needs to sign up at Weights and Biases to access the service, NVFlare can not provide access. - -In the Federated Learning phase, users can choose an API syntax that they are used to from one -of above tools. NVFlare has developed components that mimic these APIs called -:class:`LogWriters `. All clients experiment logs -are streamed over to the FL server (with :class:`ConvertToFedEvent`), -where the actual experiment logs are recorded. The components that receive -these logs are called Receivers based on :class:`AnalyticsReceiver `. -The receiver component leverages the experiment tracking tool and records the logs during the experiment run. - -In a normal setting, we would have pairs of sender and receivers, such as: - - - TBWriter <-> TBReceiver - - MLflowWriter <-> MLflowReceiver - - WandBWriter <-> WandBReceiver - -You can also mix and match any combination of LogWriter and Receiver so you can write the ML code using one API -but use any experiment tracking tool or tools (you can use multiple receivers for the same log data sent from one sender). - -.. image:: ../resources/experiment_tracking.jpg - -************************* -Experiment logs streaming -************************* - -On the client side, when a :class:`LogWriters ` writes the -metrics, instead of writing to files, it actually generates an NVFLARE event (of type `analytix_log_stats` by default). -The `ConvertToFedEvent` widget will turn the local event `analytix_log_stats` into a -fed event `fed.analytix_log_stats`, which will be delivered to the server side. - -On the server side, the :class:`AnalyticsReceiver ` is configured -to process `fed.analytix_log_stats` events, which writes received log data to the appropriate tracking solution. - -**************************************** -Support custom experiment tracking tools -**************************************** - -There are many different experiment tracking tools, and you might want to write a custom writer and/or receiver for your needs. - -There are three things to consider for developing a custom experiment tracking tool. - -Data Type -========= - -Currently, the supported data types are metrics, params, and text. If you require other data types, may sure you add -the type to :class:`AnalyticsDataType `. - -Writer -====== - -Implement LogWriter interface with the API syntax. For each tool, we mimic the API syntax of the underlying tool, -so users can use what they are familiar with without learning a new API. -For example, for Tensorboard, TBWriter uses add_scalar() and add_scalars(); for MLflow, the syntax is -log_metric(), log_metrics(), log_parameter(), and log_parameters(); for W&B, the writer just has log(). -The data collected with these calls will all send to the AnalyticsSender to deliver to the FL server. - -Receiver -======== - -Implement AnalyticsReceiver interface and determine how to represent different sites' logs. In all three implementations -(Tensorboard, MLflow, WandB), each site's log is represented as one run. Depending on the individual tool, the implementation -can be different. For example, for both Tensorboard and MLflow, we create different runs for each client and map to the -site name. In the WandB implementation, we have to leverage multiprocess and let each run in a different process. - -***************** -Examples Overview -***************** - -The :github_nvflare_link:`experiment tracking examples ` -illustrate how to leverage different writers and receivers. All examples are based upon the hello-pt example. - -The example in the "tensorboard" directory shows how to use the Tensorboard Tracking Tool (for both the -sender and receiver). See :ref:`tensorboard_streaming` for details. - -Under the "mlflow" directory, the "hello-pt-mlflow" job shows how to use MLflow for tracking with both the MLflow sender -and receiver. The "hello-pt-tb-mlflow" job shows how to use the Tensorboard Sender, while the receiver is MLflow. -See :ref:`experiment_tracking_mlflow` for details. - -Under the :github_nvflare_link:`wandb ` directory, the -"hello-pt-wandb" job shows how to use Weights and Biases for experiment tracking with -the WandBWriter and WandBReceiver to log metrics. - -MONAI Integration -================= - -:github_nvflare_link:`Integration with MONAI ` uses the `NVFlareStatsHandler` -:class:`LogWriterForMetricsExchanger ` to connect to -:class:`MetricsRetriever `. See the job -:github_nvflare_link:`spleen_ct_segmentation_local ` -for more details on this configuration. + experiment_tracking/experiment_tracking_apis + experiment_tracking/experiment_tracking_log_writer diff --git a/docs/programming_guide/experiment_tracking/experiment_tracking_apis.rst b/docs/programming_guide/experiment_tracking/experiment_tracking_apis.rst new file mode 100644 index 0000000000..8eabe53aa2 --- /dev/null +++ b/docs/programming_guide/experiment_tracking/experiment_tracking_apis.rst @@ -0,0 +1,212 @@ +.. _experiment_tracking_apis: + +######################## +Experiment Tracking APIs +######################## + +.. figure:: ../../resources/experiment_tracking_diagram.png + :height: 500px + +To track training metrics such as accuracy or loss or AUC, we need to log these metrics with one of the experiment tracking systems. +Here we will discuss the following topics: + +- Logging metrics with MLflow, TensorBoard, or Weights & Biases +- Streaming metrics to the FL server +- Streaming to FL clients + +Logging metrics with MLflow, TensorBoard, or Weights & Biases +============================================================= + +Integrate MLflow logging to efficiently stream metrics to the MLflow server with just three lines of code: + +.. code-block:: python + + from nvflare.client.tracking import MLflowWriter + + mlflow = MLflowWriter() + + mlflow.log_metric("loss", running_loss / 2000, global_step) + +In this setup, we use ``MLflowWriter`` instead of using the MLflow API directly. +This abstraction is important, as it enables users to flexibly redirect your logging metrics to any destination, which we discuss in more detail later. + +The use of MLflow, TensorBoard, or Weights & Biases syntax will all work to stream the collected metrics to any supported experiment tracking system. +Choosing to use TBWriter, MLflowWriter, or WandBWriter is user preference based on your existing code and requirements. + +- ``MLflowWriter`` uses the Mlflow API operation syntax ``log_metric()`` +- ``TBWriter`` uses the TensorBoard SummaryWriter operation ``add_scalar()`` +- ``WandBWriter`` uses the Weights & Biases API operation ``log()`` + +Here are the APIs: + +.. code-block:: python + + class TBWriter(LogWriter): + def add_scalar(self, tag: str, scalar: float, global_step: Optional[int] = None, **kwargs): + def add_scalars(self, tag: str, scalars: dict, global_step: Optional[int] = None, **kwargs): + + + class WandBWriter(LogWriter): + def log(self, metrics: Dict[str, float], step: Optional[int] = None): + + + class MLflowWriter(LogWriter): + def log_param(self, key: str, value: any) -> None: + def log_params(self, values: dict) -> None: + def log_metric(self, key: str, value: float, step: Optional[int] = None) -> None: + def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + def log_text(self, text: str, artifact_file_path: str) -> None: + def set_tag(self, key: str, tag: any) -> None: + def set_tags(self, tags: dict) -> None: + + +After you've modified the training code, you can use the NVFlare's job configuration to configure the system to stream the logs appropriately. + +Streaming metrics to FL server +============================== + +All metric key values are captured as events, with the flexibility to stream them to the most suitable destinations. +Let's add the ``ConvertToFedEvent`` to convert these metrics events to federated events so they will be sent to the server. + +Add this component to config_fed_client.json: + +.. code-block:: json + + { + "id": "event_to_fed", + "name": "ConvertToFedEvent", + "args": {"events_to_convert": ["analytix_log_stats"], "fed_event_prefix": "fed."} + } + +If using the subprocess Client API with the ClientAPILauncherExecutor (rather than the in-process Client API with the InProcessClientAPIExecutor), +we need to add the ``MetricRelay`` to fire fed events, a ``CellPipe`` for metrics, and an ``ExternalConfigurator`` for client api initialization. + +.. code-block:: yaml + + { + id = "metric_relay" + path = "nvflare.app_common.widgets.metric_relay.MetricRelay" + args { + pipe_id = "metrics_pipe" + event_type = "fed.analytix_log_stats" + read_interval = 0.1 + } + }, + { + id = "metrics_pipe" + path = "nvflare.fuel.utils.pipe.cell_pipe.CellPipe" + args { + mode = "PASSIVE" + site_name = "{SITE_NAME}" + token = "{JOB_ID}" + root_url = "{ROOT_URL}" + secure_mode = "{SECURE_MODE}" + workspace_dir = "{WORKSPACE}" + } + }, + { + id = "config_preparer" + path = "nvflare.app_common.widgets.external_configurator.ExternalConfigurator" + args { + component_ids = ["metric_relay"] + } + } + + +On the server, configure the experiment tracking system in ``config_fed_server.conf`` using one of the following receivers. +Note that any of the receivers can be used regardless of the which writer is used. + +- ``MLflowReceiver`` for MLflow +- ``TBAnalyticsReceiver`` for TensorBoard +- ``WandBReceiver`` for Weights & Biases + +For example, here we add the ``MLflowReceiver`` component to the components configuration array: + +.. code-block:: yaml + + { + "id": "mlflow_receiver_with_tracking_uri", + "path": "nvflare.app_opt.tracking.mlflow.mlflow_receiver.MLflowReceiver", + "args": { + tracking_uri = "file:///{WORKSPACE}/{JOB_ID}/mlruns" + "kwargs": { + "experiment_name": "hello-pt-experiment", + "run_name": "hello-pt-with-mlflow", + "experiment_tags": { + "mlflow.note.content": "markdown for the experiment" + }, + "run_tags": { + "mlflow.note.content": "markdown describes details of experiment" + } + }, + "artifact_location": "artifacts" + } + } + +Notice the args{} are user defined, such as tracking_uri, experiment_name, tags etc., and will be specific to which receiver is configured. + +The MLflow tracking URL argument ``tracking_uri`` is None by default, which uses the MLflow default URL, ``http://localhost:5000``. +To make this accessible from another machine, make sure to change it to the correct URL, or point to to the ``mlruns`` directory in the workspace. + +:: + + tracking_uri = + +:: + + tracking_uri = "file:///{WORKSPACE}/{JOB_ID}/mlruns" + +You can change other arguments such as experiments, run_name, tags (using Markdown syntax), and artifact location. + +Start the MLflow server with one of the following commands: + +:: + + mlflow server --host 127.0.0.1 --port 5000 + +:: + + mlflow ui -port 5000 + +For more information with an example walkthrough, see the :github_nvflare_link:`FedAvg with SAG with MLflow tutorial `. + + +Streaming metrics to FL clients +=============================== + +If streaming metrics to the FL server isn't preferred due to privacy or other concerns, users can alternatively stream metrics to the FL client. +In such cases, there's no need to add the ``ConvertToFedEvent`` component on the client side. +Additionally, since we're not streaming to the server side, there's no requirement to configure receivers in the server configuration. + +Instead to receive records on the client side, configure the metrics receiver in the client configuration instead of the server configuration. + +For example, in the TensorBoard configuration, add this component to ``config_fed_client.conf``: + +.. code-block:: yaml + + { + "id": "tb_analytics_receiver", + "name": "TBAnalyticsReceiver", + "args": {"events": ["analytix_log_stats"]} + } + +Note that the ``events`` argument is ``analytix_log_stats``, not ``fed.analytix_log_stats``, indicating that this is a local event. + +If using the ``MetricRelay`` component, we can similarly component event_type value from ``fed.analytix_log_stats`` to ``analytix_log_stats`` for convention. +We then must set the ``MetricRelay`` argument ``fed_event`` to ``false`` to fire local events rather than the default fed events. + +.. code-block:: yaml + + { + id = "metric_relay" + path = "nvflare.app_common.widgets.metric_relay.MetricRelay" + args { + pipe_id = "metrics_pipe" + event_type = "analytix_log_stats" + # how fast should it read from the peer + read_interval = 0.1 + fed_event = false + } + } + +Then, the metrics will stream to the client. diff --git a/docs/programming_guide/experiment_tracking/experiment_tracking_log_writer.rst b/docs/programming_guide/experiment_tracking/experiment_tracking_log_writer.rst new file mode 100644 index 0000000000..58fc744f55 --- /dev/null +++ b/docs/programming_guide/experiment_tracking/experiment_tracking_log_writer.rst @@ -0,0 +1,152 @@ +.. _experiment_tracking_log_writer: + +############################## +Experiment Tracking Log Writer +############################## + +.. note:: + + This page covers experiment tracking using :class:`LogWriters `, + which are configured and used with :ref:`executor` or :ref:`model_learner` on the FLARE-side code. + If using the Client API, please refer to :ref:`experiment_tracking_apis` and :ref:`client_api` for adding experiment tracking to your custom training code. + +*********************** +Overview and Approaches +*********************** + +In a federated computing setting, the data is distributed across multiple devices or systems, and training is run +on each device independently while preserving each client's data privacy. + +Assuming a federated system consisting of one server and many clients and the server coordinating the ML training of clients, +we can interact with ML experiment tracking tools in two different ways: + + - Client-side experiment tracking: Each client will directly send the log metrics/parameters to the ML experiment + tracking server (like MLflow or Weights and Biases) or local file system (like tensorboard) + - Aggregated experiment tracking: Clients will send the log metrics/parameters to FL server, and the FL server will + send the metrics to ML experiment tracking server or local file system + +This is enabled by Receivers, which can be configured on the FL server, FL client, or on both. Each approach will have its use cases and unique challenges. +Here we provide examples and describe the server-side approach: + + - Clients don't need to have access to the tracking server, avoiding the additional + authentication for every client. In many cases, the clients may be from different organizations + and different from the host organization of the experiment tracking server. + - Since we reduced connections to the tracking server from N FL clients to just one FL server, the traffic to the tracking server + can be highly reduced. In some cases, such as in MLFLow, the events can be buffered in the server and sent to the tracking + server in batches, further reducing the traffic to the tracking server. The buffer may add additional latency, so you can + disable the buffering if you can set the buffer flush time to 0 assuming the tracking server can take the traffic. + - Another key benefit of using server-side experiment tracking is that we separate the metrics data collection + from the metrics data delivery to the tracking server. Clients are only responsible for collecting metrics, and only the server needs to + know about the tracking server. This allows us to have different tools for data collection and data delivery. + For example, if the client has training code with logging in Tensorboard syntax, without changing the code, the server can + receive the logged data and deliver the metrics to MLflow. + - Server-side experiment tracking also can organize different clients' results into different experiment runs so they can be easily + compared side-by-side. + +************************************** +Tools, Sender, LogWriter and Receivers +************************************** + +With the "experiment_tracking" examples in the advanced examples directory, you can see how to track and visualize +experiments in real time and compare results by leveraging several experiment tracking solutions: + + - `Tensorboard `_ + - `MLflow `_ + - `Weights and Biases `_ + +.. note:: + + The user needs to sign up at Weights and Biases to access the service, NVFlare can not provide access. + +In the Federated Learning phase, users can choose an API syntax that they are used to from one +of above tools. NVFlare has developed components that mimic these APIs called +:class:`LogWriters `. All clients experiment logs +are streamed over to the FL server (with :class:`ConvertToFedEvent`), +where the actual experiment logs are recorded. The components that receive +these logs are called Receivers based on :class:`AnalyticsReceiver `. +The receiver component leverages the experiment tracking tool and records the logs during the experiment run. + +In a normal setting, we would have pairs of sender and receivers, with some provided implementations in :mod:`nvflare.app_opt.tracking`: + + - TBWriter <-> TBAnalyticsReceiver + - MLflowWriter <-> MLflowReceiver + - WandBWriter <-> WandBReceiver + +You can also mix and match any combination of LogWriter and Receiver so you can write the ML code using one API +but use any experiment tracking tool or tools (you can use multiple receivers for the same log data sent from one sender). + +.. image:: ../../resources/experiment_tracking.jpg + +************************* +Experiment logs streaming +************************* + +On the client side, when a :class:`LogWriters ` writes the +metrics, instead of writing to files, it actually generates an NVFLARE event (of type `analytix_log_stats` by default). +The `ConvertToFedEvent` widget will turn the local event `analytix_log_stats` into a +fed event `fed.analytix_log_stats`, which will be delivered to the server side. + +On the server side, the :class:`AnalyticsReceiver ` is configured +to process `fed.analytix_log_stats` events, which writes received log data to the appropriate tracking solution. + +**************************************** +Support custom experiment tracking tools +**************************************** + +There are many different experiment tracking tools, and you might want to write a custom writer and/or receiver for your needs. + +There are three things to consider for developing a custom experiment tracking tool. + +Data Type +========= + +Currently, the supported data types are listed in :class:`AnalyticsDataType `, and other data types can be added as needed. + +Writer +====== +Implement :class:`LogWriter ` interface with the API syntax. For each tool, we mimic the API syntax of the underlying tool, +so users can use what they are familiar with without learning a new API. +For example, for Tensorboard, TBWriter uses add_scalar() and add_scalars(); for MLflow, the syntax is +log_metric(), log_metrics(), log_parameter(), and log_parameters(); for W&B, the writer just has log(). +The data collected with these calls will all send to the AnalyticsSender to deliver to the FL server. + +Receiver +======== + +Implement :class:`AnalyticsReceiver ` interface and determine how to represent different sites' logs. In all three implementations +(Tensorboard, MLflow, WandB), each site's log is represented as one run. Depending on the individual tool, the implementation +can be different. For example, for both Tensorboard and MLflow, we create different runs for each client and map to the +site name. In the WandB implementation, we have to leverage multiprocess and let each run in a different process. + +***************** +Examples Overview +***************** + +The :github_nvflare_link:`experiment tracking examples ` +illustrate how to leverage different writers and receivers. All examples are based upon the hello-pt example. + +TensorBoard +=========== +The example in the "tensorboard" directory shows how to use the Tensorboard Tracking Tool (for both the +sender and receiver). See :ref:`tensorboard_streaming` for details. + +MLflow +====== +Under the "mlflow" directory, the "hello-pt-mlflow" job shows how to use MLflow for tracking with both the MLflow sender +and receiver. The "hello-pt-tb-mlflow" job shows how to use the Tensorboard Sender, while the receiver is MLflow. +See :ref:`experiment_tracking_mlflow` for details. + +Weights & Biases +================ +Under the :github_nvflare_link:`wandb ` directory, the +"hello-pt-wandb" job shows how to use Weights and Biases for experiment tracking with +the WandBWriter and WandBReceiver to log metrics. + +MONAI Integration +================= + +:github_nvflare_link:`Integration with MONAI ` uses the `NVFlareStatsHandler` +:class:`LogWriterForMetricsExchanger ` to connect to +:class:`MetricsRetriever `. See the job +:github_nvflare_link:`spleen_ct_segmentation_local ` +for more details on this configuration. diff --git a/docs/programming_guide/fed_job_api.rst b/docs/programming_guide/fed_job_api.rst new file mode 100644 index 0000000000..8f9e3cec01 --- /dev/null +++ b/docs/programming_guide/fed_job_api.rst @@ -0,0 +1,413 @@ +.. _fed_job_api: + +########## +FedJob API +########## + +The FLARE :class:`FedJob` API allows users to Pythonically define and create job configurations. + +Core Concepts +============= + +* Use the :func:`to` routine to assign objects (e.g. Controller, ScriptRunner, Executor, PTModel, Filters, Components etc.) to the server or clients. +* Objects can define how they are added to the job by implementing ``add_to_fed_job``, otherwise they are added as components. +* Export the job to a configuration with :func:`export_job`. +* Run the job in the simulator with :func:`simulator_run`. + +Table overview of the :class:`FedJob` API: + +.. list-table:: FedJob API + :widths: 25 35 50 + :header-rows: 1 + + * - API + - Description + - API Doc Link + * - to + - Assign object to target. + - :func:`to` + * - to_server + - Assign object to server. + - :func:`to_server` + * - to_clients + - Assign object to all clients. + - :func:`to_clients` + * - set_up_client + - To be used in FedJob subclasses. Setup routine called by FedJob when first sending object to a client target. + - :func:`set_up_client` + * - as_id + - Return generated uuid of object. Object will be added as component if referenced. + - :func:`as_id` + * - simulator_run + - Run the job with the simulator. + - :func:`simulator_run` + * - export_job + - Export the job configuration. + - :func:`export_job` + + +Here is an example of how to create a simple cifar10_fedavg job using the :class:`FedJob` API. +We assign a FedAvg controller and the initial PyTorch model to the server, and assign a ScriptExecutor for our training script to the clients. +Then we use the simulator to run the job: + +.. code-block:: python + + from src.net import Net + + from nvflare.app_common.widgets.intime_model_selector import IntimeModelSelector + from nvflare.app_common.workflows.fedavg import FedAvg + from nvflare.app_opt.pt.job_config.model import PTModel + + from nvflare.job_config.api import FedJob + from nvflare.job_config.script_runner import ScriptRunner + + if __name__ == "__main__": + n_clients = 2 + num_rounds = 2 + train_script = "src/cifar10_fl.py" + + # Create the FedJob + job = FedJob(name="cifar10_fedavg") + + # Define the FedAvg controller workflow and send to server + controller = FedAvg( + num_clients=n_clients, + num_rounds=num_rounds, + ) + job.to_server(controller) + + # Define the initial global model with PTModel wrapper and send to server + job.to_server(PTModel(Net())) + + # Add model selection widget and send to server + job.to_server(IntimeModelSelector(key_metric="accuracy")) + + # Send ScriptRunner to all clients + runner = ScriptRunner( + script=train_script, script_args="f--batch_size 32 --data_path /tmp/data/site-{i}" + ) + job.to_clients(runner) + + # job.export_job("/tmp/nvflare/jobs/job_config") + job.simulator_run("/tmp/nvflare/jobs/workdir", n_clients=n_clients) + + +Initializing the FedJob +======================= + +Initialize the :class:`FedJob` object with the following arguments: + +* ``name`` (str): for job name. +* ``min_clients`` (int): required for the job, will be set in the meta.json. +* ``mandatory_clients`` (List[str]): to run the job, will be set in the meta.json. + +Example: + +.. code-block:: python + + job = FedJob(name="cifar10_fedavg", min_clients=2, mandatory_clients=["site-1", "site-2"]) + +Assigning objects with :func:`to` +===================================================================== + +Assign objects with :func:`to` for a specific ``target``, +:func:`to_server` for the server, and +:func:`to_clients` for all the clients. + +These functions have the following parameters which are used depending on the type of object: + +* ``obj`` (any): The object to be assigned. The obj will be given a default id if none is provided based on its type. +* ``target`` (str): (For :func:`to`) The target location of the object. Can be “server” or a client name, e.g. “site-1”. +* ``**kwargs``: if the object implements the ``add_to_fed_job`` method, ``kwargs`` are additional args to be passed to this function. See the specific object's section for more details. + +.. note:: + + In order for the FedJob to use the values of arguments passed into the ``obj``, the arguments must be set as instance variables of the same name (or prefixed with "_") in the constructor. + +Below we cover in-depth how different types of objects are handled when using :func:`to`: + + +Controller +---------- + +If the object is a :class:`Controller` sent to the server, the controller is added to the server app workflows. + +Example: + +.. code-block:: python + + controller = FedAvg( + num_clients=n_clients, + num_rounds=num_rounds, + ) + job.to(controller, "server") + +If the object is a :class:`Controller` sent to a client, the controller is added to the client app components as a client-side controller. +The controller can then be used by the :class:`ClientControllerExecutor`. + +ScriptRunner +------------ + +The :class:`ScriptRunner` can be added to clients and is used to run or launch a script. +The ``tasks`` parameter specifies the tasks the script is defined the handle (defaults to "[*]" for all tasks). + +ScriptRunner args: + +* ``script``: the script to run, will automatically be added to the custom folder. +* ``script_args``: arguments appended to the end of script. +* ``launch_external_process``: two modes, default in-process (launch_external_process=False) and ex-process (launch_external_process=True). +* ``command``: in the ex-process mode, command is prepended to the script (defaults to "python3"). +* ``framework``: determines what :class:`FrameworkType` to use for the script. + + +Example: + +.. code-block:: python + + # in-process: runs `__main__` of "src/cifar10_fl.py" with argv "--batch_size 32" + in_process_runner = ScriptRunner( + script="src/cifar10_fl.py", + script_args="--batch_size 32" + ) + job.to(in_process_runner, "site-1", tasks=["train"]) + + # subprocess: runs `python3 -u custom/src/cifar10_fl.py --batch_size 32` + external_process_runner = ScriptRunner( + script="src/cifar10_fl.py", + script_args="--batch_size 32", + launch_external_process=True, + command="python3 -u" + ) + job.to(external_process_runner, "site-2", tasks=["train"]) + + +For more details on how the ScriptRunner internally configures the ``InProcessClientAPIExecutor`` or ``ClientAPILauncherExecutor``, refer to its +:func:`add_to_fed_job` implementation. +A dictionary of component ids added is also returned to be used if needed. + + +Executor +-------- + +If the object is an :class:`Executor`, it must be sent to a client. The executor is added to the client app executors. +The ``tasks`` parameter specifies the tasks that the executor is defined the handle (defaults to "[*]" for all tasks). + +Example: + +.. code-block:: python + + executor = MyExecutor() + job.to(executor, "site-1", tasks=["train"]) + + +Resource (str) +-------------- + +If the object is a str, it is treated as an external resource and will be included in the custom directory. + +* If the object is a script, it will be copied to the custom directory. +* If the object is a directory, the directory will be copied flat to the custom directory. + +Example: + +.. code-block:: python + + job.to("src/cifar10_fl.py", "site-1") # script + job.to("content_dir", "site-1") # directory + + +Filter +------ + +If the object is a :class:`Filter`, + +* Users must specify the ``filter_type`` as either FilterType.TASK_RESULT (flow from executor to controller) or FilterType.TASK_DATA (flow from controller to executor). +* The filter will be added task_data_filters and task_result_filters accordingly and be applied to the specified ``tasks`` (defaults to "[*]" for all tasks). + +Example: + +.. code-block:: python + + pp_filter = PercentilePrivacy(percentile=10, gamma=0.01) + job.to(pp_filter, "site-1", tasks=["train"], filter_type=FilterType.TASK_RESULT) + + +Model Wrappers +-------------- + +Model Wrappers :class:`PTModel` and :class:`TFModel` are used for adding a model with persistor. + +* :class:`PTModel`: for PyTorch models (torch.nn.Module) we add a :class:`PTFileModelPersistor` and :class:`PTFileModelLocator`, and return a dictionary for these added component ids. +* :class:`TFModel`: for TensorFlow models (tf.keras.Model) we add a :class:`TFModelPersistor` and return the added persistor id. + +Example: + +.. code-block:: python + + component_ids = job.to(PTModel(Net()), "server") + +For other types of models, the model and persistor can be added explicitly as components. + + +Components +---------- +For any object that does not fall under any of the previous types and does not implement ``add_to_fed_job``, then it is added as a component with ``id``. + +* The ``id`` can be either specified as a parameter, or it will be automatically assigned. +* If adding a component with a previously used id, then the id will be incremented (e.g. "component_id1", "component_id2") and returned. +* Components may reference other components by id. + +Example: + +.. code-block:: python + + job.to_server(IntimeModelSelector(key_metric="accuracy")) + + +In the case that an id generated by :func:`as_id`, is referenced by another added object, this the referenced object will also be added as a component. +In the example below, comp2 is assigned to the server. Since comp1 was referenced in comp2 with :func:`as_id`, comp1 will also be added as a component to the server. + +Example: + +.. code-block:: python + + comp1 = Component1() + comp2 = Component2(sub_component_id=job.as_id(comp1)) + job.to(comp2, "server") + + +add_to_fed_job +=============== + +If the obj implements the ``add_to_fed_job`` method, it will be called with the kwargs. The implementation of add_to_fed_job is specific to the obj being added. +This method must follow this signature: + +.. code-block:: python + + add_to_fed_job(job, ctx, ...) + +Many of the object types covered in the above sections have implemented add_to_fed_job as they either have special cases or server as wrappers to add additional related components. + +As shown in the table below, the Object Developer FedJob API provides functions to add components, Controllers, Executors, Filters, and resources. +The Job Context ``ctx`` should simply be passed to these "add_xxx" methods, and does need to be accessed. +Additionally, the check_kwargs function can check and enforce required arguments in the kwargs. + +.. note:: + + When adding other components, a good practice is to return the ids of the extra components added in case they might be needed elsewhere. + + +Example of :class:`TFModel` :func:`add_to_fed_job`: + +.. code-block:: python + + def add_to_fed_job(self, job, ctx): + """This method is used by Job API. + + Args: + job: the Job object to add to + ctx: Job Context + + Returns: + dictionary of ids of component added + """ + if isinstance(self.model, tf.keras.Model): # if model, create a TF persistor + persistor = TFModelPersistor(model=self.model) + persistor_id = job.add_component(comp_id="persistor", obj=persistor, ctx=ctx) + return persistor_id + else: + raise ValueError( + f"Unable to add {self.model} to job with TFModelPersistor. Expected tf.keras.Model but got {type(self.model)}." + ) + + +.. list-table:: FedJob Object Developer API + :widths: 25 35 50 + :header-rows: 1 + + * - API + - Description + - API Doc Link + * - add_component + - Add a component to the job. + - :func:`add_component` + * - add_controller + - Add a Controller object to the job. + - :func:`add_controller` + * - add_executor + - Add an executor to the job. + - :func:`add_executor` + * - add_filter + - Add a filter to the job. + - :func:`add_filter` + * - add_resources + - Add resources to the job. + - :func:`add_resources` + * - check_kwargs + - Check kwargs for arguments. Raise Error if required arg is missing, or unexpected arg is given. + - :func:`check_kwargs` + + +Job Pattern Inheritance +======================== + +Job inheritance can be useful when there are common patterns that can be reused in many jobs. + +When subclassing FedJob, any number of objects can be sent to the server in the __init__, +and :func:`set_up_client` can be implemented to send objects to clients. +``set_up_client`` is called by FedJob when first sending object to a client target, as the specific client targets can vary. + +For example of a Job pattern, we can use :class:`FedAvgJob` to simplify the creation of a FedAvg job. +The FedAvgJob automatically adds the FedAvg controller, PTFileModelPersistor and IntimeModelSelector, resulting in the following experience: + +.. code-block:: python + + job = FedAvgJob(name="cifar10_fedavg", num_rounds=num_rounds, n_clients=n_clients, initial_model=Net()) + +For more examples of job patterns, see: + +* :class:`BaseFedJob` +* :class:`FedAvgJob` (pytorch) +* :class:`FedAvgJob` (tensorflow) +* :class:`CCWFJob` +* :class:`FlowerJob` + +.. note:: + + Some of the default components included in these patterns are different, always refer to the + exported job configs for a full list of components used at every site. + + +Running the Job +=============== + +Simulator +--------- + +Run the FedJob with the simulator with :func:`simulator_run` in the ``workspace``, with ``n_clients``, ``threads``, and ``gpu`` assignments. + +.. note:: + + Only set ``n_clients`` if you have not specified clients using :func:`to`. + +Example: + +.. code-block:: python + + job.simulator_run(workspace="/tmp/nvflare/jobs/workdir", n_clients=2, threads=2, gpu="0,1") + + +Export Configuration +-------------------- +We can export the job configuration with :func:`export_job` to the ``job_root`` directory to be used in other modes. + +Example: + +.. code-block:: python + + job.export_job(job_root="/tmp/nvflare/jobs/job_config") + +Examples +======== + +To see examples of how the FedJob API can be used for different applications, refer the :github_nvflare_link:`Getting Started ` and :github_nvflare_link:`Job API ` examples. diff --git a/docs/programming_guide/fl_model.rst b/docs/programming_guide/fl_model.rst new file mode 100644 index 0000000000..702af3a4de --- /dev/null +++ b/docs/programming_guide/fl_model.rst @@ -0,0 +1,17 @@ +.. _fl_model: + +FLModel +======= + +We define a standard data structure :mod:`FLModel` +that captures the common attributes needed for exchanging learning results. + +This is particularly useful when NVFlare system needs to exchange learning +information with external training scripts/systems. + +The external training script/system only need to extract the required +information from received FLModel, run local training, and put the results +in a new FLModel to be sent back. + +For a detailed explanation of each attributes, please refer to the API doc: +:mod:`FLModel` diff --git a/docs/programming_guide/resources/te.py b/docs/programming_guide/resources/te.py deleted file mode 100644 index 3f511a6c34..0000000000 --- a/docs/programming_guide/resources/te.py +++ /dev/null @@ -1,9 +0,0 @@ -def _get_model_weights(self) -> Shareable: - # Get state dict and send as weights - new_weights = self.model.state_dict() - new_weights = {k: v.cpu().numpy() for k, v in new_weights.items()} - - outgoing_dxo = DXO( - data_kind=DataKind.WEIGHTS, data=new_weights, meta={MetaKey.NUM_STEPS_CURRENT_ROUND: self._n_iterations} - ) - return outgoing_dxo.to_shareable() diff --git a/docs/programming_guide/serialization.rst b/docs/programming_guide/serialization.rst deleted file mode 100644 index b951886789..0000000000 --- a/docs/programming_guide/serialization.rst +++ /dev/null @@ -1 +0,0 @@ -See :ref:`serialization`. \ No newline at end of file diff --git a/docs/programming_guide/workflows_and_controllers.rst b/docs/programming_guide/workflows_and_controllers.rst index 052327365e..8b8bd6ce24 100644 --- a/docs/programming_guide/workflows_and_controllers.rst +++ b/docs/programming_guide/workflows_and_controllers.rst @@ -7,15 +7,49 @@ A workflow has one or more controllers, each implementing a specific coordinatio CrossSiteValidation controller implements a strategy to let every client site evaluate every other site's model. You can put together a workflow that uses any number of controllers. -Before version 2.4, all federating learning workflows (fed-average, cyclic controller, cross-site evaluation) were server controlled, -implemented with the server-side :ref:`controllers `. In these workflows, -FL clients get tasks assigned by the controller, execute the tasks, -and submit results back to the server. The first section covers the server-side -controller API for server-controlled workflows. The second section covers :ref:`client_controlled_workflows` for -workflows that are controlled by the clients. +We provide the FLModel-based :ref:`model_controller` which provides a straightforward way for users to write controllers. +We also have the original :ref:`Controller API ` with more FLARE-specific functionalities, which many of our existing workflows are based upon. + +We have implemented several server controlled federated learning workflows (fed-average, cyclic controller, cross-site evaluation) with the server-side controllers. +In these workflows, FL clients get tasks assigned by the controller, execute the tasks, and submit results back to the server. + +In certain cases, if the server cannot be trusted, it should not be involved in communication with sensitive information. +To address this concern, NVFlare introduces Client Controlled Workflows (CCWF) to facilitate peer-to-peer communication among clients. + + +Controllers can be configured in ``config_fed_server.json`` in the workflows section: + +.. code-block:: json + + workflows = [ + { + id = "fedavg_ctl", + name = "FedAvg", + args { + min_clients = 2, + num_rounds = 3, + persistor_id = "persistor" + } + } + ] + +To configure controllers using the JobAPI, define the controller and send it to the server. +This code will automatically generate the server configuration for the controller: + +.. code-block:: python + + controller = FedAvg( + num_clients=2, + num_rounds=3, + persistor_id = "persistor" + ) + job.to(controller, "server") + +Please refer to the following sections for more details about the different types of controllers. .. toctree:: :maxdepth: 3 + controllers/model_controller controllers/controllers controllers/client_controlled_workflows diff --git a/docs/publications_and_talks.rst b/docs/publications_and_talks.rst index 63bd2ae260..422512da72 100644 --- a/docs/publications_and_talks.rst +++ b/docs/publications_and_talks.rst @@ -7,14 +7,21 @@ Publications Non-exhaustive list of papers and publications related to NVIDIA FLARE, including papers using NVIDIA FLARE's predecessor libraries included in the `Clara Train SDK `__. +Publications: 2024 +------------------ +* **2024-02** `Empowering Federated Learning for Massive Models with NVIDIA FLARE `__ (Accepted to `FL@FM-TheWebConf'24 `__)) + Publications: 2023 ------------------ +* **2023-10** `Communication-Efficient Vertical Federated Learning with Limited Overlapping Samples `__ (`ICCV 2023 `__) +* **2023-10** `FedBPT: Efficient Federated Black-box Prompt Tuning for Large Language Models `__ (preprint) * **2023-10** `ConDistFL: Conditional Distillation for Federated Learning from Partially Annotated Data `__ (`DeCaF @ MICCAI 2023 `__) * **2023-06** `Fair Federated Medical Image Segmentation via Client Contribution Estimation `__ (`CVPR 2023 `__) * **2023-03** `FLARE: Federated Learning from Simulation to Real-World `__ (`IEEE Data Eng. Bull. March 2023, Vol. 46, No. 1, `__) Publications: 2022 ------------------ +* **2022-11** `Federated Learning with Azure Machine Learning `__ (Video) * **2022-10** `Auto-FedRL: Federated Hyperparameter Optimization for Multi-institutional Medical Image Segmentation `__ (`ECCV 2022 `__) * **2022-10** `Joint Multi Organ and Tumor Segmentation from Partial Labels Using Federated Learning `__ (`DeCaF @ MICCAI 2022 `__) * **2022-10** `Split-U-Net: Preventing Data Leakage in Split Learning for Collaborative Multi-modal Brain Tumor Segmentation `__ (`DeCaF @ MICCAI 2022 `__) @@ -43,29 +50,55 @@ Blogs & Videos ============== NVIDIA FLARE related blogs and other media. +Blogs & Videos: 2024 +-------------------- +* **2024-03** `Turning Machine Learning to Federated Learning in Minutes with NVIDIA FLARE 2.4 `__ (NVIDIA Technical Blog) +* **2024-02** `Scalable Federated Learning with NVIDIA FLARE for Enhanced LLM Performance `__ (NVIDIA Technical Blog) + +Blogs & Videos: 2023 +-------------------- +* **2023-09** `Preventing Health Data Leaks with Federated Learning Using NVIDIA FLARE `__ (Roche Technical Blog) +* **2023-07** `Adapting LLMs to Downstream Tasks Using Federated Learning on Distributed Datasets `__ (NVIDIA Technical Blog) +* **2023-06** `Boost Your AI Workflows with Federated Learning Enabled by NVIDIA FLARE `__ (NVIDIA Technical Blog) +* **2023-06** `Applying Federated Learning to Traditional Machine Learning Methods `__ (NVIDIA Technical Blog) +* **2023-02** `AI/ML for Business Executives `__ (Medium Blog) +* **2023-01** `FATE, Flower, PySyft & Co. — Federated Learning Frameworks in Python `__ (Medium Blog) + Blogs & Videos: 2022 -------------------- + * **2022-10** `Federated Learning from Simulation to Production with NVIDIA FLARE `__ (NVIDIA Technical Blog) +* **2022-08** `Using Federated Learning to Bridge Data Silos in Financial Services `__ (NVIDIA Technical Blog) * **2022-06** `Experimenting with Novel Distributed Applications Using NVIDIA Flare 2.1 `__ (NVIDIA Technical Blog) +* **2022-03** `Flywheel & NVIDIA FLARE Demo `__ (`Flywheel `__ Video) Blogs & Videos: 2021 -------------------- * **2021-11** `Creating Robust and Generalizable AI Models with NVIDIA FLARE `__ (NVIDIA Technical Blog) -* **2021-09** `Federated Learning for Medical AI and Triaging COVID-19 Patients `__ (NVIDIA video) +* **2021-11** `Federated Learning for Healthcare AI: NVIDIA and Rhino Health Accelerate Research Collaborations `__ (`Rhino Health `__ Video) +* **2021-11** `Federated Learning With FLARE: NVIDIA Brings Collaborative AI to Healthcare and Beyond `__ (NVIDIA Blog) +* **2021-09** `Federated Learning for Medical AI and Triaging COVID-19 Patients `__ (NVIDIA Video) +* **2021-09** `Federated Learning for Healthcare Using NVIDIA Clara `__ (NVIDIA White Paper) * **2021-06** `Federated Learning with Homomorphic Encryption `__ (NVIDIA Technical Blog) +* **2021-05** `Applying a MLOps approach to Federated learning using ML Flow with NV Flare: A Healthcare use case `__ (Medium Blog) Blogs & Videos: 2019 -------------------- * **2019-12** `Federated Learning powered by NVIDIA Clara `__ (NVIDIA Technical Blog) * **2019-10** `What is federated learning - in Chinese `__ (NVIDIA Technical Blog) -* **2019-10** `NVIDIA Research: First Privacy-Preserving Federated Learning System for Medical Imaging `__ (NVIDIA video) +* **2019-10** `NVIDIA Research: First Privacy-Preserving Federated Learning System for Medical Imaging `__ (NVIDIA Video) Talks ===== Recent talks and Webinars covering federated learning research and NVIDIA FLARE. +Talks: 2024 +----------- +* **2024-03** `Empowering Federated Learning for Massive Models with NVIDIA FLARE `__ (`SFBigAnalytics Meetup `__) + Talks: 2023 ----------- +* **2023-07** `Federated Learning `__ (`MONAI MIDL Meetup 2023 `__) * **2023-01** `MONAI Federated Learning APIs (and their use with NVIDIA FLARE) `__ (`MONAI Bootcamp 2023 `__) Talks: 2022 diff --git a/docs/quickstart.rst b/docs/quickstart.rst index a9a7b819bb..f7bf6b1892 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -2,4 +2,422 @@ Quickstart ########## -See :ref:`quickstart`. +.. _installation: + +Installation +============= + +.. note:: + The server and client versions of nvflare must match, we do not support cross-version compatibility. + +Supported Operating Systems +--------------------------- +- Linux +- OSX (Note: some optional dependencies are not compatible, such as tenseal and openmined.psi) + +Python Version +-------------- + +NVIDIA FLARE requires Python 3.8+. + +Install NVIDIA FLARE in a virtual environment +--------------------------------------------- + +It is highly recommended to install NVIDIA FLARE in a virtual environment if you are not using :ref:`containerized_deployment`. +This guide briefly describes how to create a virtual environment with venv. + +Virtual Environments and Packages +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Python's official document explains the main idea about virtual environments. +The module used to create and manage virtual environments is called `venv `_. +You can find more information there. We only describe a few necessary steps for a virtual environment for NVIDIA FLARE. + + +Depending on your OS and the Python distribution, you may need to install the Python's venv package separately. For example, in Ubuntu +20.04, you need to run the following commands to continue creating a virtual environment with venv. + +.. code-block:: shell + + $ sudo apt update + $ sudo apt-get install python3-venv + + +Once venv is installed, you can use it to create a virtual environment with: + +.. code-block:: shell + + $ python3 -m venv nvflare-env + +This will create the ``nvflare-env`` directory in current working directory if it doesn't exist, +and also create directories inside it containing a copy of the Python interpreter, +the standard library, and various supporting files. + + +Activate the virtualenv by running the following command: + +.. code-block:: shell + + $ source nvflare-env/bin/activate + + +You may find that the pip and setuptools versions in the venv need updating: + +.. code-block:: shell + + (nvflare-env) $ python3 -m pip install -U pip + (nvflare-env) $ python3 -m pip install -U setuptools + +Install Stable Release of NVFlare +--------------------------------- + +Stable releases are available on `NVIDIA FLARE PyPI `_: + +.. code-block:: shell + + $ python3 -m pip install nvflare + +.. note:: + + In addition to the dependencies included when installing nvflare, many of our example applications have additional packages that must be installed. + Make sure to install from any requirement.txt files before running the examples. If you already have a specific version of nvflare installed in your + environment, you may want to remove nvflare in the requirements to avoid reinstalling nvflare. + See :github_nvflare_link:`nvflare/app_opt ` for modules and components with optional dependencies. + +Cloning the NVFlare Repository and Checking Out a Branch +--------------------------------------------------------- + +Clone NVFlare repo to get examples, and switch to either the main branch or the latest stable branch: + +.. code-block:: shell + + $ git clone https://github.com/NVIDIA/NVFlare.git + $ cd NVFlare + $ git switch 2.5 + +Note on branches: + +* The `main `_ branch is the default (unstable) development branch + +* The 2.1, 2.2, 2.3, 2.4, 2.5, etc. branches are the branches for each major release and there are tags based on these with a third digit for minor patches + +Install NVFlare from source +---------------------------- + +Navigate to the NVFlare repository and use pip install with development mode (can be useful to access latest nightly features or test custom builds for example): + +.. code-block:: shell + + $ git clone https://github.com/NVIDIA/NVFlare.git + $ cd NVFlare + $ pip install -e . + + +.. _containerized_deployment: + +Containerized Deployment with Docker +==================================== + +Running NVIDIA FLARE in a Docker container is sometimes a convenient way to ensure a +uniform OS and software environment across client and server systems. This can be used +as an alternative to the bare-metal Python virtual environment described above and will +use a similar installation to simplify transitioning between a bare metal and containerized +environment. + +To get started with a containerized deployment, you will first need to install a supported +container runtime and the NVIDIA Container Toolkit to enable support for GPUs. System requirements +and instructions for this can be found in the `NVIDIA Container Toolkit Install Guide `_. + +A simple Dockerfile is used to capture the base requirements and dependencies. In +this case, we're building an environment that will support PyTorch-based workflows, +in particular the :github_nvflare_link:`Hello PyTorch ` +example. The base for this build is the NGC PyTorch container. On this base image, +we will install the necessary dependencies and clone the NVIDIA FLARE GitHub +source code into the root workspace directory. + +Let's first create a folder called ``build`` and then create a file inside named ``Dockerfile``: + +.. code-block:: shell + + mkdir build + cd build + touch Dockerfile + +Using any text editor to edit the Dockerfile and paste the following: + +.. literalinclude:: resources/Dockerfile + :language: dockerfile + +.. note:: + + For nvflare version 2.5 set PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3 + +We can then build the new container by running docker build in the directory containing +this Dockerfile, for example tagging it nvflare-pt: + +.. code-block:: shell + + docker build -t nvflare-pt . -f Dockerfile + +This will result in a docker image, ``nvflare-pt:latest``. You can run this container with Docker, +in this example mounting a local ``my-workspace`` directory into the container for use as a persistent +workspace: + +.. code-block:: shell + + mkdir my-workspace + docker run --rm -it --gpus all \ + --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ + -w $(pwd -P)/my-workspace:/workspace/my-workspace \ + nvflare-pt:latest + +Once the container is running, you can also exec into the container, for example if you need another +terminal to start additional FLARE clients. First find the ``CONTAINER ID`` using ``docker ps``, and then +use that ID to exec into the container: + +.. code-block:: shell + + docker ps # use the CONTAINER ID in the output + docker exec -it /bin/bash + +This container can be used to run the FL Simulator or any FL server or client. When using the +FL Simulator (described in the next section), you can simply mount in any directories needed for +your FLARE application code, and run the Simulator within the Docker container with +all dependencies installed. + +For a notebook showcasing this example, see the :github_nvflare_link:`NVIDIA FLARE with Docker example `. + +Ways to Run NVFlare +=================== +NVFlare can currently support running with the FL Simulator, POC mode, or Production mode. + +FL Simulator is lightweight and uses threads to simulate different clients. +The code used for the simulator can be directly used in production mode. + +Starting in 2.5, NVFlare supports running the FL Simulator with the Job API. The :ref:`Job API ` allows +you to build jobs programatically and then export them or directly run them with the simulator. + +POC mode is a quick way to get set up to run locally on one machine. The FL server and each client +run on different processes or dockers. + +Production mode is secure with TLS certificates - depending the choice the deployment, you can further choose: + + - HA or non-HA + - Local or remote + - On-premise or on cloud (See :ref:`cloud_deployment`) + +Using non-HA, secure, local mode (all clients and server running on the same host), production mode is very similar to POC mode except it is secure. + +Which mode should I choose for running NVFLARE? (Note: the same jobs can be run in any of the modes, and the same project.yml deployment options can be run in both POC mode and production.) + +.. list-table:: NVIDIA FLARE Modes + :header-rows: 1 + + * - **Mode** + - **Documentation** + - **Description** + * - Simulator + - :ref:`fl_simulator` + - | The FL Simulator is a light weight simulation where the job run is automated on a + | single system. Useful for quickly running a job or experimenting with research + | or FL algorithms. + * - POC + - :ref:`poc_command` + - | POC mode establishes and connects distinct server and client "systems" which can + | then be orchestrated using the FLARE Console all from a single machine. Users can + | also experiment with various deployment options (project.yml), which can be used + | in production modes. + * - Production + - :ref:`provisioned_setup` + - | Real world production mode involves a distributed deployment with generated startup + | kits from the provisioning process. Provides provisioning tool, dashboard, and + | various deployment options. + +.. _starting_fl_simulator: + +The FL Simulator +========================= + +After installing the nvflare pip package, you have access to the NVFlare CLI including the FL Simulator. +The Simulator allows you to start a FLARE server and any number of connected clients on your local +workstation or laptop, and to quickly deploy an application for testing and debugging. + +Basic usage for the :ref:`FL Simulator ` is available with ``nvflare simulator -h``: + +.. code-block:: shell + + $ nvflare simulator -h + usage: nvflare simulator [-h] [-w WORKSPACE] [-n N_CLIENTS] [-c CLIENTS] [-t THREADS] [-gpu GPU] [-m MAX_CLIENTS] job_folder + + positional arguments: + job_folder + + optional arguments: + -h, --help show this help message and exit + -w WORKSPACE, --workspace WORKSPACE + WORKSPACE folder + -n N_CLIENTS, --n_clients N_CLIENTS + number of clients + -c CLIENTS, --clients CLIENTS + client names list + -t THREADS, --threads THREADS + number of parallel running clients + -gpu GPU, --gpu GPU list of GPU Device Ids, comma separated + -m MAX_CLIENTS, --max_clients MAX_CLIENTS + max number of clients + + +Before we get into the Simulator, we'll walk through a few additional setup steps in the next section required +to run an example application. + + +Running an example application +================================ + +Any of the :ref:`example_applications` can be used with the FL Simulator. We'll demonstrate the steps here +using the hello-pt example. + +First, we need to clone the NVFlare repo to get the source code for the examples: + +.. code-block:: shell + + $ git clone https://github.com/NVIDIA/NVFlare.git + + +Please make sure to switch to the correct branch that matches the NVFlare library version you installed. + +.. code-block:: shell + + $ git switch [nvflare version] + + +We can then copy the necessary files (the exercise code in the examples directory of the NVFlare repository) +to a working directory: + +.. code-block:: shell + + mkdir simulator-example + cp -rf NVFlare/examples/hello-world/hello-pt simulator-example/ + +The hello-pt application requires a few dependencies to be installed. As in the installation section, +we can install these in the Python virtual environment by running: + +.. code-block:: shell + + source nvflare-env/bin/activate + python3 -m pip install -r simulator-example/hello-pt/requirements.txt + +If using the Dockerfile above to run in a container, these dependencies have already been installed. + +Next, we can directly run the ``fedavg_script_runner_pt.py`` script which is configured to build a job +with the Job API and then run it with the FL Simulator. + +.. code-block:: shell + + cd simulator-example/hello-pt + python3 fedavg_script_runner_pt.py + +Now you will see output streaming from the server and client processes as they execute the federated +application. Once the run completes, your workspace directory (by default ``/tmp/nvflare/jobs/workdir``), +will contain the input application configuration +and codes, logs of the output, site and global models, cross-site validation results. + +.. code-block:: shell + + $ tree -L 4 /tmp/nvflare/jobs/workdir + /tmp/nvflare/jobs/workdir + ├── server + │ ├── local + │ │ └── log.config + │ ├── log.txt + │ ├── pool_stats + │ │ └── simulator_cell_stats.json + │ ├── simulate_job + │ │ ├── app_server + │ │ │ ├── FL_global_model.pt + │ │ │ ├── config + │ │ │ └── custom + │ │ ├── cross_site_val + │ │ │ └── cross_val_results.json + │ │ ├── meta.json + │ │ └── tb_events + │ │ ├── site-1 + │ │ └── site-2 + │ └── startup + ├── site-1 + │ ├── cifar_net.pth + │ ├── local + │ │ └── log.config + │ ├── log.txt + │ ├── simulate_job + │ │ ├── app_site-1 + │ │ │ ├── config + │ │ │ └── custom + │ │ └── meta.json + │ └── startup + ├── site-2 + │ ├── cifar_net.pth + │ ├── local + │ │ └── log.config + │ ├── log.txt + │ ├── simulate_job + │ │ ├── app_site-2 + │ │ │ ├── config + │ │ │ └── custom + │ │ └── meta.json + │ └── startup + └── startup + + +Now that we've explored an example application with the FL Simulator, we can look at what it takes to bring +this type of application to a secure, distributed deployment in the :ref:`Real World Federated Learning ` +section. + + +.. _setting_up_poc: + +Setting Up the Application Environment in POC Mode +================================================== + +To get started with a proof of concept (POC) setup after :ref:`installation`, run this command to generate a poc folder +with an overseer, server, two clients, and one admin client: + +.. code-block:: shell + + $ nvflare poc prepare -n 2 + +For more details, see :ref:`poc_command`. + +.. _starting_poc: + +Starting the Application Environment in POC Mode +================================================ + +Once you are ready to start the FL system, you can run the following command +to start the server and client systems and an admin console: + +.. code-block:: + + nvflare poc start + +To start the server and client systems without an admin console: + +.. code-block:: + + nvflare poc start -ex admin@nvidia.com + +We can use the :ref:`job_cli` to easily submit a job to the POC system. (Note: We can run the same jobs we ran with the simulator in POC mode. If using the :ref:`fed_job_api`, simply export the job configuration with ``job.export_job()``.) + +.. code-block:: + + nvflare job submit -j NVFlare/examples/hello-world/hello-numpy-sag/jobs/hello-numpy-sag + +.. code-block:: + + nvflare poc stop + +.. code-block:: + + nvflare poc clean + +For more details, see :ref:`poc_command`. diff --git a/docs/real_world_fl.rst b/docs/real_world_fl.rst index d9867adb19..c2b0b78e62 100644 --- a/docs/real_world_fl.rst +++ b/docs/real_world_fl.rst @@ -30,5 +30,6 @@ to see the capabilities of the system and how it can be operated. real_world_fl/job real_world_fl/workspace real_world_fl/cloud_deployment + real_world_fl/kubernetes real_world_fl/notes_on_large_models user_guide/security/identity_security diff --git a/docs/real_world_fl/cloud_deployment.rst b/docs/real_world_fl/cloud_deployment.rst index 0a7cca8e7a..3d438628e4 100644 --- a/docs/real_world_fl/cloud_deployment.rst +++ b/docs/real_world_fl/cloud_deployment.rst @@ -44,11 +44,11 @@ To run NVFlare dashboard on Azure, run: .. note:: - The script also requires sshpass and jq. Both can be installed on Ubuntu, with: + The script also requires sshpass, dig and jq. All can be installed on Ubuntu, with: .. code-block:: shell - sudo apt install sshpass jq + sudo apt install sshpass bind9-dnsutils jq Users only need to enter an email address and press Enter. This user needs to remember this email and the temporary password that will be provided, as this is the login credentials for the NVFLARE Dashboard once the Dashboard is up and running. @@ -101,11 +101,11 @@ To run NVFlare dashboard on AWS, run: .. note:: - The script also requires sshpass and jq. They can be installed on Ubuntu, with: + The script also requires sshpass, dig and jq. They can be installed on Ubuntu, with: .. code-block:: shell - sudo apt install sshpass jq + sudo apt install sshpass bind9-dnsutils jq AWS manages authentications via AWS access_key and access_secret, you will need to have these credentials before you can start creating AWS infrastructure. @@ -128,9 +128,10 @@ You can accept all default values by pressing ENTER. .. code-block:: none - This script requires az (Azure CLI), sshpass and jq. Now checking if they are installed. + This script requires az (Azure CLI), sshpass dig and jq. Now checking if they are installed. Checking if az exists. => found Checking if sshpass exists. => found + Checking if dig exists. => found Checking if jq exists. => found Cloud VM image, press ENTER to accept default Canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest: Cloud VM size, press ENTER to accept default Standard_B2ms: @@ -190,9 +191,10 @@ You can accept all default values by pressing ENTER. .. code-block:: - This script requires aws (AWS CLI), sshpass and jq. Now checking if they are installed. + This script requires aws (AWS CLI), sshpass, dig and jq. Now checking if they are installed. Checking if aws exists. => found Checking if sshpass exists. => found + Checking if dig exists. => found Checking if jq exists. => found If the server requires additional dependencies, please copy the requirements.txt to /home/nvflare/workspace/aws/nvflareserver/startup. Press ENTER when it's done or no additional dependencies. @@ -213,10 +215,15 @@ The configuration file provided is formatted as follows: .. code-block:: shell - AMI_IMAGE=ami-04bad3c587fe60d89 + AMI_IMAGE=ami-03c983f9003cb9cd1 EC2_TYPE=t2.small REGION=us-west-2 +.. note:: + + For the AWS AMIs, we recommend the following images for each version of Ubuntu: + 20.04:ami-04bad3c587fe60d89, 22.04:ami-03c983f9003cb9cd1, 24.04:ami-0406d1fdd021121cd + Deploy FL Client in the Cloud ============================= As an organization admin for an FL project, you are responsible for setting up your FL Client system. You will receive a Client startup kit either from email, sftp @@ -267,7 +274,7 @@ eg. ``--config my_config.txt``. The configuration file is formatted as follows: .. code-block:: shell - AMI_IMAGE=ami-04bad3c587fe60d89 + AMI_IMAGE=ami-03c983f9003cb9cd1 EC2_TYPE=t2.small REGION=us-west-2 diff --git a/docs/real_world_fl/flare_api.rst b/docs/real_world_fl/flare_api.rst index f40e3276c6..8165724061 100644 --- a/docs/real_world_fl/flare_api.rst +++ b/docs/real_world_fl/flare_api.rst @@ -42,7 +42,7 @@ Like with FLAdminAPI previously, :class:`AdminAPI:/workspace/nvfl/ + +And the same for site-1, site-2, admin@nvidia.com. + +This will make the entire startup kits available at the nvflare-pv-claim of the cluster so that NVIDIA FLARE system +can mount that nvflare-pv-claim and access the startup kits. + +After copying those folders to nvflare-pv-claim, you can shutdown the helper pod. The nvflare-pv-claim and its contents will stay and is +available to server/client/admin pods. + +Start Server Pod +================ + +The NVIDIA FLARE server consists of two portions for Kubernetes clusters. As you might know, +the server needs computation to handle model updates, aggregations and other operations. It also needs to provide a service for clients and admins +to connect. Therefore, the followings are two separate yaml files that work together to create the NVIDIA FLARE server in EKS. + +.. code-block:: yaml + + apiVersion: apps/v1 + kind: Deployment + metadata: + labels: + run: nvflare + name: nvflare + spec: + replicas: 1 + selector: + matchLabels: + run: nvflare + template: + metadata: + labels: + run: nvflare + spec: + containers: + - args: + - -u + - -m + - nvflare.private.fed.app.server.server_train + - -m + - /workspace/nvfl/server + - -s + - fed_server.json + - --set + - secure_train=true + - config_folder=config + - org=nvidia + command: + - /usr/local/bin/python3 + image: nvflare/nvflare:2.4.0 + imagePullPolicy: Always + name: nvflare + volumeMounts: + - name: nvfl + mountPath: /workspace/nvfl/ + volumes: + - name: nvfl + persistentVolumeClaim: + claimName: nvflare-pv-claim + + +.. code-block:: yaml + + apiVersion: v1 + kind: Service + metadata: + labels: + run: server + name: server + spec: + ports: + - port: 8002 + protocol: TCP + targetPort: 8002 + name: flport + - port: 8003 + protocol: TCP + targetPort: 8003 + name: adminport + selector: + run: nvflare + + +Note that the pod will use nvflare/nvflare:2.4.0 container image from dockerhub.com. This image only includes the necessary dependencies to start +NVIDIA FLARE system. If you require additional dependencies, such as Torch or MONAI, you will need to build and publish your own image and update +the yaml file accordingly. + +Start Client Pods +================= + +For the client pods, we only need one yaml file for eacch client. The following is the deployment yaml file for site-1. + +.. code-block:: yaml + + apiVersion: apps/v1 + kind: Deployment + metadata: + labels: + run: site1 + name: site1 + spec: + replicas: 1 + selector: + matchLabels: + run: site1 + template: + metadata: + labels: + run: site1 + spec: + containers: + - args: + - -u + - -m + - nvflare.private.fed.app.client.client_train + - -m + - /workspace/nvfl/site-1 + - -s + - fed_client.json + - --set + - secure_train=true + - uid=site-1 + - config_folder=config + - org=nvidia + command: + - /usr/local/bin/python3 + image: nvflare/nvflare:2.4.0 + imagePullPolicy: Always + name: site1 + volumeMounts: + - name: nvfl + mountPath: /workspace/nvfl/ + volumes: + - name: nvfl + persistentVolumeClaim: + claimName: nvflare-pv-claim + +Once the client is up and running, you can check the server log with ``kubectl logs`` and the log should show the clients registered. + +Start and Connect to Admin Pods +=============================== + +We can also run the admin console inside the EKS cluster to submit jobs to the NVIDIA FLARE running in the EKS cluster. Start the admin pod +with the following yaml file. + +.. code-block:: yaml + + apiVersion: apps/v1 + kind: Deployment + metadata: + labels: + run: admin + name: admin + spec: + replicas: 1 + selector: + matchLabels: + run: admin + template: + metadata: + labels: + run: admin + spec: + containers: + - args: + - "50000" + command: + - /usr/bin/sleep + image: nvflare/nvflare:2.4.0 + imagePullPolicy: Always + name: admin + volumeMounts: + - name: nvfl + mountPath: /workspace/nvfl/ + volumes: + - name: nvfl + persistentVolumeClaim: + claimName: nvflare-pv-claim + +Once the admin pod is running, you can enter the pod with ``kubectl exec`` , cd to ``/workspace/nvfl/admin@nvidia.com/startup`` and run ``fl_admin.sh``. + + +Note that you need to copy the job from your local machine to the EKS cluster so that the ``transfer`` directory of admin@nvidia.com contains the jobs +you would like to run in that EKS cluster. + diff --git a/docs/real_world_fl/migrating_to_flare_api.rst b/docs/real_world_fl/migrating_to_flare_api.rst index 755cc3cc54..97ece87210 100644 --- a/docs/real_world_fl/migrating_to_flare_api.rst +++ b/docs/real_world_fl/migrating_to_flare_api.rst @@ -91,19 +91,40 @@ This section has a summary of the commands then goes through each command and sh and the new way with FLARE API. .. csv-table:: - :header: Command for FLAdminAPI,Command for FLARE API,Differences + :header: FLAdminAPI,FLARE API,Version Added,Notes :widths: 15, 15, 30, 30 - check_status(),get_system_info(),Simplified and reformatted output, see below for details - submit_job(),submit_job(),Simplified output, see below for details - list_job(),list_job(),Simplified output, see below for details - wait_until_server_status(),monitor_job(),Changed the arg names and function, see below for details - download_job(),download_job_result(),Simplified output, see below for details - clone_job(),clone_job(),Simplified output, see below for details - abort_job(),abort_job(),Simplified output, see below for details - delete_job(),delete_job(),Simplified output, see below for details - All other commands,api.do_command(),The underlying AdminAPI's do_command() can be used for all other previous commands - + check_status,get_system_info,2.3.0,Simplified and reformatted output (see below for details) + submit_job,submit_job,2.3.0,Simplified output (see below for details) + list_job,list_job,2.3.0,Simplified output (see below for details) + wait_until_server_status,monitor_job,2.3.0,Changed the arg names and function (see below for details) + download_job,download_job_result,2.3.0,Simplified output (see below for details) + clone_job,clone_job,2.3.0,Simplified output (see below for details) + abort_job,abort_job,2.3.0,Simplified output (see below for details) + delete_job,delete_job,2.3.0,Simplified output (see below for details) + check_status,get_client_job_status,2.4.0,only for client + restart,restart,2.4.0, + shutdown,shutdown,2.4.0, + set_timeout,set_timeout,2.4.0,changed to session-based + list_sp,list_sp,2.4.0, + get_active_sp,get_active_sp,2.4.0, + promote_sp,promote_sp,2.4.0, + get_available_apps_to_upload,get_available_apps_to_upload,2.4.0, + shutdown_system,shutdown_system,2.4.0, + ls_target,ls_target,2.4.0, + cat_target,cat_target,2.4.0, + ,tail_target,2.4.0,added for consistency + tail_target_log,tail_target_log,2.4.0, + ,head_target,2.4.0,new + ,head_target_log,2.4.0,new + grep_target,grep_target,2.4.0, + get_working_directory,get_working_directory,2.4.0, + show_stats,show_stats,2.4.0,return structure changed + show_errors,show_errors,2.4.0,return structure changed + reset_errors,reset_errors,2.4.0, + get_connected_client_list,get_connected_client_list,2.4.0, + abort,,2.4.0,obsolete + remove_client,,2.4.0,not exposed Get System Info from Check Status ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -182,7 +203,7 @@ path to the job to submit as a string. For ``submit_job()`` with FLAdminAPI: .. code-block:: python - path_to_example_job = "/workspace/NVFlare/examples/hello-numpy-sag" + path_to_example_job = "/workspace/NVFlare/examples/hello-world/hello-numpy-sag" runner.api.submit_job(path_to_example_job) .. code-block:: bash @@ -205,7 +226,7 @@ value to use later. .. code-block:: python - path_to_example_job = "/workspace/NVFlare/examples/hello-numpy-sag" + path_to_example_job = "/workspace/NVFlare/examples/hello-world/hello-numpy-sag" job_id = sess.submit_job(path_to_example_job) print(job_id + " was submitted") @@ -421,14 +442,7 @@ With the FLARE API, ``delete_job()``: sess.delete_job(job_id) - -.. _migrating_all_other_fladminapi_commands_to_flare_api: - Migrating All Other FLAdminAPI Commands to FLARE API ---------------------------------------------------- -For all other commands, there are not yet specific commands in the FLARE API. With the underlying api, however, you -can submit any previous command that could be executed from the Admin Console with ``do_command()``: - -.. code-block:: python - - sess.api.do_command(COMMAND_AS_STRING) +The remaining FLAdminAPI commands have been added to the FLARE API in 2.4.0. +For more details, see the notes in the table above, and the :mod:`FLARE API` definitions. diff --git a/docs/real_world_fl/operation.rst b/docs/real_world_fl/operation.rst index 99ebb64308..23dde1eb5c 100644 --- a/docs/real_world_fl/operation.rst +++ b/docs/real_world_fl/operation.rst @@ -1,12 +1,12 @@ .. _operating_nvflare: ###################################################### -Operating NVFLARE - Admin Client, Commands, FLAdminAPI +Operating NVFLARE - Admin Client, Commands, FLARE API ###################################################### The FL system is operated by the packages of type admin configured at provisioning. The admin packages contain key and certificate files to connect and authenticate with the server, and the administration can be done through an included -command prompt with the Admin Console by running ``fl_admin.sh`` or programmatically through the FLAdminAPI or new :ref:`flare_api`. +command prompt with the Admin Console by running ``fl_admin.sh`` or programmatically through the :ref:`flare_api`. Admin command prompt ==================== @@ -34,7 +34,7 @@ commands shown as examples of how they may be run with a description. clone_job,``clone_job job_id``,Creates a copy of the specified job with a new job_id abort,``abort job_id client``,Aborts the job for the specified job_id for all clients. Individual client jobs can be aborted by specifying *clientname*. ,``abort job_id server``,Aborts the server job for the specified job_id. - download_job,``download_job job_id``,Download folder from the job store containing the job and workspace + download_job,``download_job job_id``,Download folder from the job store containing the job and workspace. Please note that for larger jobs there may be extra delay for workspace creation in the job store (If you try to download the job before that you may not be able to get the workspace data) delete_job,``delete_job job_id``,Delete the job from the job store cat,``cat server startup/fed_server.json -ns``,Show content of a file (-n: number all output lines; -s: suppress repeated empty output lines) ,``cat clientname startup/docker.sh -bT``,Show content of a file (-b: number nonempty output lines; -T: display TAB characters as ^I) @@ -73,12 +73,12 @@ commands shown as examples of how they may be run with a description. whitespace before the filename. For example, you may run ``sys_info server >serverinfo.txt``. To only save the file output without printing it, use two greater-than symbols ">>" instead: ``sys_info server >>serverinfo.txt``. -The FLARE API is the redesigned FLAdminAPI with a better user experience introduced in version 2.3.0. If you do not have -existing code using the FLAdminAPI, it is recommended to just use the FLARE API. +The FLARE API is the redesigned FLAdminAPI with a better user experience introduced in version 2.3.0. +We recommend using the FLARE API as the FLAdminAPI is now deprecated and will soon sunset. .. toctree:: :maxdepth: 1 - FLAdminAPI flare_api + FLAdminAPI migrating_to_flare_api diff --git a/docs/real_world_fl/overview.rst b/docs/real_world_fl/overview.rst index 8f0572c1ec..637489b58b 100644 --- a/docs/real_world_fl/overview.rst +++ b/docs/real_world_fl/overview.rst @@ -68,7 +68,7 @@ For advanced users, you can customize your provision with additional behavior th - **Zip**: To create password protected zip archives for the startup kits, see :ref:`distribution_builder` - **Docker-compose**: Provision to launch NVIDIA FLARE system via docker containers. You can customize the provisioning process and ask the provisioner to generate a docker-compose file. This can be found in :ref:`docker_compose`. - **Docker**: Provision to launch NVIDIA FLARE system via docker containers. If you just want to use docker files, see :ref:`containerized_deployment`. - - **Helm**: To change the provisioning tool to generate an NVIDIA FLARE Helm chart for Kubernetes deployment, see :ref:` helm_chart`. + - **Helm**: To change the provisioning tool to generate an NVIDIA FLARE Helm chart for Kubernetes deployment, see :ref:`helm_chart`. - **CUSTOM**: you can build custom builders specific to your needs like in :ref:`distribution_builder`. Package distribution @@ -159,6 +159,12 @@ See how to deploy to Azure and AWS clouds can be found in :ref:`cloud_deployment Deploy to Google Cloud will be made available in a future release. +Kubernetes Deployment +===================== +As mentioned above, you can run NVIDIA FLARE in the public cloud. If you prefer to deploy NVIDIA FLARE in Amazon Elastic Kubernetes Service (EKS), +you can find the deployment guide in :ref:`eks_deployment`. + + Starting Federated Learning Servers ============================================= The FL Server will coordinate the federated learning training and be the main hub all clients and admin diff --git a/docs/release_notes/flare_220.rst b/docs/release_notes/flare_220.rst index 4450da646f..de685629b4 100644 --- a/docs/release_notes/flare_220.rst +++ b/docs/release_notes/flare_220.rst @@ -19,7 +19,7 @@ To accomplish these goals, a set of key new tools and features were developed, i - :ref:`preflight_check` - Site-policy management - Federated XGboost - - Federated Statistics + - Federated Statistics - MONAI Integration The sections below provide an overview of these features. For more detailed documentation and usage information, refer to the :ref:`User Guide ` and :ref:`Programming Guide `. diff --git a/docs/release_notes/flare_230.rst b/docs/release_notes/flare_230.rst index 92ae350f88..6721de4b86 100644 --- a/docs/release_notes/flare_230.rst +++ b/docs/release_notes/flare_230.rst @@ -41,7 +41,7 @@ Prior to FLARE 2.3.0, model initialization was performed on the server-side. The model was either initialized from a model file or custom model initiation code. Pre-defining a model file required extra steps of pre-generating and saving the model file and then sending it over to the server. Running custom model initialization code on server could be a security risk. -FLARE 2.3.0 introuduces another way to initialize the model on the client side. The FL Server can select +FLARE 2.3.0 introduces another way to initialize the model on the client side. The FL Server can select the initial model based on a user-chosen strategy. Here is an example using client-side model initialization: https://github.com/NVIDIA/NVFlare/tree/main/examples/hello-world/hello-pt. You can read more about this feature in :ref:`initialize_global_weights_workflow`. @@ -67,7 +67,7 @@ Federated Private Set Intersection (PSI) In order to support vertical learning use cases such as secure user-id matching and feature over-lapping discovery, we have developed a multi-party private set intersection (PSI) operator that allows for the secure discovery of data intersections. Our approach leverages OpenMined's two-party -`Private Set Intersection Cardinality protocol `_, which is basedon ECDH and Bloom Filters, and we have +`Private Set Intersection Cardinality protocol `_, which is based on ECDH and Bloom Filters, and we have made this protocol available for multi-party use. More information on our approach and how to use the PSI operator can be found in the :github_nvflare_link:`PSI Example `. @@ -111,7 +111,7 @@ in which an online RL agent can dynamically adjust the hyperparameters of each c Quantifying Data Leakage in Federated Learning ---------------------------------------------- -This research :github_nvflare_link:`example `_ contains the tools necessary to recreate the chest X-ray experiments described in +This research :github_nvflare_link:`example ` contains the tools necessary to recreate the chest X-ray experiments described in `Do Gradient Inversion Attacks Make Federated Learning Unsafe? `_, accepted to IEEE Transactions on Medical Imaging. It presents new ways to measure and visualize potential data leakage in FL using a new FLARE filter that can quantify the data leakage for each client and visualize it as a function of the FL training rounds. diff --git a/docs/release_notes/flare_240.rst b/docs/release_notes/flare_240.rst index 8d07cc58fd..4741b386f0 100644 --- a/docs/release_notes/flare_240.rst +++ b/docs/release_notes/flare_240.rst @@ -1,6 +1,6 @@ -########################## +************************** What's New in FLARE v2.4.0 -########################## +************************** Usability Improvements ====================== @@ -23,7 +23,7 @@ Here is a brief example of a common pattern when using the Client API for a clie # initialize NVFlare client API flare.init() - # run continously when launching once + # run continuously when launching once while flare.is_running(): # receive FLModel from NVFlare @@ -52,7 +52,8 @@ The 3rd-Party Integration Pattern In certain scenarios, users face challenges when attempting to moving the training logic to the FLARE client side due to pre-existing ML/DL training system infrastructure. In the 2.4.0 release, we introduce the Third-Party Integration Pattern, which allows the FLARE system and a third-party external training system to seamlessly exchange model parameters without requiring a tightly integrated system. -See the documentation (coming soon) for more details. +See the :ref:`3rd_party_integration` documentation for more details. + Job Templates and CLI --------------------- @@ -62,16 +63,16 @@ Furthermore, the Job CLI also offers users a convenient method for submitting jo ``nvflare job list_templates|create|submit|show_variables`` -Also explore the continously growing :github_nvflare_link:`Job Template directory ` we have created for commonly used configurations. +Also explore the continuously growing :github_nvflare_link:`Job Template directory ` we have created for commonly used configurations. For more in-depth information on Job Templates and the Job CLI, refer to the :ref:`job_cli` documentation and :github_nvflare_link:`tutorials `. ModelLearner ------------ -The ModelLearner is introduced for a simplifed user experience in cases requiring a Learner-pattern. +The ModelLearner is introduced for a simplified user experience in cases requiring a Learner-pattern. Users exclusively interact with the FLModel object, which includes weights, optimizer, metrics, and metadata, while FLARE-specific concepts remain hidden to users. The ModelLearner defines standard learning functions, such as ``train()``, ``validate()``, and ``submit_model()`` that can be subclassed for easy adaptation. -See the API definitions of :github_nvflare_link:`ModelLearner ` and +See the :ref:`model_learner` documentation and API definitions of :github_nvflare_link:`ModelLearner ` and :github_nvflare_link:`FLModel ` for more detail. Step-by-Step Example Series @@ -82,25 +83,25 @@ Each example will build upon previous ones to showcase different features, workf **CIFAR10 Examples:** -- stats: federated statistics (histograms) of CIFAR10. +- image_stats: federated statistics (histograms) of CIFAR10. - sag: scatter and gather (SAG) workflow with PyTorch with Client API. -- sag_with_deploy_map: scatter and gather workflow with deploy_map configuration, for deployment of apps to different sites using the Client API. -- cse: cross-site evaluation using the Client API. +- sag_deploy_map: scatter and gather workflow with deploy_map configuration, for deployment of apps to different sites using the Client API. - sag_model_learner: scatter and gather workflow illustrating how to write client code using the ModelLearner. - sag_executor: scatter and gather workflow demonstrating show to write client-side executors. +- sag_mlflow: MLflow experiment tracking logs with the Client API in scatter & gather workflows. +- sag_he: homomorphic encryption using Client API and POC -he mode. +- cse: cross-site evaluation using the Client API. - cyclic: cyclic weight transfer workflow with server-side controller. - cyclic_ccwf: client-controlled cyclic weight transfer workflow with client-side controller. - swarm: swarm learning and client-side cross-site evaluation with Client API. -- sag_with_mlflow (coming soon): MLFlow experiment tracking logs with the Client API in scatter & gather workflows. -- sag_with_he (coming soon): scatter and gather workflow with Client API and Homomorphic Encryption (HE) -**HIGGS Examples (coming soon):** +**HIGGS Examples:** -- stats -- scikit_learn linear -- kmeans -- svm -- xgboost +- tabular_stats: federated statistics tabular histogram calculation. +- scikit_learn: federated linear model (logistic regression on binary classification) learning on tabular data. +- sklearn_svm: federated SVM model learning on tabular data. +- sklearn_kmeans: federated k-Means clustering on tabular data. +- xgboost: federated horizontal xgboost learning on tabular data with bagging collaboration. Streaming APIs ============== @@ -108,7 +109,7 @@ To support large language models (LLMs), the 2.4.0 release introduces the stream The addition of a new streaming layer designed to handle large objects allows us to divide the large model into 1M chunks and stream them to the target. We provide built-in streamers for Objects, Bytes, Files, and Blobs, providing a versatile solution for efficient object streaming between different endpoints. -See the :ref:`notes_on_large_models` documentation for more insights on working with large models in FLARE. +Refer to the :mod:`nvflare.fuel.f3.stream_cell` api for more details, and the :ref:`notes_on_large_models` documentation for insights on working with large models in FLARE. Expanding Federated Learning Workflows ====================================== @@ -132,13 +133,13 @@ Client-side controlled workflow Three commonly used types of client-side controlled workflows are provided: - :ref:`ccwf_cyclic_learning`: the model is passed from client to client. -- :ref:`ccwf_swarm_learning`: randomly select clients as client-side controller and aggregrators, where then Scatter and Gather with FedAvg is performed. +- :ref:`ccwf_swarm_learning`: randomly select clients as client-side controller and aggregators, where then Scatter and Gather with FedAvg is performed. - :ref:`ccwf_cross_site_evaluation`: allow clients to evaluate other sites' models. -See :github_nvflare_link:`swarm learning ` for examples using these client-controlled workflows. +See :github_nvflare_link:`swarm learning ` and :github_nvflare_link:`client-controlled cyclic ` for examples using these client-controlled workflows. -MLFlow and WandB Experiment Tracking Support -============================================ +MLFlow and Weights & Biases Experiment Tracking Support +======================================================= We expand our experiment tracking support with MLFLow and Weights & Biases systems. The detailed documentation on these features can be found in :ref:`experiment_tracking`, and examples can be found at FL Experiment Tracking with :github_nvflare_link:`MLFlow ` and @@ -167,7 +168,7 @@ Improved Job Configuration File Processing - OS Environment Variables - OS environment variables can be referenced via the dollar sign - Parameterized Variable Definition - for creating configuration templates that can be reused and resolved into different concrete configurations -See more details in the enhanced job config file processing documentation (coming soon) +See more details in the :ref:`configurations` documentation. POC Command Upgrade =================== @@ -201,13 +202,13 @@ FL HUB: Hierarchical Unification Bridge ======================================= The FL HUB is a new experimental feature designed to support multiple FLARE systems working together in a hierarchical manner. In Federated Computing, the number of edge devices is usually large with often just a single server, which can cause performance issues. -A solution to this problem is to use a hierachical FLARE system, where tiered FLARE systems connect together to form a tree-like structure. +A solution to this problem is to use a hierarchical FLARE system, where tiered FLARE systems connect together to form a tree-like structure. Each leaf of clients (edge devices) only connect to its server, where this server also serves as the client for the parent tier FLARE system. One potential use case is with global studies, where the client machine may be located across different regions. Rather than requiring every region's client machines connect to only a single FL server in that region, the FL HUB could enable a more performant tiered multi-server setup. -Learn more about the FL Hub in the :ref:`hierarchy_unification_bridge` documenation and the :github_nvflare_link:`code `. +Learn more about the FL Hub in the :ref:`hierarchy_unification_bridge` documentation and the :github_nvflare_link:`code `. Misc. Features ============== @@ -229,13 +230,13 @@ Misc. Features - Run Model Evaluation Without Training - In the 2.4.0 release, users can now run cross-validation without having to re-run the training. - - `Enable re-run cross-validation without training workflow (WIP) `_. + - See the example for :github_nvflare_link:`run cross-site validation without training `. - Communication Enhancements - We added the application layer ping between Client Job process and Server parent process to replace the gRPC timeout. Previously, we noticed if the gRPC timeout is set too long, the cloud provider (eg. Azure Cloud) will kill the connection after 4 minutes. - If the timeout setup is too short (such as 2 mins), the underlying gRPC will report too many pings. + If the timeout setup is too short (such as 2 minutes), the underlying gRPC will report too many pings. The application level ping will avoid both issues to make sure the server/client is aware of the status of the processes. - FLARE provides two drivers for gRPC based communication- asyncio (AIO) and regular (non-AIO) versions of gRPC library. One notable benefit of the AIO gRPC is its ability to handle many more concurrent connections on the server side. @@ -256,16 +257,16 @@ We've added several examples to demonstrate how to work with federated LLM: - :github_nvflare_link:`Parameter Efficient Fine Turning ` utilizing NeMo's PEFT methods to adapt a LLM to a downstream task. - :github_nvflare_link:`Prompt-Tuning Example ` for using FLARE with NeMo for prompt learning. - :github_nvflare_link:`Supervised Fine Tuning (SFT) ` to fine-tune all parameters of a LLM on supervised data. +- :github_nvflare_link:`LLM Tuning via HuggingFace SFT Trainer ` for using FLARE with a HuggingFace trainer for LLM tuning tasks. Vertical Federated XGBoost -------------------------- -With the 2.0 release of :github_nvflare_link:`XGBoost `. +With the 2.0 release of `XGBoost `_, we are able to demonstrate the :github_nvflare_link:`vertical xgboost example `. We use Private Set Intersection and XGBoost's new federated learning support to perform classification on vertically split HIGGS data (where sites share overlapping data samples but contain different features). -GNN Examples ------------- -We added two examples using GraphSage to demonstrate how to train `Federated GNN on -Graph Dataset using Inductive Learning `_. +Graph Neural Networks (GNNs) +---------------------------- +We added two examples using GraphSage to demonstrate how to train :github_nvflare_link:`Federated GNN on Graph Dataset using Inductive Learning `. **Protein Classification:** to classify protein roles based on their cellular functions from gene ontology. The dataset we are using is PPI (`protein-protein interaction `_) graphs, where each graph represents a specific human tissue. @@ -277,13 +278,202 @@ For this financial application, we use the `Elliptic++ `_. -Finanical Application Examples +Financial Application Examples ------------------------------ To demonstrate how to perform Fraud Detection in financial applications, we introduced an :github_nvflare_link:`example ` illustrating how to use XGBoost in various ways to train a model in a federated manner with a `finance dataset `_. We illustrate both vertical and horizontal federated learning with XGBoost, along with histogram and tree-based approaches. +KeyCloak Site Authentication Integration +---------------------------------------- +FLARE is agnostic to the 3rd party authentication mechanism, and each client can have its own authentication system. +We demonstrate FLARE's support of site-specific authentication using KeyCloak. +The :github_nvflare_link:`KeyCloak Site Authentication Integration ` example is configured so the admin user will need additional user authentication to submit and run a job. + + +********************************** Migration to 2.4.0: Notes and Tips -================================== +********************************** + +FLARE 2.4.0 introduces a few API and behavior changes. This migration guide will help you to migrate from the previous NVFLARE version to the current version. + +Job Format: meta.json +===================== +In FLARE 2.4.0, users must have a meta.json configuration file defined in their jobs. +Legacy app definitions should be updated to the job format to include a meta.json file with a deployment map and any number of app folders (containing config/ and custom/). +Here is a basic job structure with a single app: + +.. code-block:: shell + + ├── my_job + │ ├── app + │ │ ├── config + │ │ │ ├── config_client.json + │ │ │ └── config_server.json + │ │ └── custom + │ └── meta.json + +Here is the default meta.json which can be edited accordingly: + +.. code-block:: json + + { + "name": "my_job", + "resource_spec": {}, + "min_clients" : 2, + "deploy_map": { + "app": [ + "@ALL" + ] + } + } + +FLARE API Parity +================ +In FLARE 2.3.0, an initial version of the FLARE API was implemented as a redesigned FLAdminAPI, however we only included a subset of the functions. +In FLARE 2.4.0, the FLARE API has been enhanced to include the remaining functions of the FLAdminAPI, so that the FLAdminAPI can sunset. + +See the :ref:`migrating_to_flare_api` for more details on the added functions. + +Timeout Handling +---------------- + +In the 2.4.0 release, improvements have been to made to the timeout handling for commands involving Admin Server communication with FL Clients and awaiting responses. +Previously, a fixed global timeout value was used on the Admin Server, however this value was sometimes not enough if a command took a long time +(e.g. ``cat server log.txt`` command may take time to transfer the large log file). +In this case, the user could use the ``set_timeout`` command to change the default timeout value of the Admin Server, however this command had the drawback of being global, and would affect all users. +The global effect of this command meant one user setting a very small timeout value could cause all user commands to fail. + +To address this, the ``set_timeout`` command has been changed to be session specific. +Additionally a new ``unset_timeout`` command has been added to revert to use the Admin Server's default timeout for the session. + +Changes to ``show_stats`` and ``show_errors`` +--------------------------------------------- + +The old structure puts the server's result dict directly at the top level of the overall result dict, while each client's result dict is placed as an item keyed on the client name. +To make it consistent between server and client results, we've change to put the server's result as an item keyed on "server". +If any code is based on the old return structure of FLAdminAPI, please update it accordingly. + +.. code-block:: json + + { + "server": { # new "server" key for server result dict + "ScatterAndGather": { + "tasks": { + "train": [ + "site-1", + "site-2" + ] + }, + "phase": "train", + "current_round": 2, + "num_rounds": 50 + }, + "ServerRunner": { + "job_id": "3ad5bdef-db12-4ffb-9362-0ff163973f7d", + "status": "started", + "workflow": "scatter_and_gather" + } + }, + "site-1": { + "ClientRunner": { + "job_id": "3ad5bdef-db12-4ffb-9362-0ff163973f7d", + "current_task_name": "None", + "status": "started" + } + }, + "site-2": { + "ClientRunner": { + "job_id": "3ad5bdef-db12-4ffb-9362-0ff163973f7d", + "current_task_name": "train", + "status": "started" + } + } + } + +POC Command Upgrade +=================== +The POC command has been upgraded in 2.4.0: + +- Remove ``--`` for action commands, change to subcommands +- POC is now using "production mode", the admin user name is now "admin@nvidia.com" instead of "admin" from previous releases. +- new ``-d`` docker and ``-he`` Homomorphic encryption options +- ``nvflare poc prepare`` generates ``.nvflare/config.conf`` to store location of POC workspace, takes precedent over environment variable ``NVFLARE_POC_WORKSPACE`` +- In the previous version, the startup kits are located directly under default POC workspace at ``/tmp/nvflare/poc``. In the 2.4.0, the startup kit is now under ``/tmp/nvflare/poc/example_project/prod_00/`` to follow the production provision default structure. +- Multi-org and multi-role support + +.. code-block:: none + + nvflare poc -h + usage: nvflare poc [-h] [--prepare] [--start] [--stop] [--clean] {prepare,prepare-jobs-dir,start,stop,clean} ... + + optional arguments: + -h, --help show this help message and exit + --prepare deprecated, suggest use 'nvflare poc prepare' + --start deprecated, suggest use 'nvflare poc start' + --stop deprecated, suggest use 'nvflare poc stop' + --clean deprecated, suggest use 'nvflare poc clean' + + poc: + {prepare,prepare-jobs-dir,start,stop,clean} + poc subcommand + prepare prepare poc environment by provisioning local project + prepare-jobs-dir prepare jobs directory + start start services in poc mode + stop stop services in poc mode + clean clean up poc workspace + +Refer to :ref:`poc_command` for more details. + +Secure Messaging +================ + +A new ``secure`` argument has been added for ``send_aux_request()`` in :class:`ServerEngineSpec`, +and :class:`ClientEngineExecutorSpec`. + +``secure`` is an optional boolean to determine whether the aux request should be sent in a secure way. +One such use case is for secure peer-to-peer messaging, such as in the client-controlled workflows. + +.. code-block:: python + + @abstractmethod + def send_aux_request( + self, + targets: Union[None, str, List[str]], + topic: str, + request: Shareable, + timeout: float, + fl_ctx: FLContext, + optional=False, + secure: bool = False, + ) -> dict: + """Send a request to Server via the aux channel. + Implementation: simply calls the ClientAuxRunner's send_aux_request method. + Args: + targets: aux messages targets. None or empty list means the server. + topic: topic of the request + request: request to be sent + timeout: number of secs to wait for replies. 0 means fire-and-forget. + fl_ctx: FL context + optional: whether the request is optional + secure: should the request sent in the secure way + Returns: + a dict of reply Shareable in the format of: + { site_name: reply_shareable } + """ + pass + +Stats Result Format +=================== +In :class:`StatisticsController`, +the result dictionary format originally concatenated "site" and "dataset" to support visualization. +In 2.4.0 this has now been changed so "site" and "dataset" have their own keys in the result dictionary. + +``result = {feature: {statistic: {site-dataset: value}}}`` + +to + +``result = feature: {statistic: {site: {dataset: value}}}}`` -Coming Soon \ No newline at end of file +To continue to support the visualization needs, the site-dataset concatenation logic has instead been moved to +:class:`Visualization`. diff --git a/docs/release_notes/flare_250.rst b/docs/release_notes/flare_250.rst new file mode 100644 index 0000000000..eb41a60fba --- /dev/null +++ b/docs/release_notes/flare_250.rst @@ -0,0 +1,506 @@ +************************** +What's New in FLARE v2.5.0 +************************** + +User Experience Improvements +============================ +NVFlare 2.5.0 offers several new sets of APIs that allows for end-to-end ease of use that can greatly improve researcher and data +scientists' experience working with FLARE. The new API covers client, server and job construction with end-to-end pythonic user experience. + +Model Controller API +-------------------- +The new :ref:`model_controller` greatly simplifies the experience of developing new federated learning workflows. Users can simply subclass +the ModelController to develop new workflows. The new API doesn't require users to know the details of NVFlare constructs except for FLModel +class, where it is simply a data structure that contains model weights, optimization parameters and metadata. + +You can easily construct a new workflow with basic python code, and when ready, the send_and_wait() communication function is all you need for +communication between clients and server. + +Client API +---------- +We introduced another :ref:`client_api` implementation, +:class:`InProcessClientAPIExecutor`. +This has the same interface and syntax of the previous Client API using +:class:`SubprocessLauncher`, except all communication is in memory. + +Using this in-process client API, we build a :class:`ScriptExecutor`, +which is directly used in the new Job API. + +Compared with SubProcessLauncherClientAPI, the in-process client API offers better efficiency and is easier to configure. All +the operations will be carried out within the memory space of the executor. + +SubProcessLauncherClientAPI can be used for cases where a separate training process is required. + +Job API +------- +The new Job API, or :ref:`fed_job_api`, combined with Client API and Model Controller API, will give users an end-to-end pythonic +user experience. The Job configuration, required prior to the current release, can now be directly generated automatically, so the +user doesn't need to edit the configuration files manually. + +We provide many examples to demonstrate the power of the new Job APIs making it very easy to experiment with new federated +learning algorithms or create new applications. + +Flower Integration +================== +Integration between NVFlare and the `Flower `_ framework aims to provide researchers the ability to leverage +the strengths of both frameworks by enabling Flower projects to seamlessly run on top of NVFlare. Through the seamless +integration of Flower and FLARE, applications crafted within the Flower framework can effortlessly operate within the FLARE runtime +environment without necessitating any modifications. This initial integration streamlines the process, eliminating complexities and +ensuring smooth interoperability between the two platforms, thus enhancing the overall efficiency and accessibility of FL applications. +Please find details `here `__. A hello-world example is available +:github_nvflare_link:`here `. + +Secure XGBoost +============== +The latest features from XGBoost introduced the support for secure federated learning via homomorphic encryption. For vertical federated +XGBoost learning, the gradients of each sample are protected by encryption such that the label information +will not be leaked to unintended parties; while for horizontal federated XGBoost learning, the local gradient histograms will not be +learnt by the central aggregation server. + +With our encryption plugins working with XGBoost, NVFlare now supports all secure federated schemes for XGBoost model training, with +both CPU and GPU. + +Please check `federated xgboost with nvflare user guide ` +and the :github_nvflare_link:`example ` + +Tensorflow support +================== +With community contributions, we add FedOpt, FedProx and Scaffold algorithms using Tensorflow. +You can check the code :github_nvflare_link:`here ` and the :github_nvflare_link:`example ` + +FOBS Auto Registration +====================== +FOBS, the secure mechanism NVFlare uses for message serialization and deserialization, is enhanced with new auto registration features. +These changes will reduce the number of decomposers that users have to register. The changes are: + + - Auto registering of decomposers on deserialization. The decomposer class is stored in the serialized data and the decomposers are + registered automatically when deserializing. If a component only receives serialized data but it doesn't perform serialization, + decomposer registering is not needed anymore. + + - Data Class decomposer auto registering on serialization. If a decomposer is not found for a class, FOBS will try to treat the class + as a Data Class and register DataClassDecomposer for it. This works in most cases but not all. + + +New Examples +============ +Secure Federated Kaplan-Meier Analysis +-------------------------------------- +The :github_nvflare_link:`Secure Federated Kaplan-Meier Analysis via Time-Binning and Homomorphic Encryption example ` +illustrates two features: + + - How to perform Kaplan-Meier survival analysis in a federated setting without and with secure features via time-binning and Homomorphic Encryption (HE). + - How to use the Flare ModelController API to contract a workflow to facilitate HE under simulator mode. + +BioNemo example for Drug Discovery +---------------------------------- +`BioNeMo `_ is NVIDIA's generative AI platform for drug discovery. +We included several examples of running BioNeMo in a federated learning environment using NVFlare: + + - The :github_nvflare_link:`task fitting example ` includes a notebook that shows how to obtain protein-learned representations in the form of embeddings using the ESM-1nv pre-trained model. + - The :github_nvflare_link:`downstream example ` shows three different downstream tasks for fine-tuning a BioNeMo ESM-style model. + +Federated Logistic Regression with NR optimization +-------------------------------------------------- +The :github_nvflare_link:`Federated Logistic Regression with Second-Order Newton-Raphson optimization example ` +shows how to implement a federated binary classification via logistic regression with second-order Newton-Raphson optimization. + +Hierarchical Federated Statistics +-------------------------------- +:github_nvflare_link:`Hierarchical Federated Statistics ` is helpful when there +are multiple organizations involved. For example, in the medical device applications, the medical devices usage statistics can be +viewed from both device, device-hosting site, and hospital or manufacturers' point of views. +Manufacturers would like to see the usage stats of their product (device) in different sites and hospitals. Hospitals +may like to see overall stats of devices including different products from different manufacturers. In such a case, the hierarchical +federated stats will be very helpful. + +FedAvg Early Stopping Example +------------------------------ +The :github_nvflare_link:`FedAvg Early Stopping example ` tries to demonstrate that with the new server-side model +controller API, it is very easy to change the control conditions and adjust workflows with a few lines of python code. + +Tensorflow Algorithms & Examples +-------------------------------- +FedOpt, FedProx, Scaffold implementation for Tensorflow. + +FedBN: Federated Learning on Non-IID Features via Local Batch Normalization +--------------------------------------------------------------------------- +The :github_nvflare_link:`FedBN example ` showcases a federated learning algorithm designed +to address the feature shift problem when aggregating models across different data distributions. + +In this work, we propose an effective method that uses local batch normalization to alleviate the feature shift before averaging models. +The resulting scheme, called FedBN, outperforms both classical FedAvg and FedProx on our extensive experiments. These empirical results +are supported by a convergence analysis that shows in a simplified setting that FedBN has a faster convergence rate than FedAvg. + + +End-to-end Federated XGBoost examples +------------------------------------- +In :github_nvflare_link:`this example `, +we try to show that end-to-end process of feature engineering, pre-processing and training in federated settings. You +can use FLARE to perform federated ETL and then training. + +Developer Tutorial Page +======================= +To let users quickly learn Federated Learning with FLARE, we developed a `tutorial web page `_ with +both code and video to interactively learn how to convert and run FL in a few minutes. We also +created a tutorial catalog to help you easily search and find the examples you are interested in. + +********************************** +Migration to 2.5.0: Notes and Tips +********************************** + +FLARE 2.5.0 introduces some API and behavior changes. This migration guide will help you to migrate from the previous NVFlare version +to the current version. + +Deprecate "name" to only use "path" +=================================== +In 2.5.0, the "name" field in configurations is deprecated. You need to change the "name" field to "path" and use the full path. For +example, + +.. code-block:: json + + "name": "TBAnalyticsReceiver" + +needs to be updated to: + +.. code-block:: json + + "path": "nvflare.app_opt.tracking.tb.tb_receiver.TBAnalyticsReceiver" + +XGBoost v1 - v2 +=============== + +XGBoost support is enhanced in 2.5.0 to support secure training using Homomorphic Encryption (HE). The user interface is also simplified by +setting the XGBoost parameters in the controller so all clients get the same parameters. + +The main changes are: + + - The xgboost params have been moved from the client configuration to server. + - New split_mode and secure_training parameters + - New :class:`CSVDataLoader` + +Sample configuration files for 2.5.0 +------------------------------------- + +config_fed_server.json +"""""""""""""""""""""" + +.. code-block:: json + + { + "format_version": 2, + "num_rounds": 3, + "workflows": [ + { + "id": "xgb_controller", + "path": "nvflare.app_opt.xgboost.histogram_based_v2.fed_controller.XGBFedController", + "args": { + "num_rounds": "{num_rounds}", + "split_mode": 1, + "secure_training": false, + "xgb_options": { + "early_stopping_rounds": 2 + }, + "xgb_params": { + "max_depth": 3, + "eta": 0.1, + "objective": "binary:logistic", + "eval_metric": "auc", + "tree_method": "hist", + "nthread": 1 + }, + "client_ranks": { + "site-1": 0, + "site-2": 1 + }, + "in_process": true + } + } + ] + } + +config_fed_client.json +"""""""""""""""""""""" + +.. code-block:: json + + { + "format_version": 2, + "executors": [ + { + "tasks": [ + "config", + "start" + ], + "executor": { + "id": "Executor", + "path": "nvflare.app_opt.xgboost.histogram_based_v2.fed_executor.FedXGBHistogramExecutor", + "args": { + "data_loader_id": "dataloader", + "in_process": true + } + } + } + ], + "components": [ + { + "id": "dataloader", + "path": "nvflare.app_opt.xgboost.histogram_based_v2.secure_data_loader.SecureDataLoader", + "args": { + "rank": 0, + "folder": "/tmp/nvflare/dataset/vertical_xgb_data" + } + } + ] + } + +Simulator workspace structure +============================= + +In 2.4.0, the server and all the clients shared the same simulator workspace root of ``simulate_job``. The server and each client had +their own app_XXXX job definition, but the same root folder for the workspace may result in conflicting model file locations. + +.. raw:: html + +
+ Example folder structure for 2.4.0 + +.. code-block:: none + + simulator/ + ├── local + │ └── log.config + ├── simulate_job + │ ├── app_server + │ │ ├── FL_global_model.pt + │ │ ├── __init__.py + │ │ ├── config + │ │ │ ├── config_fed_client.json + │ │ │ ├── config_fed_server.json + │ │ │ ├── config_train.json + │ │ │ ├── config_validation.json + │ │ │ ├── dataset_0.json + │ │ │ └── environment.json + │ │ ├── custom + │ │ │ ├── __init__.py + │ │ │ ├── add_shareable_parameter.py + │ │ │ ├── client_aux_handler.py + │ │ │ ├── client_send_aux.py + │ │ │ ├── client_trainer.py + │ │ │ ├── fed_avg_responder.py + │ │ │ ├── model_shareable_manager.py + │ │ │ ├── print_shareable_parameter.py + │ │ │ ├── server_aux_handler.py + │ │ │ ├── server_send_aux.py + │ │ │ └── supervised_fitter.py + │ │ ├── docs + │ │ │ ├── Readme.md + │ │ │ └── license.txt + │ │ ├── eval + │ │ └── models + │ ├── app_site-1 + │ │ ├── __init__.py + │ │ ├── config + │ │ │ ├── config_fed_client.json + │ │ │ ├── config_fed_server.json + │ │ │ ├── config_train.json + │ │ │ ├── config_validation.json + │ │ │ ├── dataset_0.json + │ │ │ └── environment.json + │ │ ├── custom + │ │ │ ├── __init__.py + │ │ │ ├── add_shareable_parameter.py + │ │ │ ├── client_aux_handler.py + │ │ │ ├── client_send_aux.py + │ │ │ ├── client_trainer.py + │ │ │ ├── fed_avg_responder.py + │ │ │ ├── model_shareable_manager.py + │ │ │ ├── print_shareable_parameter.py + │ │ │ ├── server_aux_handler.py + │ │ │ ├── server_send_aux.py + │ │ │ └── supervised_fitter.py + │ │ ├── docs + │ │ │ ├── Readme.md + │ │ │ └── license.txt + │ │ ├── eval + │ │ ├── log.txt + │ │ └── models + │ ├── app_site-2 + │ │ ├── __init__.py + │ │ ├── config + │ │ │ ├── config_fed_client.json + │ │ │ ├── config_fed_server.json + │ │ │ ├── config_train.json + │ │ │ ├── config_validation.json + │ │ │ ├── dataset_0.json + │ │ │ └── environment.json + │ │ ├── custom + │ │ │ ├── __init__.py + │ │ │ ├── add_shareable_parameter.py + │ │ │ ├── client_aux_handler.py + │ │ │ ├── client_send_aux.py + │ │ │ ├── client_trainer.py + │ │ │ ├── fed_avg_responder.py + │ │ │ ├── model_shareable_manager.py + │ │ │ ├── print_shareable_parameter.py + │ │ │ ├── server_aux_handler.py + │ │ │ ├── server_send_aux.py + │ │ │ └── supervised_fitter.py + │ │ ├── docs + │ │ │ ├── Readme.md + │ │ │ └── license.txt + │ │ ├── eval + │ │ ├── log.txt + │ │ └── models + │ ├── log.txt + │ ├── meta.json + │ └── pool_stats + │ └── simulator_cell_stats.json + └── startup + ├── client_context.tenseal + └── server_context.tenseal + +.. raw:: html + +
+
+ +In 2.5.0, the server and all the clients will have their own workspace subfolder under the simulator workspace. The ``simulator_job`` +is within the workspace of each site. This results in the total isolation of each site, with no model files conflicting. This workspace +structure is consistent with the format of the POC real world application. + +.. raw:: html + +
+ Example folder structure for 2.5.0 + +.. code-block:: none + + simulator/ + ├── server + │ ├── local + │ │ └── log.config + │ ├── log.txt + │ ├── pool_stats + │ │ └── simulator_cell_stats.json + │ ├── simulate_job + │ │ ├── app_server + │ │ │ ├── FL_global_model.pt + │ │ │ └── config + │ │ │ ├── config_fed_client.conf + │ │ │ └── config_fed_server.conf + │ │ ├── artifacts + │ │ │ ├── 39d0b7edb17b437dbf77da2e402b2a4d + │ │ │ │ └── artifacts + │ │ │ │ └── running_loss_reset.txt + │ │ │ └── b10ff3e54b0d464c8aab8cf0b751f3cf + │ │ │ └── artifacts + │ │ │ └── running_loss_reset.txt + │ │ ├── cross_site_val + │ │ │ ├── cross_val_results.json + │ │ │ ├── model_shareables + │ │ │ │ ├── SRV_FL_global_model.pt + │ │ │ │ ├── site-1 + │ │ │ │ └── site-2 + │ │ │ └── result_shareables + │ │ │ ├── site-1_SRV_FL_global_model.pt + │ │ │ ├── site-1_site-1 + │ │ │ ├── site-1_site-2 + │ │ │ ├── site-2_SRV_FL_global_model.pt + │ │ │ ├── site-2_site-1 + │ │ │ └── site-2_site-2 + │ │ ├── meta.json + │ │ ├── mlruns + │ │ │ ├── 0 + │ │ │ │ └── meta.yaml + │ │ │ └── 470289463842501388 + │ │ │ ├── 39d0b7edb17b437dbf77da2e402b2a4d + │ │ │ │ ├── artifacts + │ │ │ │ ├── meta.yaml + │ │ │ │ ├── metrics + │ │ │ │ │ ├── running_loss + │ │ │ │ │ ├── train_loss + │ │ │ │ │ └── validation_accuracy + │ │ │ │ ├── params + │ │ │ │ │ ├── learning_rate + │ │ │ │ │ ├── loss + │ │ │ │ │ └── momentum + │ │ │ │ └── tags + │ │ │ │ ├── client + │ │ │ │ ├── job_id + │ │ │ │ ├── mlflow.note.content + │ │ │ │ ├── mlflow.runName + │ │ │ │ └── run_name + │ │ │ ├── b10ff3e54b0d464c8aab8cf0b751f3cf + │ │ │ │ ├── artifacts + │ │ │ │ ├── meta.yaml + │ │ │ │ ├── metrics + │ │ │ │ │ ├── running_loss + │ │ │ │ │ ├── train_loss + │ │ │ │ │ └── validation_accuracy + │ │ │ │ ├── params + │ │ │ │ │ ├── learning_rate + │ │ │ │ │ ├── loss + │ │ │ │ │ └── momentum + │ │ │ │ └── tags + │ │ │ │ ├── client + │ │ │ │ ├── job_id + │ │ │ │ ├── mlflow.note.content + │ │ │ │ ├── mlflow.runName + │ │ │ │ └── run_name + │ │ │ ├── meta.yaml + │ │ │ └── tags + │ │ │ └── mlflow.note.content + │ │ └── tb_events + │ │ ├── site-1 + │ │ │ ├── events.out.tfevents.1724447288.yuhongw-mlt.86138.3 + │ │ │ ├── metrics_running_loss + │ │ │ │ └── events.out.tfevents.1724447288.yuhongw-mlt.86138.5 + │ │ │ └── metrics_train_loss + │ │ │ └── events.out.tfevents.1724447288.yuhongw-mlt.86138.4 + │ │ └── site-2 + │ │ ├── events.out.tfevents.1724447288.yuhongw-mlt.86138.0 + │ │ ├── metrics_running_loss + │ │ │ └── events.out.tfevents.1724447288.yuhongw-mlt.86138.2 + │ │ └── metrics_train_loss + │ │ └── events.out.tfevents.1724447288.yuhongw-mlt.86138.1 + │ └── startup + ├── site-1 + │ ├── local + │ │ └── log.config + │ ├── log.txt + │ ├── simulate_job + │ │ ├── app_site-1 + │ │ │ └── config + │ │ │ ├── config_fed_client.conf + │ │ │ └── config_fed_server.conf + │ │ ├── meta.json + │ │ └── models + │ │ └── local_model.pt + │ └── startup + ├── site-2 + │ ├── local + │ │ └── log.config + │ ├── log.txt + │ ├── simulate_job + │ │ ├── app_site-2 + │ │ │ └── config + │ │ │ ├── config_fed_client.conf + │ │ │ └── config_fed_server.conf + │ │ ├── meta.json + │ │ └── models + │ │ └── local_model.pt + │ └── startup + └── startup + +.. raw:: html + +
+
+ +Allow Simulator local resources configuration +============================================== +In 2.4.0, we only support the ``log.config`` setting file within the simulator workspace ``startup`` folder to be used to change the log format. + +In 2.5.0, we enable the full ``local`` and ``startup`` contents to be configured under the simulator workspace. All the POC real world application +local settings can be placed within the ``workspace/local`` folder and be deployed to each site. The ``log.config`` file is also moved to +this ``workspace/local`` folder. diff --git a/docs/resources/3rd_party_integration_diagram.png b/docs/resources/3rd_party_integration_diagram.png new file mode 100644 index 0000000000..5f99832968 Binary files /dev/null and b/docs/resources/3rd_party_integration_diagram.png differ diff --git a/docs/resources/3rd_party_trainer.py b/docs/resources/3rd_party_trainer.py new file mode 100644 index 0000000000..1ffdd085bb --- /dev/null +++ b/docs/resources/3rd_party_trainer.py @@ -0,0 +1,59 @@ +import argparse +import logging + +from nvflare.client.flare_agent import AgentClosed, FlareAgentWithCellPipe + +NUMPY_KEY = "numpy_key" + + +def main(): + + logging.basicConfig() + logging.getLogger().setLevel(logging.INFO) + + parser = argparse.ArgumentParser() + parser.add_argument("--workspace", "-w", type=str, help="workspace folder", required=False, default=".") + parser.add_argument("--site_name", "-s", type=str, help="flare site name", required=True) + parser.add_argument("--agent_id", "-a", type=str, help="agent id", required=True) + + args = parser.parse_args() + + # 1. create the agent + agent = FlareAgentWithCellPipe( + root_url="grpc://server:8002", + site_name=args.site_name, + agent_id=args.agent_id, + workspace_dir=args.workspace, + secure_mode=True, + submit_result_timeout=2.0, + heartbeat_timeout=120.0, + ) + + # 2. start the agent + agent.start() + + # 3. processing tasks + while True: + print("getting task ...") + try: + task = agent.get_task() + except AgentClosed: + print("agent closed - exit") + break + + print(f"got task: {task}") + result = train(task.data) # perform train task + submitted = agent.submit_result(result) + print(f"result submitted: {submitted}") + + # 4. stop the agent + agent.stop() + + +def train(model): + print(f"training on {model}") + return model + + +if __name__ == "__main__": + main() diff --git a/docs/resources/Dockerfile b/docs/resources/Dockerfile index c2c992a651..4d82cee85f 100644 --- a/docs/resources/Dockerfile +++ b/docs/resources/Dockerfile @@ -1,7 +1,7 @@ -ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:23.02-py3 +ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3 FROM ${PYTORCH_IMAGE} -ARG NVF_VERSION=2.3 +ARG NVF_VERSION=main ENV NVF_BRANCH=${NVF_VERSION} RUN python3 -m pip install -U pip diff --git a/docs/resources/FLARE_as_flower_communicator.png b/docs/resources/FLARE_as_flower_communicator.png new file mode 100644 index 0000000000..3cd927e392 Binary files /dev/null and b/docs/resources/FLARE_as_flower_communicator.png differ diff --git a/docs/resources/client_api.png b/docs/resources/client_api.png new file mode 100644 index 0000000000..edc340b010 Binary files /dev/null and b/docs/resources/client_api.png differ diff --git a/docs/resources/controller_executor_no_filter.png b/docs/resources/controller_executor_no_filter.png new file mode 100644 index 0000000000..ff4f17c15a Binary files /dev/null and b/docs/resources/controller_executor_no_filter.png differ diff --git a/docs/resources/experiment_tracking_diagram.png b/docs/resources/experiment_tracking_diagram.png new file mode 100644 index 0000000000..58f5d2fc5e Binary files /dev/null and b/docs/resources/experiment_tracking_diagram.png differ diff --git a/docs/programming_guide/resources/fed_sag_round.png b/docs/resources/fed_sag_round.png similarity index 100% rename from docs/programming_guide/resources/fed_sag_round.png rename to docs/resources/fed_sag_round.png diff --git a/docs/resources/fed_xgb_detail.png b/docs/resources/fed_xgb_detail.png new file mode 100644 index 0000000000..ca1818e084 Binary files /dev/null and b/docs/resources/fed_xgb_detail.png differ diff --git a/docs/resources/fl_diagram.png b/docs/resources/fl_diagram.png new file mode 100644 index 0000000000..cb5442732f Binary files /dev/null and b/docs/resources/fl_diagram.png differ diff --git a/docs/resources/flare_flower_communication.png b/docs/resources/flare_flower_communication.png new file mode 100644 index 0000000000..b886866550 Binary files /dev/null and b/docs/resources/flare_flower_communication.png differ diff --git a/docs/resources/flare_overview.png b/docs/resources/flare_overview.png new file mode 100644 index 0000000000..b7562c0b3b Binary files /dev/null and b/docs/resources/flare_overview.png differ diff --git a/docs/programming_guide/resources/init_weights_1_config_fed_server.json b/docs/resources/init_weights_1_config_fed_server.json similarity index 82% rename from docs/programming_guide/resources/init_weights_1_config_fed_server.json rename to docs/resources/init_weights_1_config_fed_server.json index a4b8708bf1..614f50753d 100644 --- a/docs/programming_guide/resources/init_weights_1_config_fed_server.json +++ b/docs/resources/init_weights_1_config_fed_server.json @@ -8,7 +8,7 @@ "components": [ { "id": "persistor", - "name": "PTFileModelPersistor" + "path": "nvflare.app_opt.pt.file_model_persistor.PTFileModelPersistor" }, { "id": "shareable_generator", @@ -36,14 +36,14 @@ "workflows": [ { "id": "pre_train", - "name": "InitializeGlobalWeights", + "path": "nvflare.app_common.workflows.initialize_global_weights.InitializeGlobalWeights", "args": { "task_name": "get_weights" } }, { "id": "scatter_and_gather", - "name": "ScatterAndGather", + "path": "nvflare.app_common.workflows.scatter_and_gather.ScatterAndGather", "args": { "min_clients": 2, "num_rounds": 2, @@ -58,7 +58,7 @@ }, { "id": "cross_site_validate", - "name": "CrossSiteModelEval", + "path": "nvflare.app_common.workflows.cross_site_model_eval.CrossSiteModelEval", "args": { "model_locator_id": "model_locator" } diff --git a/docs/resources/list_templates_results.png b/docs/resources/list_templates_results.png new file mode 100644 index 0000000000..49da02ec5c Binary files /dev/null and b/docs/resources/list_templates_results.png differ diff --git a/docs/resources/loose_xgb.png b/docs/resources/loose_xgb.png new file mode 100644 index 0000000000..1dd907a239 Binary files /dev/null and b/docs/resources/loose_xgb.png differ diff --git a/docs/resources/nvflare_overview.svg b/docs/resources/nvflare_overview.svg new file mode 100644 index 0000000000..493abc5760 --- /dev/null +++ b/docs/resources/nvflare_overview.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/resources/nvidia_logo.png b/docs/resources/nvidia_logo.png new file mode 100644 index 0000000000..578592cfc3 Binary files /dev/null and b/docs/resources/nvidia_logo.png differ diff --git a/docs/resources/processor_interface_design.png b/docs/resources/processor_interface_design.png new file mode 100644 index 0000000000..ff84dae5d2 Binary files /dev/null and b/docs/resources/processor_interface_design.png differ diff --git a/docs/resources/secure_horizontal_xgb.png b/docs/resources/secure_horizontal_xgb.png new file mode 100644 index 0000000000..c09c847e2c Binary files /dev/null and b/docs/resources/secure_horizontal_xgb.png differ diff --git a/docs/resources/secure_vertical_xgb.png b/docs/resources/secure_vertical_xgb.png new file mode 100644 index 0000000000..f8e596d6dc Binary files /dev/null and b/docs/resources/secure_vertical_xgb.png differ diff --git a/docs/resources/system_architecture.png b/docs/resources/system_architecture.png new file mode 100644 index 0000000000..fbab53ff97 Binary files /dev/null and b/docs/resources/system_architecture.png differ diff --git a/docs/resources/task_execution_decision_chart.png b/docs/resources/task_execution_decision_chart.png new file mode 100644 index 0000000000..c40bae817e Binary files /dev/null and b/docs/resources/task_execution_decision_chart.png differ diff --git a/docs/resources/tight_xgb.png b/docs/resources/tight_xgb.png new file mode 100644 index 0000000000..8bc5adb533 Binary files /dev/null and b/docs/resources/tight_xgb.png differ diff --git a/docs/resources/xgb_communicator.jpg b/docs/resources/xgb_communicator.jpg new file mode 100644 index 0000000000..c04c547674 Binary files /dev/null and b/docs/resources/xgb_communicator.jpg differ diff --git a/docs/user_guide.rst b/docs/user_guide.rst index fa6eaec77b..a4c8787537 100644 --- a/docs/user_guide.rst +++ b/docs/user_guide.rst @@ -4,27 +4,24 @@ User Guide ########## -This user guide has information about various features in NVIDIA FLARE. +This user guide provides instructions on how to utilize various features in NVIDIA FLARE. For information about operating an FL system, see :ref:`Real-World Federated Learning `. -For more details on what you can do with apps with custom components and -the flexibility that the Controller and Worker APIs bring, see the :ref:`programming_guide`. - -In version 2.2, the commands for NVIDIA FLARE have been consolidated to be under the ``nvflare`` command for -better ease of use. This includes the FL Simulator, the POC command, ``provision``, and preflight check, all of -which are explained in more detail in their own sections linked below. +For a more in-depth exploration of the capabilities offered by apps with custom workflows and algorithms, +please refer to the :ref:`programming_guide`. .. toctree:: :maxdepth: 1 user_guide/nvflare_cli - user_guide/client_api + user_guide/configurations user_guide/dashboard_api user_guide/dashboard_ui user_guide/nvflare_security user_guide/docker_compose user_guide/helm_chart - user_guide/logging_configuration user_guide/confidential_computing user_guide/hierarchy_unification_bridge + user_guide/federated_xgboost + user_guide/flower_integration diff --git a/docs/user_guide/client_api.rst b/docs/user_guide/client_api.rst deleted file mode 100644 index d13e685d95..0000000000 --- a/docs/user_guide/client_api.rst +++ /dev/null @@ -1,88 +0,0 @@ -.. _client_api: - -########## -Client API -########## - -NVFlare Client API provides an easy way for users to convert their centralized, local -training code into a federated learning code. - -It brings the following benefits: - -* Enable a quicker start by reducing the number of new NVFlare specific concepts - a user has to learn when first working with Federated Learning using NVFlare. - -* Enable easy adaptation from existing local training code using different framework - (pytorch, pytorch lightning, huggingface) to run the application in a - federated setting by just few lines of code changes - -************ -Core concept -************ - -Federated learning's concept is for each participating site to get a good model (better than -locally trained model) without sharing the data. - -It is done by sharing model parameters or parameter differences (certain filters can be used to -ensure privacy-preserving and protects against gradient inversion attacks) to each other. - -The aggregators will take in all these model parameters submitted by each site and produce a -new global model. - -We hope that this new global model will be better than locally trained model since it -conceptually trained on more data. - -One of the popular federated learning workflow, "FedAvg" is like this: - -#. Server site initialize an initial model -#. For each round: - - #. server sends the global model to clients - #. each client starts with this global model and train on their own data - #. each client sends back their trained model - #. server aggregates all the models and produces a new global model - -On the client side, the training workflow is: - -#. get a model from server side -#. local training -#. send a trained model to server side - -To be able to support different training frameworks, we define a standard data structure called "FLModel" -for the local training code to exchange information with NVFlare system. - -We explain its attributes below: - -.. literalinclude:: ../../nvflare/app_common/abstract/fl_model.py - :language: python - :lines: 41-67 - :linenos: - :caption: fl_model.py - -Users only need to get the required information from this data structure, -run local training, and put the results back into this data structure to be aggregated on the aggregator side. - - -For a general use case, there are three essential methods for the Client API: - -* `init()`: Initializes NVFlare Client API environment. -* `receive()`: Receives model from NVFlare side. -* `send()`: Sends the model to NVFlare side. - - -Users can use these APIs to change their centralized training code to federate learning, for example: - -.. code-block:: python - - import nvflare.client as flare - - flare.init() - input_model = flare.receive() - new_params = local_train(input_model.params) - output_model = flare.FLModel(params=new_params) - flare.send(output_model) - -Please refer to (:mod:`nvflare.client` for all the APIs) - -For more examples of using Client API with different frameworks, -please refer to :github_nvflare_link:`examples/hello-world/ml-to-fl `. diff --git a/docs/user_guide/configurations.rst b/docs/user_guide/configurations.rst new file mode 100644 index 0000000000..9d8998a183 --- /dev/null +++ b/docs/user_guide/configurations.rst @@ -0,0 +1,29 @@ +.. _configurations: + +################### +Configuration Files +################### + +**Supported Configuration File Formats** + +- `JSON `_ +- `YAML `_ +- `Pyhocon `_ - a JSON variant and HOCON (Human-Optimized Config Object Notation) parser for python. + Supports comments, variable substitution, and inheritance. +- `OmegaConf `_ - a YAML based hierarchical configuration. + +Users have the flexibility to use a single format or combine several formats, as exemplified by using config_fed_client.conf and config_fed_server.json together. +If multiple configuration formats coexist, then their usage will be prioritized based on the following search order: + +``.json -> .conf -> .yml -> .yaml`` + +See the sections below for more in-depth information about the different capabilities and types of configuration files: + +.. toctree:: + :maxdepth: 1 + + configurations/variable_resolution + configurations/job_configuration + configurations/communication_configuration + configurations/logging_configuration + \ No newline at end of file diff --git a/docs/user_guide/configurations/communication_configuration.rst b/docs/user_guide/configurations/communication_configuration.rst new file mode 100644 index 0000000000..dbe1ad2a4b --- /dev/null +++ b/docs/user_guide/configurations/communication_configuration.rst @@ -0,0 +1,342 @@ +.. _communication_configuration: + +########################### +Communication Configuration +########################### + +FLARE's communication system is based on the CellNet technology. +CellNet supports logical communication. Each site in the system is called a communication cell, or simply a cell. +All cells form a communication network called CellNet and each cell has a unique ID called Fully Qualified Cell Name (FQCN). +Any cell can communicate with any other cells via their FQCNs, regardless how the messages are routed. + +FLARE is a multi-job system in that multiple jobs can be executed at the same time. +When a FLARE system is started, the CellNet only comprises of the server and one client cell for each site. +All client cells are connected to the server cell. This topology is the backbone of the communication system and cells are called Parent Cells. + +When a job is deployed, the job is done by new cells dedicated to the execution of the job, one cell at each site (server and clients). +These cells are called Job Cells which are started when the job is deployed, and stopped when the job is finished. + +This communication system provides many powerful features (multiple choices of communication drivers, large message streaming, ad-hoc direct connections, etc.). +However, for these features to work well, they need to be configured properly. + +This document describes all aspects that can be configured and how to do configure them properly. + +The following aspects of the communication system can be configured: + +- Parameters of communication drivers +- Selection of gRPC driver implementation (asyncio vs. non-asyncio) +- Configuration of ad-hoc connections +- Configuration of internal connections +- Messaging parameters + +General Configuration +===================== + +The communication system is configured with the comm_config.json file. This file is to be maintained by Operation Staff of each FL site (servers and FL clients). +This file must be placed in the site's "local" folder: + +``/local/comm_config.json`` + +Some aspects of the communication system are configured with simple variables (e.g. max_message_size). +Variables can be defined in comm_config.json or via OS system environment variables. + +To define a variable in comm_config.json, simply set it as the first-level element: + +.. code-block:: json + + { + "max_message_size": 2000000000 + } + +You can also define the variable using an OS environment variable. The name of the env var the var name converted into uppercase and prefixed with ``NVFLARE_``. +For example, the env var name for max_message_size is: ``NVFLARE_MAX_MESSAGE_SIZE``. + +If you define the same variable both in the file and as an environment variable, the value defined in the file takes precedence. + +The following is an example of the comm_config.json: + +.. code-block:: json + + { + "allow_adhoc_conns": false, + "backbone_conn_gen": 2, + "max_message_size": 2000000000, + "internal": { + "scheme": "tcp", + "resources": { + "host": "localhost" + } + }, + "adhoc": { + "scheme": "tcp", + "resources": { + "host": "localhost", + "secure": false + } + }, + "grpc": { + "options": [ + [ + "grpc.max_send_message_length", 1073741824 + ], + [ + "grpc.max_receive_message_length", 1073741824 + ], + [ + "grpc.keepalive_time_ms", 120000 + ], + [ + "grpc.http2.max_pings_without_data", 0 + ] + ] + } + } + + +Configuration of Communication Drivers +====================================== + +A communication driver is identified by its scheme (tcp, http, grpc, etc.). +The details of the driver can be configured with a section named with the scheme in the config file. In the example above, the "grpc" section defines the gRPC driver's options. + +Note that different drivers have different configuration parameters. + +GRPC Configuration +------------------ + +The GRPC driver's details are defined in the "options" section within the "grpc" section. Please see GRPC documentation for details of available options. + +Note that since FLARE has built general messaging management for all drivers, you shouldn't need to configure GRPC options in most cases. + +GRPC Driver Selection +--------------------- + +GRPC is the default scheme for communication between FL clients and the server. +FLARE provides two implementations of GRPC drivers, one uses GRPC's asyncio version (AIO), another uses GRPC's non-asyncio version (non-AIO). +The default driver is the non-AIO version. + +According to GRPC documentation, the AIO GRPC is slightly more efficient. +But the main advantage is that it can handle many more simultaneous connections on the server side, and there is no need to configure the "num_workers" parameter. + +Unfortunately the AIO GRPC client-side library is not stable under difficult network conditions where disconnects happen frequently. +The non-AIO GRPC library seems very stable. + +If your network is stable and you have many clients and/or many concurrent jobs, you should consider using the AIO version of the GRPC driver. +This is done by setting use_aio_grpc to true: + +``"use_aio_grpc": true`` + +On the server side if you use the non-AIO gRPC driver, the default maximum number of workers is 100, meaning that there can be at most 100 concurrent connections to the server. +If this is not enough, you will need to use the AIO gRPC driver. + +Ad-hoc Connections +================== + +By default, all sites only connect to the server. When a site needs to talk to another site, messages will be relayed through the server. +To improve communication speed, it could be configured to allow the two sites to communicate directly, if network policies of the sites permit. +A direct connection between two sites (cells) is called an ad-hoc connection. + +First of all, the ad-hoc connection must be enabled. This is done by setting the allow_adhoc_conns variable to true (default value is false). + +``"allow_adhoc_conns": true`` + +Secondly, in the "adhoc" section, you can further specify what scheme to use for ad-hoc connections, as well as resources for establishing the connections. + +.. code-block:: json + + "adhoc": { + "scheme": "tcp", + "resources": { + "host": "localhost", + "secure": false, + "ports": [8008, 9008] + } + } + +In this example, we use tcp for ad-hoc connections, and we will listen on port number 8008 or 9008. +Note that the ad-hoc connection's port number is dynamically determined based on the port information in the config. + +Config Properties +----------------- + +Scheme +^^^^^^ + +You specify the communication driver with the "scheme" property. Available schemes are grpc, http and tcp. + +If not specified, the default scheme is "tcp". + +Host +^^^^ + +You specify the host of the connection with the "host" property. This value is part of the URL for the connector to connect to. + +Secure +^^^^^^ + +The "secure" property to specifies whether the ad-hoc connections will use SSL. + +Note that if secure is set to true for a site, then the site must have a "server certificate", even if the site is a FL Client. +The site's "server certificate" is generated during the provision process, if you configure the "listening_host" property for the site in project.yml. + +In secure communication mode, this host name must match the Common Name of the site's "server certificate", which is the same as the "listening_host" property for the site in project.yml. + +The default value of "secure" is false. + +Port Numbers +^^^^^^^^^^^^ + +You can specify port numbers to be used for connecting to the host. If not specified, an available port number will be dynamically assigned at the time the ad-hoc listener is created. + +To specify a single port number using the "port" property: + +``"port": 8008`` + +To specify a list of port numbers using the "ports" property: + +``"ports": [8008, 8009, 8010]`` + +To specify a list of port number ranges using the "ports" property. The following example specifies two ranges of port numbers, one from 8008 to 9008, another from 18000 to 19000. + +``"ports": [8008-9008, 18000-19000]`` + + +Internal Connections +==================== + +As described earlier, job cells are started when a job is deployed. There is one job cell at each site (server and FL clients). +Job cells at one site are connected to the Parent cell of the same site. Such job-cell/parent-cell connections are called internal connections, since they are internal within the same site. + +By default, internal connections use tcp drivers on dynamically determined port numbers. +Since internal connections are used between processes running on the same host, they don't require SSL. + +If this default setup does not work for you, you can configure it to your liking in the "internal" section. For example: + +.. code-block:: json + + "internal": { + "scheme": "grpc", + "resources": { + "host": "localhost", + "secure": false, + "ports": [8008, 9008] + } + } + +In this example, we changed to use "grpc" as the communication scheme. + +The syntax and meanings of the properties are exactly the same as the "adhoc" configurations. + +Messaging Parameters +==================== + +FLARE's messaging functions should work well with default configuration settings. However you may find it necessary to tune some parameters under some circumstances. +This section describes all parameters that you can configure. + +The messaging parameters can be specified in /local/comm_config.json file as first-level elements, or by using environment variables as described in the beginning of this document. + +This is an example of comm_config.json file with default values for all the parameters, + +.. code-block:: json + + { + "comm_driver_path": "", + "heartbeat_interval": 60, + "streaming_chunk_size": 1048576, + "streaming_read_timeout": 60, + "streaming_max_out_seq_chunks": 16, + "streaming_window_size": 16777216, + "streaming_ack_interval": 4194304, + "streaming_ack_wait": 10 + } + +When large amount of data are exchanged on busy hosts like in LLM training, following parameters are recommended in /local/comm_config.json on both servers and clients, + +.. code-block:: json + + { + "streaming_read_timeout": 3000, + "streaming_ack_wait": 6000 + } + +The communication_timeout parameter should be adjusted as following on clients in /local/resources.json, + +.. code-block:: json + + { + "format_version": 2, + "client": { + "communication_timeout": 6000 + }, + } + +Here are the detailed description of each messaging parameter, + +comm_driver_path +---------------- + +FLARE supports custom communication drivers. The paths to search for the drivers need to be configured using parameter "comm_driver_path". +The parameter should be a list separated by colon. For example, + +``"comm_driver_path": "/opt/drivers:/home/nvflare/drivers"`` + +heartbeat_interval +------------------ + +To keep the connection alive, FLARE exchanges a short message (PING/PONG) for each connection if no traffic is detected for a period of time. +This is controlled through the parameter "heartbeat_interval". The unit is seconds and the default value is 60. + +``"heartbeat_interval": 30`` + +This parameter needs to be changed if the network closes idle connection too aggressively. + +FLARE supports streaming of large messages. With streaming, the message is sliced into chunks and each chunk is sent as an individual message. +On the receiving end, the chunks are combined into the original large message. The following parameters control the general streaming behavior, + +streaming_chunk_size +-------------------- + +The chunk size in bytes. The default value is 1M. When deciding chunk size the following factors must be considered: +- Each chunk is sent with headers so there is some overhead (around 50 bytes) so try to avoid small chunks (< 1K). +- The relaying server has to buffer the whole chunk so the memory usage will be higher with bigger chunks. + +streaming_read_timeout +---------------------- + +The receiver of streaming times out after this value while waiting for the next chunk. The unit is seconds and the default is 60. + +This timeout is used to detect dead senders. On a very slow network or extremely busy host, this value may need to be increased. + +streaming_max_out_seq_chunks +---------------------------- + +The chunks may arrive on the receiving end out of sequence. +The receiver keeps out-of-sequence chunks in a reassembly buffer while waiting for the expected chunk to arrive. +The streaming terminates with error if the number of chunks in the reassembly buffer is larger than this value. The default is 16. + +The streaming implements a sliding-window protocol for flow-control. The receiver sends ACKs after the chunks are retrieved by the reader. +The window is all the chunks sent but not being acknowledged by the receiver. Once the window reaches a certain size, the sender pauses and waits for more ACKs. +Following parameters are used to control the flow-control behavior. + +streaming_window_size +--------------------- + +The sliding window size in bytes. The default is 16M. + +The larger the window size, the smoother the flow of data but the memory usage will be higher. + +streaming_ack_interval +---------------------- + +This parameter controls how often the receiver sends ACKs to the sender. +he unit is bytes and the default value is 4M (1/4 of the window size). + +The smaller the value, the smoother the sliding window moves, however it generates more messages. + +streaming_ack_wait +------------------ + +The number of seconds that the sender waits for the next ACK. +The default value is 10 seconds. + +This timeout is used to detect dead receivers. On a very slow network, this value may need to be increased. diff --git a/docs/user_guide/configurations/job_configuration.rst b/docs/user_guide/configurations/job_configuration.rst new file mode 100644 index 0000000000..d72e342c21 --- /dev/null +++ b/docs/user_guide/configurations/job_configuration.rst @@ -0,0 +1,143 @@ +.. _job_configuration: + +Predefined Job Configuration Variables +====================================== + +The following are predefined variables that can be configured in job config files. +The default values of these variables are usually good enough. However, you may change them to different values in some specific cases. + +Runner Sync +----------- + +When a job is deployed, dedicated job-specific processes are created throughout the system for the execution of the job. +Specifically, a dedicated server process is created to perform server-side logic; and dedicated client processes (one process for each site) are created to perform client-side logic. +This design allows multiple jobs to be running in their isolated space at the same time. The success or failure of a job won't interfere with the execution of other jobs. + +The task-based interactions between a FL client and the FL server is done with the ClientRunner on the client side and the ServerRunner on the server side. +When the job is deployed, the order of the job process creation is not guaranteed - the server-side job process may be started before or after any client-side job process. + +To ensure that the ClientRunner does not start to fetch tasks from the ServerRunner, the two runners need to be synchronized first. +Specifically, the ClientRunner keeps sending a "runner sync" request to the ServerRunner until a response is received. + +The behavior of the "runner sync" process can be configured with two variables: + +runner_sync_timeout +^^^^^^^^^^^^^^^^^^^ + +This variable is for the client-side configuration (config_fed_client.json). + +This runner_sync_timeout specifies the timeout value for the "runner sync" request. +If a response is not received from the Server within this specified value, then another "runner sync" request will be sent. + +The default value is 2.0 seconds. + +max_runner_sync_tries +^^^^^^^^^^^^^^^^^^^^^ + +This variable is for the client-side configuration (config_fed_client.json). + +This variable specifies the max number of "runner sync" messages to be sent before receiving a response from the server. +If a response is still not received after this many tries, the client's job process will terminate. + +The default value is 30. + +The default settings of these two variables mean that if the ClientRunner and the ServerRunner are not synched within one minute, the client will terminate. +If one minute is not enough, you can extend these two variables to meet your requirement. + +Task Check +---------- + +After the client is finished with the assigned task, it will send the result to the server, and before sending the result, the client asks the server whether the task is still valid. +This is particularly useful when the result is large and the communication network is slow. If the task is no longer valid, then the client won't need to send the result any more. +The client keeps sending the "task check" request to the server until a response is received. + +The behavior of "task check" process can be configured with two variables: + +task_check_timeout +^^^^^^^^^^^^^^^^^^ + +This variable is for the client-side configuration (config_fed_client.json). + +This variable specifies the timeout value for the "task check" request. +If a response is not received from the Server within this specified value, then another "task check" request will be sent. + +The default value is 5.0 seconds. + +task_check_interval +^^^^^^^^^^^^^^^^^^^ + +This variable is for the client-side configuration (config_fed_client.json). + +This variable specifies how long to wait before sending another "task check" request if a response is not received from the server for the previous request. + +The default value is 5.0 seconds. + +Get Task +-------- + +The client sends the "get task" request to the server to get the next assigned task. +You can set the get_task_timeout variable to specify how long to wait for the response from the server. +If a response is not received from the server within the specified time, the client will try again. + +It is crucial to set this variable to a proper value. +If this value is too short for the server to deliver the response to the client in time, then the server may get repeated requests for the same task. +This can cause the server to run out of memory (since there could be many messages inflight to the same client). + +The default value of this variable is 30 seconds. You change its value by setting it in the config_fed_client.json: + +``get_task_timeout: 60.0`` + +Submit Task Result +------------------ + +The client submits the task result to the server after the task is completed. You can set the submit_task_result_timeout variable to specify how long to wait for the response from the server. If a response is not received from the server within the specified time, the client will try to send the result again until it succeeds. + +It is crucial to set this variable to a proper value. If this value is too short for the server to accept the result and deliver a response to the client in time, then the server may get repeated task results for the same task. This can cause the server to run out of memory (since there could be many messages coming to the server). + +The default value of this variable is 30 seconds. You change its value by setting it in the config_fed_client.json: + +``submit_task_result_timeout: 120.0`` + +Job Heartbeat +------------- + +A task could take the client a long time to finish. +During this time, there is no interaction between the client-side job process and the server-side job process. +In some network environments, this long-time silence could cause the underlying network to drop connections, which could cause some system functions to fail (e.g. any server-initiated messages may not be delivered to the client in a timely fashion). +To prevent this problem, the client's job process sends periodical heartbeats to the server. +The behavior of the heartbeat is controlled by: + +job_heartbeat_interval +^^^^^^^^^^^^^^^^^^^^^^ + +This variable is for the client-side configuration (config_fed_client.json). +This variable specifies how often to send a heartbeat message to the server. + +The default value is 30.0 seconds. You can tune this value up or down depending on your communication network's behavior. + +Graceful Job Completion +----------------------- + +Many components could be involved in the execution of a job. At the end of the job, all components should end gracefully. +For example, a stats report component may still have pending stats records to be processed when the job is done. +If the job process (server-side or client-side) is abruptly terminated when the job's workflow is done, then the pending records would be lost. + +To enable graceful completion of components, FLARE will fire the ``EventType.CHECK_END_RUN_READINESS event``. +A component that may have pending tasks can listen to this event and indicate whether it is ready to end. +FLARE will repeat the event until all components are ready to end; or until a configured max time is reached. + +end_run_readiness_timeout +^^^^^^^^^^^^^^^^^^^^^^^^^ + +This variable is for both the server-side (config_fed_server.json) and client-side configuration (config_fed_client.json). +This variable specifies the max time to wait for all components to become ready to end. + +The default value is 5.0 seconds + +end_run_readiness_check_interval +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This variable is for both the server-side (config_fed_server.json) and client-side configuration (config_fed_client.json). +This variable specifies how long to wait before checking component readiness again. + +The default value is 0.5 seconds. \ No newline at end of file diff --git a/docs/user_guide/logging_configuration.rst b/docs/user_guide/configurations/logging_configuration.rst similarity index 100% rename from docs/user_guide/logging_configuration.rst rename to docs/user_guide/configurations/logging_configuration.rst diff --git a/docs/user_guide/configurations/variable_resolution.rst b/docs/user_guide/configurations/variable_resolution.rst new file mode 100644 index 0000000000..22344321fe --- /dev/null +++ b/docs/user_guide/configurations/variable_resolution.rst @@ -0,0 +1,335 @@ +.. _variable_resolution: + +Variable Resolution in Job Configuration +======================================== + +FLARE jobs are defined with configuration files: ``config_fed_client.json`` and ``config_fed_server.json``. +These two files configure the components (Python objects) used for the server process and the FL client processes. +The component configuration includes information about the class path of the Python object, and arguments for the object's constructor. +The configuration files are processed at the beginning of the server/client job processes to create those components. + +Here is a typical example of a job configuration: + +.. code-block:: json + + { + "format_version": 2, + "executors": [ + { + "tasks": [ + "train" + ], + "executor": { + "path": "nvflare.app_common.np.np_trainer.NPTrainer", + "args": { + "sleep_time": 1.5, + "model_dir": "model" + } + } + } + ], + "task_result_filters": [], + "task_data_filters": [], + "components": [ + ] + } + +As shown in the example above, the ``executor`` component has two args (sleep_time and model_dir) and both are specified explicitly. + +Variable Resolution +------------------- + +Sometimes, users want to experiment with different arg values of the component, and want to manage those experimental args in a common place (e.g. beginning of the config file) instead of searching for the args from the file to modify them. +This is particularly true if the user has multiple components to experiment with. + +FLARE makes this possible with a mechanism called Variable Resolution. +Instead of hard-coding values for each config arg, users can simply use a Variable Reference as the value of the arg, and then define the value of the variable in a separate place (e.g. beginning of the config file). + +The following shows the configuration of the above example using variable resolution: + +.. code-block:: json + + { + "format_version": 2, + "result_dir": "result", + "sleep_time": 1.5, + "executors": [ + { + "tasks": [ + "train" + ], + "executor": { + "path": "nvflare.app_common.np.np_trainer.NPTrainer", + "args": { + "sleep_time": "{sleep_time}", + "model_dir": "{result_dir}" + } + } + } + ], + "task_result_filters": [], + "task_data_filters": [], + "components": [ + ] + } + + +As you can see from the example, the Variable Definition (Var Def) is a simple JSON element that defines a value for a Variable Name (Var Name). +The Variable Reference (Var Ref) is a string that embeds the referenced Variable Name within curly brackets: ``{VarName}``. + +A var ref can be used within a string with other information. +For example, you could define the ``model_dir`` arg to include a prefix: +``/tmp/fl_work/{result_dir}`` + +You could reference multiple variables in one arg value: +``{root_dir}/{result_dir}`` + +If the arg value contains nothing but a single var ref, it is called a Simple Var Ref (SVR). +Other uses, such as var ref with other info, or multiple var refs, are called Complex Var Ref (CVR). +There is an important difference between a SVR and a CVR when the ref is resolved to compute the arg value: +a SVR will be resolved to its true type of the corresponding variable definition; whereas a CVR is always resolved into a string with the values of the referenced variables. +The SVR can reference both primitive variables (number, boolean, string) and non-primitives (list and dict), whereas you can only use primitive variables with a CVR! + +Predefined System Variables +--------------------------- + +Referenced variables must be defined. For user-defined variables, usually users define them somewhere in the config file (e.g. at the beginning of the file) as first-level elements, as shown in the above example. + +FLARE predefined the following System Variables that are also available for you to use in the job config: + +- SITE_NAME - the name of the site (server ot FL client) +- WORKSPACE - the directory of the site's workspace +- JOB_ID - Job ID +- ROOT_URL - the url for connecting to the FL server +- SECURE_MODE - whether the communication is in secure mode + +Note that system variables are named in UPPERCASE letters. To avoid potential name conflict between user-defined variables and system variables, please name all user-defined variables with lowercase letters. + +The next example will show the use of system variables in CellPipe configuration. + +OS Environment Variables +------------------------ + +OS environment variables can be referenced in job configuration via the dollar sign: + +``{$EnvVarName}`` + +With this, you can make your job config controlled by OS environment variables. +For example, you can use an environment variable (e.g. NVFLARE_MODEL_DIR) to specify where the trained model will be stored such that system operators can change the model location without needing to change job configurations. +Note that if a variable with the name ``$VarName`` is already defined in the job config, then this definition takes precedence over the corresponding OS environment variable, if any. + +The following example shows how to use an OS environment variable to control the location of model_dir: + +.. code-block:: json + + { + "format_version": 2, + "executors": [ + { + "tasks": [ + "train" + ], + "executor": { + "path": "nvflare.app_common.np.np_trainer.NPTrainer", + "args": { + "model_dir": "{$NVFLARE_MODEL_DIR}" + } + } + } + ], + "task_result_filters": [], + "task_data_filters": [], + "components": [ + ] + } + +Just like any other var definitions, OS environment variables can be referenced in both SVR and CVR. + +Parameterized Variable Definitions +---------------------------------- + +Before discussing this advanced topic, let's first show an example of job configuration that does not use this technique for comparison: + +.. code-block:: json + + { + "format_version": 2, + "pipe_token": "pipe_123", + "executors": [ + { + "tasks": [ + "train" + ], + "executor": { + "path": "nvflare.app_common.executors.task_exchanger.TaskExchanger", + "args": { + "pipe_id": "task_pipe" + } + } + } + ], + "task_result_filters": [], + "task_data_filters": [], + "components": [ + { + "id": "task_pipe", + "path": "nvflare.fuel.utils.pipe.cell_pipe.CellPipe", + "args": { + "mode": "passive", + "site_name": "{SITE_NAME}", + "token": "{pipe_token}", + "root_url": "{ROOT_URL}", + "secure_mode": "{SECURE_MODE}", + "workspace_dir": "{WORKSPACE}" + } + }, + { + "id": "metric_pipe", + "path": "nvflare.fuel.utils.pipe.cell_pipe.CellPipe", + "args": { + "mode": "passive", + "site_name": "{SITE_NAME}", + "token": "{pipe_token}", + "root_url": "{ROOT_URL}", + "secure_mode": "{SECURE_MODE}", + "workspace_dir": "{WORKSPACE}" + } + }, + { + "id": "metric_receiver", + "path": "nvflare.widgets.metric_receiver.MetricReceiver", + "args": { + "pipe_id": "metric_pipe" + } + } + ] + } + + +This job requires two pipes, one for task exchange (task_pipe), another for metrics collection (metric_pipe). +If you look at their configuration closely, you will see that: there are many args to configure, and the configs of the two pipes are identical except for their ``id`` values. It is tedious and error-prone to configure many args in multiple places. + +One way to improve is to make use of SVR for the args of the two pipes: + +.. code-block:: json + + { + "format_version": 2, + "pipe_token": "pipe_123", + "executors": [ + { + "tasks": [ + "train" + ], + "executor": { + "path": "nvflare.app_common.executors.task_exchanger.TaskExchanger", + "args": { + "pipe_id": "task_pipe" + } + } + } + ], + "task_result_filters": [], + "task_data_filters": [], + "pipe_args": { + "mode": "passive", + "site_name": "{SITE_NAME}", + "token": "{pipe_token}", + "root_url": "{ROOT_URL}", + "secure_mode": "{SECURE_MODE}", + "workspace_dir": "{WORKSPACE}" + }, + "components": [ + { + "id": "task_pipe", + "path": "nvflare.fuel.utils.pipe.cell_pipe.CellPipe", + "args": "{pipe_args}" + }, + { + "id": "metric_pipe", + "path": "nvflare.fuel.utils.pipe.cell_pipe.CellPipe", + "args": "{pipe_args}" + }, + { + "id": "metric_receiver", + "path": "nvflare.widgets.metric_receiver.MetricReceiver", + "args": { + "pipe_id": "metric_pipe" + } + } + ] + } + +In this version of the example, the args for the two pipes are moved into the var def ``pipe_args``, and the components' ``args`` simply reference the var def. +This is better than the original version, but the path of the two pipes still must be repeated for both components. + +Using Parameterized Variable Definition, we can further improve it: + +.. code-block:: json + + { + "format_version": 2, + "pipe_token": "pipe_123", + "executors": [ + { + "tasks": [ + "train" + ], + "executor": { + "path": "nvflare.app_common.executors.task_exchanger.TaskExchanger", + "args": { + "pipe_id": "task_pipe" + } + } + } + ], + "task_result_filters": [], + "task_data_filters": [], + "@pipe_def": { + "id": "{pipe_id}", + "path": "nvflare.fuel.utils.pipe.cell_pipe.CellPipe", + "args": { + "mode": "passive", + "site_name": "{SITE_NAME}", + "token": "{pipe_token}", + "root_url": "{ROOT_URL}", + "secure_mode": "{SECURE_MODE}", + "workspace_dir": "{WORKSPACE}" + } + }, + "components": [ + "{@pipe_def:pipe_id=task_pipe}", + "{@pipe_def:pipe_id=metric_pipe}", + { + "id": "metric_receiver", + "path": "nvflare.widgets.metric_receiver.MetricReceiver", + "args": { + "pipe_id": "metric_pipe" + } + } + ] + } + +As you can see here, ``@pipe_def`` is a parameterized variable definition (PVD). +The name of a PVD must start with the ``@`` sign. The PVD is usually defined with references to other variables, and the values can be provided at the time the PVD is referenced. +In this example, the ``@pipe_def`` PVD defines a pipe configuration template that can be resolved to a concrete pipe config. +In the ``components`` section, this PVD is used for the config of the two pipes: task_pipe and metric_pipe. + +A PVD can only be referenced with SVR (simple variable reference). +To reference a PVD, you provide values for any variables in the PVD. +In this example, the ``pipe_id`` is the variable that takes two different values for the two different pipes. + +The reference to a PVD is in this general format: + +``{PvdName:N1=V1:N2=V2:...}`` + +The PvdName is the name of the PVD. +You supply the value of each variable in the PVD using N=V, where N is the name of the variable, and V is the value. +Note that the V can even reference other variables! + +Note that if there is a value defined for N outside of the reference, the supplied value in the reference takes precedence. +For example, if your reference supplied a value for ``pipe_token``, then the value you supplied will take precedence over the one defined at the beginning of the file: + +``"{@pipe_def:pipe_id=task_pipe:pipe_token=pipe_789}"`` + +In this case, the value of the ``pipe_token`` when creating the pipe ``task_pipe`` will be ``pipe_789``, instead of ``pipe_123`` as defined at the beginning of the file. \ No newline at end of file diff --git a/docs/user_guide/dashboard_ui.rst b/docs/user_guide/dashboard_ui.rst index e9fe31e413..0525b4e624 100644 --- a/docs/user_guide/dashboard_ui.rst +++ b/docs/user_guide/dashboard_ui.rst @@ -24,7 +24,7 @@ for the server(s) and overseer (if :ref:`HA mode ` is enabled Member and Lead User Experience =============================== -The website should alredy be set up when ``Member`` or ``Lead`` users are invited to sign up by the project admin. +The website should already be set up when ``Member`` or ``Lead`` users are invited to sign up by the project admin. .. _dashboard_homepage: diff --git a/docs/user_guide/federated_xgboost.rst b/docs/user_guide/federated_xgboost.rst new file mode 100644 index 0000000000..4f38f0587b --- /dev/null +++ b/docs/user_guide/federated_xgboost.rst @@ -0,0 +1,28 @@ +############################## +Federated XGBoost with NVFlare +############################## + +XGBoost (https://github.com/dmlc/xgboost) is an open-source project that +implements machine learning algorithms under the Gradient Boosting framework. +It is an optimized distributed gradient boosting library designed to be highly +efficient, flexible and portable. +This implementation uses MPI (message passing interface) for client +communication and synchronization. + +MPI requires the underlying communication network to be perfect - a single +message drop causes the training to fail. + +This is usually achieved via a highly reliable special-purpose network like NCCL. + +The open-source XGBoost supports federated paradigm, where clients are in different +locations and communicate with each other with gRPC over internet connections. + +We introduce federated XGBoost with NVFlare for a more reliable federated setup. + +.. toctree:: + :maxdepth: 1 + + federated_xgboost/reliable_xgboost_design + federated_xgboost/reliable_xgboost_timeout + federated_xgboost/secure_xgboost_design + federated_xgboost/secure_xgboost_user_guide diff --git a/docs/user_guide/federated_xgboost/reliable_xgboost_design.rst b/docs/user_guide/federated_xgboost/reliable_xgboost_design.rst new file mode 100644 index 0000000000..99747dcd11 --- /dev/null +++ b/docs/user_guide/federated_xgboost/reliable_xgboost_design.rst @@ -0,0 +1,65 @@ +################################# +Reliable Federated XGBoost Design +################################# + + +************************* +Flare as XGBoost Launcher +************************* + +NVFLARE serves as a launchpad to start the XGBoost system. +Once started, the XGBoost system runs independently of FLARE, +as illustrated in the following figure. + +.. figure:: ../../resources/loose_xgb.png + :height: 500px + +There are a few potential problems with this approach: + + - As we know, MPI requires a perfect communication network, + whereas the simple gRPC over the internet could be unstable. + + - For each job, the XGBoost Server must open a port for clients to connect to. + This adds burden to request IT for the additional port in the real-world situation. + Even if a fixed port is allowed to open, and we reuse that port, + multiple XGBoost jobs can not be run at the same time, + since each XGBoost job requires a different port number. + + +***************************** +Flare as XGBoost Communicator +***************************** + +FLARE provides a highly flexible, scalable and reliable communication mechanism. +We enhance the reliability of federated XGBoost by using FLARE as the communicator of XGBoost, +as shown here: + +.. figure:: ../../resources/tight_xgb.png + :height: 500px + +Detailed Design +=============== + +The open-source Federated XGBoost (c++) uses gRPC as the communication protocol. +To use FLARE as the communicator, we simply route XGBoost's gRPC messages through FLARE. +To do so, we change the server endpoint of each XGBoost client to a local gRPC server +(LGS) within the FLARE client. + +.. figure:: ../../resources/fed_xgb_detail.png + :height: 500px + +As shown in this diagram, there is a local GRPC server (LGS) for each site +that serves as the server endpoint for the XGBoost client on the site. +Similarly, there is a local GRPC Client (LGC) on the FL Server that +interacts with the XGBoost Server. The message path between the XGBoost Client and +the XGBoost Server is as follows: + + 1. The XGBoost client generates a gRPC message and sends it to the LGS in FLARE Client + 2. FLARE Client forwards the message to the FLARE Server. This is a reliable FLARE message. + 3. FLARE Server uses the LGC to send the message to the XGBoost Server. + 4. XGBoost Server sends the response back to the LGC in FLARE Server. + 5. FLARE Server sends the response back to the FLARE Client. + 6. FLARE Client sends the response back to the XGBoost Client via the LGS. + +Please note that the XGBoost Client (c++) component could be running as a separate process +or within the same process of FLARE Client. diff --git a/docs/user_guide/federated_xgboost/reliable_xgboost_timeout.rst b/docs/user_guide/federated_xgboost/reliable_xgboost_timeout.rst new file mode 100644 index 0000000000..efecba5647 --- /dev/null +++ b/docs/user_guide/federated_xgboost/reliable_xgboost_timeout.rst @@ -0,0 +1,96 @@ +.. _reliable_xgboost_timeout: + +############################################ +Reliable Federated XGBoost Timeout Mechanism +############################################ + +NVFlare introduces a tightly-coupled integration between XGBoost and NVFlare. +NVFlare implements the :class:`ReliableMessage` +mechanism to make XGBoost's server/client interactions more robust over +unstable internet connections. + +Unstable internet connection is the situation where the connections between +the communication endpoints have random disconnects/reconnects and unstable speed. +It is not meant to be an extended internet outage. + +ReliableMessage does not mean guaranteed delivery. +It only means that it will try its best to deliver the message to the peer. +If one attempt fails, it will keep trying until either the message is +successfully delivered or a specified "transaction timeout" is reached. + +***************** +Timeout Mechanism +***************** + +In runtime, the FLARE System is configured with a few important timeout parameters. + +ReliableMessage Timeout +======================= + +There are two timeout values to control the behavior of ReliableMessage (RM). + +Per-message Timeout +------------------- + +Essentially RM tries to resend the message until delivered successfully. +Each resend of the message requires a timeout value. +This value should be defined based on the message size, overall network speed, +and the amount of time needed to process the message in a normal situation. +For example, if an XGBoost message takes no more than 5 seconds to be +sent, processed, and replied. +The per-message timeout should be set to 5 seconds. + +.. note:: + + Note that the initial XGBoost message might take more than 100 seconds + depends on the dataset size. + +Transaction Timeout +------------------- + +This value defines how long you want RM to keep retrying until done, in case +of unstable connection. +This value should be defined based on the overall stability of the connection, +nature of the connection, and how quickly the connection is restored. +For occasional connection glitches, this value shouldn't have to be too big +(e.g. 20 seconds). +However if the outage is long (say 60 seconds or longer), then this value +should be big enough. + +.. note:: + + Note that even if you think the connection is restored (e.g. replugged + the internet cable or reactivated WIFI), the underlying connection + layer may take much longer to actually restore connections (e.g. up to + a few minutes)! + +.. note:: + + Note: if the transaction timeout is <= per-message timeout, then the + message will be sent through simple messaging - no retry will be done + in case of failure. + +XGBoost Client Operation Timeout +================================ + +To prevent a XGBoost client from running forever, the XGBoost/FLARE +integration lets you define a parameter (max_client_op_interval) on the +server side to control the max amount of time permitted for a client to be +silent (i.e. no messages sent to the server). +The default value of this parameter is 900 seconds, meaning that if no XGB +message is received from the client for over 900 seconds, then that client +is considered dead, and the whole job is aborted. + +*************************** +Configure Timeouts Properly +*************************** + +These timeout values are related. For example, if the transaction timeout +is greater than the server timeout, then it won't be that effective since +the server will treat the client to be dead once the server timeout is reached +anyway. Similarly, it does not make sense to have transaction timeout > XGBoost +client op timeout. + +In general, follow this rule: + +Per-message Timeout < Transaction Timeout < XGBoost Client Operation Timeout diff --git a/docs/user_guide/federated_xgboost/secure_xgboost_design.rst b/docs/user_guide/federated_xgboost/secure_xgboost_design.rst new file mode 100644 index 0000000000..6f69255f59 --- /dev/null +++ b/docs/user_guide/federated_xgboost/secure_xgboost_design.rst @@ -0,0 +1,85 @@ +############################### +Secure Federated XGBoost Design +############################### + +Collaboration Modes and Secure Patterns +======================================= + +Horizontal Secure +----------------- + +For horizontal XGBoost, each party holds "equal status" - whole feature and label for partial population, while the federated server performs aggregation, without owning any data. +Hence in this case, the federated server is the "minor contributor" from model training perspective, and clients have a concern of leaking any information to the server. +Under this setting, the protection is mainly against the federated server over local histograms. + +To protect the local histograms for horizontal collaboration, the local histograms will be encrypted before sending to the federated server for aggregation. +The aggregation will then be performed over ciphertexts and the encrypted global histograms will be returned to clients, where they will be decrypted and used for tree building. + +Vertical Secure +--------------- + +For vertical XGBoost, the active party holds the label, which cannot be accessed by passive parties and can be considered the most valuable asset for the whole process. +Therefore, the active party in this case is the "major contributor" from model training perspective, and it will have a concern of leaking this information to passive clients. +In this case, the security protection is mainly against passive clients over the label information. + +To protect label information for vertical collaboration, at every round of XGBoost after the active party computes the gradients for each sample at the active party, the gradients will be encrypted before sending to passive parties. +Upon receiving the encrypted gradients (ciphertext), they will be accumulated according to the specific feature distribution at each passive party. +The resulting cumulative histograms will be returned to the active party, decrypted, and further be used for tree building at the active party. + +Decoupled Encryption with Processor Interface +============================================= + +In our current design, XGBoost communication is routed through the NVIDIA FLARE Communicator layer via local gRPC handlers. +From communication's perspective, the previous direct messages within XGBoost are now handled by FL communicator - they become "external communications" to and from XGBoost via FL system. +This gives us flexibilities in performing message operations both within XGBoost (before entering FL communicator) and within FL system (by FL communicator) + +.. figure:: ../../resources/xgb_communicator.jpg + :height: 500px + +With NVFlare, the XGBoost plugin will be implemented in C++, while the FL system communicator will be implemented in Python. A processor interface is designed and developed to properly connect the two by taking plugins implemented towards a specific HE method and collaboration mode: + +.. figure:: ../../resources/processor_interface_design.png + :height: 500px + +Processor Interface Design + + 1. Upon receiving specific MPI calls from XGBoost, each corresponding party calls interface for data processing (serialization, etc.), providing necessary information: g/h pairs, or local G/H histograms + 2. Processor interface performs necessary processing (and encryption), and send the results back as a processed buffer + 3. Each party then forward the message to local gRPC handler on FL system side + 4. After FL communication involving message routing and computation, each party receives the result buffer upon MPI calls. + 5. Each FL party then sends the received buffer to processor interface for interpretation + 6. Interface performs necessary processing (deserialization, etc.), recovers proper information, and sends the result back to XGBoost for further computation + + +Note that encryption/decryption can be performed either by processor interface (C++), or at local gRPC handler (Python) depending on the particular HE library and scheme being considered. + +System Design +============= +With the secure solutions, communication patterns, and processor interface, below we provide example designs for secure federated XGBoost - both vertical and horizontal. + +For vertical pipeline: + + 1. active party first compute g/h with the label information it owns + 2. g/h data will be sent to processor interface, encrypted with C++ based encryption util library, and sent to passive party via FL communication + 3. passive party provides indexing information for histogram computation according to local feature distributions, and the processor interface will perform aggregation with E(g/h) received. + 4. The resulting E(G/H) will be sent to active party via FL message routing + 5. Decrypted by processor interface on active party side, tree building can be performed with global histogram information + +.. figure:: ../../resources/secure_vertical_xgb.png + :height: 500px + +Secure Vertical Federated XGBoost with XGBoost-side Encryption +In this case, the "heavy-lifting" jobs - encryption, secure aggregation, etc. - are done by processor interface. + +For horizontal pipeline: + + 1. All parties sends their local G/H histograms to FL side via processor interface, in this design processor interface only performs buffer preparation without any complicated processing steps + 2. Before sending to federated server, the G/H histograms will be encrypted at local gRPC handler with Python-based encryption util library + 3. Federated server will perform secure aggregation over received partial E(G/H), and distribute the global E(G/H) to each clients, where the global histograms will be decrypted, and used for further tree-building + +.. figure:: ../../resources/secure_horizontal_xgb.png + :height: 500px + +Secure Horizontal Federated XGBoost with FL-side Encryption +In this case, the encryption is done on the FL system side. + diff --git a/docs/user_guide/federated_xgboost/secure_xgboost_user_guide.rst b/docs/user_guide/federated_xgboost/secure_xgboost_user_guide.rst new file mode 100644 index 0000000000..51e0e9fb52 --- /dev/null +++ b/docs/user_guide/federated_xgboost/secure_xgboost_user_guide.rst @@ -0,0 +1,504 @@ +########################## +NVFlare XGBoost User Guide +########################## + +Overview +======== +NVFlare supports federated training with XGBoost. It provides the following advantages over doing the training natively with XGBoost: + +- Secure training with Homomorphic Encryption (HE) +- Lifecycle management of XGBoost processes. +- Reliable messaging which can overcome network glitches +- Training over complicated networks with relays. + +It supports federated training in the following 4 modes: + +1. Row split without encryption +2. Column split without encryption +3. Row split with HE (Requires at least 3 clients. With 2 clients, the other client's histogram can be deduced.) +4. Column split with HE + +When running with NVFlare, all the GRPC connections in XGBoost are local and the messages are forwarded to other clients through NVFlare's CellNet communication. +The local GRPC ports are selected automatically by NVFlare. + +The encryption is handled in XGBoost by encryption plugins, which are external components that can be installed at runtime. + +Prerequisites +============= +Required Python Packages +------------------------ + +NVFlare 2.5.0 or above, + +.. code-block:: bash + + pip install nvflare~=2.5.0 + +XGBoost 2.2 or above, which can be installed from the binary build using this command, + +.. code-block:: bash + + pip install https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/federated-secure/xgboost-2.2.0.dev0%2B4601688195708f7c31fcceeb0e0ac735e7311e61-py3-none-manylinux_2_28_x86_64.whl + +or in case you need to get the most current build of XGBoost, + +.. code-block:: bash + + pip install https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/federated-secure/`curl -s https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/federated-secure/meta.json | grep -o 'xgboost-2\.2.*whl'|sed -e 's/+/%2B/'` + +``TenSEAL`` package is needed for horizontal secure training, + +.. code-block:: bash + + pip install tenseal + +``ipcl_python`` package is required for vertical secure training if **nvflare** plugin is used. This package is not needed if **cuda_paillier** plugin is used. + +.. code-block:: bash + + pip install ipcl-python + +This package is only available for Python 3.8 on PyPI. For other versions of python, it needs to be installed from github, + +.. code-block:: bash + + pip install git+https://github.com/intel/pailliercryptolib_python.git@development + +System Environments +------------------- +To support secure training, several homomorphic encryption libraries are used. Those libraries require Intel CPU or Nvidia GPU. + +Linux is the preferred OS. It's tested extensively under Ubuntu 22.4. + +The following docker image is recommended for GPU training: + +:: + + nvcr.io/nvidia/pytorch:24.03-py3 + +.. _xgb_provisioning: + +NVFlare Provisioning +-------------------- +For horizontal secure training, the NVFlare system must be provisioned with homomorphic encryption context. The HEBuilder in ``project.yml`` is used to achieve this. +An example configuration can be found at :github_nvflare_link:`secure_project.yml `. + +This is a snippet of the ``secure_project.yml`` file with the HEBuilder: + +.. code-block:: yaml + + api_version: 3 + name: secure_project + description: NVIDIA FLARE sample project yaml file for CIFAR-10 example + + participants: + + ... + + builders: + - path: nvflare.lighter.impl.workspace.WorkspaceBuilder + args: + template_file: master_template.yml + - path: nvflare.lighter.impl.template.TemplateBuilder + - path: nvflare.lighter.impl.static_file.StaticFileBuilder + args: + config_folder: config + overseer_agent: + path: nvflare.ha.dummy_overseer_agent.DummyOverseerAgent + overseer_exists: false + args: + sp_end_point: localhost:8102:8103 + heartbeat_interval: 6 + - path: nvflare.lighter.impl.he.HEBuilder + args: + poly_modulus_degree: 8192 + coeff_mod_bit_sizes: [60, 40, 40] + scale_bits: 40 + scheme: CKKS + - path: nvflare.lighter.impl.cert.CertBuilder + - path: nvflare.lighter.impl.signature.SignatureBuilder + + +Data Preparation +================ +Data must be properly formatted for federated XGBoost training based on split mode (row or column). + +For horizontal (row-split) training, the datasets on all clients must share the same columns. + +For vertical (column-split) training, the datasets on all clients contain different columns, but must share overlapping rows. For more details on vertical split preprocessing, refer to the :github_nvflare_link:`Vertical XGBoost Example `. + +XGBoost Plugin Configuration +============================ +XGBoost requires an encryption plugin to handle secure training. + +- **cuda_paillier**: The default plugin. This plugin uses GPU for cryptographic operations. +- **nvflare**: This plugin forwards data locally to NVFlare process for encryption. + +.. note:: + + All clients must use the same plugin. When different plugins are used in different clients, + the behavior of federated XGBoost is undetermined, which can cause the job to crash. + +The **cuda_paillier** plugin requires NVIDIA GPUs that support compute capability 7.0 or higher. Also, CUDA +12.2 or 12.4 must be installed. Please refer to https://developer.nvidia.com/cuda-gpus for more information. + +The two included plugins are only different in vertical secure training. For horizontal secure training, both +plugins work exactly the same by forwarding the data to NVFlare for encryption. + +Here are plugin configurations needed for each training mode. + +Vertical (Non-secure) +--------------------- +No plugin is needed. + +Horizontal (Non-secure) +----------------------- +No plugin is needed. + +Vertical Secure +--------------- +Both plugins can be used for vertical secure training. + +The default cuda_paillier plugin is preferred because it uses GPU for faster cryptographic operations. + +.. note:: + + **cuda_paillier** plugin requires NVIDIA GPUs that support compute capability 7.0 or higher. Please refer to https://developer.nvidia.com/cuda-gpus for more information. + +If you see the following errors in the log, it means either no GPU is detected or the GPU does not meet the requirements: + +:: + + CUDA runtime API error no kernel image is available for execution on the device at line 241 in file /my_home/nvflare-internal/processor/src/cuda-plugin/paillier.h + 2024-07-01 12:19:15,683 - SimulatorClientRunner - ERROR - run_client_thread error: EOFError: + + +In this case, the nvflare plugin can be used to perform encryption on CPUs, which requires the ipcl-python package. +The plugin can be configured in the ``local/resources.json`` file on clients: + +.. code-block:: json + + { + "federated_plugin": { + "name": "nvflare", + "path": "/tmp/libnvflare.so" + } + } + +Where **name** is the plugin name and **path** is the full path of the plugin including the library file name. +The **path** is optional, the default value is the library distributed with NVFlare for the plugin. + +The following environment variables can be used to override the values in the JSON, + +.. code-block:: bash + + export NVFLARE_XGB_PLUGIN_NAME=nvflare + export NVFLARE_XGB_PLUGIN_PATH=/tmp/libnvflare.so + +.. note:: + + When running with the NVFlare simulator, the plugin must be configured using environment variables, + as it does not support resources.json. + +Horizontal Secure +----------------- +The plugin setup is the same as vertical secure. + +This mode requires the tenseal package for all plugins. +The provisioning of NVFlare systems must include tenseal context. +See :ref:`xgb_provisioning` for details. + +For simulator, the tenseal context generated by provisioning needs to be copied to the startup folder, + +``simulator_workspace/startup/client_context.tenseal`` + +For example, + +.. code-block:: bash + + nvflare provision -p secure_project.yml -w /tmp/poc_workspace + mkdir -p /tmp/simulator_workspace/startup + cp /tmp/poc_workspace/example_project/prod_00/site-1/startup/client_context.tenseal /tmp/simulator_workspace/startup + +The server_context.tenseal file is not needed. + +Building Encryption Plugins +=========================== + +The plugins need to be built from the source code for your specific environment. + +To build the plugins, check out the NVFlare source code from https://github.com/NVIDIA/NVFlare and following the +instructions in :github_nvflare_link:`this document. ` + +Job Configuration +================= +.. _secure_xgboost_controller: + +Controller +---------- + +On the server side, the following controller must be configured in workflows, + +``nvflare.app_opt.xgboost.histogram_based_v2.fed_controller.XGBFedController`` + +Even though the XGBoost training is performed on clients, the parameters are configured on the server so all clients share the same configuration. +XGBoost parameters are defined here, https://xgboost.readthedocs.io/en/stable/python/python_intro.html#setting-parameters + +- **num_rounds**: Number of training rounds. +- **data_split_mode**: Same as XGBoost data_split_mode parameter, 0 for row-split, 1 for column-split. +- **secure_training**: If true, XGBoost will train in secure mode using the plugin. +- **xgb_params**: The training parameters defined in this dict are passed to XGBoost as **params**, the boost paramter. +- **xgb_options**: This dict contains other optional parameters passed to XGBoost. Currently, only **early_stopping_rounds** is supported. +- **client_ranks**: A dict that maps client name to rank. + +Executor +-------- + +On the client side, the following executor must be configured in executors, + +``nvflare.app_opt.xgboost.histogram_based_v2.fed_executor.FedXGBHistogramExecutor`` + +Only one parameter is required for executor, + +- **data_loader_id**: The component ID of Data Loader + +Data Loader +----------- + +On the client side, a data loader must be configured in the components. The CSVDataLoader can be used if the data is pre-processed. For example, + +.. code-block:: json + + { + "id": "dataloader", + "path": "nvflare.app_opt.xgboost.histogram_based_v2.csv_data_loader.CSVDataLoader", + "args": { + "folder": "/opt/dataset/vertical_xgb_data" + } + } + + +If the data requires any special processing, a custom loader can be implemented. The loader must implement the XGBDataLoader interface. + + +Job Example +=========== + +Vertical Training +----------------- + +Here are the configuration files for a vertical secure training job. If encryption is not needed, just change the ``secure_training`` arg to false. + +.. code-block:: json + + :caption: config_fed_server.json + + { + "format_version": 2, + "num_rounds": 3, + "workflows": [ + { + "id": "xgb_controller", + "path": "nvflare.app_opt.xgboost.histogram_based_v2.fed_controller.XGBFedController", + "args": { + "num_rounds": "{num_rounds}", + "data_split_mode": 1, + "secure_training": true, + "xgb_options": { + "early_stopping_rounds": 2 + }, + "xgb_params": { + "max_depth": 3, + "eta": 0.1, + "objective": "binary:logistic", + "eval_metric": "auc", + "tree_method": "hist", + "nthread": 1 + }, + "client_ranks": { + "site-1": 0, + "site-2": 1 + } + } + } + ] + } + + + +.. code-block:: json + + :caption: config_fed_client.json + + { + "format_version": 2, + "executors": [ + { + "tasks": [ + "config", + "start" + ], + "executor": { + "id": "Executor", + "path": "nvflare.app_opt.xgboost.histogram_based_v2.fed_executor.FedXGBHistogramExecutor", + "args": { + "data_loader_id": "dataloader" + } + } + } + ], + "components": [ + { + "id": "dataloader", + "path": "nvflare.app_opt.xgboost.histogram_based_v2.csv_data_loader.CSVDataLoader", + "args": { + "folder": "/opt/dataset/vertical_xgb_data" + } + } + ] + } + + +Horizontal Training +------------------- + +The configuration for horizontal training is the same as vertical except ``data_split_mode`` is 0 and the data loader must point to horizontal split data. + +.. code-block:: json + :caption: config_fed_server.json + + { + "format_version": 2, + "num_rounds": 3, + "workflows": [ + { + "id": "xgb_controller", + "path": "nvflare.app_opt.xgboost.histogram_based_v2.fed_controller.XGBFedController", + "args": { + "num_rounds": "{num_rounds}", + "data_split_mode": 0, + "secure_training": true, + "xgb_options": { + "early_stopping_rounds": 2 + }, + "xgb_params": { + "max_depth": 3, + "eta": 0.1, + "objective": "binary:logistic", + "eval_metric": "auc", + "tree_method": "hist", + "nthread": 1 + }, + "client_ranks": { + "site-1": 0, + "site-2": 1 + }, + "in_process": true + } + } + ] + } + + + + +.. code-block:: json + :caption: config_fed_client.json + + { + "format_version": 2, + "executors": [ + { + "tasks": [ + "config", + "start" + ], + "executor": { + "id": "Executor", + "path": "nvflare.app_opt.xgboost.histogram_based_v2.fed_executor.FedXGBHistogramExecutor", + "args": { + "data_loader_id": "dataloader", + "in_process": true + } + } + } + ], + "components": [ + { + "id": "dataloader", + "path": "nvflare.app_opt.xgboost.histogram_based_v2.csv_data_loader.CSVDataLoader", + "args": { + "folder": "/data/xgboost_secure/dataset/horizontal_xgb_data" + } + } + ] + } + +Pre-Trained Models +================== +To continue training using a pre-trained model, the model can be placed in the job folder with the path and name +of ``custom/model.json``. + +Every site should share the same ``model.json``. The result of previous training with the same dataset can be used as the input model. + +When a pre-trained model is detected, NVFlare prints following line in the log: + +:: + + INFO - Pre-trained model is used: /tmp/nvflare/poc/example_project/prod_00/site-1/startup/../996ac44f-e784-4117-b365-24548f1c490d/app_site-1/custom/model.json + + +Performance Tuning +================== +Timeouts +-------- +For secure training, the HE operations are very slow. If a large dataset is used, several timeout values need +to be adjusted. + +The XGBoost messages are transferred between client and server using +Reliable Messages (:class:`ReliableMessage`). The following parameters +in executor arguments control the timeout behavior: + + - **per_msg_timeout**: Timeout in seconds for each message. + - **tx_timeout**: Timeout for the whole transaction in seconds. This is the total time to wait for a response, accounting for all retry attempts. + +.. code-block:: json + :caption: config_fed_client.json + + { + "format_version": 2, + "executors": [ + { + "tasks": [ + "config", + "start" + ], + "executor": { + "id": "Executor", + "path": "nvflare.app_opt.xgboost.histogram_based_v2.fed_executor.FedXGBHistogramExecutor", + "args": { + "data_loader_id": "dataloader", + "per_msg_timeout": 300.0, + "tx_timeout": 900.0, + "in_process": true + } + } + } + ], + ... + } + +Number of Clients +----------------- +The default configuration can only handle 20 clients. This parameter needs to be adjusted if more clients are involved in the training: + +.. code-block:: json + :caption: config_fed_client.json + + { + "format_version": 2, + "num_rounds": 3, + "rm_max_request_workers": 100, + ... + } + diff --git a/docs/user_guide/flower_integration.rst b/docs/user_guide/flower_integration.rst new file mode 100644 index 0000000000..2fc5926fc3 --- /dev/null +++ b/docs/user_guide/flower_integration.rst @@ -0,0 +1,25 @@ +#################################################### +Integration of Flower Applications with NVIDIA FLARE +#################################################### + +`Flower `_ is an open-source project that implements a unified approach +to federated learning, analytics, and evaluation. Flower has developed a large set of +strategies and algorithms for FL application development and a healthy FL research community. + +FLARE, on the other hand, has been focusing on providing an enterprise-ready, robust runtime +environment for FL applications. + +With the integration of Flower and FLARE, applications developed with the Flower framework +will run easily in FLARE runtime without needing to make any changes. All the user needs to do +is configure the Flower application into a FLARE job and submit the job to the FLARE system. + + +.. toctree:: + :maxdepth: 1 + + flower_integration/flower_initial_integration + flower_integration/flower_job_structure + flower_integration/flower_run_as_flare_job + flower_integration/flare_multi_job_architecture + flower_integration/flower_detailed_design + flower_integration/flower_reliable_messaging diff --git a/docs/user_guide/flower_integration/flare_multi_job_architecture.rst b/docs/user_guide/flower_integration/flare_multi_job_architecture.rst new file mode 100644 index 0000000000..c8b255403b --- /dev/null +++ b/docs/user_guide/flower_integration/flare_multi_job_architecture.rst @@ -0,0 +1,23 @@ +**************************** +FLARE Multi-Job Architecture +**************************** + +To maximize the utilization of compute resources, FLARE supports multiple jobs running at the +same time, where each job is an independent FL experiment. + +.. image:: ../../resources/system_architecture.png + +As shown in the diagram above, there is the Server Control Process (SCP) on the Server host, and there is a +Client Control Process (CCP) on each client host. The SCP communicates with CCPs to manage jobs (schedule, +deploy, monitor, and abort jobs). When a job is scheduled by the SCP, the job is sent to the CCPs of all sites, +which creates separate processes for the job. These processes form a “Job Network” for the job. This network +goes away when the job is finished. + +The diagram shows 3 jobs (J1, J2, J3) in different colors on server and client(s). For example, all J1 processes +form the “job network” for Job 1. + +By default, processes of the same job network are not connected directly. Instead, they only connect to the SCP, +and all messages between job processes are relayed through the SCP. However, if network policy permits, direct +P2P connections could be established automatically between the job processes to obtain maximum communication +speed. The underlying communication path is transparent to applications and only requires config changes to +enable direct communication. diff --git a/docs/user_guide/flower_integration/flower_detailed_design.rst b/docs/user_guide/flower_integration/flower_detailed_design.rst new file mode 100644 index 0000000000..4d12208045 --- /dev/null +++ b/docs/user_guide/flower_integration/flower_detailed_design.rst @@ -0,0 +1,26 @@ +*************** +Detailed Design +*************** + +Flower uses gRPC as the communication protocol. To use FLARE as the communicator, we route Flower's gRPC +messages through FLARE. To do so, we change the server-endpoint of each Flower client to a local gRPC +server (LGS) within the FLARE client. + +.. image:: ../../resources/FLARE_as_flower_communicator.png + +As shown in this diagram, there is a Local GRPC server (LGS) for each site that serves as the +server-endpoint for the Flower client on the site. Similarly, there is a Local GRPC Client (LGC) on the +FLARE Server that interacts with the Flower Server. The message path between the Flower Client and the Flower +Server is as follows: + + - The Flower client generates a gRPC message and sends it to the LGS in the FLARE Client + - FLARE Client forwards the message to the FLARE Server. This is a reliable FLARE message. + - FLARE Server uses the LGC to send the message to the Flower Server. + - Flower Server sends the response back to the LGC in the FLARE Server. + - FLARE Server sends the response back to the FLARE Client. + - FLARE Client sends the response back to the Flower Client via the LGS. + +Please note that the Flower Client could be running as a separate process or within the same process as the FLARE Client. + +This will enable users to directly deploy Flower ServerApps and ClientsApps developed within the +NVFlare Runtime Environment. No code changes are necessary! diff --git a/docs/user_guide/flower_integration/flower_initial_integration.rst b/docs/user_guide/flower_integration/flower_initial_integration.rst new file mode 100644 index 0000000000..2fe8c13ba4 --- /dev/null +++ b/docs/user_guide/flower_integration/flower_initial_integration.rst @@ -0,0 +1,28 @@ +******************* +Initial Integration +******************* + +Architecturally, Flower uses client/server communication. Clients communicate with the server +via gRPC. FLARE uses the same architecture with the enhancement that multiple jobs can run at +the same time (each job requires one set of clients/server) without requiring multiple ports to +be open on the server host. + +Since both frameworks follow the same communication architecture, it is fairly easy to make a +Flower application a FLARE job by using FLARE as the communicator for the Flower app, as shown below. + +.. image:: ../../resources/FLARE_as_flower_communicator.png + +In this approach, Flower Clients no longer directly interact with the Flower Server, instead all +communications are through FLARE. + +The integration with FLARE-based communication has some unique benefits: + + - Provisioning of startup kits, including certificates + - Deployment of custom code (apps) + - User authentication and authorization + - :class:`ReliableMessage` mechanism to counter connection stability issues + - Multiple communication schemes (gRPC, HTTP, TCP, Redis, etc.) are available + - P2P communication: anyone can talk to anyone else without needing topology changes + - Support of P2P communication encryption (on top of SSL) + - Multi-job system that allows multiple Flower apps to run at the same time without needing extra ports on the server host + - Use additional NVFlare features like experiment tracking diff --git a/docs/user_guide/flower_integration/flower_job_structure.rst b/docs/user_guide/flower_integration/flower_job_structure.rst new file mode 100644 index 0000000000..bf10a7ddde --- /dev/null +++ b/docs/user_guide/flower_integration/flower_job_structure.rst @@ -0,0 +1,145 @@ +******************** +Flower Job Structure +******************** +Even though Flower Programming is out of the scope of FLARE/Flower integration, you need to have a good +understanding of the Flower Job Structure when submitting to FLARE. + +A Flower job is a regular FLARE job with special requirements for the ``custom`` directory, as shown below. + +.. code-block:: none + + ├── flwr_pt + │ ├── client.py # <-- contains `ClientApp` + │ ├── __init__.py # <-- to register the python module + │ ├── server.py # <-- contains `ServerApp` + │ └── task.py # <-- task-specific code (model, data) + └── pyproject.toml # <-- Flower project file + +Project Folder +============== +All Flower app code must be placed in a subfolder in the ``custom`` directory of the job. This subfolder is called +the project folder of the app. In this example, the project folder is named ``flwr_pt``. Typically, this folder +contains ``server.py``, ``client.py``, and the ``__init__.py``. Though you could organize them differently (see discussion +below), we recommend always including the ``__init__.py`` so that the project folder is guaranteed to be a valid Python +package, regardless of Python versions. + +Pyproject.toml +============== +The ``pyproject.toml`` file exists in the job's ``custom`` folder. It is an important file that contains server and +client app definition and configuration information. Such information is used by the Flower system to find the +server app and the client app, and to pass app-specific configuration to the apps. + +Here is an example of ``pyproject.toml``, taken from :github_nvflare_link:`this example `. + +.. code-block:: toml + + [build-system] + requires = ["hatchling"] + build-backend = "hatchling.build" + + [project] + name = "flwr_pt" + version = "1.0.0" + description = "" + license = "Apache-2.0" + dependencies = [ + "flwr[simulation]>=1.11.0,<2.0", + "nvflare~=2.5.0rc", + "torch==2.2.1", + "torchvision==0.17.1", + ] + + [tool.hatch.build.targets.wheel] + packages = ["."] + + [tool.flwr.app] + publisher = "nvidia" + + [tool.flwr.app.components] + serverapp = "flwr_pt.server:app" + clientapp = "flwr_pt.client:app" + + [tool.flwr.app.config] + num-server-rounds = 3 + + [tool.flwr.federations] + default = "local-simulation" + + [tool.flwr.federations.local-simulation] + options.num-supernodes = 2 + + +.. note:: Note that the information defined in pyproject.toml must match the code in the project folder! + +Project Name +------------ +The project name should match the name of the project folder, though not a requirement. In this example, it is ``flwr_pt``. +Serverapp Specification + +This value is specified following this format: + +.. code-block:: toml + + : + +where: + + - The is the module that contains the server app code. This module is usually defined as ``server.py`` in the project folder (flwr_pt in this example). + - The is the name of the variable that holds the ServerApp object in the . This variable is usually defined as ``app``: + +.. code-block:: python + + app = ServerApp(server_fn=server_fn) + + +Clientapp Specification +------------------------ +This value is specified following this format: + +.. code-block:: toml + + : + +where: + + - The is the module that contains the client app code. This module is usually defined as ``client.py`` in the project folder (flwr_pt in this example). + - The is the name of the variable that holds the ClientApp object in the . This variable is usually defined as ``app``: + +.. code-block:: python + + app = ClientApp(client_fn=client_fn) + + +App Configuration +----------------- +The pyproject.toml file can contain app config information, in the ``[tool.flwr.app.config]`` section. In this example, +it defines the number of rounds: + +.. code-block:: toml + + [tool.flwr.app.config] + num-server-rounds = 3 + +The content of this section is specific to the server app code. The ``server.py`` in the example shows how this is used: + +.. code-block:: python + + def server_fn(context: Context): + # Read from config + num_rounds = context.run_config["num-server-rounds"] + + # Define config + config = ServerConfig(num_rounds=num_rounds) + + return ServerAppComponents(strategy=strategy, config=config) + +Supernode Count +--------------- +If you run the Flower job with its simulation (not as a FLARE job), you need to specify how many clients (supernodes) to use +for the simulation in the ``[tool.flwr.federations.local-simulation]`` section, like this: + +.. code-block:: toml + + options.num-supernodes = 2 + +But this does not apply when submitting it as a FLARE job. diff --git a/docs/user_guide/flower_integration/flower_reliable_messaging.rst b/docs/user_guide/flower_integration/flower_reliable_messaging.rst new file mode 100644 index 0000000000..eee8a7caa8 --- /dev/null +++ b/docs/user_guide/flower_integration/flower_reliable_messaging.rst @@ -0,0 +1,19 @@ +****************** +Reliable Messaging +****************** + +The interaction between the FLARE Clients and Server is through reliable messaging. +First, the requester tries to send the request to the peer. If it fails to send it, it will retry a moment later. +This process keeps repeating until the request is sent successfully or the amount of time has passed (which will +cause the job to abort). + +Secondly, once the request is sent, the requester waits for the response. Once the peer finishes processing, it +sends the result to the requester immediately (which could be successful or unsuccessful). At the same time, the +requester repeatedly sends queries to get the result from the peer, until the result is received or the max amount +of time has passed (which will cause the job to abort). The result could be received in one of the following ways: + + - The result is received from the response message sent by the peer when it finishes the processing + - The result is received from the response to the query message of the requester + +For details of :class:`ReliableMessage`, +see :ref:`ReliableMessage Timeout `. diff --git a/docs/user_guide/flower_integration/flower_run_as_flare_job.rst b/docs/user_guide/flower_integration/flower_run_as_flare_job.rst new file mode 100644 index 0000000000..a0dda43c64 --- /dev/null +++ b/docs/user_guide/flower_integration/flower_run_as_flare_job.rst @@ -0,0 +1,133 @@ +*********************************** +Run Flower Application as FLARE Job +*********************************** + +Before running Flower applications with FLARE, you must have both FLARE and Flower frameworks +installed in your Python environment. As of this writing, the Flower version to be used is 1.11.0rc0. + +.. code-block:: shell + + pip install flwr==1.11.0rc0 + +To run a Flower application as a job in FLARE, follow these steps: + + - Copy all Flower application code (python code) into the job's "custom" folder. Note that all training functions are implemented in Flower, not in FLARE! + - Create the ``config_fed_server.json`` and ``config_fed_client.json`` + - Submit the created job to FLARE system for execution + +For a full example, see: +:github_nvflare_link:`Hello Flower ` + +Server Config: config_fed_server.json +===================================== +A typical server configuration looks like this: + +.. code-block:: json + + { + "format_version": 2, + "task_data_filters": [], + "task_result_filters": [], + "components": [ + ], + "workflows": [ + { + "id": "ctl", + "path": "nvflare.app_opt.flower.controller.FlowerController", + "args": {} + } + ] + } + +The :class:`FlowerController` has additional args that can be +set to finetune its behavior, as shown below: + +.. code-block:: python + + class FlowerController(TieController): + def __init__( + self, + num_rounds=1, + database: str = "", + server_app_args: list = None, + superlink_ready_timeout: float = 10.0, + configure_task_name=TieConstant.CONFIG_TASK_NAME, + configure_task_timeout=TieConstant.CONFIG_TASK_TIMEOUT, + start_task_name=TieConstant.START_TASK_NAME, + start_task_timeout=TieConstant.START_TASK_TIMEOUT, + job_status_check_interval: float = TieConstant.JOB_STATUS_CHECK_INTERVAL, + max_client_op_interval: float = TieConstant.MAX_CLIENT_OP_INTERVAL, + progress_timeout: float = TieConstant.WORKFLOW_PROGRESS_TIMEOUT, + int_client_grpc_options=None, + ): + """Constructor of FlowerController + + Args: + num_rounds: number of rounds. Not used in this version. + database: database name + server_app_args: additional server app CLI args + superlink_ready_timeout: how long to wait for the superlink to become ready before starting server app + configure_task_name: name of the config task + configure_task_timeout: max time allowed for config task to complete + start_task_name: name of the start task + start_task_timeout: max time allowed for start task to complete + job_status_check_interval: how often to check job status + max_client_op_interval: max time allowed for missing client requests + progress_timeout: max time allowed for missing overall progress + int_client_grpc_options: internal grpc client options + """ + +The args ``num_rounds``, ``database``, and ``server_app_args`` are not currently used. + +Default values for most args should be good enough. You may need to adjust the following args in some special cases. + +``Superlink_ready_timeout`` - superlink process is started first and must become ready before starting the server-app process. +It may take some time for the superlink to become ready (port is open and ready for the server-app). The default value is +10 seconds, which should be enough for most cases. If not, you may need to increase it. + + +Rest of the args are for job lifecycle management. Their meanings are the same as those used for +:ref:`XGBoost controller`. + + +Client Config: config_fed_client.json +===================================== +A typical client configuration looks like this: + +.. code-block:: json + + { + "format_version": 2, + "executors": [ + { + "tasks": ["*"], + "executor": { + "path": "nvflare.app_opt.flower.executor.FlowerExecutor", + "args": {} + } + } + ], + "task_result_filters": [], + "task_data_filters": [], + "components": [] + } + +The FlowerExecutor has additional args that can be set to finetune its behavior, as shown below: + +.. code-block:: python + + class FlowerExecutor(TieExecutor): + def __init__( + self, + start_task_name=Constant.START_TASK_NAME, + configure_task_name=Constant.CONFIG_TASK_NAME, + per_msg_timeout=10.0, + tx_timeout=100.0, + client_shutdown_timeout=5.0, + ): + +The ``per_msg_timeout`` and ``tx_timeout`` configure :class:`ReliableMessage`, +which is used to send requests to the server. + +The ``client_shutdown_timeout`` specifies how long to wait in seconds for graceful shutdown of the Flower's client-app process when +stopping the FL client. If the client-app process does not shut down within this time, it will be killed by Flare. diff --git a/docs/user_guide/nvflare_cli.rst b/docs/user_guide/nvflare_cli.rst index 360e41ed06..d806a2aedd 100644 --- a/docs/user_guide/nvflare_cli.rst +++ b/docs/user_guide/nvflare_cli.rst @@ -4,9 +4,10 @@ NVFlare CLI ########################### -The commands for NVIDIA FLARE have been consolidated to be under the ``nvflare`` command for -better ease of use. This includes the FL Simulator, the POC command, ``provision``, and preflight check, all of -which are explained in more detail in their own sections: +Various NVIDIA FLARE command line interfaces are available to enhance usability. +These include the FL Simulator, the POC command, the provision command, the job command, +the preflight check command, and the dashboard command. +Detailed explanations for each can be found in their respective sections, linked below. .. toctree:: :maxdepth: 1 diff --git a/docs/user_guide/nvflare_cli/fl_simulator.rst b/docs/user_guide/nvflare_cli/fl_simulator.rst index d1abaaea61..f52fd47078 100644 --- a/docs/user_guide/nvflare_cli/fl_simulator.rst +++ b/docs/user_guide/nvflare_cli/fl_simulator.rst @@ -49,11 +49,11 @@ Command examples Run a single NVFlare app ======================== -This command will run the same ``hello-numpy-sag`` app on the server and 8 clients using 1 thread. The client names will be site-1, site-2, ... , site-8: +This command will run the same ``hello-numpy-sag`` app on the server and 8 clients using 1 process. The client names will be site-1, site-2, ... , site-8: .. code-block:: python - nvflare simulator NVFlare/examples/hello-numpy-sag/app -w /tmp/nvflare/workspace_folder/ -n 8 -t 1 + nvflare simulator NVFlare/examples/hello-world/hello-numpy-sag/jobs/hello-numpy-sag -w /tmp/nvflare/workspace_folder/ -n 8 -t 1 .. raw:: html @@ -749,7 +749,7 @@ This command will run the job following the meta.json in the job. The executing .. code-block:: python - nvflare simulator NVFlare/examples/hello-numpy-sag -w /tmp/nvflare/workspace_folder/ -c client0,client1,client2,client3 -t 1 + nvflare simulator NVFlare/examples/hello-world/hello-numpy-sag/jobs/hello-numpy-sag -w /tmp/nvflare/workspace_folder/ -c client0,client1,client2,client3 -t 1 Note that the ``-n`` option is used to specify the number of clients like in the previous section above, but it is checked only if the ``-c`` option is not used. The with the ``-n`` option, clients are automatically created up to the number provided after ``-n``, and they are named site-1, site-2, site-3, etc. @@ -764,7 +764,7 @@ in meta.json to run. .. code-block:: python - nvflare simulator NVFlare/examples/hello-numpy-sag -w /tmp/nvflare/workspace_folder/ -t 1 + nvflare simulator NVFlare/examples/hello-world/hello-numpy-sag/jobs/hello-numpy-sag -w /tmp/nvflare/workspace_folder/ -t 1 .. note:: @@ -829,22 +829,29 @@ application run. status = run_simulator(args) sys.exit(status) -**************************** -Threads, Clients, and Events -**************************** +****************************** +Processes, Clients, and Events +****************************** -Specifying threads -================== -The simulator ``-t`` option provides the ability to specify how many threads to run the simulator with. +Specifying number of processes +============================== +The simulator ``-t`` option provides the ability to specify how many processes to run the simulator with. -When you run the simulator with ``-t 1``, there is only one client active and running at a time, and the clients will be running in -turn. This is to enable the simulation of large number of clients using a single machine with limited resources. +.. note:: + + The ``-t`` and ``--threads`` option for simulator was originally due to clients running in separate threads. + However each client now actually runs in a separate process. This distinction will not affect the user experience. + +- N = number of clients (``-n``) +- T = number of processes (``-t``) -Note that if you have fewer threads than the number of clients, ClientRunner/learner object will go thorugh setup and -teardown in every round. +When running the simulator with fewer processes than clients (T < N) +the simulator will need to swap-in/out the clients for the processes, resulting in some of the clients running sequentially as processes are available. +This also will cause the ClientRunner/learner objects to go through setup and teardown in every round. +Using T < N is only needed when trying to simulate of large number of clients using a single machine with limited resources. -With ``-t=num_client``, the simulator will run the number of clients in separate threads at the same time. Each -client will always be running in memory with no swap_in / swap_out, but it will require more resources available. +In most cases, run the simulator with the same number of processes as clients (T = N). The simulator will run the number of clients in separate processes at the same time. Each +client will always be running in memory with no swap-in/out, but it will require more resources available. For the dataset / tensorboard initialization, you could make use of EventType.SWAP_IN and EventType.SWAP_OUT in the application. diff --git a/docs/user_guide/nvflare_cli/job_cli.rst b/docs/user_guide/nvflare_cli/job_cli.rst index 78852af370..8dcccaf5a6 100644 --- a/docs/user_guide/nvflare_cli/job_cli.rst +++ b/docs/user_guide/nvflare_cli/job_cli.rst @@ -8,6 +8,11 @@ The NVIDIA FLARE :mod:`Job CLI` provides options to cr jobs from a command line interface. See the :github_nvflare_link:`NVFlare Job CLI Notebook ` for a tutorial on how to use the Job CLI. +.. note:: + + We have introduced a new Pythonic Job API experience, please + check :ref:`fed_job_api`. + *********************** Command Usage *********************** @@ -45,22 +50,42 @@ the job_templates. The output should be similar to the following: -.. code-block::shell +.. code-block:: none The following job templates are available: - ------------------------------------------------------------------------------------------------------------------------ - name Description Controller Type Client Category - ------------------------------------------------------------------------------------------------------------------------ - sag_cross_np scatter & gather and cross-site validation using numpy server client executor - sag_pt scatter & gather workflow using pytorch server client_api - sag_pt_ddp scatter & gather workflow using pytorch + ddp server client_api - sag_pt_deploy_map SAG workflow with pytorch, deploy_map, site-specific configs server client_api - sag_tf scatter & gather workflow using TensorFlow server client_api - stats_df FedStats: tabular data with pandas server stats executor - stats_image FedStats: image intensity histogram server stats executor - ------------------------------------------------------------------------------------------------------------------------ - + ---------------------------------------------------------------------------------------------------------------------- + name Description Controller Type Execution API Type + ---------------------------------------------------------------------------------------------------------------------- + cyclic_cc_pt client-controlled cyclic workflow with PyTorch ClientAPI tra client client_api + cyclic_pt server-controlled cyclic workflow with PyTorch ClientAPI tra server client_api + psi_csv private-set intersection for csv data server Executor + sag_cross_np scatter & gather and cross-site validation using numpy server client executor + sag_cse_pt scatter & gather workflow and cross-site evaluation with PyT server client_api + sag_gnn scatter & gather workflow for gnn learning server client_api + sag_nemo Scatter and Gather Workflow for NeMo server client_api + sag_np scatter & gather workflow using numpy server client_api + sag_np_cell_pipe scatter & gather workflow using numpy server client_api + sag_np_metrics scatter & gather workflow using numpy server client_api + sag_pt scatter & gather workflow using pytorch server client_api + sag_pt_deploy_map SAG workflow with pytorch, deploy_map, site-specific configs server client_api + sag_pt_executor scatter & gather workflow and cross-site evaluation with PyT server Executor + sag_pt_he scatter & gather workflow using pytorch and homomorphic encr server client_api + sag_pt_mlflow scatter & gather workflow using pytorch with MLflow tracking server client_api + sag_pt_model_learner scatter & gather workflow and cross-site evaluation with PyT server ModelLearner + sag_tf scatter & gather workflow using TensorFlow server client_api + sklearn_kmeans scikit-learn KMeans model server client_api + sklearn_linear scikit-learn linear model server client_api + sklearn_svm scikit-learn SVM model server client_api + stats_df FedStats: tabular data with pandas server stats executor + stats_image FedStats: image intensity histogram server stats executor + swarm_cse_pt Swarm Learning with Cross-Site Evaluation with PyTorch client client_api + swarm_cse_pt_model_l Swarm Learning with Cross-Site Evaluation with PyTorch Model client ModelLearner + vertical_xgb vertical federated xgboost server Executor + xgboost_tree xgboost horizontal tree-based collaboration model server client_api + ---------------------------------------------------------------------------------------------------------------------- + +View all the available templates at the :github_nvflare_link:`FLARE Job Template Registry `. Setting job_template path ------------------------- @@ -90,20 +115,18 @@ The options for usage are as follows: .. code-block:: - usage: nvflare job create [-h] [-j [JOB_FOLDER]] [-w [TEMPLATE]] [-s [SCRIPT]] [-sd [SCRIPT_DIR]] [-f [CONFIG_FILE ...]] [-debug] [-force] + usage: nvflare job create [-h] [-j [JOB_FOLDER]] [-w [TEMPLATE]] [-sd [SCRIPT_DIR]] [-f [CONFIG_FILE [CONFIG_FILE ...]]] [-debug] [-force] - options: + optional arguments: -h, --help show this help message and exit -j [JOB_FOLDER], --job_folder [JOB_FOLDER] job_folder path, default to ./current_job directory -w [TEMPLATE], --template [TEMPLATE] - template name or template folder. You can use list_templates to see available jobs from job templates, pick name such as 'sag_pt' as template name. Alternatively, you can use the path to the job template folder, such as - job_templates/sag_pt - -s [SCRIPT], --script [SCRIPT] - code script such as train.py + template name or template folder. You can use list_templates to see available jobs from job templates, pick name such as 'sag_pt' as template name. Alternatively, you can use the path to the job + template folder, such as job_templates/sag_pt -sd [SCRIPT_DIR], --script_dir [SCRIPT_DIR] script directory contains additional related files. All files or directories under this directory will be copied over to the custom directory. - -f [CONFIG_FILE ...], --config_file [CONFIG_FILE ...] + -f [CONFIG_FILE [CONFIG_FILE ...]], --config_file [CONFIG_FILE [CONFIG_FILE ...]] Training config file with corresponding optional key=value pairs. If key presents in the preceding config file, the value in the config file will be overwritten by the new value -debug, --debug debug is on -force, --force force create is on, if -force, overwrite existing configuration with newly created configurations diff --git a/docs/user_guide/nvflare_cli/poc_command.rst b/docs/user_guide/nvflare_cli/poc_command.rst index ae8cb707da..9f5d45e021 100644 --- a/docs/user_guide/nvflare_cli/poc_command.rst +++ b/docs/user_guide/nvflare_cli/poc_command.rst @@ -1,34 +1,33 @@ .. _poc_command: ***************************************** -Command for Proof Of Concept (POC) Mode +Proof Of Concept (POC) Command ***************************************** -Introduction to the POC Command -=============================== The POC command allows users to try out the features of NVFlare in a proof of concept deployment on a single machine. +Different processes represent the server, clients, and the admin console, making it a useful tool in preparation for a distributed deployment. + Syntax and Usage ================= -The POC command has been reorgaznied in version 2.4 to have the subcommands ``prepare``, ``start``, ``stop``, and ``clean`` (``prepare-examples`` -should happen in prepare now). +The POC command has been reorgaznied in version 2.4 to have the subcommands ``prepare``, ``prepare-jobs-dir``, ``start``, ``stop``, and ``clean``. .. code-block:: none nvflare poc -h - usage: nvflare poc [-h] {prepare,prepare-examples,start,stop,clean} ... + usage: nvflare poc [-h] {prepare,prepare-jobs-dir,start,stop,clean} ... options: -h, --help show this help message and exit poc: - {prepare,prepare-examples,start,stop,clean} + {prepare,prepare-jobs-dir,start,stop,clean} poc subcommand prepare prepare poc environment by provisioning local project - prepare-examples prepare examples + prepare-jobs-dir prepare jobs directory start start services in poc mode stop stop services in poc mode clean clean up poc workspace @@ -41,7 +40,7 @@ The detailed options for ``nvflare poc prepare``: nvflare poc prepare -h - usage: nvflare poc prepare [-h] [-n [NUMBER_OF_CLIENTS]] [-c [CLIENTS ...]] [-e [EXAMPLES]] [-he] [-i [PROJECT_INPUT]] [-d [DOCKER_IMAGE]] [-debug] + usage: nvflare poc prepare [-h] [-n [NUMBER_OF_CLIENTS]] [-c [CLIENTS ...]] [-he] [-i [PROJECT_INPUT]] [-d [DOCKER_IMAGE]] [-debug] options: -h, --help show this help message and exit @@ -49,8 +48,6 @@ The detailed options for ``nvflare poc prepare``: number of sites or clients, default to 2 -c [CLIENTS ...], --clients [CLIENTS ...] Space separated client names. If specified, number_of_clients argument will be ignored. - -e [EXAMPLES], --examples [EXAMPLES] - examples directory -he, --he enable homomorphic encryption. -i [PROJECT_INPUT], --project_input [PROJECT_INPUT] project.yaml file path, If specified, 'number_of_clients','clients' and 'docker' specific options will be ignored. @@ -58,16 +55,31 @@ The detailed options for ``nvflare poc prepare``: generate docker.sh based on the docker_image, used in '--prepare' command. and generate docker.sh 'start/stop' commands will start with docker.sh -debug, --debug debug is on +nvflare poc prepare-jobs-dir +---------------------------- +The detailed options for ``nvflare poc prepare-jobs-dir``: + +.. code-block:: none + + nvflare poc prepare-jobs-dir -h + + usage: nvflare poc prepare-jobs-dir [-h] [-j [JOBS_DIR]] [-debug] + + optional arguments: + -h, --help show this help message and exit + -j [JOBS_DIR], --jobs_dir [JOBS_DIR] + jobs directory + -debug, --debug debug is on .. note:: - The "-e" option is new in version 2.4 for linking to the examples in the code base. Previously, you could + The "-j" option is new in version 2.4 for linking to the job directory in the code base. Previously, you could optionally define an ``NVFLARE_HOME`` environment variable to point to a local NVFlare directory to create a symbolic link to point the transfer directory to the examples in the code base. For example, if the the NVFlare GitHub repository is cloned under ~/projects, then you could set ``NVFLARE_HOME=~/projects/NVFlare``. If the NVFLARE_HOME environment variable was not set, you could manually copy the examples to the transfer directory. - Now, the "-e" option takes precedence over the ``NVFLARE_HOME`` environment variable, but the ``NVFLARE_HOME`` environment + Now, the "-j" option takes precedence over the ``NVFLARE_HOME`` environment variable, but the ``NVFLARE_HOME`` environment variable can still be used. @@ -282,7 +294,7 @@ will start ALL clients (site-1, site-2) and server as well as FLARE Console (aka .. note:: - If you prefer to have the FLARE Console on a different terminal, you can start everything else with: ``nvflare poc start -ex admin``. + If you prefer to have the FLARE Console on a different terminal, you can start everything else with: ``nvflare poc start -ex admin@nvidia.com``. Start the server only ---------------------- @@ -346,6 +358,59 @@ If there is no GPU, then there will be no assignments. If there are GPUs, they w nvidia-smi --list-gpus +Operating the System and Submitting a Job +========================================== +After preparing the poc workspace and starting the server, clients, and console (optional), we have several options to operate the whole system. + +First, link the desired job directory to the admin's transfer directory: + +.. code-block:: none + + nvflare poc prepare-jobs-dir -j NVFlare/examples + +FLARE Console +-------------- +After starting the FLARE console with: + +.. code-block:: none + + nvflare poc start -p admin@nvidia.com + +Login and submit the job: + +.. code-block:: none + + submit_job hello-world/hello-numpy-sag/jobs/hello-numpy-sag + +Refer to :ref:`operating_nvflare` for more details. + +FLARE API +--------- +To programmatically operate the system and submit a job, use the :ref:`flare_api`: + +.. code-block:: python + + import os + from nvflare.fuel.flare_api.flare_api import new_secure_session + + poc_workspace = "/tmp/nvflare/poc" + poc_prepared = os.path.join(poc_workspace, "example_project/prod_00") + admin_dir = os.path.join(poc_prepared, "admin@nvidia.com") + sess = new_secure_session("admin@nvidia.com", startup_kit_location=admin_dir) + job_id = sess.submit_job("hello-world/hello-numpy-sag/jobs/hello-numpy-sag") + + print(f"Job is running with ID {job_id}") + + +Job CLI +------- +The :ref:`job_cli` also provides a convenient command to submit a job: + +.. code-block:: none + + nvflare job submit -j NVFlare/examples/hello-world/hello-numpy-sag/jobs/hello-numpy-sag + + Stop Package(s) =============== @@ -371,3 +436,9 @@ There is a command to clean up the POC workspace added in version 2.2 that will .. code-block:: nvflare poc clean + +Learn More +=========== + +To learn more about the different options of the POC command in more detail, see the +:github_nvflare_link:`Setup NVFLARE in POC Mode Tutorial `. diff --git a/docs/user_guide/security/data_privacy_protection.rst b/docs/user_guide/security/data_privacy_protection.rst index dd91a93f0d..e1593faacd 100644 --- a/docs/user_guide/security/data_privacy_protection.rst +++ b/docs/user_guide/security/data_privacy_protection.rst @@ -14,4 +14,4 @@ general-purpose data :ref:`filtering mechanism ` for processing task da This mechanism has been used for the purpose of data privacy protection on the client side. For example, differential privacy filters can be applied to model weights before sending to the server for aggregation. -NVFLARE has implemented some commonly used privacy protection filters: https://github.com/NVIDIA/NVFlare/tree/main/nvflare/app_common/filters +NVFLARE has implemented github_nvflare_link:`some commonly used privacy protection filters. ` diff --git a/docs/user_guide/security/serialization.rst b/docs/user_guide/security/serialization.rst index 997804161b..e601da79ed 100644 --- a/docs/user_guide/security/serialization.rst +++ b/docs/user_guide/security/serialization.rst @@ -5,4 +5,230 @@ Message Serialization NVFLARE uses a secure mechanism called FOBS (Flare OBject Serializer) for message serialization and deserialization when exchanging data between the server and clients. -See ``_ for usage guidelines. + +Flare Object Serializer (FOBS) +============================== + + +Overview +-------- + +FOBS is a drop-in replacement for Pickle for security purposes. It uses **MessagePack** to +serialize objects. + +FOBS sacrifices convenience for security. With Pickle, most objects are supported +automatically using introspection. To serialize an object using FOBS, a **Decomposer** +must be registered for the class. A few decomposers for commonly used classes are +pre-registered with the module. + +FOBS supports enum types by registering decomposers automatically for all classes that +are subclasses of :code:`Enum`. + +FOBS treats all other classes as dataclass by registering a generic decomposer for dataclasses. +Dataclass is a class whose constructor only changes the state of the object without side-effects. +Side-effects include changing global variables, creating network connection, files etc. + +FOBS throws :code:`TypeError` exception when it encounters an object with no decomposer +registered. For example, +:: + TypeError: can not serialize 'xxx' object + +Usage +----- + +FOBS defines following 4 functions, similar to Pickle, + +* :code:`dumps(obj)`: Serializes obj and returns bytes +* :code:`dump(obj, stream)`: Serializes obj and writes the result to stream +* :code:`loads(data)`: Deserializes the data and returns an object +* :code:`load(stream)`: Reads data from stream and deserializes it into an object + + +Examples, +:: + + from nvflare.fuel.utils import fobs + + data = fobs.dumps(dxo) + new_dxo = fobs.loads(data) + + # Pickle/json compatible functions can be used also + data = fobs.dumps(shareable) + new_shareable = fobs.loads(data) + +Decomposers +----------- + +Decomposers are classes that inherit abstract base class :code:`fobs.Decomposer`. FOBS +uses decomposers to break an object into **serializable objects** before serializing it +using MessagePack. + +Decomposers are very similar to serializers, except that they don't have to convert object +into bytes directly, they can just break the object into other objects that are serializable. + +An object is serializable if its type is supported by MessagePack or a decomposer is +registered for its class. + +FOBS recursively decomposes objects till all objects are of types supported by MessagePack. +Decomposing looping must be avoided, which causes stack overflow. Decomposers form a loop +when one class is decomposed into another class which is eventually decomposed into the +original class. For example, this scenario forms the simplest loop: X decomposes into Y +and Y decomposes back into X. + +MessagePack supports following types natively, + +* None +* bool +* int +* float +* str +* bytes +* bytearray +* memoryview +* list +* dict + +Decomposers for following classes are included with `fobs` module and auto-registered, + +* tuple +* set +* OrderedDict +* datetime +* Shareable +* FLContext +* DXO +* Client +* RunSnapshot +* Workspace +* Signal +* AnalyticsDataType +* argparse.Namespace +* Learnable +* _CtxPropReq +* _EventReq +* _EventStats +* numpy.float32 +* numpy.float64 +* numpy.int32 +* numpy.int64 +* numpy.ndarray + +All classes defined in :code:`fobs/decomposers` folder are automatically registered. +Other decomposers must be registered manually like this, + +:: + + fobs.register(FooDecomposer) + fobs.register(BarDecomposer()) + + +:code:`fobs.register` takes either a class or an instance as the argument. Decomposer whose +constructor takes arguments must be registered as instance. + +A decomposer can either serialize the class into bytes or decompose it into objects of +serializable types. In most cases, it only involves saving members as a list and reconstructing +the object from the list. + +MessagePack can't handle items larger than 4GB in dict. To work around this issue, FOBS can externalize +the large item and just stores a reference in the buffer. :code:`DatumManager` is used to handle the +externalized data. For most objects which don't deal with dict items larger than 4GB, the DatumManager +is not needed. + +Here is an example of a simple decomposer. Even though :code:`datetime` is not supported +by MessagePack, a decomposer is included in `fobs` module so no need to further decompose it. + +:: + + from nvflare.fuel.utils import fobs + + + class Simple: + + def __init__(self, num: int, name: str, timestamp: datetime): + self.num = num + self.name = name + self.timestamp = timestamp + + + class SimpleDecomposer(fobs.Decomposer): + + def supported_type(self) -> Type[Any]: + return Simple + + def decompose(self, obj, manager) -> Any: + return [obj.num, obj.name, obj.timestamp] + + def recompose(self, data: Any, manager) -> Simple: + return Simple(data[0], data[1], data[2]) + + + fobs.register(SimpleDecomposer) + data = fobs.dumps(Simple(1, 'foo', datetime.now())) + obj = fobs.loads(data) + assert obj.num == 1 + assert obj.name == 'foo' + assert isinstance(obj.timestamp, datetime) + + +The same decomposer can be registered multiple times. Only first one takes effect, the others +are ignored with a warning message. + +Note that ``fobs_initialize()`` may need to be called if decomposers are not registered. + +Enum Types +---------- + +All classes derived from :code:`Enum` are automatically handled by the default enum decomposer, +which is already registered for you. +This means you don't need to manually configure anything for these enums; +they come with built-in support for serialization and deserialization. + +In rare cases where a class derived from :code:`Enum` +is too complex for the generic decomposer to handle, +you can write and register a special decomposer. +This will prevent FOBS from using the generic decomposer for that class. + +Dataclass Types +--------------- + +All dataclass are automatically handled by the default dataclass decomposer, +which is already registered for you. +This means you don't need to manually configure anything for those classes. + +An example of dataclass: + +.. code-block:: python + + from dataclasses import dataclass + + @dataclass + class Student: + name: str + height: int + +Custom Types +------------ + +To support custom types with FOBS, the decomposers for the types must be included +with the custom code and registered. + +The decomposers must be registered in both server and client code before FOBS is used. +A good place for registration is the constructors for controllers and executors. It +can also be done in ``START_RUN`` event handler. + +Custom object cannot be put in ``shareable`` directly, +it must be serialized using FOBS first. Assuming ``custom_data`` contains custom type, +this is how data can be stored in shareable, +:: + shareable[CUSTOM_DATA] = fobs.dumps(custom_data) +On the receiving end, +:: + custom_data = fobs.loads(shareable[CUSTOM_DATA]) + +This doesn't work +:: + shareable[CUSTOM_DATA] = custom_data + + +When using custom types with FOBS, +please place each custom type, such as class ``CustomType``, in its own file within the custom folder of the app directory. diff --git a/docs/whats_new.rst b/docs/whats_new.rst index 228cc25b8a..845d108182 100644 --- a/docs/whats_new.rst +++ b/docs/whats_new.rst @@ -1,6 +1,10 @@ .. _whats_new: -.. include:: release_notes/flare_240.rst +########## +What's New +########## + +.. include:: release_notes/flare_250.rst ************************** Previous Releases of FLARE @@ -9,6 +13,7 @@ Previous Releases of FLARE .. toctree:: :maxdepth: 1 + release_notes/flare_240 release_notes/flare_230 release_notes/flare_220 release_notes/flare_210 diff --git a/examples/README.md b/examples/README.md index 3f615a4460..d1827fb822 100644 --- a/examples/README.md +++ b/examples/README.md @@ -3,7 +3,7 @@ [NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.html) provides several examples to help you get started using federated learning for your own applications. The provided examples cover different aspects of [NVIDIA FLARE](https://nvflare.readthedocs.io/en/main/index.html), -such as using the provided [Controllers](https://nvflare.readthedocs.io/en/main/programming_guide/controllers.html) +such as using the provided [Controllers](https://nvflare.readthedocs.io/en/main/programming_guide/workflows_and_controllers.html) for "scatter and gather" or "cyclic weight transfer" workflows and different [Executors](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.apis.executor.html) to implement your own training and validation pipelines. @@ -76,26 +76,47 @@ When you open a notebook, select the kernel `nvflare_example` using the dropdown | Example | Framework | Summary | |----------------------------------------------------------------------------------------------------------------------------------------|--------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Notebook for Hello Examples](./hello-world/hello_world.ipynb) | - | Notebook for examples below. | -| [Hello Scatter and Gather](./hello-world/hello-numpy-sag/README.md) | Numpy | Example using [ScatterAndGather](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.workflows.scatter_and_gather.html) controller workflow. | -| [Hello Cross-Site Validation](./hello-world/hello-numpy-cross-val/README.md) | Numpy | Example using [CrossSiteModelEval](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.workflows.cross_site_model_eval.html) controller workflow. | +| [Hello FedAvg NumPy](./hello-world/hello-fedavg-numpy/README.md) | Numpy | Example using [FedAvg](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.workflows.fedavg.html) controller workflow. | +| [Hello Cross-Site Validation](./hello-world/hello-cross-val/README.md) | Numpy | Example using [CrossSiteEval](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.workflows.cross_site_eval.html) controller workflow, and example using previous results without training workflow. | | [Hello Cyclic Weight Transfer](./hello-world/hello-cyclic/README.md) | PyTorch | Example using [CyclicController](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.workflows.cyclic_ctl.html) controller workflow to implement [Cyclic Weight Transfer](https://pubmed.ncbi.nlm.nih.gov/29617797/). | | [Hello PyTorch](./hello-world/hello-pt/README.md) | PyTorch | Example using an image classifier using [FedAvg](https://arxiv.org/abs/1602.05629) and [PyTorch](https://pytorch.org/) as the deep learning training framework. | -| [Hello TensorFlow](./hello-world/hello-tf2/README.md) | TensorFlow2 | Example of using an image classifier using [FedAvg](https://arxiv.org/abs/1602.05629) and [TensorFlow](https://tensorflow.org/) as the deep learning training framework. | - -## 2. Tutorial notebooks +| [Hello TensorFlow](./hello-world/hello-tf/README.md) | TensorFlow | Example of using an image classifier using [FedAvg](https://arxiv.org/abs/1602.05629) and [TensorFlow](https://tensorflow.org/) as the deep learning training framework. | + +## 2. Step-by-Step Examples +| Example | Dataset | Controller-Type | Execution API Type | Framework | Summary | +|---------|---------|-----------------|-----------------|-----------|---------| +| [image_stats](./hello-world/step-by-step/cifar10/stats/image_stats.ipynb) | CIFAR10 | server | Executor | Pandas | Example for federated stats image histogram calculation. | +| [sag](./hello-world/step-by-step/cifar10/sag/sag.ipynb) | CIFAR10 | server | Client API| PyTorch | Example for FedAvg with [ScatterAndGather](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.workflows.scatter_and_gather.html) controller workflow using the Client API. | +| [sag_deploy_map](./hello-world/step-by-step/cifar10/sag_deploy_map/sag_deploy_map.ipynb) | CIFAR10 | server | Client API | PyTorch | Example showcasing site-specific configurations and deploy_map. | +| [sag_model_learner](./hello-world/step-by-step/cifar10/sag_model_learner/sag_model_learner.ipynb) | CIFAR10 | server | ModelLearner | PyTorch | Example with [ScatterAndGather](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.workflows.scatter_and_gather.html) using a ModelLearner. | +| [sag_executor](./hello-world/step-by-step/cifar10/sag_executor/sag_executor.ipynb) | CIFAR10 | server | Executor | PyTorch | Example with [ScatterAndGather](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.workflows.scatter_and_gather.html) using an Executor. | +| [sag_mlflow](./hello-world/step-by-step/cifar10/sag_mlflow/sag_mlflow.ipynb) | CIFAR10 | server | Client API | PyTorch | MLflow experiment tracking logs with [ScatterAndGather](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.workflows.scatter_and_gather.html) using the Client API. | +| [sag_he](./hello-world/step-by-step/cifar10/sag_he/sag_he.ipynb) | CIFAR10 | server | Client API | PyTorch | Example with homomorphic encyption using Client API and POC -he mode. | +| [cse](./hello-world/step-by-step/cifar10/cse/cse.ipynb) | CIFAR10 | server | Client API| PyTorch | Example using [CrossSiteModelEval](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.workflows.cross_site_model_eval.html) controller workflow. | +| [cyclic](./hello-world/step-by-step/cifar10/cyclic/cyclic.ipynb) | CIFAR10 | server | Client API | PyTorch | Example for cyclic weight transfer using [CyclicController](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.workflows.cyclic_ctl.html) controller workflow. | +| [cyclic_ccwf](./hello-world/step-by-step/cifar10/cyclic_ccwf/cyclic_ccwf.ipynb) | CIFAR10 | client| Client API | PyTorch | Example for client-controlled cyclic weight transfer using [CyclicClientController](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.ccwf.cyclic_client_ctl.html) controller workflow. | +| [swarm](./hello-world/step-by-step/cifar10/swarm/swarm.ipynb) | CIFAR10 | client | Client API | PyTorch | Example for swarm learning and client-controlled cross-site evaluation using [SwarmClientController](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.ccwf.swarm_client_ctl.html) and [CrossSiteEvalClientController](https://nvflare.readthedocs.io/en/main/apidocs/nvflare.app_common.ccwf.cse_client_ctl.html) controller workflows. | +| [tabular_stats](./hello-world/step-by-step/higgs/stats/tabular_stats.ipynb) | HIGGS | server | Executor |Pandas | Example for federated stats tabular histogram calculation. | +| [sklearn_linear](./hello-world/step-by-step/higgs/sklearn-linear/sklearn_linear.ipynb) | HIGGS | server | Client API |sklearn | Example for federated linear model (logistic regression on binary classification) learning on tabular data. | +| [sklearn_svm](./hello-world/step-by-step/higgs/sklearn-svm/sklearn_svm.ipynb) | HIGGS | server | Client API | sklearn | Example for federated SVM model learning on tabular data. | +| [sklearn_kmeans](./hello-world/step-by-step/higgs/sklearn-kmeans/sklearn_kmeans.ipynb) | HIGGS | server | Client API |sklearn | Example for federated k-Means clustering on tabular data. | +| [xgboost](./hello-world/step-by-step/higgs/xgboost/xgboost_horizontal.ipynb) | HIGGS | server | Client API |XGBoost | Example for federated horizontal xgboost learning on tabular data with bagging collaboration. | + +## 3. Tutorial notebooks | Example | Summary | |----------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Intro to the FL Simulator](./tutorials/flare_simulator.ipynb) | Shows how to use the FLARE Simulator to run a local simulation. | -| [Hello FLARE API](./tutorials/flare_api.ipynb) | Goes through the different commnads of the FLARE API. | -| [NVFLARE in POC Mode](./tutorials/setup_poc.ipynb) | Shows how to use POC mode. | +| [Hello FLARE API](./tutorials/flare_api.ipynb) | Goes through the different commands of the FLARE API. | +| [NVFLARE in POC Mode](./tutorials/setup_poc.ipynb) | Shows how to use POC mode. | +| [Job CLI](./tutorials/job_cli.ipynb) | Walks through the different commands of the Job CLI. | -## 3. FL algorithms +## 4. FL algorithms | Example | Summary | |------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Simulated Federated Learning with CIFAR-10](./advanced/cifar10/cifar10-sim/README.md) | This example includes instructions on running [FedAvg](https://arxiv.org/abs/1602.05629), [FedProx](https://arxiv.org/abs/1812.06127), [FedOpt](https://arxiv.org/abs/2003.00295), and [SCAFFOLD](https://arxiv.org/abs/1910.06378) algorithms using NVFlare's FL simulator. | | [Real-world Federated Learning with CIFAR-10](./advanced/cifar10/cifar10-real-world/README.md) | Includes instructions on running [FedAvg](https://arxiv.org/abs/1602.05629) with streaming of TensorBoard metrics to the server during training and [homomorphic encryption](https://developer.nvidia.com/blog/federated-learning-with-homomorphic-encryption/). | -## 4. Traditional ML examples +## 5. Traditional ML examples | Example | Framework | Notebooks | Summary | |----------------------------------------------------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Federated Linear Model with Scikit-learn](./advanced/sklearn-linear/README.md) | scikit-learn | [FL Model with Scikit-learn on HIGGS Dataset](./advanced/sklearn-linear/sklearn_linear_higgs.ipynb) | Shows how to use the NVIDIA FLARE with [scikit-learn](https://scikit-learn.org/), a widely used open-source machine learning library. | @@ -104,39 +125,69 @@ When you open a notebook, select the kernel `nvflare_example` using the dropdown | [Histogram-based FL for XGBoost](./advanced/xgboost/histogram-based/README.md) | XGBoost | [Histogram-based FL for XGBoost on HIGGS Dataset](./advanced/xgboost/histogram-based/xgboost_histogram_higgs.ipynb) | Histogram-based algorithm for XGBoost | | [Tree-based Federated Learning for XGBoost](./advanced/xgboost/tree-based/README.md) | XGBoost | [Tree-based FL for XGBoost on HIGGS Dataset](./advanced/xgboost/tree-based/README.md) | Tree-based algorithms includes [bagging](./advanced/xgboost/tree-based/jobs/bagging_base) and [cyclic](./advanced/xgboost/tree-based/jobs/cyclic_base) approaches. | | [Federated Learning for Random Forest based on XGBoost](./advanced/random_forest/README.md) | XGBoost | [Federated Random Forest on HIGGS Dataset](./advanced/random_forest/random_forest.ipynb) | Example of using NVIDIA FLARE with [scikit-learn](https://scikit-learn.org/) and Random Forest. | +| [Federated Vertical XGBoost](./advanced/vertical_xgboost) | XGBoost | [Federated Vertical XGBoost](./advanced/vertical_xgboost/README.md) | Example using Private Set Intersection and XGBoost on vertically split HIGGS data. | -## 5. Medical Image Analysis +## 6. Medical Image Analysis | Example | Framework | Summary | |----------------------------------------------------------------------------------------------------------------------------------------|--------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| | [NVFlare + MONAI integration](../integration/monai/README.md) | MONAI | For an example of using NVIDIA FLARE to train a 3D medical image analysis model using federated averaging (FedAvg) and MONAI Bundle, see [here](../integration/monai/examples/README.md). | | [Federated Learning with Differential Privacy for BraTS18 segmentation](./advanced/brats18/README.md) | MONAI | Illustrates the use of differential privacy for training brain tumor segmentation models using federated learning. | | [Federated Learning for Prostate Segmentation from Multi-source Data](./advanced/prostate/README.md) | MONAI | Example of training a multi-institutional prostate segmentation model using [FedAvg](https://arxiv.org/abs/1602.05629), [FedProx](https://arxiv.org/abs/1812.06127), and [Ditto](https://arxiv.org/abs/2012.04221). | -## 6. Federated Statistics +## 7. Federated Statistics [Federated Statistics Overview](./advanced/federated-statistics/README.md) discusses the overall federated statistics features. | Example | Notebooks | Summary | |----------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Federated Statistics for Images](./advanced/federated-statistics/image_stats/README.md) | [Image Histograms](./advanced/federated-statistics/image_stats.ipynb) | Example of gathering local image histogram to compute the global dataset histograms. | | [Federated Statistics for DataFrame](./advanced/federated-statistics/df_stats/README.md) | [Data Frame Federated Statistics](./advanced/federated-statistics/df_stats.ipynb), [Visualization](./advanced/federated-statistics/df_stats/demo/visualization.ipynb) | Example of gathering local statistics summary from Pandas DataFrame to compute the global dataset statistics. | +| [Federated Hierarchical Statistics for DataFrame](./advanced/federated-statistics/hierarchical_stats/README.md) | [Federated Hierarchical Statistics](./advanced/federated-statistics/hierarchical_stats/hierarchical_stats.ipynb), [Visualization](./advanced/federated-statistics/hierarchical_stats/demo/visualization.ipynb) | Example of generating federated hierarchical statistics for data that can be represented as Pandas DataFrame. | -## 7. Federated Policies +## 8. Federated Policies | Example | Summary | |----------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| -| [Federated Policies](./advanced/federated-policies/README.rst) | Discuss the federated site policies for authorization, resource and data privacy management. | +| [Federated Policies](./advanced/federated-policies/README.rst) | Discuss the federated site policies for authorization, resource and data privacy management. +| [Custom Authentication](./advanced/custom_authentication/README.rst) | Demonstrate the custom authentication policy and secure mode. +| [Job-Level Authorization](./advanced/job-level-authorization/README.md) | Demonstrate the job-level authorization policy and secure mode. +| [KeyCloak Site Authentication Integration](./advanced/keycloak-site-authentication/README.md) | Demonstrate KeyCloak integration for supporting site-specific authentication. -## 8. Experiment tracking +## 9. Experiment tracking | Example | Framework | Summary | |----------------------------------------------------------------------------------------------------------------------------------------|--------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Hello PyTorch with TensorBoard Streaming](./advanced/experiment-tracking/tensorboard/README.md) | PyTorch | Example building upon [Hello PyTorch](./hello-world/hello-pt/README.md) showcasing the [TensorBoard](https://tensorflow.org/tensorboard) streaming capability from the clients to the server. | +| [FL Experiment Tracking with MLflow](./advanced/experiment-tracking/mlflow/README.md) | PyTorch | Example integrating [Hello PyTorch](./hello-world/hello-pt/README.md) with MLflow streaming capability from clients to the server. | +| [FL Experiment Tracking with Weights and Biases](./advanced/experiment-tracking/wandb/README.md) | PyTorch | Example integrating [Hello PyTorch](./hello-world/hello-pt/README.md) with Weights and Biases streaming capability from clients to the server. | +| [MONAI FLARE Integration Experiment Tracking](../integration/monai/examples/spleen_ct_segmentation_local/README.md#51-experiment-tracking-with-mlflow) | MONAI | Example using FLARE and MONAI integration with experiment tracking streaming from clients to server. | -## 9. NLP +## 10. NLP | Example | Summary | |---------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| | [NLP-NER](./advanced/nlp-ner/README.md) | Illustrates both [BERT](https://github.com/google-research/bert) and [GPT-2](https://github.com/openai/gpt-2) models from [Hugging Face](https://huggingface.co/) ([BERT-base-uncased](https://huggingface.co/bert-base-uncased), [GPT-2](https://huggingface.co/gpt2)) on a Named Entity Recognition (NER) task using the [NCBI disease dataset](https://pubmed.ncbi.nlm.nih.gov/24393765/). | -## 10. Federated Learning Hub +## 11. Federated Learning Hub | Example | Framework | Summary | |---------------------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------| | [FL Hub](./advanced/fl_hub/README.md) | PyTorch/MONAI | Allow hierarchical interaction between several levels of nvflare FL systems, e.g. Tier-1 (hub) and Tier-2 (sub-systems). | + +## 12. Federated Large Language Model (LLM) + +| Example | Framework | Summary | +|---------------------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------| +| [Parameter Efficient Fine Turning](../integration/nemo/examples/peft) | NeMo | Example utilizing NeMo's PEFT methods to adapt a LLM to a downstream task. | +| [Prompt-Tuning Example](../integration/nemo/examples/prompt_learning) | NeMo | Example for using FLARE with NeMo for prompt learning.| +| [Supervised Fine Tuning (SFT)](../integration/nemo/examples/supervised_fine_tuning) | NeMo | Example to fine-tune all parameters of a LLM on supervised data. | +| [LLM Tuning via HuggingFace SFT Trainer](./advanced/llm_hf) | NeMo | Example for using FLARE with a HuggingFace trainer for LLM tuning tasks. | + +## 13. Graph Neural Network (GNN) + +| Example | Framework | Summary | +|---------------------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------| +| [Protein Classification](./advanced/gnn#federated-gnn-on-graph-dataset-using-inductive-learning) | PyTorch Geometric | Example using GNNs for Protein Classification using [PPI](http://snap.stanford.edu/graphsage/#code) dataset using GraphSAGE. | +| [Financial Transaction Classification](./advanced/gnn#federated-gnn-on-graph-dataset-using-inductive-learning) | PyTorch Geometric | Example using GNNs for Financial Transaction Classification with [Elliptic++](https://github.com/git-disl/EllipticPlusPlus) dataset using GraphSAGE. | + +## 14. Financial Applications + +| Example | Framework | Summary | +|---------------------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------| +| [Financial Application with Federated XGBoost Methods](./advanced/finance) | XGBoost | Example using XGBoost in various ways to train a federated model to perform fraud detection with a finance dataset. | diff --git a/examples/advanced/README.md b/examples/advanced/README.md index fc4acbb7ef..ebe97ab801 100644 --- a/examples/advanced/README.md +++ b/examples/advanced/README.md @@ -57,6 +57,8 @@ Please also install "./requirements.txt" in each example folder. * Example of gathering local image histogram to compute the global dataset histograms. * [Federated Statistics for DataFrame](./federated-statistics/df_stats/README.md) * Example of gathering local statistics summary from Pandas DataFrame to compute the global dataset statistics. +* [Federated Hierarchical Statistics](./federated-statistics/hierarchical_stats/README.md) + * Example of generating federated hierarchical statistics for data that can be represented as Pandas DataFrame. ## Federated Policies * [Federated Policies](./federated-policies/README.rst) diff --git a/examples/advanced/bionemo/README.md b/examples/advanced/bionemo/README.md new file mode 100644 index 0000000000..eb9366ff62 --- /dev/null +++ b/examples/advanced/bionemo/README.md @@ -0,0 +1,28 @@ +# BioNeMo + +[BioNeMo](https://www.nvidia.com/en-us/clara/bionemo/) is NVIDIA's generative AI platform for drug discovery. + +This directory contains examples of running BioNeMo in a federated learning environment using [NVFlare](https://github.com/NVIDIA/NVFlare). + +## Notebooks + +In this repo you will find two notebooks under the `task_fitting` and `downstream` folders respectively: +1. The [task_fitting](./task_fitting/task_fitting.ipynb) notebook example includes a notebook that shows how to obtain protein learned representations in the form of embeddings using the ESM-1nv pre-trained model. +The model is trained with NVIDIA's BioNeMo framework for Large Language Model training and inference. +2. The [downstream](./downstream/downstream_nvflare.ipynb) notebook example shows three different downstream tasks for fine-tuning a BioNeMo ESM-style model. + +## Requirements + +Download and run the [BioNeMo docker container](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/clara/containers/bionemo-framework). +> **Note:** The examples here were tested with `nvcr.io/nvidia/clara/bionemo-framework:1.8` + +We recommend following the [Quickstart Guide](https://docs.nvidia.com/bionemo-framework/latest/access-startup.html?highlight=docker) +on how to get the BioNeMo container. + +Start the container and Jupyter Lab to run the NVFlare experiments with NVFlare using +```commandline +./start_bionemo.sh +``` +It will start Jupyter Lab at `http://hostname:8888`. + +For information about how to get started with BioNeMo refer to the [documentation](https://docs.nvidia.com/bionemo-framework/latest). \ No newline at end of file diff --git a/examples/advanced/bionemo/downstream/downstream_nvflare.ipynb b/examples/advanced/bionemo/downstream/downstream_nvflare.ipynb new file mode 100644 index 0000000000..efd6109bcc --- /dev/null +++ b/examples/advanced/bionemo/downstream/downstream_nvflare.ipynb @@ -0,0 +1,348 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Federated Protein Downstream Fine-tuning\n", + "\n", + "
NOTE This notebook was tested on a single A1000 GPU and is compatible with BioNeMo Framework v1.8. To leverage additional or higher-performance GPUs, you can modify the configuration files and simulation script to accommodate multiple devices and increase thread utilization respectively.
\n", + "\n", + "The example datasets used here are made available by [Therapeutics Data Commons](https://tdcommons.ai/) through PyTDC.\n", + "\n", + "This example shows three different downstream tasks for fine-tuning a BioNeMo ESM-style model on different datasets.\n", + "We separate the scripts and job configurations into three folders based on the dataset names:\n", + "\n", + "\n", + "1. `tap`: therapeutic antibody profiling\"\n", + "2. `sabdab`: SAbDab: the structural antibody database\"\n", + "3. `scl`: \"subcellular location prediction\"\n", + "\n", + "## Setup\n", + "\n", + "Ensure that you have read through the Getting Started section, can run the BioNeMo Framework docker container, and have configured the NGC Command Line Interface (CLI) within the container. It is assumed that this notebook is being executed from within the container.\n", + "\n", + "
NOTE Some of the cells below generate long text output. We're using
%%capture --no-display --no-stderr cell_output
to suppress this output. Comment or delete this line in the cells below to restore full output.
\n", + "\n", + "### Import and install all required packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%capture --no-display --no-stderr cell_output\n", + "! pip install PyTDC\n", + "! pip install nvflare~=2.5.1\n", + "! pip install biopython\n", + "! pip install scikit-learn\n", + "! pip install matplotlib\n", + "! pip install protobuf==3.20\n", + "! pip install huggingface-hub==0.22.0\n", + "\n", + "import os\n", + "import warnings\n", + "\n", + "\n", + "warnings.filterwarnings('ignore')\n", + "warnings.simplefilter('ignore')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Home Directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bionemo_home = \"/workspace/bionemo\"\n", + "os.environ['BIONEMO_HOME'] = bionemo_home" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download Model Checkpoints\n", + "\n", + "In order to download pretrained models from the NGC registry, **please ensure that you have installed and configured the NGC CLI**, check the [Quickstart Guide](https://docs.nvidia.com/bionemo-framework/latest) for more info. The following code will download the pretrained model `esm2nv_650M_converted.nemo` from the NGC registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the NGC CLI API KEY and ORG for the model download\n", + "# If these variables are not already set in the container, uncomment below\n", + "# to define and set with your API KEY and ORG\n", + "# api_key = \n", + "# ngc_cli_org = \n", + "# # Update the environment variable\n", + "# os.environ['NGC_CLI_API_KEY'] = api_key\n", + "# os.environ['NGC_CLI_ORG'] = ngc_cli_org\n", + "\n", + "# Set variables and paths for model and checkpoint\n", + "model_name = \"esm2nv_650m\" # \"esm1nv\" \n", + "actual_checkpoint_name = \"esm2nv_650M_converted.nemo\" # \"esm1nv.nemo\"\n", + "model_path = os.path.join(bionemo_home, 'models')\n", + "checkpoint_path = os.path.join(model_path, actual_checkpoint_name)\n", + "os.environ['MODEL_PATH'] = model_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture --no-display --no-stderr cell_output\n", + "if not os.path.exists(checkpoint_path):\n", + " !cd /workspace/bionemo && \\\n", + " python download_artifacts.py --model_dir models --models {model_name}\n", + "else:\n", + " print(f\"Model {model_name} already exists at {model_path}.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again for esm1nv: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"esm1nv\"\n", + "actual_checkpoint_name = \"esm1nv.nemo\"\n", + "model_path = os.path.join(bionemo_home, 'models')\n", + "checkpoint_path = os.path.join(model_path, actual_checkpoint_name)\n", + "os.environ['MODEL_PATH'] = model_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture --no-display --no-stderr cell_output\n", + "if not os.path.exists(checkpoint_path):\n", + " !cd /workspace/bionemo && \\\n", + " python download_artifacts.py --model_dir models --models {model_name}\n", + "else:\n", + " print(f\"Model {model_name} already exists at {model_path}.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 1: Cross-endpoint multi-task fitting\n", + "\n", + "#### Data: Five computational developability guidelines for therapeutic antibody profiling\n", + "See https://tdcommons.ai/single_pred_tasks/develop/#tap\n", + "- 241 Antibodies (both chains)\n", + "\n", + "#### Task Description: *Regression*. \n", + "Given the antibody's heavy chain and light chain sequence, predict its developability. The input X is a list of two sequences where the first is the heavy chain and the second light chain.\n", + "\n", + "Includes five metrics measuring developability of an antibody: \n", + " - Complementarity-determining regions (CDR) length - Trivial (excluded)\n", + " - patches of surface hydrophobicity (PSH)\n", + " - patches of positive charge (PPC)\n", + " - patches of negative charge (PNC)\n", + " - structural Fv charge symmetry parameter (SFvCSP)\n", + "\n", + "In the data preparation script, one can choose between uniform sampling of the data among clients and\n", + "heterogeneous data splits using a Dirichlet sampling strategy. \n", + "Here, different values of alpha control the level of heterogeneity. Below, we show a Dirichlet sampling of `alpha=1`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! cd /bionemo_nvflare_examples/downstream/tap && python prepare_tap_data.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| Uniform sampling | Dirichlet sampling |\n", + "|:-------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------:|\n", + "| \"Uniform | \"Dirichlet |\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Run training (central, local, & FL)**\n", + "\n", + "You can change the FL job that's going to be simulated inside the `run_sim_tap.py` script." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! cd /bionemo_nvflare_examples/downstream/tap && python run_sim_tap.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 2: Cross-compound task fitting\n", + "\n", + "#### Data: Predicting Antibody Developability from Sequence using Machine Learning\n", + "See https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al\n", + "- 2,409 Antibodies (both chains)\n", + "\n", + "#### Task Description: *Binary classification*. \n", + "Given the antibody's heavy chain and light chain sequence, predict its developability. The input X is a list of two sequences where the first is the heavy chain and the second light chain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " # you may need to fix these paths to your own scripts\n", + "! cd /bionemo_nvflare_examples/downstream/sabdab && python prepare_sabdab_data.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, we are using the Dirichlet sampling strategy to generate heterogeneous data distributions among clients.\n", + "Lower values of `alpha` generate higher levels of heterogeneity.\n", + "\n", + "| Alpha 10.0 | Alpha 1.0 |\n", + "|:-------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------:|\n", + "| \"Dirichlet | \"Dirichlet |\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "**Run training (central, local, & FL)**\n", + "\n", + "You can change the FL job that's going to be simulated inside the `run_sim_sabdab.py` script." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! cd /bionemo_nvflare_examples/downstream/sabdab && python run_sim_sabdab.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Results with heterogeneous data sampling (alpha=10.0)\n", + "| Setting | Accuracy |\n", + "|:-------:|:---------:|\n", + "| Local | 0.821 |\n", + "| FL | **0.833** |\n", + "\n", + "#### Results with heterogeneous data sampling (alpha=1.0)\n", + "| Setting | Accuracy |\n", + "|:-------:|:---------:|\n", + "| Local | 0.813 |\n", + "| FL | **0.835** |\n", + "\n", + "### Task 3. Subcellular location prediction with ESM2nv 650M\n", + "Follow the data download and preparation in [task_fitting.ipynb](../task_fitting/task_fitting.ipynb).\n", + "\n", + "Here, we use a heterogeneous sampling with `alpha=1.0`.\n", + "\n", + "\"Dirichlet\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for this to work run the task_fitting notebook first in ../nvflare_with_bionemo/task_fitting/task_fitting.ipynb\n", + "! cd /bionemo_nvflare_examples/downstream/scl && python run_sim_scl.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note, you can switch between local and FL jobs by modifying the `run_sim_scl.py` script.\n", + "\n", + "#### Results with heterogeneous data sampling (alpha=10.0)\n", + "| Setting | Accuracy |\n", + "|:-------:|:---------:|\n", + "| Local | 0.773 |\n", + "| FL | **0.776** |\n", + "\n", + "\n", + "\"Dirichlet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/advanced/bionemo/downstream/sabdab/figs/sabdab_alpha1.0.svg b/examples/advanced/bionemo/downstream/sabdab/figs/sabdab_alpha1.0.svg new file mode 100644 index 0000000000..27635d1bb6 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/figs/sabdab_alpha1.0.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/advanced/bionemo/downstream/sabdab/figs/sabdab_alpha10.0.svg b/examples/advanced/bionemo/downstream/sabdab/figs/sabdab_alpha10.0.svg new file mode 100644 index 0000000000..ff077f9222 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/figs/sabdab_alpha10.0.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/config/config_fed_client.conf b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/config/config_fed_client.conf new file mode 100644 index 0000000000..95a31d24c0 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/config/config_fed_client.conf @@ -0,0 +1,94 @@ +{ + # version of the configuration + format_version = 2 + + # This is the application script which will be invoked. Client can replace this script with user's own training script. + app_script = "downstream_flip.py" + + # Additional arguments needed by the training code. For example, in lightning, these can be --trainer.batch_size=xxx. + app_config = "" + + # Additional arguments needed by DDP. + #ddp_config = "--nnodes=1 --nproc_per_node=1 --master_port=7777" + + # Client Computing Executors. + executors = [ + { + # tasks the executors are defined to handle + tasks = ["train"] + + # This particular executor + executor { + + # This is an executor for pytorch + Client API. The underline data exchange is using Pipe. + path = "nvflare.app_opt.pt.client_api_launcher_executor.PTClientAPILauncherExecutor" + + args { + # launcher_id is used to locate the Launcher object in "components" + launcher_id = "launcher" + + # pipe_id is used to locate the Pipe object in "components" + pipe_id = "pipe" + + # Timeout in seconds for waiting for a heartbeat from the training script. Defaults to 30 seconds. + # Please refer to the class docstring for all available arguments + heartbeat_timeout = 60 + + # format of the exchange parameters + params_exchange_format = "pytorch" + + # if the transfer_type is FULL, then it will be sent directly + # if the transfer_type is DIFF, then we will calculate the + # difference VS received parameters and send the difference + params_transfer_type = "FULL" + + # if train_with_evaluation is true, the executor will expect + # the custom code need to send back both the trained parameters and the evaluation metric + # otherwise only trained parameters are expected + train_with_evaluation = false + } + } + } + ], + + # this defined an array of task data filters. If provided, it will control the data from server controller to client executor + task_data_filters = [] + + # this defined an array of task result filters. If provided, it will control the result from client executor to server controller + task_result_filters = [] + + components = [ + { + # component id is "launcher" + id = "launcher" + + # the class path of this component + path = "nvflare.app_common.launchers.subprocess_launcher.SubprocessLauncher" + + args { + # the launcher will invoke the script + #script = "python3 -m torch.distributed.run {ddp_config} custom/{app_script} {app_config} " + script = "python3 custom/{app_script} {app_config} " + # if launch_once is true, the SubprocessLauncher will launch once for the whole job + # if launch_once is false, the SubprocessLauncher will launch a process for each task it receives from server + launch_once = true + } + } + { + id = "pipe" + + path = "nvflare.fuel.utils.pipe.file_pipe.FilePipe" + + args { + # Mode of the endpoint. A pipe has two endpoints. + # An endpoint can be either the one that initiates communication or the one listening. + # PASSIVE is the one listening. + mode = "PASSIVE" + + # root_path: is the directory location of the parameters exchange. + # You can also set it to an absolute path in your system. + root_path = "{WORKSPACE}/{JOB_ID}/{SITE_NAME}" + } + } + ] +} diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/config/config_fed_server.conf b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/config/config_fed_server.conf new file mode 100644 index 0000000000..d9d4d7fe61 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/config/config_fed_server.conf @@ -0,0 +1,110 @@ +{ + # version of the configuration + format_version = 2 + + # task data filter: if filters are provided, the filter will filter the data flow out of server to client. + task_data_filters =[] + + # task result filter: if filters are provided, the filter will filter the result flow out of client to server. + task_result_filters = [] + + # This assumes that there will be a "net.py" file with class name "Net". + # If your model code is not in "net.py" and class name is not "Net", please modify here + #model_class_path = "nemo_nvflare.peft_model.PEFTmodel" + + # Location of pre-trained NeMo model file. + #restore_from_path = "/models/megatron_gpt_345m.nemo" + + # Location of pre-trained peft model file. + #peft_restore_from_path = null + + # workflows: Array of workflows the control the Federated Learning workflow lifecycle. + # One can specify multiple workflows. The NVFLARE will run them in the order specified. + workflows = [ + { + # 1st workflow" + id = "scatter_and_gather" + + # name = ScatterAndGather, path is the class path of the ScatterAndGather controller. + path = "nvflare.app_common.workflows.scatter_and_gather.ScatterAndGather" + args { + # argument of the ScatterAndGather class. + # min number of clients required for ScatterAndGather controller to move to the next round + # during the workflow cycle. The controller will wait until the min_clients returned from clients + # before move to the next step. + min_clients = 1 + + # number of global round of the training. + num_rounds = 1 + + # starting round is 0-based + start_round = 0 + + # after received min number of clients' result, + # how much time should we wait further before move to the next step + wait_time_after_min_received = 0 + + # For ScatterAndGather, the server will aggregate the weights based on the client's result. + # the aggregator component id is named here. One can use the this ID to find the corresponding + # aggregator component listed below + # + aggregator_id = "aggregator" + + # The Scatter and Gather controller use an persistor to load the model and save the model. + # The persistent component can be identified by component ID specified here. + #persistor_id = "persistor" + + # Shareable to a communication message, i.e. shared between clients and server. + # Shareable generator is a component that responsible to take the model convert to/from this communication message: sharable. + # The component can be identified via "shareable_generator_id" + shareable_generator_id = "shareable_generator" + + # train task name: Client will start training once received such task. + train_task_name = "train" + + # train timeout in second. If zero, meaning no timeout. + train_timeout = 0 + } + } + ] + + # List of components used in the server side workflow. + components = [ + #{ + # This is the persistence component used in above workflow. + # PTFileModelPersistor is a Pytorch persistor which save/read the model to/from file. + + # id = "persistor" + # path = "nvflare.app_opt.pt.file_model_persistor.PTFileModelPersistor" + + # the persistor class take model class as argument + # This imply that the model is initialized from the server-side. + # The initialized model will be broadcast to all the clients to start the training. + # args.model.path = "{model_class_path}" + # args.model.args.restore_from_path = "{restore_from_path}" + # args.model.args.peft_restore_from_path = "{peft_restore_from_path}" + #}, + { + # This is the generator that convert the model to shareable communication message structure used in workflow + id = "shareable_generator" + path = "nvflare.app_common.shareablegenerators.full_model_shareable_generator.FullModelShareableGenerator" + args = {} + }, + { + # This is the aggregator that perform the weighted average aggregation. + # the aggregation is "in-time", so it doesn't wait for client results, but aggregates as soon as it received the data. + id = "aggregator" + path = "nvflare.app_common.aggregators.intime_accumulate_model_aggregator.InTimeAccumulateWeightedAggregator" + args.expected_data_kind = "WEIGHTS" + }, + { + # This component is not directly used in Workflow. + # it select the best model based on the incoming global validation metrics. + id = "model_selector" + path = "nvflare.app_common.widgets.intime_model_selector.IntimeModelSelector" + # need to make sure this "key_metric" match what server side received + args.key_metric = "validation_exact_string_match" + } + ] + +} diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/base_config.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/base_config.yaml new file mode 100644 index 0000000000..5c1d4686ad --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/base_config.yaml @@ -0,0 +1,181 @@ +name: esm1nv +do_training: True # set to false if data preprocessing steps must be completed +do_testing: False # set to true to run evaluation on test data after training, requires test_dataset section +restore_from_path: null # used when starting from a .nemo file + +trainer: + devices: 1 # number of GPUs or CPUs + num_nodes: 1 + accelerator: gpu #gpu or cpu + precision: 16-mixed #16 or 32 + logger: False # logger is provided by NeMo exp_manager + enable_checkpointing: False # checkpointing is done by NeMo exp_manager + max_epochs: null # # use max_steps instead with NeMo Megatron model + max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 1 # number of iterations between logging + val_check_interval: 1500 + limit_val_batches: 50 # number of batches in validation step, use fraction for fraction of data, 0 to disable + limit_test_batches: 500 # number of batches in test step, use fraction for fraction of data, 0 to disable + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + benchmark: False + +exp_manager: + name: ${name} + exp_dir: ${.name}/${.wandb_logger_kwargs.name} + explicit_log_dir: ${.exp_dir} + create_wandb_logger: False + create_tensorboard_logger: True + wandb_logger_kwargs: + project: ${name}_pretraining + name: ${name}_pretraining + group: ${name} + job_type: Localhost_nodes_${trainer.num_nodes}_gpus_${trainer.devices} + notes: "date: ${now:%y%m%d-%H%M%S}" + tags: + - ${name} + offline: False # set to True if there are issues uploading to WandB during training + resume_if_exists: True # automatically resume if checkpoint exists + resume_ignore_no_checkpoint: True # leave as True, will start new training if resume_if_exists is True but no checkpoint exists + create_checkpoint_callback: False # Setting this to False so to avoid overwriting the model sent and received to the server + checkpoint_callback_params: + monitor: val_TARGET_accuracy + save_top_k: 1 # number of checkpoints to save + mode: max # use min or max of monitored metric to select best checkpoints + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + filename: 'esm1nv--{val_TARGET_accuracy:.4f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + + +model: + # model parallelism + micro_batch_size: 8 # NOTE: adjust to occupy ~ 90% of GPU memory + tensor_model_parallel_size: 1 # model parallelism + pipeline_model_parallel_size: 1 + + # model architecture + seq_length: 512 # FIXME: remove me (replaced by encoder_seq_length) + max_position_embeddings: ${.seq_length} + encoder_seq_length: ${.seq_length} + num_layers: 6 + hidden_size: 768 + ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size. + num_attention_heads: 12 + init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') + hidden_dropout: 0.1 # 0.1 # Dropout probability for hidden state transformer. + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + layernorm_epsilon: 1e-5 + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + bert_binary_head: False # BERT binary head + resume_from_checkpoint: null # manually set the checkpoint file to load from + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + + tokenizer: + library: 'megatron' + type: 'BertWordPieceLowerCase' + model: null + vocab_file: null + merge_file: null + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + + # miscellaneous + seed: 4 + use_cpu_initialization: False # Init weights on the CPU (slow for large model) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + + # not implemented in NeMo yet + activations_checkpoint_method: null # 'uniform', 'block' + activations_checkpoint_num_layers: 1 + + data: + ngc_registry_target: uniref50_2022_05 + ngc_registry_version: v23.06 + data_prefix: "" # must be null or "" + num_workers: 2 + dataloader_type: single # cyclic + reset_position_ids: False # Reset position ids after end-of-document token + reset_attention_mask: False # Reset attention mask after end-of-document token + eod_mask_loss: False # Mask loss for the end of document tokens + masked_lm_prob: 0.15 # Probability of replacing a token with mask. + short_seq_prob: 0.1 # Probability of producing a short sequence. + skip_lines: 0 + drop_last: False + pin_memory: False + index_mapping_dir: null # path to store cached indexing files (if empty, will be stored in the same directory as dataset_path) + data_impl: "csv_mmap" + # Supported kwargs (with default values): + # text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True) + # csv_mmap (newline_int=10, header_lines=0,workers=None, sort_dataset_paths=True, data_col=1, data_sep=",") + data_impl_kwargs: + csv_mmap: + header_lines: 1 + newline_int: 10 # byte-value of newline + workers: ${model.data.num_workers} # number of workers when creating missing index files (null defaults to cpu_num // 2) + sort_dataset_paths: True # if True datasets will be sorted by name + data_sep: ',' # string to split text into columns + # column number of csv to take values from + data_col: 3 + use_upsampling: True # if the data should be upsampled to max number of steps in the training + seed: ${model.seed} # Random seed + max_seq_length: ${model.seq_length} # Maximum input sequence length. Longer sequences are truncated + dynamic_padding: False # If True, each batch is padded to the maximum sequence length within that batch. + # Set it to False when model.pipeline_model_parallel_size > 1, as pipeline parallelism requires fixed-length padding. + + optim: + name: fused_adam # fused optimizers used by Megatron model + lr: 2e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 500 # use to set warmup_steps explicitly or leave as null to calculate + constant_steps: 50000 + min_lr: 2e-5 + + dwnstr_task_validation: + enabled: False + dataset: + class: bionemo.model.core.dwnstr_task_callbacks.PerTokenPredictionCallback + task_type: token-level-classification + infer_target: bionemo.model.protein.esm1nv.infer.ESM1nvInference + max_seq_length: ${model.seq_length} + emb_batch_size: 128 + batch_size: 128 + num_epochs: 10 + shuffle: True + num_workers: 2 + task_name: secondary_structure + dataset_path: /data/FLIP/${model.dwnstr_task_validation.dataset.task_name} + dataset: + train: x000 + test: x000 + sequence_column: "sequence" # name of column with protein sequence in csv file + target_column: [ "3state", "resolved" ] # names of label columns in csv file + target_sizes: [ 3, 2 ] # number of classes in each label + mask_column: [ "resolved", null ] # names of mask columns in csv file, masks must be 0 or 1 + random_seed: ${model.seed} + optim: + name: adam + lr: 0.0001 + betas: + - 0.9 + - 0.999 + eps: 1e-8 + weight_decay: 0.01 + sched: + name: WarmupAnnealing + min_lr: 0.00001 + last_epoch: -1 + warmup_ratio: 0.01 + max_steps: 1000 diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/downstream_flip.py b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/downstream_flip.py new file mode 100644 index 0000000000..067cf09552 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/downstream_flip.py @@ -0,0 +1,113 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bionemo.data import FLIPPreprocess +from bionemo.data.metrics import accuracy, mse, per_token_accuracy +from bionemo.model.protein.downstream import FineTuneProteinModel +from bionemo.model.utils import setup_trainer +from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector +from nemo.core.config import hydra_runner +from nemo.utils import logging +from omegaconf.omegaconf import OmegaConf, open_dict + +# Import nvflare lightning API for federated learning +import nvflare.client.lightning as flare +from nvflare.client.api import init + +micro_batch_size = 32 + + +@hydra_runner(config_path=".", config_name="downstream_flip_sabdab") # ESM1 +def main(cfg) -> None: + logging.info("\n\n************* Finetune config ****************") + logging.info(f"\n{OmegaConf.to_yaml(cfg)}") + init() + # Get FL system info and set site-specific parameters + fl_sys_info = flare.system_info() + site_name = fl_sys_info["site_name"] + print(f"Running client {site_name} with train data: {cfg.model.data.dataset.train}") + print(f"Validation check interval: {cfg.trainer.val_check_interval}") + + # Do preprocessing if specified in config + if cfg.do_preprocessing: + logging.info("************** Starting Preprocessing ***********") + preprocessor = FLIPPreprocess() + preprocessor.prepare_all_datasets(output_dir=cfg.model.data.preprocessed_data_path) + + if not cfg.do_training and not cfg.do_testing: + return + + trainer = setup_trainer(cfg, builder=None, reset_accumulate_grad_batches=False) + + # Load model + with open_dict(cfg): + cfg.model.encoder_cfg = cfg + + if cfg.restore_from_path: + logging.info("\nRestoring model from .nemo file " + cfg.restore_from_path) + model = FineTuneProteinModel.restore_from( + cfg.restore_from_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector() + ) + else: + model = FineTuneProteinModel(cfg.model, trainer) + + metrics = {} + metrics_args = {} + for idx, name in enumerate(cfg.model.data.target_column): + if cfg.model.data.task_type == "token-level-classification": + metrics[name + "_accuracy"] = per_token_accuracy + metrics_args[name + "_accuracy"] = {"label_id": idx} + elif cfg.model.data.task_type == "classification": + metrics[name + "_accuracy"] = accuracy + metrics_args[name + "_accuracy"] = {} + elif cfg.model.data.task_type == "regression": + metrics[name + "_MSE"] = mse + metrics_args[name + "_MSE"] = {} + + model.add_metrics(metrics=metrics, metrics_args=metrics_args) + + # Patch trainer for NVFlare federated learning + flare.patch(trainer) + + # Federated learning loop + while flare.is_running(): + fl_sys_info = flare.system_info() + print("--- fl_sys_info ---") + print(fl_sys_info) + + # Validate current global model + print("--- validate global model ---") + trainer.validate(model) + + # Perform local training with received global model + print("--- train new model ---") + trainer.fit(model) + logging.info("************** Finished Training ***********") + + if cfg.do_testing: + logging.info("************** Starting Testing ***********") + if "test" in cfg.model.data.dataset: + trainer.limit_train_batches = 0 + trainer.limit_val_batches = 0 + trainer.fit(model) + trainer.test(model, ckpt_path=None) + else: + raise UserWarning( + "Skipping testing, test dataset file was not provided. Please specify 'dataset.test' in yaml config" + ) + logging.info("************** Finished Testing ***********") + + +if __name__ == "__main__": + main() diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml new file mode 100644 index 0000000000..fd9507cf80 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml @@ -0,0 +1,74 @@ +name: esm1nv_flip +defaults: + - pretrain_small + - _self_ +do_preprocessing: False +do_training: True # set to false if data preprocessing steps must be completed +do_testing: False # set to true to run evaluation on test data after training +restore_from_path: null # path to nemo checkpoint of the fine-tuned model (encoder + task head) to be used for further training, testing or inference +target: bionemo.model.protein.esm1nv.ESM1nvModel # target class for protein model +infer_target: bionemo.model.protein.esm1nv.infer.ESM1nvInference # target inference class for protein model +encoder_frozen: False + +trainer: + devices: 1 # number of GPUs or CPUs + num_nodes: 1 + max_epochs: 20 + val_check_interval: 0.0 + limit_val_batches: 0.0 # number of batches in validation step, use fraction for fraction of data, 0 to disable + limit_test_batches: 0.0 # number of batches in test step, use fraction for fraction of data, 0 to disable + use_distributed_sampler: False + +exp_manager: + wandb_logger_kwargs: + project: ${name}_${model.data.task_name}_finetuning + name: ${name}_${model.data.task_name}_finetuning_encoder_frozen_${model.encoder_frozen} + +model: + restore_encoder_path: ${oc.env:BIONEMO_HOME}/models/protein/esm1nv/esm1nv.nemo + encoder_frozen: False # encoder trainable or frozen + post_process: False # must be False for downstream task + micro_batch_size: 32 # NOTE: adjust to occupy ~ 90% of GPU memory + global_batch_size: null # if null will be computed automatically + tensor_model_parallel_size: 1 # model parallelism + loss_func: CrossEntropyLoss + hidden_layer_size: 256 + dropout_rate: 0.25 + + optim_param_groups: + encoder_model: + lr: 1e-5 + task_head: + lr: 5e-4 + + data: + task_name: tap # options: aav, bind, conservation, gb1, meltome, sav, scl, secondary_structure + task_type: classification #'token-level-classification' # alternative: classification, regression + preprocessed_data_path: /tmp/data # path where all preprocessed FLIP datasets are saved + dataset_path: ${model.data.preprocessed_data_path}/sabdab_chen # path to a training data + dataset: + train: sabdab_chen_full_train + val: sabdab_chen_valid + test: sabdab_chen_test + sequence_column: "Antibody" # name of column with protein sequence in csv file + target_column: ["Y"] #["3state"np.sum(test_df['Y']==0), "resolved"] # names of label columns in csv file + target_sizes: [2] # number of classes in each label for classifications or 1 for regression + num_classes: 2 + num_workers: 2 + shuffle: True # shuffle training dataset + max_seq_length: ${model.seq_length} + emb_batch_size: ${model.micro_batch_size} + + finetuning_optim: # optimizer parameters for downstream task model + name: adam + #lr: 0.0005 + betas: + - 0.9 + - 0.999 + eps: 1e-8 + weight_decay: 0.001 + #sched: + # name: WarmupAnnealing + # min_lr: 0.00001 #0.00001 + # last_epoch: -1 + # warmup_steps: 10 diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/infer.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/infer.yaml new file mode 100644 index 0000000000..3368831d85 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/infer.yaml @@ -0,0 +1,25 @@ +defaults: + - base_infer_config + # allow this config to override defaults + - _self_ + +hydra: + searchpath: + - /workspace/bionemo/examples/conf/ + +name: ESM1nv_Inference +desc: Minimum configuration for initializing a ESM1nv model for inference. + +model: + post_process: False + tokenizer: + vocab_path: /tokenizers/protein/esm1nv/vocab/protein_sequence_sentencepiece.vocab + model_path: /tokenizers/protein/esm1nv/vocab/protein_sequence_sentencepiece.model + downstream_task: + restore_from_path: "/model/protein/esm1nv/esm1nv.nemo" + outputs: [embeddings, hiddens] # Which outputs to extract per sample (a value or list). Possible values: hiddens, embeddings. + data: + dataset_path: /data/FLIP/secondary_structure/test/x000 # full path to dataset (can include range or a list) + +target: bionemo.model.protein.esm1nv.esm1nv_model.ESM1nvModel # path to model class to load +infer_target: bionemo.model.protein.esm1nv.infer.ESM1nvInference # path to inferende class to load \ No newline at end of file diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/pretrain_small.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/pretrain_small.yaml new file mode 100644 index 0000000000..4599d92bcc --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/app/custom/pretrain_small.yaml @@ -0,0 +1,33 @@ +defaults: + - base_config +restore_from_path: null # used when starting from a .nemo file + +model: + tokenizer: + library: 'sentencepiece' + type: null + model: ${oc.env:BIONEMO_HOME}/tokenizers/protein/esm1nv/vocab/protein_sequence_sentencepiece.model + vocab_file: ${oc.env:BIONEMO_HOME}/tokenizers/vocab/protein_sequence_sentencepiece.vocab + data: + dataset_path: ${oc.env:BIONEMO_HOME}/data/uniref2022_05 # parent directory for data, contains train / val / test folders. Needs to be writeable for index creation. + dataset: # inclusive range of data files to load x[000..049] or can a single file, e.g. x000 + train: x[000..049] + test: x[000..049] + val: x[000..049] + micro_batch_size: ${model.micro_batch_size} + num_workers: 2 + + # Supported kwargs (with default values): + # text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True) + # csv_mmap (newline_int=10, header_lines=0,workers=None, sort_dataset_paths=True, data_col=1, data_sep=",") + data_impl_kwargs: + csv_mmap: + data_col: 3 # 0-based + + # These control the MLM token probabilities. The following settings are commonly used in literature. + modify_percent: 0.15 # Fraction of characters in a protein sequence to modify. (Modification means replacing with another amino acid or with a mask token) + perturb_percent: 0.1 # Of the modify_percent, what fraction of characters are to be replaced with another amino acid. + mask_percent: 0.8 # Of the modify_percent, what fraction of characters are to be replaced with a mask token. + identity_percent: 0.1 # Of the modify_percent, what fraction of characters are to be unchanged as the original amino acid. + dwnst_task_validation: + enabled: True \ No newline at end of file diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/meta.conf b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/meta.conf new file mode 100644 index 0000000000..a21a6ec425 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/central_sabdab_esm1nv/meta.conf @@ -0,0 +1,10 @@ +{ + name = "bionemo_local_finetune_esm1nv" + resource_spec = {} + deploy_map { + # change deploy map as needed. + app: ["@ALL"] + } + min_clients = 1 + mandatory_clients = [] +} diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/config/config_fed_client.conf b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/config/config_fed_client.conf new file mode 100644 index 0000000000..95a31d24c0 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/config/config_fed_client.conf @@ -0,0 +1,94 @@ +{ + # version of the configuration + format_version = 2 + + # This is the application script which will be invoked. Client can replace this script with user's own training script. + app_script = "downstream_flip.py" + + # Additional arguments needed by the training code. For example, in lightning, these can be --trainer.batch_size=xxx. + app_config = "" + + # Additional arguments needed by DDP. + #ddp_config = "--nnodes=1 --nproc_per_node=1 --master_port=7777" + + # Client Computing Executors. + executors = [ + { + # tasks the executors are defined to handle + tasks = ["train"] + + # This particular executor + executor { + + # This is an executor for pytorch + Client API. The underline data exchange is using Pipe. + path = "nvflare.app_opt.pt.client_api_launcher_executor.PTClientAPILauncherExecutor" + + args { + # launcher_id is used to locate the Launcher object in "components" + launcher_id = "launcher" + + # pipe_id is used to locate the Pipe object in "components" + pipe_id = "pipe" + + # Timeout in seconds for waiting for a heartbeat from the training script. Defaults to 30 seconds. + # Please refer to the class docstring for all available arguments + heartbeat_timeout = 60 + + # format of the exchange parameters + params_exchange_format = "pytorch" + + # if the transfer_type is FULL, then it will be sent directly + # if the transfer_type is DIFF, then we will calculate the + # difference VS received parameters and send the difference + params_transfer_type = "FULL" + + # if train_with_evaluation is true, the executor will expect + # the custom code need to send back both the trained parameters and the evaluation metric + # otherwise only trained parameters are expected + train_with_evaluation = false + } + } + } + ], + + # this defined an array of task data filters. If provided, it will control the data from server controller to client executor + task_data_filters = [] + + # this defined an array of task result filters. If provided, it will control the result from client executor to server controller + task_result_filters = [] + + components = [ + { + # component id is "launcher" + id = "launcher" + + # the class path of this component + path = "nvflare.app_common.launchers.subprocess_launcher.SubprocessLauncher" + + args { + # the launcher will invoke the script + #script = "python3 -m torch.distributed.run {ddp_config} custom/{app_script} {app_config} " + script = "python3 custom/{app_script} {app_config} " + # if launch_once is true, the SubprocessLauncher will launch once for the whole job + # if launch_once is false, the SubprocessLauncher will launch a process for each task it receives from server + launch_once = true + } + } + { + id = "pipe" + + path = "nvflare.fuel.utils.pipe.file_pipe.FilePipe" + + args { + # Mode of the endpoint. A pipe has two endpoints. + # An endpoint can be either the one that initiates communication or the one listening. + # PASSIVE is the one listening. + mode = "PASSIVE" + + # root_path: is the directory location of the parameters exchange. + # You can also set it to an absolute path in your system. + root_path = "{WORKSPACE}/{JOB_ID}/{SITE_NAME}" + } + } + ] +} diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/config/config_fed_server.conf b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/config/config_fed_server.conf new file mode 100644 index 0000000000..f13692a589 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/config/config_fed_server.conf @@ -0,0 +1,110 @@ +{ + # version of the configuration + format_version = 2 + + # task data filter: if filters are provided, the filter will filter the data flow out of server to client. + task_data_filters =[] + + # task result filter: if filters are provided, the filter will filter the result flow out of client to server. + task_result_filters = [] + + # This assumes that there will be a "net.py" file with class name "Net". + # If your model code is not in "net.py" and class name is not "Net", please modify here + #model_class_path = "nemo_nvflare.peft_model.PEFTmodel" + + # Location of pre-trained NeMo model file. + #restore_from_path = "/models/megatron_gpt_345m.nemo" + + # Location of pre-trained peft model file. + #peft_restore_from_path = null + + # workflows: Array of workflows the control the Federated Learning workflow lifecycle. + # One can specify multiple workflows. The NVFLARE will run them in the order specified. + workflows = [ + { + # 1st workflow" + id = "scatter_and_gather" + + # name = ScatterAndGather, path is the class path of the ScatterAndGather controller. + path = "nvflare.app_common.workflows.scatter_and_gather.ScatterAndGather" + args { + # argument of the ScatterAndGather class. + # min number of clients required for ScatterAndGather controller to move to the next round + # during the workflow cycle. The controller will wait until the min_clients returned from clients + # before move to the next step. + min_clients = 6 + + # number of global round of the training. + num_rounds = 20 + + # starting round is 0-based + start_round = 0 + + # after received min number of clients' result, + # how much time should we wait further before move to the next step + wait_time_after_min_received = 0 + + # For ScatterAndGather, the server will aggregate the weights based on the client's result. + # the aggregator component id is named here. One can use the this ID to find the corresponding + # aggregator component listed below + # + aggregator_id = "aggregator" + + # The Scatter and Gather controller use an persistor to load the model and save the model. + # The persistent component can be identified by component ID specified here. + #persistor_id = "persistor" + + # Shareable to a communication message, i.e. shared between clients and server. + # Shareable generator is a component that responsible to take the model convert to/from this communication message: sharable. + # The component can be identified via "shareable_generator_id" + shareable_generator_id = "shareable_generator" + + # train task name: Client will start training once received such task. + train_task_name = "train" + + # train timeout in second. If zero, meaning no timeout. + train_timeout = 0 + } + } + ] + + # List of components used in the server side workflow. + components = [ + #{ + # This is the persistence component used in above workflow. + # PTFileModelPersistor is a Pytorch persistor which save/read the model to/from file. + + # id = "persistor" + # path = "nvflare.app_opt.pt.file_model_persistor.PTFileModelPersistor" + + # the persistor class take model class as argument + # This imply that the model is initialized from the server-side. + # The initialized model will be broadcast to all the clients to start the training. + # args.model.path = "{model_class_path}" + # args.model.args.restore_from_path = "{restore_from_path}" + # args.model.args.peft_restore_from_path = "{peft_restore_from_path}" + #}, + { + # This is the generator that convert the model to shareable communication message structure used in workflow + id = "shareable_generator" + path = "nvflare.app_common.shareablegenerators.full_model_shareable_generator.FullModelShareableGenerator" + args = {} + }, + { + # This is the aggregator that perform the weighted average aggregation. + # the aggregation is "in-time", so it doesn't wait for client results, but aggregates as soon as it received the data. + id = "aggregator" + path = "nvflare.app_common.aggregators.intime_accumulate_model_aggregator.InTimeAccumulateWeightedAggregator" + args.expected_data_kind = "WEIGHTS" + }, + { + # This component is not directly used in Workflow. + # it select the best model based on the incoming global validation metrics. + id = "model_selector" + path = "nvflare.app_common.widgets.intime_model_selector.IntimeModelSelector" + # need to make sure this "key_metric" match what server side received + args.key_metric = "validation_exact_string_match" + } + ] + +} diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/base_config.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/base_config.yaml new file mode 100644 index 0000000000..5c1d4686ad --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/base_config.yaml @@ -0,0 +1,181 @@ +name: esm1nv +do_training: True # set to false if data preprocessing steps must be completed +do_testing: False # set to true to run evaluation on test data after training, requires test_dataset section +restore_from_path: null # used when starting from a .nemo file + +trainer: + devices: 1 # number of GPUs or CPUs + num_nodes: 1 + accelerator: gpu #gpu or cpu + precision: 16-mixed #16 or 32 + logger: False # logger is provided by NeMo exp_manager + enable_checkpointing: False # checkpointing is done by NeMo exp_manager + max_epochs: null # # use max_steps instead with NeMo Megatron model + max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 1 # number of iterations between logging + val_check_interval: 1500 + limit_val_batches: 50 # number of batches in validation step, use fraction for fraction of data, 0 to disable + limit_test_batches: 500 # number of batches in test step, use fraction for fraction of data, 0 to disable + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + benchmark: False + +exp_manager: + name: ${name} + exp_dir: ${.name}/${.wandb_logger_kwargs.name} + explicit_log_dir: ${.exp_dir} + create_wandb_logger: False + create_tensorboard_logger: True + wandb_logger_kwargs: + project: ${name}_pretraining + name: ${name}_pretraining + group: ${name} + job_type: Localhost_nodes_${trainer.num_nodes}_gpus_${trainer.devices} + notes: "date: ${now:%y%m%d-%H%M%S}" + tags: + - ${name} + offline: False # set to True if there are issues uploading to WandB during training + resume_if_exists: True # automatically resume if checkpoint exists + resume_ignore_no_checkpoint: True # leave as True, will start new training if resume_if_exists is True but no checkpoint exists + create_checkpoint_callback: False # Setting this to False so to avoid overwriting the model sent and received to the server + checkpoint_callback_params: + monitor: val_TARGET_accuracy + save_top_k: 1 # number of checkpoints to save + mode: max # use min or max of monitored metric to select best checkpoints + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + filename: 'esm1nv--{val_TARGET_accuracy:.4f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + + +model: + # model parallelism + micro_batch_size: 8 # NOTE: adjust to occupy ~ 90% of GPU memory + tensor_model_parallel_size: 1 # model parallelism + pipeline_model_parallel_size: 1 + + # model architecture + seq_length: 512 # FIXME: remove me (replaced by encoder_seq_length) + max_position_embeddings: ${.seq_length} + encoder_seq_length: ${.seq_length} + num_layers: 6 + hidden_size: 768 + ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size. + num_attention_heads: 12 + init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') + hidden_dropout: 0.1 # 0.1 # Dropout probability for hidden state transformer. + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + layernorm_epsilon: 1e-5 + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + bert_binary_head: False # BERT binary head + resume_from_checkpoint: null # manually set the checkpoint file to load from + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + + tokenizer: + library: 'megatron' + type: 'BertWordPieceLowerCase' + model: null + vocab_file: null + merge_file: null + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + + # miscellaneous + seed: 4 + use_cpu_initialization: False # Init weights on the CPU (slow for large model) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + + # not implemented in NeMo yet + activations_checkpoint_method: null # 'uniform', 'block' + activations_checkpoint_num_layers: 1 + + data: + ngc_registry_target: uniref50_2022_05 + ngc_registry_version: v23.06 + data_prefix: "" # must be null or "" + num_workers: 2 + dataloader_type: single # cyclic + reset_position_ids: False # Reset position ids after end-of-document token + reset_attention_mask: False # Reset attention mask after end-of-document token + eod_mask_loss: False # Mask loss for the end of document tokens + masked_lm_prob: 0.15 # Probability of replacing a token with mask. + short_seq_prob: 0.1 # Probability of producing a short sequence. + skip_lines: 0 + drop_last: False + pin_memory: False + index_mapping_dir: null # path to store cached indexing files (if empty, will be stored in the same directory as dataset_path) + data_impl: "csv_mmap" + # Supported kwargs (with default values): + # text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True) + # csv_mmap (newline_int=10, header_lines=0,workers=None, sort_dataset_paths=True, data_col=1, data_sep=",") + data_impl_kwargs: + csv_mmap: + header_lines: 1 + newline_int: 10 # byte-value of newline + workers: ${model.data.num_workers} # number of workers when creating missing index files (null defaults to cpu_num // 2) + sort_dataset_paths: True # if True datasets will be sorted by name + data_sep: ',' # string to split text into columns + # column number of csv to take values from + data_col: 3 + use_upsampling: True # if the data should be upsampled to max number of steps in the training + seed: ${model.seed} # Random seed + max_seq_length: ${model.seq_length} # Maximum input sequence length. Longer sequences are truncated + dynamic_padding: False # If True, each batch is padded to the maximum sequence length within that batch. + # Set it to False when model.pipeline_model_parallel_size > 1, as pipeline parallelism requires fixed-length padding. + + optim: + name: fused_adam # fused optimizers used by Megatron model + lr: 2e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 500 # use to set warmup_steps explicitly or leave as null to calculate + constant_steps: 50000 + min_lr: 2e-5 + + dwnstr_task_validation: + enabled: False + dataset: + class: bionemo.model.core.dwnstr_task_callbacks.PerTokenPredictionCallback + task_type: token-level-classification + infer_target: bionemo.model.protein.esm1nv.infer.ESM1nvInference + max_seq_length: ${model.seq_length} + emb_batch_size: 128 + batch_size: 128 + num_epochs: 10 + shuffle: True + num_workers: 2 + task_name: secondary_structure + dataset_path: /data/FLIP/${model.dwnstr_task_validation.dataset.task_name} + dataset: + train: x000 + test: x000 + sequence_column: "sequence" # name of column with protein sequence in csv file + target_column: [ "3state", "resolved" ] # names of label columns in csv file + target_sizes: [ 3, 2 ] # number of classes in each label + mask_column: [ "resolved", null ] # names of mask columns in csv file, masks must be 0 or 1 + random_seed: ${model.seed} + optim: + name: adam + lr: 0.0001 + betas: + - 0.9 + - 0.999 + eps: 1e-8 + weight_decay: 0.01 + sched: + name: WarmupAnnealing + min_lr: 0.00001 + last_epoch: -1 + warmup_ratio: 0.01 + max_steps: 1000 diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/downstream_flip.py b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/downstream_flip.py new file mode 100644 index 0000000000..9b254a2990 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/downstream_flip.py @@ -0,0 +1,123 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bionemo.data import FLIPPreprocess +from bionemo.data.metrics import accuracy, mse, per_token_accuracy +from bionemo.model.protein.downstream import FineTuneProteinModel +from bionemo.model.utils import setup_trainer +from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector +from nemo.core.config import hydra_runner +from nemo.utils import logging +from omegaconf.omegaconf import OmegaConf, open_dict + +# Import nvflare lightning API for federated learning +import nvflare.client.lightning as flare +from nvflare.client.api import init + +micro_batch_size = 32 +val_check_intervals = { + "site-1": min(int(416 / micro_batch_size), 3), # Use min to ensure it's <= 3 + "site-2": min(int(238 / micro_batch_size), 3), + "site-3": min(int(282 / micro_batch_size), 3), + "site-4": min(int(472 / micro_batch_size), 3), + "site-5": min(int(361 / micro_batch_size), 3), + "site-6": min(int(157 / micro_batch_size), 3), +} + + +@hydra_runner(config_path=".", config_name="downstream_flip_sabdab") # ESM1 +def main(cfg) -> None: + logging.info("\n\n************* Finetune config ****************") + logging.info(f"\n{OmegaConf.to_yaml(cfg)}") + init() + # Get FL system info and set site-specific parameters + fl_sys_info = flare.system_info() + site_name = fl_sys_info["site_name"] + cfg.model.data.dataset.train = f"sabdab_chen_{site_name}_train" + cfg.trainer.val_check_interval = val_check_intervals[site_name] + print(f"Running client {site_name} with train data: {cfg.model.data.dataset.train}") + print(f"Validation check interval: {cfg.trainer.val_check_interval}") + + # Do preprocessing if specified in config + if cfg.do_preprocessing: + logging.info("************** Starting Preprocessing ***********") + preprocessor = FLIPPreprocess() + preprocessor.prepare_all_datasets(output_dir=cfg.model.data.preprocessed_data_path) + + if not cfg.do_training and not cfg.do_testing: + return + + trainer = setup_trainer(cfg, builder=None, reset_accumulate_grad_batches=False) + + # Load model + with open_dict(cfg): + cfg.model.encoder_cfg = cfg + + if cfg.restore_from_path: + logging.info("\nRestoring model from .nemo file " + cfg.restore_from_path) + model = FineTuneProteinModel.restore_from( + cfg.restore_from_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector() + ) + else: + model = FineTuneProteinModel(cfg.model, trainer) + + metrics = {} + metrics_args = {} + for idx, name in enumerate(cfg.model.data.target_column): + if cfg.model.data.task_type == "token-level-classification": + metrics[name + "_accuracy"] = per_token_accuracy + metrics_args[name + "_accuracy"] = {"label_id": idx} + elif cfg.model.data.task_type == "classification": + metrics[name + "_accuracy"] = accuracy + metrics_args[name + "_accuracy"] = {} + elif cfg.model.data.task_type == "regression": + metrics[name + "_MSE"] = mse + metrics_args[name + "_MSE"] = {} + + model.add_metrics(metrics=metrics, metrics_args=metrics_args) + + # Patch trainer for NVFlare federated learning + flare.patch(trainer) + + # Federated learning loop + while flare.is_running(): + fl_sys_info = flare.system_info() + print("--- fl_sys_info ---") + print(fl_sys_info) + + # Validate current global model + print("--- validate global model ---") + trainer.validate(model) + + # Perform local training with received global model + print("--- train new model ---") + trainer.fit(model) + logging.info("************** Finished Training ***********") + + if cfg.do_testing: + logging.info("************** Starting Testing ***********") + if "test" in cfg.model.data.dataset: + trainer.limit_train_batches = 0 + trainer.limit_val_batches = 0 + trainer.fit(model) + trainer.test(model, ckpt_path=None) + else: + raise UserWarning( + "Skipping testing, test dataset file was not provided. Please specify 'dataset.test' in yaml config" + ) + logging.info("************** Finished Testing ***********") + + +if __name__ == "__main__": + main() diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml new file mode 100644 index 0000000000..6488ad5205 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml @@ -0,0 +1,74 @@ +name: esm1nv_flip +defaults: + - pretrain_small + - _self_ +do_preprocessing: False +do_training: True # set to false if data preprocessing steps must be completed +do_testing: False # set to true to run evaluation on test data after training +restore_from_path: null # path to nemo checkpoint of the fine-tuned model (encoder + task head) to be used for further training, testing or inference +target: bionemo.model.protein.esm1nv.ESM1nvModel # target class for protein model +infer_target: bionemo.model.protein.esm1nv.infer.ESM1nvInference # target inference class for protein model +encoder_frozen: False + +trainer: + devices: 1 # number of GPUs or CPUs + num_nodes: 1 + max_epochs: 1 + val_check_interval: 0.0 + limit_val_batches: 0.0 # number of batches in validation step, use fraction for fraction of data, 0 to disable + limit_test_batches: 0.0 # number of batches in test step, use fraction for fraction of data, 0 to disable + use_distributed_sampler: False + +exp_manager: + wandb_logger_kwargs: + project: ${name}_${model.data.task_name}_finetuning + name: ${name}_${model.data.task_name}_finetuning_encoder_frozen_${model.encoder_frozen} + +model: + restore_encoder_path: ${oc.env:BIONEMO_HOME}/models/protein/esm1nv/esm1nv.nemo + encoder_frozen: False # encoder trainable or frozen + post_process: False # must be False for downstream task + micro_batch_size: 32 # NOTE: adjust to occupy ~ 90% of GPU memory + global_batch_size: null # if null will be computed automatically + tensor_model_parallel_size: 1 # model parallelism + loss_func: CrossEntropyLoss + hidden_layer_size: 256 + dropout_rate: 0.25 + + optim_param_groups: + encoder_model: + lr: 1e-5 + task_head: + lr: 5e-4 + + data: + task_name: tap # options: aav, bind, conservation, gb1, meltome, sav, scl, secondary_structure + task_type: classification #'token-level-classification' # alternative: classification, regression + preprocessed_data_path: /tmp/data # path where all preprocessed FLIP datasets are saved + dataset_path: ${model.data.preprocessed_data_path}/sabdab_chen # path to a training data + dataset: + train: ??? # train data will be set in `custom/downstream_flip.py`, e.g. to "sabdab_chen_site-1_train" + val: sabdab_chen_valid + test: sabdab_chen_test + sequence_column: "Antibody" # name of column with protein sequence in csv file + target_column: ["Y"] #["3state"np.sum(test_df['Y']==0), "resolved"] # names of label columns in csv file + target_sizes: [2] # number of classes in each label for classifications or 1 for regression + num_classes: 2 + num_workers: 2 + shuffle: True # shuffle training dataset + max_seq_length: ${model.seq_length} + emb_batch_size: ${model.micro_batch_size} + + finetuning_optim: # optimizer parameters for downstream task model + name: adam + #lr: 0.0005 + betas: + - 0.9 + - 0.999 + eps: 1e-8 + weight_decay: 0.001 + #sched: + # name: WarmupAnnealing + # min_lr: 0.00001 #0.00001 + # last_epoch: -1 + # warmup_steps: 10 diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/infer.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/infer.yaml new file mode 100644 index 0000000000..3368831d85 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/infer.yaml @@ -0,0 +1,25 @@ +defaults: + - base_infer_config + # allow this config to override defaults + - _self_ + +hydra: + searchpath: + - /workspace/bionemo/examples/conf/ + +name: ESM1nv_Inference +desc: Minimum configuration for initializing a ESM1nv model for inference. + +model: + post_process: False + tokenizer: + vocab_path: /tokenizers/protein/esm1nv/vocab/protein_sequence_sentencepiece.vocab + model_path: /tokenizers/protein/esm1nv/vocab/protein_sequence_sentencepiece.model + downstream_task: + restore_from_path: "/model/protein/esm1nv/esm1nv.nemo" + outputs: [embeddings, hiddens] # Which outputs to extract per sample (a value or list). Possible values: hiddens, embeddings. + data: + dataset_path: /data/FLIP/secondary_structure/test/x000 # full path to dataset (can include range or a list) + +target: bionemo.model.protein.esm1nv.esm1nv_model.ESM1nvModel # path to model class to load +infer_target: bionemo.model.protein.esm1nv.infer.ESM1nvInference # path to inferende class to load \ No newline at end of file diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/pretrain_small.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/pretrain_small.yaml new file mode 100644 index 0000000000..4599d92bcc --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/app/custom/pretrain_small.yaml @@ -0,0 +1,33 @@ +defaults: + - base_config +restore_from_path: null # used when starting from a .nemo file + +model: + tokenizer: + library: 'sentencepiece' + type: null + model: ${oc.env:BIONEMO_HOME}/tokenizers/protein/esm1nv/vocab/protein_sequence_sentencepiece.model + vocab_file: ${oc.env:BIONEMO_HOME}/tokenizers/vocab/protein_sequence_sentencepiece.vocab + data: + dataset_path: ${oc.env:BIONEMO_HOME}/data/uniref2022_05 # parent directory for data, contains train / val / test folders. Needs to be writeable for index creation. + dataset: # inclusive range of data files to load x[000..049] or can a single file, e.g. x000 + train: x[000..049] + test: x[000..049] + val: x[000..049] + micro_batch_size: ${model.micro_batch_size} + num_workers: 2 + + # Supported kwargs (with default values): + # text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True) + # csv_mmap (newline_int=10, header_lines=0,workers=None, sort_dataset_paths=True, data_col=1, data_sep=",") + data_impl_kwargs: + csv_mmap: + data_col: 3 # 0-based + + # These control the MLM token probabilities. The following settings are commonly used in literature. + modify_percent: 0.15 # Fraction of characters in a protein sequence to modify. (Modification means replacing with another amino acid or with a mask token) + perturb_percent: 0.1 # Of the modify_percent, what fraction of characters are to be replaced with another amino acid. + mask_percent: 0.8 # Of the modify_percent, what fraction of characters are to be replaced with a mask token. + identity_percent: 0.1 # Of the modify_percent, what fraction of characters are to be unchanged as the original amino acid. + dwnst_task_validation: + enabled: True \ No newline at end of file diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/meta.conf b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/meta.conf new file mode 100644 index 0000000000..a21a6ec425 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/fedavg_sabdab_esm1nv/meta.conf @@ -0,0 +1,10 @@ +{ + name = "bionemo_local_finetune_esm1nv" + resource_spec = {} + deploy_map { + # change deploy map as needed. + app: ["@ALL"] + } + min_clients = 1 + mandatory_clients = [] +} diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/config/config_fed_client.conf b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/config/config_fed_client.conf new file mode 100644 index 0000000000..95a31d24c0 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/config/config_fed_client.conf @@ -0,0 +1,94 @@ +{ + # version of the configuration + format_version = 2 + + # This is the application script which will be invoked. Client can replace this script with user's own training script. + app_script = "downstream_flip.py" + + # Additional arguments needed by the training code. For example, in lightning, these can be --trainer.batch_size=xxx. + app_config = "" + + # Additional arguments needed by DDP. + #ddp_config = "--nnodes=1 --nproc_per_node=1 --master_port=7777" + + # Client Computing Executors. + executors = [ + { + # tasks the executors are defined to handle + tasks = ["train"] + + # This particular executor + executor { + + # This is an executor for pytorch + Client API. The underline data exchange is using Pipe. + path = "nvflare.app_opt.pt.client_api_launcher_executor.PTClientAPILauncherExecutor" + + args { + # launcher_id is used to locate the Launcher object in "components" + launcher_id = "launcher" + + # pipe_id is used to locate the Pipe object in "components" + pipe_id = "pipe" + + # Timeout in seconds for waiting for a heartbeat from the training script. Defaults to 30 seconds. + # Please refer to the class docstring for all available arguments + heartbeat_timeout = 60 + + # format of the exchange parameters + params_exchange_format = "pytorch" + + # if the transfer_type is FULL, then it will be sent directly + # if the transfer_type is DIFF, then we will calculate the + # difference VS received parameters and send the difference + params_transfer_type = "FULL" + + # if train_with_evaluation is true, the executor will expect + # the custom code need to send back both the trained parameters and the evaluation metric + # otherwise only trained parameters are expected + train_with_evaluation = false + } + } + } + ], + + # this defined an array of task data filters. If provided, it will control the data from server controller to client executor + task_data_filters = [] + + # this defined an array of task result filters. If provided, it will control the result from client executor to server controller + task_result_filters = [] + + components = [ + { + # component id is "launcher" + id = "launcher" + + # the class path of this component + path = "nvflare.app_common.launchers.subprocess_launcher.SubprocessLauncher" + + args { + # the launcher will invoke the script + #script = "python3 -m torch.distributed.run {ddp_config} custom/{app_script} {app_config} " + script = "python3 custom/{app_script} {app_config} " + # if launch_once is true, the SubprocessLauncher will launch once for the whole job + # if launch_once is false, the SubprocessLauncher will launch a process for each task it receives from server + launch_once = true + } + } + { + id = "pipe" + + path = "nvflare.fuel.utils.pipe.file_pipe.FilePipe" + + args { + # Mode of the endpoint. A pipe has two endpoints. + # An endpoint can be either the one that initiates communication or the one listening. + # PASSIVE is the one listening. + mode = "PASSIVE" + + # root_path: is the directory location of the parameters exchange. + # You can also set it to an absolute path in your system. + root_path = "{WORKSPACE}/{JOB_ID}/{SITE_NAME}" + } + } + ] +} diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/config/config_fed_server.conf b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/config/config_fed_server.conf new file mode 100644 index 0000000000..95e9235d89 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/config/config_fed_server.conf @@ -0,0 +1,110 @@ +{ + # version of the configuration + format_version = 2 + + # task data filter: if filters are provided, the filter will filter the data flow out of server to client. + task_data_filters =[] + + # task result filter: if filters are provided, the filter will filter the result flow out of client to server. + task_result_filters = [] + + # This assumes that there will be a "net.py" file with class name "Net". + # If your model code is not in "net.py" and class name is not "Net", please modify here + #model_class_path = "nemo_nvflare.peft_model.PEFTmodel" + + # Location of pre-trained NeMo model file. + #restore_from_path = "/models/megatron_gpt_345m.nemo" + + # Location of pre-trained peft model file. + #peft_restore_from_path = null + + # workflows: Array of workflows the control the Federated Learning workflow lifecycle. + # One can specify multiple workflows. The NVFLARE will run them in the order specified. + workflows = [ + { + # 1st workflow" + id = "scatter_and_gather" + + # name = ScatterAndGather, path is the class path of the ScatterAndGather controller. + path = "nvflare.app_common.workflows.scatter_and_gather.ScatterAndGather" + args { + # argument of the ScatterAndGather class. + # min number of clients required for ScatterAndGather controller to move to the next round + # during the workflow cycle. The controller will wait until the min_clients returned from clients + # before move to the next step. + min_clients = 6 + + # number of global round of the training. + num_rounds = 1 + + # starting round is 0-based + start_round = 0 + + # after received min number of clients' result, + # how much time should we wait further before move to the next step + wait_time_after_min_received = 0 + + # For ScatterAndGather, the server will aggregate the weights based on the client's result. + # the aggregator component id is named here. One can use the this ID to find the corresponding + # aggregator component listed below + # + aggregator_id = "aggregator" + + # The Scatter and Gather controller use an persistor to load the model and save the model. + # The persistent component can be identified by component ID specified here. + #persistor_id = "persistor" + + # Shareable to a communication message, i.e. shared between clients and server. + # Shareable generator is a component that responsible to take the model convert to/from this communication message: sharable. + # The component can be identified via "shareable_generator_id" + shareable_generator_id = "shareable_generator" + + # train task name: Client will start training once received such task. + train_task_name = "train" + + # train timeout in second. If zero, meaning no timeout. + train_timeout = 0 + } + } + ] + + # List of components used in the server side workflow. + components = [ + #{ + # This is the persistence component used in above workflow. + # PTFileModelPersistor is a Pytorch persistor which save/read the model to/from file. + + # id = "persistor" + # path = "nvflare.app_opt.pt.file_model_persistor.PTFileModelPersistor" + + # the persistor class take model class as argument + # This imply that the model is initialized from the server-side. + # The initialized model will be broadcast to all the clients to start the training. + # args.model.path = "{model_class_path}" + # args.model.args.restore_from_path = "{restore_from_path}" + # args.model.args.peft_restore_from_path = "{peft_restore_from_path}" + #}, + { + # This is the generator that convert the model to shareable communication message structure used in workflow + id = "shareable_generator" + path = "nvflare.app_common.shareablegenerators.full_model_shareable_generator.FullModelShareableGenerator" + args = {} + }, + { + # This is the aggregator that perform the weighted average aggregation. + # the aggregation is "in-time", so it doesn't wait for client results, but aggregates as soon as it received the data. + id = "aggregator" + path = "nvflare.app_common.aggregators.intime_accumulate_model_aggregator.InTimeAccumulateWeightedAggregator" + args.expected_data_kind = "WEIGHTS" + }, + { + # This component is not directly used in Workflow. + # it select the best model based on the incoming global validation metrics. + id = "model_selector" + path = "nvflare.app_common.widgets.intime_model_selector.IntimeModelSelector" + # need to make sure this "key_metric" match what server side received + args.key_metric = "validation_exact_string_match" + } + ] + +} diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/base_config.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/base_config.yaml new file mode 100644 index 0000000000..5c1d4686ad --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/base_config.yaml @@ -0,0 +1,181 @@ +name: esm1nv +do_training: True # set to false if data preprocessing steps must be completed +do_testing: False # set to true to run evaluation on test data after training, requires test_dataset section +restore_from_path: null # used when starting from a .nemo file + +trainer: + devices: 1 # number of GPUs or CPUs + num_nodes: 1 + accelerator: gpu #gpu or cpu + precision: 16-mixed #16 or 32 + logger: False # logger is provided by NeMo exp_manager + enable_checkpointing: False # checkpointing is done by NeMo exp_manager + max_epochs: null # # use max_steps instead with NeMo Megatron model + max_steps: 1000000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 1 # number of iterations between logging + val_check_interval: 1500 + limit_val_batches: 50 # number of batches in validation step, use fraction for fraction of data, 0 to disable + limit_test_batches: 500 # number of batches in test step, use fraction for fraction of data, 0 to disable + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + benchmark: False + +exp_manager: + name: ${name} + exp_dir: ${.name}/${.wandb_logger_kwargs.name} + explicit_log_dir: ${.exp_dir} + create_wandb_logger: False + create_tensorboard_logger: True + wandb_logger_kwargs: + project: ${name}_pretraining + name: ${name}_pretraining + group: ${name} + job_type: Localhost_nodes_${trainer.num_nodes}_gpus_${trainer.devices} + notes: "date: ${now:%y%m%d-%H%M%S}" + tags: + - ${name} + offline: False # set to True if there are issues uploading to WandB during training + resume_if_exists: True # automatically resume if checkpoint exists + resume_ignore_no_checkpoint: True # leave as True, will start new training if resume_if_exists is True but no checkpoint exists + create_checkpoint_callback: False # Setting this to False so to avoid overwriting the model sent and received to the server + checkpoint_callback_params: + monitor: val_TARGET_accuracy + save_top_k: 1 # number of checkpoints to save + mode: max # use min or max of monitored metric to select best checkpoints + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + filename: 'esm1nv--{val_TARGET_accuracy:.4f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + + +model: + # model parallelism + micro_batch_size: 8 # NOTE: adjust to occupy ~ 90% of GPU memory + tensor_model_parallel_size: 1 # model parallelism + pipeline_model_parallel_size: 1 + + # model architecture + seq_length: 512 # FIXME: remove me (replaced by encoder_seq_length) + max_position_embeddings: ${.seq_length} + encoder_seq_length: ${.seq_length} + num_layers: 6 + hidden_size: 768 + ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size. + num_attention_heads: 12 + init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.') + hidden_dropout: 0.1 # 0.1 # Dropout probability for hidden state transformer. + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + layernorm_epsilon: 1e-5 + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + bert_binary_head: False # BERT binary head + resume_from_checkpoint: null # manually set the checkpoint file to load from + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + + tokenizer: + library: 'megatron' + type: 'BertWordPieceLowerCase' + model: null + vocab_file: null + merge_file: null + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + + # miscellaneous + seed: 4 + use_cpu_initialization: False # Init weights on the CPU (slow for large model) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + + # not implemented in NeMo yet + activations_checkpoint_method: null # 'uniform', 'block' + activations_checkpoint_num_layers: 1 + + data: + ngc_registry_target: uniref50_2022_05 + ngc_registry_version: v23.06 + data_prefix: "" # must be null or "" + num_workers: 2 + dataloader_type: single # cyclic + reset_position_ids: False # Reset position ids after end-of-document token + reset_attention_mask: False # Reset attention mask after end-of-document token + eod_mask_loss: False # Mask loss for the end of document tokens + masked_lm_prob: 0.15 # Probability of replacing a token with mask. + short_seq_prob: 0.1 # Probability of producing a short sequence. + skip_lines: 0 + drop_last: False + pin_memory: False + index_mapping_dir: null # path to store cached indexing files (if empty, will be stored in the same directory as dataset_path) + data_impl: "csv_mmap" + # Supported kwargs (with default values): + # text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True) + # csv_mmap (newline_int=10, header_lines=0,workers=None, sort_dataset_paths=True, data_col=1, data_sep=",") + data_impl_kwargs: + csv_mmap: + header_lines: 1 + newline_int: 10 # byte-value of newline + workers: ${model.data.num_workers} # number of workers when creating missing index files (null defaults to cpu_num // 2) + sort_dataset_paths: True # if True datasets will be sorted by name + data_sep: ',' # string to split text into columns + # column number of csv to take values from + data_col: 3 + use_upsampling: True # if the data should be upsampled to max number of steps in the training + seed: ${model.seed} # Random seed + max_seq_length: ${model.seq_length} # Maximum input sequence length. Longer sequences are truncated + dynamic_padding: False # If True, each batch is padded to the maximum sequence length within that batch. + # Set it to False when model.pipeline_model_parallel_size > 1, as pipeline parallelism requires fixed-length padding. + + optim: + name: fused_adam # fused optimizers used by Megatron model + lr: 2e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 500 # use to set warmup_steps explicitly or leave as null to calculate + constant_steps: 50000 + min_lr: 2e-5 + + dwnstr_task_validation: + enabled: False + dataset: + class: bionemo.model.core.dwnstr_task_callbacks.PerTokenPredictionCallback + task_type: token-level-classification + infer_target: bionemo.model.protein.esm1nv.infer.ESM1nvInference + max_seq_length: ${model.seq_length} + emb_batch_size: 128 + batch_size: 128 + num_epochs: 10 + shuffle: True + num_workers: 2 + task_name: secondary_structure + dataset_path: /data/FLIP/${model.dwnstr_task_validation.dataset.task_name} + dataset: + train: x000 + test: x000 + sequence_column: "sequence" # name of column with protein sequence in csv file + target_column: [ "3state", "resolved" ] # names of label columns in csv file + target_sizes: [ 3, 2 ] # number of classes in each label + mask_column: [ "resolved", null ] # names of mask columns in csv file, masks must be 0 or 1 + random_seed: ${model.seed} + optim: + name: adam + lr: 0.0001 + betas: + - 0.9 + - 0.999 + eps: 1e-8 + weight_decay: 0.01 + sched: + name: WarmupAnnealing + min_lr: 0.00001 + last_epoch: -1 + warmup_ratio: 0.01 + max_steps: 1000 diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/downstream_flip.py b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/downstream_flip.py new file mode 100644 index 0000000000..9b254a2990 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/downstream_flip.py @@ -0,0 +1,123 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bionemo.data import FLIPPreprocess +from bionemo.data.metrics import accuracy, mse, per_token_accuracy +from bionemo.model.protein.downstream import FineTuneProteinModel +from bionemo.model.utils import setup_trainer +from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector +from nemo.core.config import hydra_runner +from nemo.utils import logging +from omegaconf.omegaconf import OmegaConf, open_dict + +# Import nvflare lightning API for federated learning +import nvflare.client.lightning as flare +from nvflare.client.api import init + +micro_batch_size = 32 +val_check_intervals = { + "site-1": min(int(416 / micro_batch_size), 3), # Use min to ensure it's <= 3 + "site-2": min(int(238 / micro_batch_size), 3), + "site-3": min(int(282 / micro_batch_size), 3), + "site-4": min(int(472 / micro_batch_size), 3), + "site-5": min(int(361 / micro_batch_size), 3), + "site-6": min(int(157 / micro_batch_size), 3), +} + + +@hydra_runner(config_path=".", config_name="downstream_flip_sabdab") # ESM1 +def main(cfg) -> None: + logging.info("\n\n************* Finetune config ****************") + logging.info(f"\n{OmegaConf.to_yaml(cfg)}") + init() + # Get FL system info and set site-specific parameters + fl_sys_info = flare.system_info() + site_name = fl_sys_info["site_name"] + cfg.model.data.dataset.train = f"sabdab_chen_{site_name}_train" + cfg.trainer.val_check_interval = val_check_intervals[site_name] + print(f"Running client {site_name} with train data: {cfg.model.data.dataset.train}") + print(f"Validation check interval: {cfg.trainer.val_check_interval}") + + # Do preprocessing if specified in config + if cfg.do_preprocessing: + logging.info("************** Starting Preprocessing ***********") + preprocessor = FLIPPreprocess() + preprocessor.prepare_all_datasets(output_dir=cfg.model.data.preprocessed_data_path) + + if not cfg.do_training and not cfg.do_testing: + return + + trainer = setup_trainer(cfg, builder=None, reset_accumulate_grad_batches=False) + + # Load model + with open_dict(cfg): + cfg.model.encoder_cfg = cfg + + if cfg.restore_from_path: + logging.info("\nRestoring model from .nemo file " + cfg.restore_from_path) + model = FineTuneProteinModel.restore_from( + cfg.restore_from_path, cfg.model, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector() + ) + else: + model = FineTuneProteinModel(cfg.model, trainer) + + metrics = {} + metrics_args = {} + for idx, name in enumerate(cfg.model.data.target_column): + if cfg.model.data.task_type == "token-level-classification": + metrics[name + "_accuracy"] = per_token_accuracy + metrics_args[name + "_accuracy"] = {"label_id": idx} + elif cfg.model.data.task_type == "classification": + metrics[name + "_accuracy"] = accuracy + metrics_args[name + "_accuracy"] = {} + elif cfg.model.data.task_type == "regression": + metrics[name + "_MSE"] = mse + metrics_args[name + "_MSE"] = {} + + model.add_metrics(metrics=metrics, metrics_args=metrics_args) + + # Patch trainer for NVFlare federated learning + flare.patch(trainer) + + # Federated learning loop + while flare.is_running(): + fl_sys_info = flare.system_info() + print("--- fl_sys_info ---") + print(fl_sys_info) + + # Validate current global model + print("--- validate global model ---") + trainer.validate(model) + + # Perform local training with received global model + print("--- train new model ---") + trainer.fit(model) + logging.info("************** Finished Training ***********") + + if cfg.do_testing: + logging.info("************** Starting Testing ***********") + if "test" in cfg.model.data.dataset: + trainer.limit_train_batches = 0 + trainer.limit_val_batches = 0 + trainer.fit(model) + trainer.test(model, ckpt_path=None) + else: + raise UserWarning( + "Skipping testing, test dataset file was not provided. Please specify 'dataset.test' in yaml config" + ) + logging.info("************** Finished Testing ***********") + + +if __name__ == "__main__": + main() diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml new file mode 100644 index 0000000000..b21438510e --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/downstream_flip_sabdab.yaml @@ -0,0 +1,74 @@ +name: esm1nv_flip +defaults: + - pretrain_small + - _self_ +do_preprocessing: False +do_training: True # set to false if data preprocessing steps must be completed +do_testing: False # set to true to run evaluation on test data after training +restore_from_path: null # path to nemo checkpoint of the fine-tuned model (encoder + task head) to be used for further training, testing or inference +target: bionemo.model.protein.esm1nv.ESM1nvModel # target class for protein model +infer_target: bionemo.model.protein.esm1nv.infer.ESM1nvInference # target inference class for protein model +encoder_frozen: False + +trainer: + devices: 1 # number of GPUs or CPUs + num_nodes: 1 + max_epochs: 20 + val_check_interval: 0.0 + limit_val_batches: 0.0 # number of batches in validation step, use fraction for fraction of data, 0 to disable + limit_test_batches: 0.0 # number of batches in test step, use fraction for fraction of data, 0 to disable + use_distributed_sampler: False + +exp_manager: + wandb_logger_kwargs: + project: ${name}_${model.data.task_name}_finetuning + name: ${name}_${model.data.task_name}_finetuning_encoder_frozen_${model.encoder_frozen} + +model: + restore_encoder_path: ${oc.env:BIONEMO_HOME}/models/protein/esm1nv/esm1nv.nemo + encoder_frozen: False # encoder trainable or frozen + post_process: False # must be False for downstream task + micro_batch_size: 32 # NOTE: adjust to occupy ~ 90% of GPU memory + global_batch_size: null # if null will be computed automatically + tensor_model_parallel_size: 1 # model parallelism + loss_func: CrossEntropyLoss + hidden_layer_size: 256 + dropout_rate: 0.25 + + optim_param_groups: + encoder_model: + lr: 1e-5 + task_head: + lr: 5e-4 + + data: + task_name: tap # options: aav, bind, conservation, gb1, meltome, sav, scl, secondary_structure + task_type: classification #'token-level-classification' # alternative: classification, regression + preprocessed_data_path: /tmp/data # path where all preprocessed FLIP datasets are saved + dataset_path: ${model.data.preprocessed_data_path}/sabdab_chen # path to a training data + dataset: + train: ??? # train data will be set in `custom/downstream_flip.py`, e.g. to "sabdab_chen_site-1_train" + val: sabdab_chen_valid + test: sabdab_chen_test + sequence_column: "Antibody" # name of column with protein sequence in csv file + target_column: ["Y"] #["3state"np.sum(test_df['Y']==0), "resolved"] # names of label columns in csv file + target_sizes: [2] # number of classes in each label for classifications or 1 for regression + num_classes: 2 + num_workers: 2 + shuffle: True # shuffle training dataset + max_seq_length: ${model.seq_length} + emb_batch_size: ${model.micro_batch_size} + + finetuning_optim: # optimizer parameters for downstream task model + name: adam + #lr: 0.0005 + betas: + - 0.9 + - 0.999 + eps: 1e-8 + weight_decay: 0.001 + #sched: + # name: WarmupAnnealing + # min_lr: 0.00001 #0.00001 + # last_epoch: -1 + # warmup_steps: 10 diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/infer.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/infer.yaml new file mode 100644 index 0000000000..3368831d85 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/infer.yaml @@ -0,0 +1,25 @@ +defaults: + - base_infer_config + # allow this config to override defaults + - _self_ + +hydra: + searchpath: + - /workspace/bionemo/examples/conf/ + +name: ESM1nv_Inference +desc: Minimum configuration for initializing a ESM1nv model for inference. + +model: + post_process: False + tokenizer: + vocab_path: /tokenizers/protein/esm1nv/vocab/protein_sequence_sentencepiece.vocab + model_path: /tokenizers/protein/esm1nv/vocab/protein_sequence_sentencepiece.model + downstream_task: + restore_from_path: "/model/protein/esm1nv/esm1nv.nemo" + outputs: [embeddings, hiddens] # Which outputs to extract per sample (a value or list). Possible values: hiddens, embeddings. + data: + dataset_path: /data/FLIP/secondary_structure/test/x000 # full path to dataset (can include range or a list) + +target: bionemo.model.protein.esm1nv.esm1nv_model.ESM1nvModel # path to model class to load +infer_target: bionemo.model.protein.esm1nv.infer.ESM1nvInference # path to inferende class to load \ No newline at end of file diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/pretrain_small.yaml b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/pretrain_small.yaml new file mode 100644 index 0000000000..4599d92bcc --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/app/custom/pretrain_small.yaml @@ -0,0 +1,33 @@ +defaults: + - base_config +restore_from_path: null # used when starting from a .nemo file + +model: + tokenizer: + library: 'sentencepiece' + type: null + model: ${oc.env:BIONEMO_HOME}/tokenizers/protein/esm1nv/vocab/protein_sequence_sentencepiece.model + vocab_file: ${oc.env:BIONEMO_HOME}/tokenizers/vocab/protein_sequence_sentencepiece.vocab + data: + dataset_path: ${oc.env:BIONEMO_HOME}/data/uniref2022_05 # parent directory for data, contains train / val / test folders. Needs to be writeable for index creation. + dataset: # inclusive range of data files to load x[000..049] or can a single file, e.g. x000 + train: x[000..049] + test: x[000..049] + val: x[000..049] + micro_batch_size: ${model.micro_batch_size} + num_workers: 2 + + # Supported kwargs (with default values): + # text_mmap (newline_int=10, header_lines=0, workers=None, sort_dataset_paths=True) + # csv_mmap (newline_int=10, header_lines=0,workers=None, sort_dataset_paths=True, data_col=1, data_sep=",") + data_impl_kwargs: + csv_mmap: + data_col: 3 # 0-based + + # These control the MLM token probabilities. The following settings are commonly used in literature. + modify_percent: 0.15 # Fraction of characters in a protein sequence to modify. (Modification means replacing with another amino acid or with a mask token) + perturb_percent: 0.1 # Of the modify_percent, what fraction of characters are to be replaced with another amino acid. + mask_percent: 0.8 # Of the modify_percent, what fraction of characters are to be replaced with a mask token. + identity_percent: 0.1 # Of the modify_percent, what fraction of characters are to be unchanged as the original amino acid. + dwnst_task_validation: + enabled: True \ No newline at end of file diff --git a/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/meta.conf b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/meta.conf new file mode 100644 index 0000000000..a21a6ec425 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/jobs/local_sabdab_esm1nv/meta.conf @@ -0,0 +1,10 @@ +{ + name = "bionemo_local_finetune_esm1nv" + resource_spec = {} + deploy_map { + # change deploy map as needed. + app: ["@ALL"] + } + min_clients = 1 + mandatory_clients = [] +} diff --git a/examples/advanced/bionemo/downstream/sabdab/prepare_sabdab_data.py b/examples/advanced/bionemo/downstream/sabdab/prepare_sabdab_data.py new file mode 100644 index 0000000000..8b8c73bb03 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/prepare_sabdab_data.py @@ -0,0 +1,156 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +import pandas as pd +from tdc.single_pred import Develop + +np.random.seed(1234) + +out_name = "sabdab_chen" +split_dir = f"/tmp/data/{out_name}" +n_clients = 6 +do_break_chains = False +do_clean_chains = True +do_normalize = False +alpha = 1.0 + + +def clean_chains(df): + a = df["Antibody"] + b = [] + for chains in a: + # split chains + chains = chains.replace("['", "").replace("']", "").replace("', '", " ") + assert "'" not in chains + assert "[" not in chains + assert "]" not in chains + assert "\\n" not in chains + assert "'" not in chains + b.append(chains) + df["Antibody"] = b + + return df + + +def break_chains(df): + out_df = {"Antibody": []} + for idx, row in df.iterrows(): + # split chains + chains = row["Antibody"] + chains = chains.replace("['", "").replace("']", "").split("'\\n '") + assert "'" not in chains + assert "[" not in chains + assert "]" not in chains + assert "\\n" not in chains + assert "'" not in chains + + for chain in chains: + out_df["Antibody"].append(chain) + for k in row.keys(): + if k == "Antibody": + continue + if k not in out_df: + out_df[k] = [row[k]] + else: + out_df[k].append(row[k]) + + return pd.DataFrame(out_df) + + +def main(): + seed = 0 + + data = Develop(name="SAbDab_Chen", path="/tmp/data") + split = data.get_split() + + train_df = pd.concat([split["train"], split["valid"]]) + test_df = split["test"] + + # split client train + client_train_dfs = [] + if alpha > 0: + print(f"Sampling with alpha={alpha}") + proportions = np.random.dirichlet(np.repeat(alpha, n_clients)) + else: + print("Uniform sampling") + proportions = n_clients * [1 / n_clients] + + for client_id in range(n_clients): + client_name = f"site-{client_id + 1}" + client_train_df = train_df.sample(frac=proportions[client_id], replace=False, random_state=seed + client_id) + + if do_break_chains: + client_train_df = break_chains(client_train_df) + if do_clean_chains: + client_train_df = clean_chains(client_train_df) + client_train_dfs.append(client_train_df) + + _split_dir = os.path.join(split_dir, "train") + if not os.path.isdir(_split_dir): + os.makedirs(_split_dir) + client_train_df.to_csv(os.path.join(_split_dir, f"{out_name}_{client_name}_train.csv"), index=False) + print(f"Save {len(client_train_df)} training proteins for {client_name} (frac={proportions[client_id]:0.3f})") + + # save full train, test, & valid + if do_break_chains: + train_df = break_chains(train_df) + test_df = break_chains(test_df) + if do_clean_chains: + train_df = clean_chains(train_df) + test_df = clean_chains(test_df) + + _split_dir = os.path.join(split_dir, "train") + if not os.path.isdir(_split_dir): + os.makedirs(_split_dir) + train_df.to_csv(os.path.join(_split_dir, f"{out_name}_full_train.csv"), index=False) + _split_dir = os.path.join(split_dir, "val") + if not os.path.isdir(_split_dir): + os.makedirs(_split_dir) + test_df.to_csv(os.path.join(_split_dir, f"{out_name}_valid.csv"), index=False) + _split_dir = os.path.join(split_dir, "test") + if not os.path.isdir(_split_dir): + os.makedirs(_split_dir) + test_df.to_csv(os.path.join(_split_dir, f"{out_name}_test.csv"), index=False) + + print(f"Saved {len(train_df)} training and {len(test_df)} testing proteins.") + + for _set, _df in zip(["TRAIN", "TEST"], [train_df, test_df]): + n_pos = np.sum(_df["Y"] == 0) + n_neg = np.sum(_df["Y"] == 1) + n = len(_df) + print(f" {_set} Pos/Neg ratio: neg={n_neg}, pos={n_pos}: {n_pos / n_neg:0.3f}") + print(f" {_set} Trivial accuracy: {n_pos / n:0.3f}") + + # measure overlap + d = np.nan * np.zeros((n_clients, n_clients)) + for i in range(n_clients): + for j in range(n_clients): + if j <= i: + continue + a = np.asarray(client_train_dfs[i]["Antibody_ID"]) + b = np.asarray(client_train_dfs[j]["Antibody_ID"]) + assert len(np.unique(a)) == len(a) + assert len(np.unique(b)) == len(b) + d[i][j] = len(np.intersect1d(a, b)) / len(b) + + print(d) + overlap = np.mean(d[~np.isnan(d)]) + print(f"Avg. overlap: {100 * overlap:0.2f}%") + + +if __name__ == "__main__": + main() diff --git a/examples/advanced/bionemo/downstream/sabdab/run_sim_sabdab.py b/examples/advanced/bionemo/downstream/sabdab/run_sim_sabdab.py new file mode 100644 index 0000000000..1c48035c36 --- /dev/null +++ b/examples/advanced/bionemo/downstream/sabdab/run_sim_sabdab.py @@ -0,0 +1,27 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nvflare import SimulatorRunner + +# Choose from one of the available jobs +job_name = "central_sabdab_esm1nv" +n_clients = 1 +# job_name = "local_sabdab_esm1nv"; n_clients = 6 +# job_name = "fedavg_sabdab_esm1nv"; n_clients = 6 + +simulator = SimulatorRunner( + job_folder=f"jobs/{job_name}", workspace=f"/tmp/nvflare/results/{job_name}", n_clients=n_clients, threads=n_clients +) +run_status = simulator.run() +print("Simulator finished with run_status", run_status) diff --git a/examples/advanced/bionemo/downstream/scl/figs/scl_alpha1.0.svg b/examples/advanced/bionemo/downstream/scl/figs/scl_alpha1.0.svg new file mode 100644 index 0000000000..ff1a1cfbb6 --- /dev/null +++ b/examples/advanced/bionemo/downstream/scl/figs/scl_alpha1.0.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/advanced/bionemo/downstream/scl/figs/scl_results.svg b/examples/advanced/bionemo/downstream/scl/figs/scl_results.svg new file mode 100644 index 0000000000..81a26ca50a --- /dev/null +++ b/examples/advanced/bionemo/downstream/scl/figs/scl_results.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/advanced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/config/config_fed_client.conf b/examples/advanced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/config/config_fed_client.conf new file mode 100644 index 0000000000..95a31d24c0 --- /dev/null +++ b/examples/advanced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/config/config_fed_client.conf @@ -0,0 +1,94 @@ +{ + # version of the configuration + format_version = 2 + + # This is the application script which will be invoked. Client can replace this script with user's own training script. + app_script = "downstream_flip.py" + + # Additional arguments needed by the training code. For example, in lightning, these can be --trainer.batch_size=xxx. + app_config = "" + + # Additional arguments needed by DDP. + #ddp_config = "--nnodes=1 --nproc_per_node=1 --master_port=7777" + + # Client Computing Executors. + executors = [ + { + # tasks the executors are defined to handle + tasks = ["train"] + + # This particular executor + executor { + + # This is an executor for pytorch + Client API. The underline data exchange is using Pipe. + path = "nvflare.app_opt.pt.client_api_launcher_executor.PTClientAPILauncherExecutor" + + args { + # launcher_id is used to locate the Launcher object in "components" + launcher_id = "launcher" + + # pipe_id is used to locate the Pipe object in "components" + pipe_id = "pipe" + + # Timeout in seconds for waiting for a heartbeat from the training script. Defaults to 30 seconds. + # Please refer to the class docstring for all available arguments + heartbeat_timeout = 60 + + # format of the exchange parameters + params_exchange_format = "pytorch" + + # if the transfer_type is FULL, then it will be sent directly + # if the transfer_type is DIFF, then we will calculate the + # difference VS received parameters and send the difference + params_transfer_type = "FULL" + + # if train_with_evaluation is true, the executor will expect + # the custom code need to send back both the trained parameters and the evaluation metric + # otherwise only trained parameters are expected + train_with_evaluation = false + } + } + } + ], + + # this defined an array of task data filters. If provided, it will control the data from server controller to client executor + task_data_filters = [] + + # this defined an array of task result filters. If provided, it will control the result from client executor to server controller + task_result_filters = [] + + components = [ + { + # component id is "launcher" + id = "launcher" + + # the class path of this component + path = "nvflare.app_common.launchers.subprocess_launcher.SubprocessLauncher" + + args { + # the launcher will invoke the script + #script = "python3 -m torch.distributed.run {ddp_config} custom/{app_script} {app_config} " + script = "python3 custom/{app_script} {app_config} " + # if launch_once is true, the SubprocessLauncher will launch once for the whole job + # if launch_once is false, the SubprocessLauncher will launch a process for each task it receives from server + launch_once = true + } + } + { + id = "pipe" + + path = "nvflare.fuel.utils.pipe.file_pipe.FilePipe" + + args { + # Mode of the endpoint. A pipe has two endpoints. + # An endpoint can be either the one that initiates communication or the one listening. + # PASSIVE is the one listening. + mode = "PASSIVE" + + # root_path: is the directory location of the parameters exchange. + # You can also set it to an absolute path in your system. + root_path = "{WORKSPACE}/{JOB_ID}/{SITE_NAME}" + } + } + ] +} diff --git a/examples/advanced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/custom/base_config.yaml b/examples/advanced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/custom/base_config.yaml new file mode 100644 index 0000000000..ae3fd5c67e --- /dev/null +++ b/examples/advanced/bionemo/downstream/scl/jobs/fedavg_scl_finetune_esm2nv/app1/custom/base_config.yaml @@ -0,0 +1,266 @@ +defaults: + - _self_ + +name: esm2nv +do_preprocessing: False # set to true if data preprocessing is needed +do_training: True # set to false if data preprocessing steps must be completed +do_testing: False # set to true to run evaluation on test data after training, requires test_dataset section +restore_from_path: null # used when starting from a .nemo file + +trainer: + devices: 1 # number of GPUs or CPUs + num_nodes: 1 + accelerator: gpu #gpu or cpu + precision: 32 # 16-mixed, bf16-mixed or 32 + logger: False # logger is provided by NeMo exp_manager + enable_checkpointing: False # checkpointing is done by NeMo exp_manager + use_distributed_sampler: False # use NeMo Megatron samplers + max_epochs: null # # use max_steps instead with NeMo Megatron model + log_every_n_steps: 10 # number of interations between logging + val_check_interval: 1500 + limit_val_batches: 1.0 # Number of batches in the validation step. Use 0 for no batches, 1 for full dataset, and 0 1 it will take that number as the number of batches. + limit_test_batches: 1.0 # Number of batches in the test set. Use 0 for no batches, or 0