From 0c97f58598e00e107225a14fa2527aa80d26a2d4 Mon Sep 17 00:00:00 2001 From: Holger Roth Date: Fri, 26 Apr 2024 16:03:05 -0400 Subject: [PATCH] swith back --- .../mednist/code/monai_mednist_train.py | 9 +- .../monai/examples/mednist/monai_101_fl.ipynb | 524 +++++++++--------- 2 files changed, 269 insertions(+), 264 deletions(-) diff --git a/integration/monai/examples/mednist/code/monai_mednist_train.py b/integration/monai/examples/mednist/code/monai_mednist_train.py index 45033c2aa2..f9a8cccf13 100644 --- a/integration/monai/examples/mednist/code/monai_mednist_train.py +++ b/integration/monai/examples/mednist/code/monai_mednist_train.py @@ -48,10 +48,12 @@ # (optional) metrics from nvflare.client.tracking import SummaryWriter -summary_writer = SummaryWriter() print_config() +# (2) initializes NVFlare client API +flare.init() + # Setup data directory directory = os.environ.get("MONAI_DATA_DIRECTORY") root_dir = tempfile.mkdtemp() if directory is None else directory @@ -94,12 +96,11 @@ train_handlers=StatsHandler(), ) -# (2) initializes NVFlare client API -flare.init() - # (optional) calculate total steps steps = max_epochs * len(train_loader) # Run the training + +summary_writer = SummaryWriter() while flare.is_running(): # (3) receives FLModel from NVFlare input_model = flare.receive() diff --git a/integration/monai/examples/mednist/monai_101_fl.ipynb b/integration/monai/examples/mednist/monai_101_fl.ipynb index e6089db307..d41f8fbc36 100644 --- a/integration/monai/examples/mednist/monai_101_fl.ipynb +++ b/integration/monai/examples/mednist/monai_101_fl.ipynb @@ -65,62 +65,62 @@ "text": [ "Requirement already satisfied: tensorboard in /home/hroth/.venv_monai/lib/python3.10/site-packages (from -r requirements.txt (line 1)) (2.16.2)\n", "Requirement already satisfied: mlflow in /home/hroth/.venv_monai/lib/python3.10/site-packages (from -r requirements.txt (line 2)) (2.12.1)\n", + "Requirement already satisfied: werkzeug>=1.0.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (3.0.1)\n", + "Requirement already satisfied: markdown>=2.6.8 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (3.6)\n", "Requirement already satisfied: absl-py>=0.4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (2.1.0)\n", "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (0.7.2)\n", - "Requirement already satisfied: six>1.9 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (1.16.0)\n", "Requirement already satisfied: setuptools>=41.0.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (59.6.0)\n", + "Requirement already satisfied: protobuf!=4.24.0,>=3.19.6 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (4.24.4)\n", "Requirement already satisfied: numpy>=1.12.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (1.26.4)\n", - "Requirement already satisfied: werkzeug>=1.0.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (3.0.1)\n", + "Requirement already satisfied: six>1.9 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (1.16.0)\n", "Requirement already satisfied: grpcio>=1.48.2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (1.62.1)\n", - "Requirement already satisfied: markdown>=2.6.8 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (3.6)\n", - "Requirement already satisfied: protobuf!=4.24.0,>=3.19.6 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from tensorboard->-r requirements.txt (line 1)) (4.24.4)\n", - "Requirement already satisfied: importlib-metadata!=4.7.0,<8,>=3.7.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (7.1.0)\n", "Requirement already satisfied: pyyaml<7,>=5.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (6.0.1)\n", - "Requirement already satisfied: matplotlib<4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (3.8.4)\n", - "Requirement already satisfied: sqlparse<1,>=0.4.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (0.5.0)\n", + "Requirement already satisfied: gunicorn<22 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (21.2.0)\n", + "Requirement already satisfied: cloudpickle<4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (3.0.0)\n", + "Requirement already satisfied: pandas<3 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (2.2.2)\n", "Requirement already satisfied: entrypoints<1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (0.4)\n", + "Requirement already satisfied: querystring-parser<2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (1.2.4)\n", + "Requirement already satisfied: docker<8,>=4.0.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (7.0.0)\n", + "Requirement already satisfied: pytz<2025 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (2024.1)\n", + "Requirement already satisfied: click<9,>=7.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (8.1.7)\n", + "Requirement already satisfied: pyarrow<16,>=4.0.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (15.0.2)\n", + "Requirement already satisfied: sqlparse<1,>=0.4.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (0.5.0)\n", "Requirement already satisfied: requests<3,>=2.17.3 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (2.31.0)\n", + "Requirement already satisfied: Flask<4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (3.0.2)\n", "Requirement already satisfied: graphene<4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (3.3)\n", - "Requirement already satisfied: scikit-learn<2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (1.4.2)\n", - "Requirement already satisfied: cloudpickle<4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (3.0.0)\n", "Requirement already satisfied: alembic!=1.10.0,<2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (1.13.1)\n", - "Requirement already satisfied: Flask<4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (3.0.2)\n", - "Requirement already satisfied: docker<8,>=4.0.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (7.0.0)\n", - "Requirement already satisfied: sqlalchemy<3,>=1.4.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (2.0.16)\n", - "Requirement already satisfied: scipy<2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (1.13.0)\n", + "Requirement already satisfied: scikit-learn<2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (1.4.2)\n", "Requirement already satisfied: packaging<25 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (24.0)\n", - "Requirement already satisfied: pytz<2025 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (2024.1)\n", - "Requirement already satisfied: pandas<3 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (2.2.2)\n", - "Requirement already satisfied: querystring-parser<2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (1.2.4)\n", - "Requirement already satisfied: pyarrow<16,>=4.0.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (15.0.2)\n", - "Requirement already satisfied: gitpython<4,>=3.1.9 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (3.1.43)\n", "Requirement already satisfied: Jinja2<4,>=2.11 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (3.1.3)\n", - "Requirement already satisfied: click<9,>=7.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (8.1.7)\n", - "Requirement already satisfied: gunicorn<22 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (21.2.0)\n", - "Requirement already satisfied: typing-extensions>=4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from alembic!=1.10.0,<2->mlflow->-r requirements.txt (line 2)) (4.11.0)\n", + "Requirement already satisfied: gitpython<4,>=3.1.9 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (3.1.43)\n", + "Requirement already satisfied: importlib-metadata!=4.7.0,<8,>=3.7.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (7.1.0)\n", + "Requirement already satisfied: scipy<2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (1.13.0)\n", + "Requirement already satisfied: matplotlib<4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (3.8.4)\n", + "Requirement already satisfied: sqlalchemy<3,>=1.4.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from mlflow->-r requirements.txt (line 2)) (2.0.16)\n", "Requirement already satisfied: Mako in /home/hroth/.venv_monai/lib/python3.10/site-packages (from alembic!=1.10.0,<2->mlflow->-r requirements.txt (line 2)) (1.3.3)\n", + "Requirement already satisfied: typing-extensions>=4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from alembic!=1.10.0,<2->mlflow->-r requirements.txt (line 2)) (4.11.0)\n", "Requirement already satisfied: urllib3>=1.26.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from docker<8,>=4.0.0->mlflow->-r requirements.txt (line 2)) (2.2.1)\n", "Requirement already satisfied: itsdangerous>=2.1.2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask<4->mlflow->-r requirements.txt (line 2)) (2.2.0)\n", "Requirement already satisfied: blinker>=1.6.2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask<4->mlflow->-r requirements.txt (line 2)) (1.7.0)\n", "Requirement already satisfied: gitdb<5,>=4.0.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from gitpython<4,>=3.1.9->mlflow->-r requirements.txt (line 2)) (4.0.11)\n", - "Requirement already satisfied: aniso8601<10,>=8 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from graphene<4->mlflow->-r requirements.txt (line 2)) (9.0.1)\n", "Requirement already satisfied: graphql-relay<3.3,>=3.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from graphene<4->mlflow->-r requirements.txt (line 2)) (3.2.0)\n", + "Requirement already satisfied: aniso8601<10,>=8 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from graphene<4->mlflow->-r requirements.txt (line 2)) (9.0.1)\n", "Requirement already satisfied: graphql-core<3.3,>=3.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from graphene<4->mlflow->-r requirements.txt (line 2)) (3.2.3)\n", "Requirement already satisfied: zipp>=0.5 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from importlib-metadata!=4.7.0,<8,>=3.7.0->mlflow->-r requirements.txt (line 2)) (3.18.1)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Jinja2<4,>=2.11->mlflow->-r requirements.txt (line 2)) (2.1.5)\n", + "Requirement already satisfied: cycler>=0.10 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from matplotlib<4->mlflow->-r requirements.txt (line 2)) (0.12.1)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from matplotlib<4->mlflow->-r requirements.txt (line 2)) (3.1.2)\n", + "Requirement already satisfied: pillow>=8 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from matplotlib<4->mlflow->-r requirements.txt (line 2)) (10.3.0)\n", "Requirement already satisfied: python-dateutil>=2.7 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from matplotlib<4->mlflow->-r requirements.txt (line 2)) (2.9.0.post0)\n", "Requirement already satisfied: fonttools>=4.22.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from matplotlib<4->mlflow->-r requirements.txt (line 2)) (4.51.0)\n", "Requirement already satisfied: kiwisolver>=1.3.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from matplotlib<4->mlflow->-r requirements.txt (line 2)) (1.4.5)\n", - "Requirement already satisfied: cycler>=0.10 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from matplotlib<4->mlflow->-r requirements.txt (line 2)) (0.12.1)\n", "Requirement already satisfied: contourpy>=1.0.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from matplotlib<4->mlflow->-r requirements.txt (line 2)) (1.2.1)\n", - "Requirement already satisfied: pillow>=8 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from matplotlib<4->mlflow->-r requirements.txt (line 2)) (10.3.0)\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from matplotlib<4->mlflow->-r requirements.txt (line 2)) (3.1.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from pandas<3->mlflow->-r requirements.txt (line 2)) (2024.1)\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from requests<3,>=2.17.3->mlflow->-r requirements.txt (line 2)) (2024.2.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from requests<3,>=2.17.3->mlflow->-r requirements.txt (line 2)) (3.7)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from requests<3,>=2.17.3->mlflow->-r requirements.txt (line 2)) (3.3.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from scikit-learn<2->mlflow->-r requirements.txt (line 2)) (3.4.0)\n", "Requirement already satisfied: joblib>=1.2.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from scikit-learn<2->mlflow->-r requirements.txt (line 2)) (1.4.0)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from scikit-learn<2->mlflow->-r requirements.txt (line 2)) (3.4.0)\n", "Requirement already satisfied: greenlet!=0.4.17 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from sqlalchemy<3,>=1.4.0->mlflow->-r requirements.txt (line 2)) (3.0.3)\n", "Requirement already satisfied: smmap<6,>=3.0.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from gitdb<5,>=4.0.1->gitpython<4,>=3.1.9->mlflow->-r requirements.txt (line 2)) (5.0.1)\n", "Obtaining file:///media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel\n", @@ -128,47 +128,51 @@ "\u001b[?25h Checking if build backend supports build_editable ... \u001b[?25ldone\n", "\u001b[?25h Getting requirements to build editable ... \u001b[?25ldone\n", "\u001b[?25h Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25hRequirement already satisfied: numpy in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (1.26.4)\n", - "Requirement already satisfied: gunicorn>=20.1.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (21.2.0)\n", - "Requirement already satisfied: grpcio==1.62.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (1.62.1)\n", - "Requirement already satisfied: Flask==3.0.2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (3.0.2)\n", - "Requirement already satisfied: requests>=2.28.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (2.31.0)\n", - "Requirement already satisfied: docker>=6.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (7.0.0)\n", - "Requirement already satisfied: Flask-SQLAlchemy==3.1.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (3.1.1)\n", - "Requirement already satisfied: protobuf==4.24.4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (4.24.4)\n", - "Requirement already satisfied: SQLAlchemy==2.0.16 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (2.0.16)\n", - "Requirement already satisfied: msgpack>=1.0.3 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (1.0.8)\n", - "Requirement already satisfied: psutil>=5.9.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (5.9.8)\n", - "Requirement already satisfied: PyYAML>=6.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (6.0.1)\n", - "Requirement already satisfied: pyhocon in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (0.3.60)\n", - "Requirement already satisfied: Flask-JWT-Extended==4.6.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (4.6.0)\n", - "Requirement already satisfied: Werkzeug==3.0.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (3.0.1)\n", - "Requirement already satisfied: cryptography>=36.0.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (42.0.5)\n", - "Requirement already satisfied: websockets>=10.4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (12.0)\n", - "Requirement already satisfied: six>=1.15.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (1.16.0)\n", - "Requirement already satisfied: itsdangerous>=2.1.2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask==3.0.2->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (2.2.0)\n", - "Requirement already satisfied: Jinja2>=3.1.2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask==3.0.2->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (3.1.3)\n", - "Requirement already satisfied: blinker>=1.6.2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask==3.0.2->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (1.7.0)\n", - "Requirement already satisfied: click>=8.1.3 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask==3.0.2->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (8.1.7)\n", - "Requirement already satisfied: PyJWT<3.0,>=2.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask-JWT-Extended==4.6.0->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (2.8.0)\n", - "Requirement already satisfied: typing-extensions>=4.2.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from SQLAlchemy==2.0.16->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (4.11.0)\n", - "Requirement already satisfied: greenlet!=0.4.17 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from SQLAlchemy==2.0.16->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (3.0.3)\n", - "Requirement already satisfied: MarkupSafe>=2.1.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Werkzeug==3.0.1->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (2.1.5)\n", - "Requirement already satisfied: cffi>=1.12 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from cryptography>=36.0.0->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (1.16.0)\n", - "Requirement already satisfied: packaging>=14.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from docker>=6.0->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (24.0)\n", - "Requirement already satisfied: urllib3>=1.26.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from docker>=6.0->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (2.2.1)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from requests>=2.28.0->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (2024.2.2)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from requests>=2.28.0->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from requests>=2.28.0->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (3.7)\n", - "Requirement already satisfied: pyparsing<4,>=2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from pyhocon->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (3.1.2)\n", - "Requirement already satisfied: pycparser in /home/hroth/.venv_monai/lib/python3.10/site-packages (from cffi>=1.12->cryptography>=36.0.0->nvflare-nightly==2.4.0rc7+110.gdf34f183.dirty) (2.22)\n", + "\u001b[?25hRequirement already satisfied: pyhocon in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (0.3.60)\n", + "Requirement already satisfied: websockets>=10.4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (12.0)\n", + "Requirement already satisfied: cryptography>=36.0.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (42.0.5)\n", + "Requirement already satisfied: PyYAML>=6.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (6.0.1)\n", + "Requirement already satisfied: docker>=6.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (7.0.0)\n", + "Requirement already satisfied: Werkzeug==3.0.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (3.0.1)\n", + "Requirement already satisfied: Flask==3.0.2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (3.0.2)\n", + "Requirement already satisfied: msgpack>=1.0.3 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (1.0.8)\n", + "Requirement already satisfied: six>=1.15.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (1.16.0)\n", + "Requirement already satisfied: SQLAlchemy==2.0.16 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (2.0.16)\n", + "Requirement already satisfied: gunicorn>=20.1.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (21.2.0)\n", + "Requirement already satisfied: protobuf==4.24.4 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (4.24.4)\n", + "Requirement already satisfied: psutil>=5.9.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (5.9.8)\n", + "Requirement already satisfied: requests>=2.28.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (2.31.0)\n", + "Requirement already satisfied: Flask-JWT-Extended==4.6.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (4.6.0)\n", + "Requirement already satisfied: grpcio==1.62.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (1.62.1)\n", + "Requirement already satisfied: numpy in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (1.26.4)\n", + "Requirement already satisfied: Flask-SQLAlchemy==3.1.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (3.1.1)\n", + "Requirement already satisfied: click>=8.1.3 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask==3.0.2->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (8.1.7)\n", + "Requirement already satisfied: itsdangerous>=2.1.2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask==3.0.2->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (2.2.0)\n", + "Requirement already satisfied: blinker>=1.6.2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask==3.0.2->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (1.7.0)\n", + "Requirement already satisfied: Jinja2>=3.1.2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask==3.0.2->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (3.1.3)\n", + "Requirement already satisfied: PyJWT<3.0,>=2.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Flask-JWT-Extended==4.6.0->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (2.8.0)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from SQLAlchemy==2.0.16->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (3.0.3)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from SQLAlchemy==2.0.16->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (4.11.0)\n", + "Requirement already satisfied: MarkupSafe>=2.1.1 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from Werkzeug==3.0.1->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (2.1.5)\n", + "Requirement already satisfied: cffi>=1.12 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from cryptography>=36.0.0->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (1.16.0)\n", + "Requirement already satisfied: packaging>=14.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from docker>=6.0->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (24.0)\n", + "Requirement already satisfied: urllib3>=1.26.0 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from docker>=6.0->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (2.2.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from requests>=2.28.0->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (3.3.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from requests>=2.28.0->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (2024.2.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from requests>=2.28.0->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (3.7)\n", + "Requirement already satisfied: pyparsing<4,>=2 in /home/hroth/.venv_monai/lib/python3.10/site-packages (from pyhocon->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (3.1.2)\n", + "Requirement already satisfied: pycparser in /home/hroth/.venv_monai/lib/python3.10/site-packages (from cffi>=1.12->cryptography>=36.0.0->nvflare-nightly==2.4.0rc7+112.gd6f1f587.dirty) (2.22)\n", "Building wheels for collected packages: nvflare-nightly\n", " Building editable for nvflare-nightly (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for nvflare-nightly: filename=nvflare_nightly-2.4.0rc7+110.gdf34f183.dirty-0.editable-py3-none-any.whl size=10615 sha256=7a7a2bdc4e10e28770281cf7e866f8959b668f07573494f87b8c29b1dfa8edc1\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-cpyzi11t/wheels/15/f0/59/04a976ff344afbdbbd7bb194f92e23f23577de3f84fd53b9d8\n", + "\u001b[?25h Created wheel for nvflare-nightly: filename=nvflare_nightly-2.4.0rc7+112.gd6f1f587.dirty-0.editable-py3-none-any.whl size=10619 sha256=f6e9c90525bd5ecf6a0fe27c43aa450c7f673a4771c0ee3bacafa3f557543aea\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-qgc0s0xz/wheels/15/f0/59/04a976ff344afbdbbd7bb194f92e23f23577de3f84fd53b9d8\n", "Successfully built nvflare-nightly\n", "Installing collected packages: nvflare-nightly\n", - "Successfully installed nvflare-nightly-2.4.0rc7+110.gdf34f183.dirty\n" + " Attempting uninstall: nvflare-nightly\n", + " Found existing installation: nvflare-nightly 2.4.0rc7+111.g01453824.dirty\n", + " Uninstalling nvflare-nightly-2.4.0rc7+111.g01453824.dirty:\n", + " Successfully uninstalled nvflare-nightly-2.4.0rc7+111.g01453824.dirty\n", + "Successfully installed nvflare-nightly-2.4.0rc7+112.gd6f1f587.dirty\n" ] } ], @@ -233,7 +237,7 @@ "\n", "---------------------------------------------------------------------------------------------------------------------------------------\n", " \n", - " job folder: ./jobs/client_api \n", + " job folder: ./jobs/fedavg_mednist \n", " \n", "---------------------------------------------------------------------------------------------------------------------------------------\n", " file_name var_name value component \n", @@ -283,7 +287,7 @@ } ], "source": [ - "!nvflare job create -force -j ./jobs/client_api -w fedavg_mednist -sd ./code/. \\\n", + "!nvflare job create -force -j ./jobs/fedavg_mednist -w fedavg_mednist -sd ./code/. \\\n", " -f config_fed_client.conf app_script=monai_mednist_train.py \\\n", " -f config_fed_server.conf model_class_path=monai.networks.nets.densenet121 \\\n", " -f config_fed_server.conf spatial_dims=2 \\\n", @@ -308,75 +312,67 @@ "name": "stdout", "output_type": "stream", "text": [ - "2024-04-26 15:46:07,766 - SimulatorRunner - INFO - Create the Simulator Server.\n", - "2024-04-26 15:46:07,767 - CoreCell - INFO - server: creating listener on tcp://0:35947\n", - "2024-04-26 15:46:07,793 - CoreCell - INFO - server: created backbone external listener for tcp://0:35947\n", - "2024-04-26 15:46:07,793 - ConnectorManager - INFO - 2567262: Try start_listener Listener resources: {'secure': False, 'host': 'localhost'}\n", - "2024-04-26 15:46:07,793 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00002 PASSIVE tcp://0:62461] is starting\n", - "2024-04-26 15:46:08,295 - CoreCell - INFO - server: created backbone internal listener for tcp://localhost:62461\n", - "2024-04-26 15:46:08,295 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00001 PASSIVE tcp://0:35947] is starting\n", - "2024-04-26 15:46:08,368 - nvflare.fuel.hci.server.hci - INFO - Starting Admin Server localhost on Port 49247\n", - "2024-04-26 15:46:08,368 - SimulatorRunner - INFO - Deploy the Apps.\n", - "2024-04-26 15:46:08,371 - SimulatorRunner - INFO - Create the simulate clients.\n", - "2024-04-26 15:46:08,375 - ClientManager - INFO - Client: New client site-1@192.168.1.203 joined. Sent token: fee2d56d-f850-4590-984c-abb9e88b6ad2. Total clients: 1\n", - "2024-04-26 15:46:08,375 - FederatedClient - INFO - Successfully registered client:site-1 for project simulator_server. Token:fee2d56d-f850-4590-984c-abb9e88b6ad2 SSID:\n", - "2024-04-26 15:46:08,376 - ClientManager - INFO - Client: New client site-2@192.168.1.203 joined. Sent token: 1cd76060-2599-4023-b0ab-cdf27357a0e9. Total clients: 2\n", - "2024-04-26 15:46:08,376 - FederatedClient - INFO - Successfully registered client:site-2 for project simulator_server. Token:1cd76060-2599-4023-b0ab-cdf27357a0e9 SSID:\n", - "2024-04-26 15:46:08,377 - SimulatorRunner - INFO - Set the client status ready.\n", - "2024-04-26 15:46:08,377 - SimulatorRunner - INFO - Deploy and start the Server App.\n", - "2024-04-26 15:46:08,378 - Cell - INFO - Register blob CB for channel='server_command', topic='*'\n", - "2024-04-26 15:46:08,379 - Cell - INFO - Register blob CB for channel='aux_communication', topic='*'\n", - "2024-04-26 15:46:08,379 - ServerCommandAgent - INFO - ServerCommandAgent cell register_request_cb: server.simulate_job\n", - "2024-04-26 15:46:12,056 - IntimeModelSelector - INFO - model selection weights control: None\n", - "2024-04-26 15:46:12,059 - ReliableMessage - INFO - enabled reliable message: max_request_workers=20 query_interval=2.0\n", - "2024-04-26 15:46:12,059 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: Server runner starting ...\n", - "2024-04-26 15:46:12,070 - MLflowReceiver - INFO - Experiment with name 'nvflare-fedavg-mednist-experiment' does not exist. Creating a new experiment.\n", - "2024-04-26 15:46:12,074 - MLflowReceiver - INFO - Experiment=\n", - "2024-04-26 15:46:12,080 - MLflowReceiver - INFO - Experiment=\n", - "2024-04-26 15:46:12,083 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: starting workflow scatter_and_gather () ...\n", - "2024-04-26 15:46:12,084 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Initializing ModelController workflow.\n", - "2024-04-26 15:46:12,084 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Workflow scatter_and_gather () started\n", - "2024-04-26 15:46:12,084 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Beginning model controller run.\n", - "2024-04-26 15:46:12,084 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Start FedAvg.\n", - "2024-04-26 15:46:12,084 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: loading initial model from persistor\n", - "2024-04-26 15:46:12,084 - PTFileModelPersistor - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Both source_ckpt_file_full_name and ckpt_preload_path are not provided. Using the default model weights initialized on the persistor side.\n", - "2024-04-26 15:46:12,089 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Round 0 started.\n", - "2024-04-26 15:46:12,089 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Sending task train to ['site-2', 'site-1']\n", - "2024-04-26 15:46:12,089 - WFCommServer - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: scheduled task train\n", - "2024-04-26 15:46:12,395 - SimulatorClientRunner - INFO - Start the clients run simulation.\n", - "2024-04-26 15:46:13,397 - SimulatorClientRunner - INFO - Simulate Run client: site-1 on GPU group: None\n", - "2024-04-26 15:46:13,411 - SimulatorClientRunner - INFO - Simulate Run client: site-2 on GPU group: None\n", - "2024-04-26 15:46:14,432 - ClientTaskWorker - INFO - ClientTaskWorker started to run\n", - "2024-04-26 15:46:14,442 - ClientTaskWorker - INFO - ClientTaskWorker started to run\n", - "2024-04-26 15:46:14,499 - CoreCell - INFO - site-1.simulate_job: created backbone external connector to tcp://localhost:35947\n", - "2024-04-26 15:46:14,499 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00001 ACTIVE tcp://localhost:35947] is starting\n", - "2024-04-26 15:46:14,500 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00002 127.0.0.1:50092 => 127.0.0.1:35947] is created: PID: 2567355\n", - "2024-04-26 15:46:14,501 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00005 127.0.0.1:35947 <= 127.0.0.1:50092] is created: PID: 2567262\n", - "2024-04-26 15:46:14,507 - CoreCell - INFO - site-2.simulate_job: created backbone external connector to tcp://localhost:35947\n", - "2024-04-26 15:46:14,507 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00001 ACTIVE tcp://localhost:35947] is starting\n", - "2024-04-26 15:46:14,508 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00002 127.0.0.1:50106 => 127.0.0.1:35947] is created: PID: 2567356\n", - "2024-04-26 15:46:14,508 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00006 127.0.0.1:35947 <= 127.0.0.1:50106] is created: PID: 2567262\n", - "2024-04-26 15:46:16,590 - Cell - INFO - Register blob CB for channel='aux_communication', topic='*'\n", - "2024-04-26 15:46:16,590 - Cell - INFO - Register blob CB for channel='aux_communication', topic='*'\n", - "2024-04-26 15:46:17,095 - Cell - INFO - broadcast: channel='aux_communication', topic='__sync_runner__', targets=['server.simulate_job'], timeout=2.0\n", - "2024-04-26 15:46:17,096 - Cell - INFO - broadcast: channel='aux_communication', topic='__sync_runner__', targets=['server.simulate_job'], timeout=2.0\n", - "2024-04-26 15:46:17,105 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: synced to Server Runner in 0.5097711086273193 seconds\n", - "2024-04-26 15:46:17,105 - ReliableMessage - INFO - enabled reliable message: max_request_workers=20 query_interval=2.0\n", - "2024-04-26 15:46:17,108 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: synced to Server Runner in 0.5127370357513428 seconds\n", - "2024-04-26 15:46:17,108 - ReliableMessage - INFO - enabled reliable message: max_request_workers=20 query_interval=2.0\n", + "2024-04-26 16:01:16,820 - SimulatorRunner - INFO - Create the Simulator Server.\n", + "2024-04-26 16:01:16,822 - CoreCell - INFO - server: creating listener on tcp://0:43925\n", + "2024-04-26 16:01:16,839 - CoreCell - INFO - server: created backbone external listener for tcp://0:43925\n", + "2024-04-26 16:01:16,839 - ConnectorManager - INFO - 2598417: Try start_listener Listener resources: {'secure': False, 'host': 'localhost'}\n", + "2024-04-26 16:01:16,840 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00002 PASSIVE tcp://0:26178] is starting\n", + "2024-04-26 16:01:17,341 - CoreCell - INFO - server: created backbone internal listener for tcp://localhost:26178\n", + "2024-04-26 16:01:17,342 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00001 PASSIVE tcp://0:43925] is starting\n", + "2024-04-26 16:01:17,414 - nvflare.fuel.hci.server.hci - INFO - Starting Admin Server localhost on Port 53755\n", + "2024-04-26 16:01:17,414 - SimulatorRunner - INFO - Deploy the Apps.\n", + "2024-04-26 16:01:17,417 - SimulatorRunner - INFO - Create the simulate clients.\n", + "2024-04-26 16:01:17,420 - ClientManager - INFO - Client: New client site-1@192.168.1.203 joined. Sent token: 03bbadf2-a405-4d56-96c0-d36175100b04. Total clients: 1\n", + "2024-04-26 16:01:17,421 - FederatedClient - INFO - Successfully registered client:site-1 for project simulator_server. Token:03bbadf2-a405-4d56-96c0-d36175100b04 SSID:\n", + "2024-04-26 16:01:17,422 - ClientManager - INFO - Client: New client site-2@192.168.1.203 joined. Sent token: ed0662e7-4007-4632-a9b7-7f64c3b1276a. Total clients: 2\n", + "2024-04-26 16:01:17,422 - FederatedClient - INFO - Successfully registered client:site-2 for project simulator_server. Token:ed0662e7-4007-4632-a9b7-7f64c3b1276a SSID:\n", + "2024-04-26 16:01:17,422 - SimulatorRunner - INFO - Set the client status ready.\n", + "2024-04-26 16:01:17,422 - SimulatorRunner - INFO - Deploy and start the Server App.\n", + "2024-04-26 16:01:17,424 - Cell - INFO - Register blob CB for channel='server_command', topic='*'\n", + "2024-04-26 16:01:17,425 - Cell - INFO - Register blob CB for channel='aux_communication', topic='*'\n", + "2024-04-26 16:01:17,425 - ServerCommandAgent - INFO - ServerCommandAgent cell register_request_cb: server.simulate_job\n", + "2024-04-26 16:01:21,018 - IntimeModelSelector - INFO - model selection weights control: None\n", + "2024-04-26 16:01:21,019 - ReliableMessage - INFO - enabled reliable message: max_request_workers=20 query_interval=2.0\n", + "2024-04-26 16:01:21,020 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: Server runner starting ...\n", + "2024-04-26 16:01:21,027 - MLflowReceiver - INFO - Experiment with name 'nvflare-fedavg-mednist-experiment' does not exist. Creating a new experiment.\n", + "2024-04-26 16:01:21,031 - MLflowReceiver - INFO - Experiment=\n", + "2024-04-26 16:01:21,036 - MLflowReceiver - INFO - Experiment=\n", + "2024-04-26 16:01:21,040 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: starting workflow scatter_and_gather () ...\n", + "2024-04-26 16:01:21,041 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Initializing ModelController workflow.\n", + "2024-04-26 16:01:21,041 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Workflow scatter_and_gather () started\n", + "2024-04-26 16:01:21,041 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Beginning model controller run.\n", + "2024-04-26 16:01:21,041 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Start FedAvg.\n", + "2024-04-26 16:01:21,041 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: loading initial model from persistor\n", + "2024-04-26 16:01:21,041 - PTFileModelPersistor - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Both source_ckpt_file_full_name and ckpt_preload_path are not provided. Using the default model weights initialized on the persistor side.\n", + "2024-04-26 16:01:21,046 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Round 0 started.\n", + "2024-04-26 16:01:21,047 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Sending task train to ['site-1', 'site-2']\n", + "2024-04-26 16:01:21,047 - WFCommServer - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: scheduled task train\n", + "2024-04-26 16:01:21,431 - SimulatorClientRunner - INFO - Start the clients run simulation.\n", + "2024-04-26 16:01:22,433 - SimulatorClientRunner - INFO - Simulate Run client: site-1 on GPU group: None\n", + "2024-04-26 16:01:22,434 - SimulatorClientRunner - INFO - Simulate Run client: site-2 on GPU group: None\n", + "2024-04-26 16:01:23,471 - ClientTaskWorker - INFO - ClientTaskWorker started to run\n", + "2024-04-26 16:01:23,480 - ClientTaskWorker - INFO - ClientTaskWorker started to run\n", + "2024-04-26 16:01:23,534 - CoreCell - INFO - site-1.simulate_job: created backbone external connector to tcp://localhost:43925\n", + "2024-04-26 16:01:23,534 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00001 ACTIVE tcp://localhost:43925] is starting\n", + "2024-04-26 16:01:23,535 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00002 127.0.0.1:37738 => 127.0.0.1:43925] is created: PID: 2598509\n", + "2024-04-26 16:01:23,536 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00005 127.0.0.1:43925 <= 127.0.0.1:37738] is created: PID: 2598417\n", + "2024-04-26 16:01:23,550 - CoreCell - INFO - site-2.simulate_job: created backbone external connector to tcp://localhost:43925\n", + "2024-04-26 16:01:23,550 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00001 ACTIVE tcp://localhost:43925] is starting\n", + "2024-04-26 16:01:23,551 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00002 127.0.0.1:37746 => 127.0.0.1:43925] is created: PID: 2598510\n", + "2024-04-26 16:01:23,551 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00006 127.0.0.1:43925 <= 127.0.0.1:37746] is created: PID: 2598417\n", + "2024-04-26 16:01:25,493 - Cell - INFO - Register blob CB for channel='aux_communication', topic='*'\n", + "2024-04-26 16:01:25,590 - Cell - INFO - Register blob CB for channel='aux_communication', topic='*'\n", + "2024-04-26 16:01:26,000 - Cell - INFO - broadcast: channel='aux_communication', topic='__sync_runner__', targets=['server.simulate_job'], timeout=2.0\n", + "2024-04-26 16:01:26,010 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: synced to Server Runner in 0.5108146667480469 seconds\n", + "2024-04-26 16:01:26,010 - ReliableMessage - INFO - enabled reliable message: max_request_workers=20 query_interval=2.0\n", + "2024-04-26 16:01:26,096 - Cell - INFO - broadcast: channel='aux_communication', topic='__sync_runner__', targets=['server.simulate_job'], timeout=2.0\n", + "2024-04-26 16:01:26,105 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: synced to Server Runner in 0.5092577934265137 seconds\n", + "2024-04-26 16:01:26,105 - ReliableMessage - INFO - enabled reliable message: max_request_workers=20 query_interval=2.0\n", "MONAI version: 1.4.dev2416\n", "Numpy version: 1.26.4\n", "Pytorch version: 2.3.0+cu121\n", "MONAI flags: HAS_EXT = False, USE_COMPILED = False, USE_META_DICT = False\n", "MONAI rev id: d487995c8f685729ee5e1259f364034dab4811da\n", - "MONAI version: 1.4.dev2416\n", - "Numpy version: 1.26.4\n", - "Pytorch version: 2.3.0+cu121\n", - "MONAI flags: HAS_EXT = False, USE_COMPILED = False, USE_META_DICT = False\n", - "MONAI rev id: d487995c8f685729ee5e1259f364034dab4811da\n", - "MONAI __file__: /home//.venv_monai/lib/python3.10/site-packages/monai/__init__.py\n", - "\n", - "Optional dependencies:\n", "MONAI __file__: /home//.venv_monai/lib/python3.10/site-packages/monai/__init__.py\n", "\n", "Optional dependencies:\n", @@ -402,6 +398,43 @@ "For details about installing the optional dependencies, please visit:\n", " https://docs.monai.io/en/latest/installation.html#installing-the-recommended-dependencies\n", "\n", + "2024-04-26 16:01:27,702 - PTInProcessClientAPIExecutor - ERROR - [identity=site-1, run=simulate_job]: Exception when handling event \"_start_run\": AttributeError: 'NoneType' object has no attribute 'init'\n", + "2024-04-26 16:01:27,702 - PTInProcessClientAPIExecutor - ERROR - Traceback (most recent call last):\n", + " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/private/event.py\", line 61, in fire_event\n", + " h.handle_event(event, ctx)\n", + " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/executors/in_process_client_api_executor.py\", line 108, in handle_event\n", + " self._task_fn_wrapper = ExecTaskFuncWrapper(\n", + " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/executors/exec_task_fn_wrapper.py\", line 35, in __init__\n", + " self.task_fn = find_task_fn(task_fn_path)\n", + " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/fuel/utils/function_utils.py\", line 36, in find_task_fn\n", + " module = importlib.import_module(module_name)\n", + " File \"/usr/lib/python3.10/importlib/__init__.py\", line 126, in import_module\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + " File \"\", line 1050, in _gcd_import\n", + " File \"\", line 1027, in _find_and_load\n", + " File \"\", line 1006, in _find_and_load_unlocked\n", + " File \"\", line 688, in _load_unlocked\n", + " File \"\", line 883, in exec_module\n", + " File \"\", line 241, in _call_with_frames_removed\n", + " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/integration/monai/examples/mednist/fedavg_workspace/server/simulate_job/app_server/custom/monai_mednist_train.py\", line 55, in \n", + " flare.init()\n", + " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/client/api.py\", line 53, in init\n", + " client_api.init(rank=rank)\n", + "AttributeError: 'NoneType' object has no attribute 'init'\n", + "\n", + "2024-04-26 16:01:27,702 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: client runner started\n", + "2024-04-26 16:01:27,703 - ClientTaskWorker - INFO - Initialize ClientRunner for client: site-1\n", + "2024-04-26 16:01:27,706 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: assigned task to client site-1: name=train, id=29d3de90-01ab-436e-8722-dbf3ff2a1411\n", + "2024-04-26 16:01:27,707 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: sent task assignment to client. client_name:site-1 task_id:29d3de90-01ab-436e-8722-dbf3ff2a1411\n", + "2024-04-26 16:01:27,707 - GetTaskCommand - INFO - return task to client. client_name: site-1 task_name: train task_id: 29d3de90-01ab-436e-8722-dbf3ff2a1411 sharable_header_task_id: 29d3de90-01ab-436e-8722-dbf3ff2a1411\n", + "MONAI version: 1.4.dev2416\n", + "Numpy version: 1.26.4\n", + "Pytorch version: 2.3.0+cu121\n", + "MONAI flags: HAS_EXT = False, USE_COMPILED = False, USE_META_DICT = False\n", + "MONAI rev id: d487995c8f685729ee5e1259f364034dab4811da\n", + "MONAI __file__: /home//.venv_monai/lib/python3.10/site-packages/monai/__init__.py\n", + "\n", + "Optional dependencies:\n", "Pytorch Ignite version: 0.4.11\n", "ITK version: NOT INSTALLED or UNKNOWN VERSION.\n", "Nibabel version: NOT INSTALLED or UNKNOWN VERSION.\n", @@ -424,9 +457,8 @@ "For details about installing the optional dependencies, please visit:\n", " https://docs.monai.io/en/latest/installation.html#installing-the-recommended-dependencies\n", "\n", - "2024-04-26 15:46:18,720 - PTInProcessClientAPIExecutor - ERROR - [identity=site-1, run=simulate_job]: Exception when handling event \"_start_run\": AttributeError: 'NoneType' object has no attribute 'init'\n", - "2024-04-26 15:46:18,720 - PTInProcessClientAPIExecutor - ERROR - [identity=site-2, run=simulate_job]: Exception when handling event \"_start_run\": AttributeError: 'NoneType' object has no attribute 'init'\n", - "2024-04-26 15:46:18,721 - PTInProcessClientAPIExecutor - ERROR - Traceback (most recent call last):\n", + "2024-04-26 16:01:27,808 - PTInProcessClientAPIExecutor - ERROR - [identity=site-2, run=simulate_job]: Exception when handling event \"_start_run\": AttributeError: 'NoneType' object has no attribute 'init'\n", + "2024-04-26 16:01:27,809 - PTInProcessClientAPIExecutor - ERROR - Traceback (most recent call last):\n", " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/private/event.py\", line 61, in fire_event\n", " h.handle_event(event, ctx)\n", " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/executors/in_process_client_api_executor.py\", line 108, in handle_event\n", @@ -443,74 +475,36 @@ " File \"\", line 688, in _load_unlocked\n", " File \"\", line 883, in exec_module\n", " File \"\", line 241, in _call_with_frames_removed\n", - " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/integration/monai/examples/mednist/client_api_workspace/server/simulate_job/app_server/custom/monai_mednist_train.py\", line 56, in \n", + " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/integration/monai/examples/mednist/fedavg_workspace/server/simulate_job/app_server/custom/monai_mednist_train.py\", line 55, in \n", " flare.init()\n", " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/client/api.py\", line 53, in init\n", " client_api.init(rank=rank)\n", "AttributeError: 'NoneType' object has no attribute 'init'\n", "\n", - "2024-04-26 15:46:18,721 - PTInProcessClientAPIExecutor - ERROR - Traceback (most recent call last):\n", - " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/private/event.py\", line 61, in fire_event\n", - " h.handle_event(event, ctx)\n", - " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/executors/in_process_client_api_executor.py\", line 108, in handle_event\n", - " self._task_fn_wrapper = ExecTaskFuncWrapper(\n", - " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/executors/exec_task_fn_wrapper.py\", line 35, in __init__\n", - " self.task_fn = find_task_fn(task_fn_path)\n", - " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/fuel/utils/function_utils.py\", line 36, in find_task_fn\n", - " module = importlib.import_module(module_name)\n", - " File \"/usr/lib/python3.10/importlib/__init__.py\", line 126, in import_module\n", - " return _bootstrap._gcd_import(name[level:], package, level)\n", - " File \"\", line 1050, in _gcd_import\n", - " File \"\", line 1027, in _find_and_load\n", - " File \"\", line 1006, in _find_and_load_unlocked\n", - " File \"\", line 688, in _load_unlocked\n", - " File \"\", line 883, in exec_module\n", - " File \"\", line 241, in _call_with_frames_removed\n", - " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/integration/monai/examples/mednist/client_api_workspace/server/simulate_job/app_server/custom/monai_mednist_train.py\", line 56, in \n", - " flare.init()\n", - " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/client/api.py\", line 53, in init\n", - " client_api.init(rank=rank)\n", - "AttributeError: 'NoneType' object has no attribute 'init'\n", - "\n", - "2024-04-26 15:46:18,721 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: client runner started\n", - "2024-04-26 15:46:18,721 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: client runner started\n", - "2024-04-26 15:46:18,721 - ClientTaskWorker - INFO - Initialize ClientRunner for client: site-1\n", - "2024-04-26 15:46:18,721 - ClientTaskWorker - INFO - Initialize ClientRunner for client: site-2\n", - "2024-04-26 15:46:18,724 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: assigned task to client site-2: name=train, id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9\n", - "2024-04-26 15:46:18,725 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: sent task assignment to client. client_name:site-2 task_id:fa31ef86-bcc4-4100-9cec-4f4f217bf2c9\n", - "2024-04-26 15:46:18,726 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: assigned task to client site-1: name=train, id=6a603103-770f-403f-91df-f479c6a56e2d\n", - "2024-04-26 15:46:18,726 - GetTaskCommand - INFO - return task to client. client_name: site-2 task_name: train task_id: fa31ef86-bcc4-4100-9cec-4f4f217bf2c9 sharable_header_task_id: fa31ef86-bcc4-4100-9cec-4f4f217bf2c9\n", - "2024-04-26 15:46:18,727 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: sent task assignment to client. client_name:site-1 task_id:6a603103-770f-403f-91df-f479c6a56e2d\n", - "2024-04-26 15:46:18,748 - GetTaskCommand - INFO - return task to client. client_name: site-1 task_name: train task_id: 6a603103-770f-403f-91df-f479c6a56e2d sharable_header_task_id: 6a603103-770f-403f-91df-f479c6a56e2d\n", - "2024-04-26 15:46:18,920 - Communicator - INFO - Received from simulator_server server. getTask: train size: 28.3MB (28319188 Bytes) time: 0.199132 seconds\n", - "2024-04-26 15:46:18,921 - FederatedClient - INFO - pull_task completed. Task name:train Status:True \n", - "2024-04-26 15:46:18,921 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=train, id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9\n", - "2024-04-26 15:46:18,921 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: invoking task executor PTInProcessClientAPIExecutor\n", - "2024-04-26 15:46:18,921 - PTInProcessClientAPIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: execute for task (train)\n", - "2024-04-26 15:46:18,921 - PTInProcessClientAPIExecutor - ERROR - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: Traceback (most recent call last):\n", - " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/executors/in_process_client_api_executor.py\", line 131, in execute\n", - " self.client_api.set_meta(meta)\n", - "AttributeError: 'PTInProcessClientAPIExecutor' object has no attribute 'client_api'\n", - "\n", - "2024-04-26 15:46:18,922 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: finished processing task\n", - "2024-04-26 15:46:18,922 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: try #1: sending task result to server\n", - "2024-04-26 15:46:18,922 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: checking task ...\n", - "2024-04-26 15:46:18,922 - Cell - INFO - broadcast: channel='aux_communication', topic='__task_check__', targets=['server.simulate_job'], timeout=5.0\n", - "2024-04-26 15:46:18,929 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: start to send task result to server\n", - "2024-04-26 15:46:18,929 - FederatedClient - INFO - Starting to push execute result.\n", - "2024-04-26 15:46:18,931 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job]: got result from client site-2 for task: name=train, id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9\n", - "2024-04-26 15:46:18,932 - WFCommServer - ERROR - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: processing error in result_received_cb on task train(fa31ef86-bcc4-4100-9cec-4f4f217bf2c9): ValueError: the shareable is not a valid DXO - expect content_type DXO but got None\n", - "2024-04-26 15:46:18,933 - Communicator - INFO - Received from simulator_server server. getTask: train size: 28.3MB (28319188 Bytes) time: 0.211613 seconds\n", - "2024-04-26 15:46:18,933 - FederatedClient - INFO - pull_task completed. Task name:train Status:True \n", - "2024-04-26 15:46:18,933 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=train, id=6a603103-770f-403f-91df-f479c6a56e2d\n", - "2024-04-26 15:46:18,934 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: invoking task executor PTInProcessClientAPIExecutor\n", - "2024-04-26 15:46:18,934 - PTInProcessClientAPIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: execute for task (train)\n", - "2024-04-26 15:46:18,934 - PTInProcessClientAPIExecutor - ERROR - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: Traceback (most recent call last):\n", + "2024-04-26 16:01:27,809 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: client runner started\n", + "2024-04-26 16:01:27,809 - ClientTaskWorker - INFO - Initialize ClientRunner for client: site-2\n", + "2024-04-26 16:01:27,813 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: assigned task to client site-2: name=train, id=844620aa-9411-4582-837a-4551995e669f\n", + "2024-04-26 16:01:27,813 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: sent task assignment to client. client_name:site-2 task_id:844620aa-9411-4582-837a-4551995e669f\n", + "2024-04-26 16:01:27,813 - GetTaskCommand - INFO - return task to client. client_name: site-2 task_name: train task_id: 844620aa-9411-4582-837a-4551995e669f sharable_header_task_id: 844620aa-9411-4582-837a-4551995e669f\n", + "2024-04-26 16:01:27,913 - Communicator - INFO - Received from simulator_server server. getTask: train size: 28.3MB (28319188 Bytes) time: 0.210103 seconds\n", + "2024-04-26 16:01:27,913 - FederatedClient - INFO - pull_task completed. Task name:train Status:True \n", + "2024-04-26 16:01:27,913 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=train, id=29d3de90-01ab-436e-8722-dbf3ff2a1411\n", + "2024-04-26 16:01:27,914 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: invoking task executor PTInProcessClientAPIExecutor\n", + "2024-04-26 16:01:27,914 - PTInProcessClientAPIExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: execute for task (train)\n", + "2024-04-26 16:01:27,914 - PTInProcessClientAPIExecutor - ERROR - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: Traceback (most recent call last):\n", " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/executors/in_process_client_api_executor.py\", line 131, in execute\n", " self.client_api.set_meta(meta)\n", "AttributeError: 'PTInProcessClientAPIExecutor' object has no attribute 'client_api'\n", "\n", - "2024-04-26 15:46:18,934 - WFCommServer - ERROR - Traceback (most recent call last):\n", + "2024-04-26 16:01:27,914 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: finished processing task\n", + "2024-04-26 16:01:27,914 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: try #1: sending task result to server\n", + "2024-04-26 16:01:27,914 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: checking task ...\n", + "2024-04-26 16:01:27,915 - Cell - INFO - broadcast: channel='aux_communication', topic='__task_check__', targets=['server.simulate_job'], timeout=5.0\n", + "2024-04-26 16:01:27,919 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: start to send task result to server\n", + "2024-04-26 16:01:27,919 - FederatedClient - INFO - Starting to push execute result.\n", + "2024-04-26 16:01:27,921 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job]: got result from client site-1 for task: name=train, id=29d3de90-01ab-436e-8722-dbf3ff2a1411\n", + "2024-04-26 16:01:27,921 - WFCommServer - ERROR - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: processing error in result_received_cb on task train(29d3de90-01ab-436e-8722-dbf3ff2a1411): ValueError: the shareable is not a valid DXO - expect content_type DXO but got None\n", + "2024-04-26 16:01:27,922 - WFCommServer - ERROR - Traceback (most recent call last):\n", " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/apis/impl/wf_comm_server.py\", line 421, in _do_process_submission\n", " task.result_received_cb(client_task=client_task, fl_ctx=fl_ctx)\n", " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/workflows/model_controller.py\", line 216, in _process_result\n", @@ -521,38 +515,48 @@ " raise ValueError(\"the shareable is not a valid DXO - expect content_type DXO but got {}\".format(content_type))\n", "ValueError: the shareable is not a valid DXO - expect content_type DXO but got None\n", "\n", - "2024-04-26 15:46:18,934 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: finished processing client result by scatter_and_gather\n", - "2024-04-26 15:46:18,934 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: finished processing task\n", - "2024-04-26 15:46:18,934 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: try #1: sending task result to server\n", - "2024-04-26 15:46:18,934 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: checking task ...\n", - "2024-04-26 15:46:18,935 - Cell - INFO - broadcast: channel='aux_communication', topic='__task_check__', targets=['server.simulate_job'], timeout=5.0\n", - "2024-04-26 15:46:18,935 - SubmitUpdateCommand - INFO - submit_update process. client_name:site-2 task_id:fa31ef86-bcc4-4100-9cec-4f4f217bf2c9\n", - "2024-04-26 15:46:18,937 - Communicator - INFO - SubmitUpdate size: 567B (567 Bytes). time: 0.007819 seconds\n", - "2024-04-26 15:46:18,937 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: task result sent to server\n", - "2024-04-26 15:46:18,937 - ClientTaskWorker - INFO - Finished one task run for client: site-2 interval: 2 task_processed: True\n", - "2024-04-26 15:46:18,939 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: start to send task result to server\n", - "2024-04-26 15:46:18,940 - FederatedClient - INFO - Starting to push execute result.\n", - "2024-04-26 15:46:18,941 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job]: got result from client site-1 for task: name=train, id=6a603103-770f-403f-91df-f479c6a56e2d\n", - "2024-04-26 15:46:18,942 - WFCommServer - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: task is already finished - submission dropped\n", - "2024-04-26 15:46:18,942 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: finished processing client result by scatter_and_gather\n", - "2024-04-26 15:46:18,942 - SubmitUpdateCommand - INFO - submit_update process. client_name:site-1 task_id:6a603103-770f-403f-91df-f479c6a56e2d\n", - "2024-04-26 15:46:18,943 - Communicator - INFO - SubmitUpdate size: 567B (567 Bytes). time: 0.003030 seconds\n", - "2024-04-26 15:46:18,943 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=6a603103-770f-403f-91df-f479c6a56e2d]: task result sent to server\n", - "2024-04-26 15:46:18,943 - ClientTaskWorker - INFO - Finished one task run for client: site-1 interval: 2 task_processed: True\n", - "2024-04-26 15:46:19,102 - FedAvg - WARNING - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: Number of results (0) is different from number of targets (2).\n", - "2024-04-26 15:46:19,103 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: aggregating 0 update(s) at round 0\n", - "2024-04-26 15:46:19,104 - FedAvg - ERROR - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: Exception in aggregate call: IndexError: list index out of range\n", - "2024-04-26 15:46:19,104 - FedAvg - ERROR - Traceback (most recent call last):\n", + "2024-04-26 16:01:27,922 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: finished processing client result by scatter_and_gather\n", + "2024-04-26 16:01:27,923 - SubmitUpdateCommand - INFO - submit_update process. client_name:site-1 task_id:29d3de90-01ab-436e-8722-dbf3ff2a1411\n", + "2024-04-26 16:01:27,924 - Communicator - INFO - SubmitUpdate size: 567B (567 Bytes). time: 0.004846 seconds\n", + "2024-04-26 16:01:27,924 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: task result sent to server\n", + "2024-04-26 16:01:27,924 - ClientTaskWorker - INFO - Finished one task run for client: site-1 interval: 2 task_processed: True\n", + "2024-04-26 16:01:27,974 - Communicator - INFO - Received from simulator_server server. getTask: train size: 28.3MB (28319188 Bytes) time: 0.165209 seconds\n", + "2024-04-26 16:01:27,975 - FederatedClient - INFO - pull_task completed. Task name:train Status:True \n", + "2024-04-26 16:01:27,975 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=train, id=844620aa-9411-4582-837a-4551995e669f\n", + "2024-04-26 16:01:27,975 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: invoking task executor PTInProcessClientAPIExecutor\n", + "2024-04-26 16:01:27,975 - PTInProcessClientAPIExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: execute for task (train)\n", + "2024-04-26 16:01:27,975 - PTInProcessClientAPIExecutor - ERROR - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: Traceback (most recent call last):\n", + " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/executors/in_process_client_api_executor.py\", line 131, in execute\n", + " self.client_api.set_meta(meta)\n", + "AttributeError: 'PTInProcessClientAPIExecutor' object has no attribute 'client_api'\n", + "\n", + "2024-04-26 16:01:27,976 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: finished processing task\n", + "2024-04-26 16:01:27,976 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: try #1: sending task result to server\n", + "2024-04-26 16:01:27,976 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: checking task ...\n", + "2024-04-26 16:01:27,976 - Cell - INFO - broadcast: channel='aux_communication', topic='__task_check__', targets=['server.simulate_job'], timeout=5.0\n", + "2024-04-26 16:01:27,981 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: start to send task result to server\n", + "2024-04-26 16:01:27,982 - FederatedClient - INFO - Starting to push execute result.\n", + "2024-04-26 16:01:27,983 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job]: got result from client site-2 for task: name=train, id=844620aa-9411-4582-837a-4551995e669f\n", + "2024-04-26 16:01:27,984 - WFCommServer - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: task is already finished - submission dropped\n", + "2024-04-26 16:01:27,984 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: finished processing client result by scatter_and_gather\n", + "2024-04-26 16:01:27,984 - SubmitUpdateCommand - INFO - submit_update process. client_name:site-2 task_id:844620aa-9411-4582-837a-4551995e669f\n", + "2024-04-26 16:01:27,985 - Communicator - INFO - SubmitUpdate size: 567B (567 Bytes). time: 0.003595 seconds\n", + "2024-04-26 16:01:27,985 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=844620aa-9411-4582-837a-4551995e669f]: task result sent to server\n", + "2024-04-26 16:01:27,986 - ClientTaskWorker - INFO - Finished one task run for client: site-2 interval: 2 task_processed: True\n", + "2024-04-26 16:01:28,061 - FedAvg - WARNING - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: Number of results (0) is different from number of targets (2).\n", + "2024-04-26 16:01:28,062 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: aggregating 0 update(s) at round 0\n", + "2024-04-26 16:01:28,062 - FedAvg - ERROR - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: Exception in aggregate call: IndexError: list index out of range\n", + "2024-04-26 16:01:28,062 - FedAvg - ERROR - Traceback (most recent call last):\n", " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/workflows/base_fedavg.py\", line 148, in aggregate\n", " aggr_result = aggregate_fn(results)\n", " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/workflows/base_fedavg.py\", line 124, in _aggregate_fn\n", " params_type=results[0].params_type,\n", "IndexError: list index out of range\n", "\n", - "2024-04-26 15:46:19,104 - ServerRunner - ERROR - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: Aborting current RUN due to FATAL_SYSTEM_ERROR received: Exception in aggregate call: IndexError: list index out of range\n", - "2024-04-26 15:46:19,105 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: asked to abort - triggered abort_signal to stop the RUN\n", - "2024-04-26 15:46:19,105 - FedAvg - ERROR - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: Exception in model controller run: RuntimeError: params_type None of `model_update` not supported!\n", - "2024-04-26 15:46:19,106 - FedAvg - ERROR - Traceback (most recent call last):\n", + "2024-04-26 16:01:28,063 - ServerRunner - ERROR - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: Aborting current RUN due to FATAL_SYSTEM_ERROR received: Exception in aggregate call: IndexError: list index out of range\n", + "2024-04-26 16:01:28,063 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: asked to abort - triggered abort_signal to stop the RUN\n", + "2024-04-26 16:01:28,063 - FedAvg - ERROR - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: Exception in model controller run: RuntimeError: params_type None of `model_update` not supported!\n", + "2024-04-26 16:01:28,064 - FedAvg - ERROR - Traceback (most recent call last):\n", " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/workflows/model_controller.py\", line 286, in control_flow\n", " self.run()\n", " File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/nvflare/app_common/workflows/fedavg.py\", line 63, in run\n", @@ -563,55 +567,55 @@ " raise RuntimeError(f\"params_type {model_update.params_type} of `model_update` not supported!\")\n", "RuntimeError: params_type None of `model_update` not supported!\n", "\n", - "2024-04-26 15:46:19,106 - ServerRunner - ERROR - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: Aborting current RUN due to FATAL_SYSTEM_ERROR received: Exception in model controller run: RuntimeError: params_type None of `model_update` not supported!\n", - "2024-04-26 15:46:19,106 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=fa31ef86-bcc4-4100-9cec-4f4f217bf2c9]: asked to abort - triggered abort_signal to stop the RUN\n", - "2024-04-26 15:46:19,107 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Workflow: scatter_and_gather finalizing ...\n", - "2024-04-26 15:46:19,107 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: ABOUT_TO_END_RUN fired\n", - "2024-04-26 15:46:19,107 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Firing CHECK_END_RUN_READINESS ...\n", - "2024-04-26 15:46:19,112 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: END_RUN fired\n", - "2024-04-26 15:46:19,113 - ReliableMessage - INFO - ReliableMessage is shutdown\n", - "2024-04-26 15:46:19,113 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Server runner finished.\n", - "2024-04-26 15:46:20,068 - ReliableMessage - INFO - shutdown reliable message monitor\n", - "2024-04-26 15:46:20,943 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job]: server runner is finalizing - asked client to end the run\n", - "2024-04-26 15:46:20,943 - GetTaskCommand - INFO - return task to client. client_name: site-2 task_name: __end_run__ task_id: sharable_header_task_id: \n", - "2024-04-26 15:46:20,945 - FederatedClient - INFO - pull_task completed. Task name:__end_run__ Status:True \n", - "2024-04-26 15:46:20,946 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: server asked to end the run\n", - "2024-04-26 15:46:20,946 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: started end-run events sequence\n", - "2024-04-26 15:46:20,946 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: ABOUT_TO_END_RUN fired\n", - "2024-04-26 15:46:20,946 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: Firing CHECK_END_RUN_READINESS ...\n", - "2024-04-26 15:46:20,946 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: END_RUN fired\n", - "2024-04-26 15:46:20,946 - ClientTaskWorker - INFO - End the Simulator run.\n", - "2024-04-26 15:46:20,948 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job]: server runner is finalizing - asked client to end the run\n", - "2024-04-26 15:46:20,948 - GetTaskCommand - INFO - return task to client. client_name: site-1 task_name: __end_run__ task_id: sharable_header_task_id: \n", - "2024-04-26 15:46:20,950 - FederatedClient - INFO - pull_task completed. Task name:__end_run__ Status:True \n", - "2024-04-26 15:46:20,951 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: server asked to end the run\n", - "2024-04-26 15:46:20,951 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: started end-run events sequence\n", - "2024-04-26 15:46:20,951 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: ABOUT_TO_END_RUN fired\n", - "2024-04-26 15:46:20,951 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: Firing CHECK_END_RUN_READINESS ...\n", - "2024-04-26 15:46:20,951 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: END_RUN fired\n", - "2024-04-26 15:46:20,951 - ClientTaskWorker - INFO - End the Simulator run.\n", - "2024-04-26 15:46:20,988 - ClientTaskWorker - INFO - Clean up ClientRunner for : site-2 \n", - "2024-04-26 15:46:20,990 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00002 Not Connected] is closed PID: 2567356\n", - "2024-04-26 15:46:20,990 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00006 Not Connected] is closed PID: 2567262\n", - "2024-04-26 15:46:20,995 - ClientTaskWorker - INFO - Clean up ClientRunner for : site-1 \n", - "2024-04-26 15:46:20,996 - FederatedClient - INFO - Shutting down client run: site-1\n", - "2024-04-26 15:46:20,996 - FederatedClient - INFO - Shutting down client run: site-2\n", - "2024-04-26 15:46:20,997 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: asked to abort - triggered abort_signal to stop the RUN\n", - "2024-04-26 15:46:20,998 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00002 Not Connected] is closed PID: 2567355\n", - "2024-04-26 15:46:20,998 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00005 Not Connected] is closed PID: 2567262\n", - "2024-04-26 15:46:21,066 - SimulatorServer - INFO - Server app stopped.\n", + "2024-04-26 16:01:28,064 - ServerRunner - ERROR - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: Aborting current RUN due to FATAL_SYSTEM_ERROR received: Exception in model controller run: RuntimeError: params_type None of `model_update` not supported!\n", + "2024-04-26 16:01:28,064 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, peer_rc=EXECUTION_EXCEPTION, task_name=train, task_id=29d3de90-01ab-436e-8722-dbf3ff2a1411]: asked to abort - triggered abort_signal to stop the RUN\n", + "2024-04-26 16:01:28,064 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Workflow: scatter_and_gather finalizing ...\n", + "2024-04-26 16:01:28,065 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: ABOUT_TO_END_RUN fired\n", + "2024-04-26 16:01:28,065 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Firing CHECK_END_RUN_READINESS ...\n", + "2024-04-26 16:01:28,068 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: END_RUN fired\n", + "2024-04-26 16:01:28,068 - ReliableMessage - INFO - ReliableMessage is shutdown\n", + "2024-04-26 16:01:28,068 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Server runner finished.\n", + "2024-04-26 16:01:29,027 - ReliableMessage - INFO - shutdown reliable message monitor\n", + "2024-04-26 16:01:29,929 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job]: server runner is finalizing - asked client to end the run\n", + "2024-04-26 16:01:29,930 - GetTaskCommand - INFO - return task to client. client_name: site-1 task_name: __end_run__ task_id: sharable_header_task_id: \n", + "2024-04-26 16:01:29,932 - FederatedClient - INFO - pull_task completed. Task name:__end_run__ Status:True \n", + "2024-04-26 16:01:29,932 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: server asked to end the run\n", + "2024-04-26 16:01:29,932 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: started end-run events sequence\n", + "2024-04-26 16:01:29,932 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: ABOUT_TO_END_RUN fired\n", + "2024-04-26 16:01:29,932 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: Firing CHECK_END_RUN_READINESS ...\n", + "2024-04-26 16:01:29,932 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: END_RUN fired\n", + "2024-04-26 16:01:29,933 - ClientTaskWorker - INFO - End the Simulator run.\n", + "2024-04-26 16:01:29,976 - ClientTaskWorker - INFO - Clean up ClientRunner for : site-1 \n", + "2024-04-26 16:01:29,978 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00002 Not Connected] is closed PID: 2598509\n", + "2024-04-26 16:01:29,978 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00005 Not Connected] is closed PID: 2598417\n", + "2024-04-26 16:01:29,990 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job]: server runner is finalizing - asked client to end the run\n", + "2024-04-26 16:01:29,990 - GetTaskCommand - INFO - return task to client. client_name: site-2 task_name: __end_run__ task_id: sharable_header_task_id: \n", + "2024-04-26 16:01:29,992 - FederatedClient - INFO - pull_task completed. Task name:__end_run__ Status:True \n", + "2024-04-26 16:01:29,993 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: server asked to end the run\n", + "2024-04-26 16:01:29,993 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: started end-run events sequence\n", + "2024-04-26 16:01:29,993 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: ABOUT_TO_END_RUN fired\n", + "2024-04-26 16:01:29,994 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: Firing CHECK_END_RUN_READINESS ...\n", + "2024-04-26 16:01:29,994 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: END_RUN fired\n", + "2024-04-26 16:01:29,995 - ClientTaskWorker - INFO - End the Simulator run.\n", + "2024-04-26 16:01:30,024 - SimulatorServer - INFO - Server app stopped.\n", "\n", "\n", - "2024-04-26 15:46:21,509 - nvflare.fuel.hci.server.hci - INFO - Admin Server localhost on Port 49247 shutdown!\n", - "2024-04-26 15:46:21,510 - SimulatorServer - INFO - shutting down server\n", - "2024-04-26 15:46:21,510 - SimulatorServer - INFO - canceling sync locks\n", - "2024-04-26 15:46:21,510 - SimulatorServer - INFO - server off\n", - "2024-04-26 15:46:25,014 - MPM - INFO - MPM: Good Bye!\n" + "2024-04-26 16:01:30,036 - ClientTaskWorker - INFO - Clean up ClientRunner for : site-2 \n", + "2024-04-26 16:01:30,036 - FederatedClient - INFO - Shutting down client run: site-1\n", + "2024-04-26 16:01:30,037 - FederatedClient - INFO - Shutting down client run: site-2\n", + "2024-04-26 16:01:30,037 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: asked to abort - triggered abort_signal to stop the RUN\n", + "2024-04-26 16:01:30,039 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00006 Not Connected] is closed PID: 2598417\n", + "2024-04-26 16:01:30,039 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00002 Not Connected] is closed PID: 2598510\n", + "2024-04-26 16:01:30,463 - nvflare.fuel.hci.server.hci - INFO - Admin Server localhost on Port 53755 shutdown!\n", + "2024-04-26 16:01:30,463 - SimulatorServer - INFO - shutting down server\n", + "2024-04-26 16:01:30,464 - SimulatorServer - INFO - canceling sync locks\n", + "2024-04-26 16:01:30,464 - SimulatorServer - INFO - server off\n", + "2024-04-26 16:01:33,969 - MPM - INFO - MPM: Good Bye!\n" ] } ], "source": [ - "!nvflare simulator -n 2 -t 2 ./jobs/client_api -w client_api_workspace" + "!nvflare simulator -n 2 -t 2 ./jobs/fedavg_mednist -w fedavg_workspace" ] } ],