From 23b82280ca6cab032cc826dd9116f9ac622ee90e Mon Sep 17 00:00:00 2001 From: Holger Roth Date: Wed, 1 May 2024 12:54:36 -0400 Subject: [PATCH] switch to 2.4.1.rc1 --- .../mednist/code/monai_mednist_train.py | 4 +- .../fedavg_mednist/config_fed_client.conf | 119 ++-- .../monai/examples/mednist/monai_101_fl.ipynb | 510 +++++++++++++++++- 3 files changed, 591 insertions(+), 42 deletions(-) diff --git a/integration/monai/examples/mednist/code/monai_mednist_train.py b/integration/monai/examples/mednist/code/monai_mednist_train.py index 0ae3271505..d59b45fc52 100644 --- a/integration/monai/examples/mednist/code/monai_mednist_train.py +++ b/integration/monai/examples/mednist/code/monai_mednist_train.py @@ -109,7 +109,7 @@ def main(): print(f"current_round={input_model.current_round}") # (4) loads model from NVFlare and sends it to GPU - trainer.network.load_state_dict(input_model.params) + trainer.network.load_state_dict(input_model.params, strict=False) # TODO: enable strict trainer.network.to(DEVICE) trainer.run() @@ -117,7 +117,7 @@ def main(): # (5) wraps evaluation logic into a method to re-use for # evaluation on both trained and received model def evaluate(input_weights): - model.load_state_dict(input_weights) + model.load_state_dict(input_weights, strict=False) # TODO: enable strict # Check the prediction on the test dataset dataset_dir = Path(root_dir, "MedNIST") diff --git a/integration/monai/examples/mednist/job_templates/fedavg_mednist/config_fed_client.conf b/integration/monai/examples/mednist/job_templates/fedavg_mednist/config_fed_client.conf index 47c9d32c80..4029e915f3 100644 --- a/integration/monai/examples/mednist/job_templates/fedavg_mednist/config_fed_client.conf +++ b/integration/monai/examples/mednist/job_templates/fedavg_mednist/config_fed_client.conf @@ -3,11 +3,10 @@ format_version = 2 # This is the application script which will be invoked. Client can replace this script with user's own training script. - app_script = "monai_mednist_train.py" + app_script = "cifar10.py" # Additional arguments needed by the training code. For example, in lightning, these can be --trainer.batch_size=xxx. - # Additional arguments needed by the training code. For example, in lightning, these can be --trainer.batch_size=xxx. - app_config = "" + app_config = "monai_mednist_train.py" # Client Computing Executors. executors = [ @@ -18,33 +17,32 @@ # This particular executor executor { - path = "nvflare.app_opt.pt.in_process_client_api_executor.PTInProcessClientAPIExecutor" + # This is an executor for Client API. The underline data exchange is using Pipe. + path = "nvflare.app_opt.pt.client_api_launcher_executor.PTClientAPILauncherExecutor" + args { - task_script_path = "{app_script}" - task_script_args = "{app_config}" - - # if the transfer_type is FULL, then it will be sent directly - # if the transfer_type is DIFF, then we will calculate the - # difference VS received parameters and send the difference - params_transfer_type = "DIFF" - - # if train_with_evaluation is true, the executor will expect - # the custom code need to send back both the trained parameters and the evaluation metric - # otherwise only trained parameters are expected - train_with_evaluation = true - - # time interval in seconds. Time interval to wait before check if the local task has submitted the result - # if the local task takes long time, you can increase this interval to larger number - # uncomment to overwrite the default, default is 0.5 seconds - result_pull_interval = 0.5 - - # time interval in seconds. Time interval to wait before check if the trainig code has log metric (such as - # Tensorboard log, MLFlow log or Weights & Biases logs. The result will be streanmed to the server side - # then to the corresponding tracking system - # if the log is not needed, you can set this to a larger number - # uncomment to overwrite the default, default is None, which disable the log streaming feature. - log_pull_interval = 0.1 + # launcher_id is used to locate the Launcher object in "components" + launcher_id = "launcher" + + # pipe_id is used to locate the Pipe object in "components" + pipe_id = "pipe" + + # Timeout in seconds for waiting for a heartbeat from the training script. Defaults to 30 seconds. + # Please refer to the class docstring for all available arguments + heartbeat_timeout = 60 + + # format of the exchange parameters + params_exchange_format = "pytorch" + + # if the transfer_type is FULL, then it will be sent directly + # if the transfer_type is DIFF, then we will calculate the + # difference VS received parameters and send the difference + params_transfer_type = "DIFF" + # if train_with_evaluation is true, the executor will expect + # the custom code need to send back both the trained parameters and the evaluation metric + # otherwise only trained parameters are expected + train_with_evaluation = true } } } @@ -56,12 +54,63 @@ # this defined an array of task result filters. If provided, it will control the result from client executor to server controller task_result_filters = [] - # define this component that will help relay local metrics log to FL server. components = [ - { - "id": "event_to_fed", - "name": "ConvertToFedEvent", - "args": {"events_to_convert": ["analytix_log_stats"], "fed_event_prefix": "fed."} + { + # component id is "launcher" + id = "launcher" + + # the class path of this component + path = "nvflare.app_common.launchers.subprocess_launcher.SubprocessLauncher" + + args { + # the launcher will invoke the script + script = "python3 custom/{app_script} {app_config} " + # if launch_once is true, the SubprocessLauncher will launch once for the whole job + # if launch_once is false, the SubprocessLauncher will launch a process for each task it receives from server + launch_once = true } - ] -} + } + { + id = "pipe" + path = "nvflare.fuel.utils.pipe.cell_pipe.CellPipe" + args { + mode = "PASSIVE" + site_name = "{SITE_NAME}" + token = "{JOB_ID}" + root_url = "{ROOT_URL}" + secure_mode = "{SECURE_MODE}" + workspace_dir = "{WORKSPACE}" + } + } + { + id = "metrics_pipe" + path = "nvflare.fuel.utils.pipe.cell_pipe.CellPipe" + args { + mode = "PASSIVE" + site_name = "{SITE_NAME}" + token = "{JOB_ID}" + root_url = "{ROOT_URL}" + secure_mode = "{SECURE_MODE}" + workspace_dir = "{WORKSPACE}" + } + }, + { + id = "metric_relay" + path = "nvflare.app_common.widgets.metric_relay.MetricRelay" + args { + pipe_id = "metrics_pipe" + event_type = "fed.analytix_log_stats" + # how fast should it read from the peer + read_interval = 0.1 + } + }, + { + # we use this component so the client api `flare.init()` can get required information + id = "config_preparer" + path = "nvflare.app_common.widgets.external_configurator.ExternalConfigurator" + args { + component_ids = ["metric_relay"] + } + } + ] +} \ No newline at end of file diff --git a/integration/monai/examples/mednist/monai_101_fl.ipynb b/integration/monai/examples/mednist/monai_101_fl.ipynb index beb53acbad..f26c31ed64 100644 --- a/integration/monai/examples/mednist/monai_101_fl.ipynb +++ b/integration/monai/examples/mednist/monai_101_fl.ipynb @@ -109,9 +109,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "The following job templates are available: \n", + "\n", + "------------------------------------------------------------------------------------------------------------------------\n", + " name Description Controller Type Execution API Type \n", + "------------------------------------------------------------------------------------------------------------------------\n", + " fedavg_mednist FedAvg using MONAI with in_process Client API server client_api \n", + "------------------------------------------------------------------------------------------------------------------------\n" + ] + } + ], "source": [ "!nvflare config -jt ./job_templates\n", "!nvflare job list_templates" @@ -129,9 +144,83 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "The following are the variables you can change in the template\n", + "\n", + "---------------------------------------------------------------------------------------------------------------------------------------\n", + " \n", + " job folder: ./jobs/fedavg_mednist \n", + " \n", + "---------------------------------------------------------------------------------------------------------------------------------------\n", + " file_name var_name value component \n", + "---------------------------------------------------------------------------------------------------------------------------------------\n", + " meta.conf app ['@ALL'] \n", + " meta.conf mandatory_clients [] \n", + " meta.conf min_clients 2 \n", + "\n", + " config_fed_client.conf app_config monai_mednist_train.py \n", + " config_fed_client.conf app_script monai_mednist_train.py \n", + " config_fed_client.conf component_ids ['metric_relay'] ExternalConfigurator \n", + " config_fed_client.conf config_file_name client_api_config.json ExternalConfigurator \n", + " config_fed_client.conf evaluate_task_name evaluate \n", + " config_fed_client.conf event_type fed.analytix_log_stats MetricRelay \n", + " config_fed_client.conf external_execution_wait 5.0 \n", + " config_fed_client.conf fed_event True MetricRelay \n", + " config_fed_client.conf heartbeat_interval 5.0 MetricRelay \n", + " config_fed_client.conf heartbeat_timeout 30.0 MetricRelay \n", + " config_fed_client.conf last_result_transfer_timeout 300.0 \n", + " config_fed_client.conf launch_once True SubprocessLauncher \n", + " config_fed_client.conf mode PASSIVE CellPipe \n", + " config_fed_client.conf monitor_interval 0.01 \n", + " config_fed_client.conf params_exchange_format pytorch \n", + " config_fed_client.conf params_transfer_type DIFF \n", + " config_fed_client.conf pipe_channel_name metric MetricRelay \n", + " config_fed_client.conf read_interval 0.5 \n", + " config_fed_client.conf root_url {ROOT_URL} CellPipe \n", + " config_fed_client.conf script python3 custom/{app_script} {app_c SubprocessLauncher \n", + " config_fed_client.conf secure_mode {SECURE_MODE} CellPipe \n", + " config_fed_client.conf site_name {SITE_NAME} CellPipe \n", + " config_fed_client.conf token {JOB_ID} CellPipe \n", + " config_fed_client.conf train_with_evaluation True \n", + " config_fed_client.conf workers 4 \n", + " config_fed_client.conf workspace_dir {WORKSPACE} CellPipe \n", + "\n", + " config_fed_server.conf \"mlflow.note.content\" ## Federated Experiment tracking wi \n", + " config_fed_server.conf allow_empty_global_weights False FedAvg \n", + " config_fed_server.conf artifact_location artifacts MLflowReceiver \n", + " config_fed_server.conf best_global_model_file_name best_FL_global_model.pt PTFileModelPersistor \n", + " config_fed_server.conf buffer_flush_time 1 MLflowReceiver \n", + " config_fed_server.conf events ['fed.analytix_log_stats'] MLflowReceiver \n", + " config_fed_server.conf experiment_name nvflare-fedavg-mednist-experiment \n", + " config_fed_server.conf global_model_file_name FL_global_model.pt PTFileModelPersistor \n", + " config_fed_server.conf ignore_result_error False FedAvg \n", + " config_fed_server.conf in_channels {in_channels} \n", + " config_fed_server.conf key_metric accuracy IntimeModelSelector \n", + " config_fed_server.conf min_clients 2 FedAvg \n", + " config_fed_server.conf model_class_path monai.networks.nets.densenet121 \n", + " config_fed_server.conf negate_key_metric False IntimeModelSelector \n", + " config_fed_server.conf num_rounds 5 FedAvg \n", + " config_fed_server.conf out_channels {out_channels} \n", + " config_fed_server.conf persist_every_n_rounds 1 FedAvg \n", + " config_fed_server.conf run_name nvflare-fedavg-mednist-with-mlflow \n", + " config_fed_server.conf spatial_dims {spatial_dims} \n", + " config_fed_server.conf task_check_period 0.5 FedAvg \n", + " config_fed_server.conf tb_folder tb_events TBAnalyticsReceiver \n", + " config_fed_server.conf tracking_uri MLflowReceiver \n", + " config_fed_server.conf validation_metric_name initial_metrics IntimeModelSelector \n", + " config_fed_server.conf weigh_by_local_iter False IntimeModelSelector \n", + "\n", + "---------------------------------------------------------------------------------------------------------------------------------------\n" + ] + } + ], "source": [ "!nvflare job create -force -j ./jobs/fedavg_mednist -w fedavg_mednist -sd ./code/. \\\n", " -f config_fed_client.conf app_script=monai_mednist_train.py \\\n", @@ -154,7 +243,418 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-05-01 12:51:25,269 - SimulatorRunner - INFO - Create the Simulator Server.\n", + "2024-05-01 12:51:25,284 - CoreCell - INFO - server: creating listener on tcp://0:43515\n", + "2024-05-01 12:51:25,298 - CoreCell - INFO - server: created backbone external listener for tcp://0:43515\n", + "2024-05-01 12:51:25,298 - ConnectorManager - INFO - 2900298: Try start_listener Listener resources: {'secure': False, 'host': 'localhost'}\n", + "2024-05-01 12:51:25,299 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00002 PASSIVE tcp://0:43791] is starting\n", + "2024-05-01 12:51:25,800 - CoreCell - INFO - server: created backbone internal listener for tcp://localhost:43791\n", + "2024-05-01 12:51:25,801 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00001 PASSIVE tcp://0:43515] is starting\n", + "2024-05-01 12:51:25,880 - nvflare.fuel.hci.server.hci - INFO - Starting Admin Server localhost on Port 44941\n", + "2024-05-01 12:51:25,880 - SimulatorRunner - INFO - Deploy the Apps.\n", + "2024-05-01 12:51:25,883 - SimulatorRunner - INFO - Create the simulate clients.\n", + "2024-05-01 12:51:25,887 - ClientManager - INFO - Client: New client site-1@192.168.1.203 joined. Sent token: 38b14b95-7884-4787-ba31-428f4d38d1cf. Total clients: 1\n", + "2024-05-01 12:51:25,887 - FederatedClient - INFO - Successfully registered client:site-1 for project simulator_server. Token:38b14b95-7884-4787-ba31-428f4d38d1cf SSID:\n", + "2024-05-01 12:51:25,889 - ClientManager - INFO - Client: New client site-2@192.168.1.203 joined. Sent token: 22f09ba3-8d05-450c-be01-16e2f415a41a. Total clients: 2\n", + "2024-05-01 12:51:25,889 - FederatedClient - INFO - Successfully registered client:site-2 for project simulator_server. Token:22f09ba3-8d05-450c-be01-16e2f415a41a SSID:\n", + "2024-05-01 12:51:25,889 - SimulatorRunner - INFO - Set the client status ready.\n", + "2024-05-01 12:51:25,889 - SimulatorRunner - INFO - Deploy and start the Server App.\n", + "2024-05-01 12:51:25,891 - Cell - INFO - Register blob CB for channel='server_command', topic='*'\n", + "2024-05-01 12:51:25,891 - Cell - INFO - Register blob CB for channel='aux_communication', topic='*'\n", + "2024-05-01 12:51:25,891 - ServerCommandAgent - INFO - ServerCommandAgent cell register_request_cb: server.simulate_job\n", + "/home/hroth/.venv_monai/lib/python3.10/site-packages/nvflare/fuel/utils/class_utils.py:54: Warning: Use of experimental class ModelController ().\n", + " instance = c(**init_params)\n", + ":283: DeprecationWarning: the load_module() method is deprecated and slated for removal in Python 3.12; use exec_module() instead\n", + "/home/hroth/.venv_monai/lib/python3.10/site-packages/monai/utils/module.py:406: DeprecationWarning: Please import `shift` from the `scipy.ndimage` namespace; the `scipy.ndimage.interpolation` namespace is deprecated and will be removed in SciPy 2.0.0.\n", + " the_module = getattr(the_module, name)\n", + ":283: DeprecationWarning: the load_module() method is deprecated and slated for removal in Python 3.12; use exec_module() instead\n", + "/home/hroth/.venv_monai/lib/python3.10/site-packages/monai/utils/module.py:406: DeprecationWarning: Please import `binary_erosion` from the `scipy.ndimage` namespace; the `scipy.ndimage.morphology` namespace is deprecated and will be removed in SciPy 2.0.0.\n", + " the_module = getattr(the_module, name)\n", + "/home/hroth/.venv_monai/lib/python3.10/site-packages/monai/utils/module.py:406: DeprecationWarning: Please import `distance_transform_edt` from the `scipy.ndimage` namespace; the `scipy.ndimage.morphology` namespace is deprecated and will be removed in SciPy 2.0.0.\n", + " the_module = getattr(the_module, name)\n", + "/home/hroth/.venv_monai/lib/python3.10/site-packages/monai/utils/module.py:406: DeprecationWarning: Please import `distance_transform_cdt` from the `scipy.ndimage` namespace; the `scipy.ndimage.morphology` namespace is deprecated and will be removed in SciPy 2.0.0.\n", + " the_module = getattr(the_module, name)\n", + "/home/hroth/.venv_monai/lib/python3.10/site-packages/monai/utils/module.py:406: DeprecationWarning: Please import `label` from the `scipy.ndimage` namespace; the `scipy.ndimage.measurements` namespace is deprecated and will be removed in SciPy 2.0.0.\n", + " the_module = getattr(the_module, name)\n", + "2024-05-01 12:51:29,278 - IntimeModelSelector - INFO - model selection weights control: None\n", + "2024-05-01 12:51:29,280 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: Server runner starting ...\n", + "2024-05-01 12:51:29,293 - MLflowReceiver - INFO - Experiment=\n", + "2024-05-01 12:51:29,300 - MLflowReceiver - INFO - Experiment=\n", + "2024-05-01 12:51:29,304 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job]: starting workflow scatter_and_gather () ...\n", + "2024-05-01 12:51:29,304 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Initializing ModelController workflow.\n", + "2024-05-01 12:51:29,305 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Workflow scatter_and_gather () started\n", + "2024-05-01 12:51:29,305 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Beginning model controller run.\n", + "2024-05-01 12:51:29,305 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Start FedAvg.\n", + "2024-05-01 12:51:29,305 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Round 0 started.\n", + "2024-05-01 12:51:29,306 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: Sending task train to ['site-1', 'site-2']\n", + "2024-05-01 12:51:29,306 - FedAvg - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather]: scheduled task train\n", + "2024-05-01 12:51:30,129 - SimulatorClientRunner - INFO - Start the clients run simulation.\n", + "2024-05-01 12:51:31,132 - SimulatorClientRunner - INFO - Simulate Run client: site-1 on GPU group: None\n", + "2024-05-01 12:51:31,132 - SimulatorClientRunner - INFO - Simulate Run client: site-2 on GPU group: None\n", + "2024-05-01 12:51:32,176 - ClientTaskWorker - INFO - ClientTaskWorker started to run\n", + "2024-05-01 12:51:32,193 - ClientTaskWorker - INFO - ClientTaskWorker started to run\n", + "2024-05-01 12:51:32,237 - CoreCell - INFO - site-1.simulate_job: created backbone external connector to tcp://localhost:43515\n", + "2024-05-01 12:51:32,237 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00001 ACTIVE tcp://localhost:43515] is starting\n", + "2024-05-01 12:51:32,238 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00002 127.0.0.1:58490 => 127.0.0.1:43515] is created: PID: 2900386\n", + "2024-05-01 12:51:32,239 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00005 127.0.0.1:43515 <= 127.0.0.1:58490] is created: PID: 2900298\n", + "2024-05-01 12:51:32,257 - CoreCell - INFO - site-2.simulate_job: created backbone external connector to tcp://localhost:43515\n", + "2024-05-01 12:51:32,257 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00001 ACTIVE tcp://localhost:43515] is starting\n", + "2024-05-01 12:51:32,258 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00002 127.0.0.1:58502 => 127.0.0.1:43515] is created: PID: 2900387\n", + "2024-05-01 12:51:32,258 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00006 127.0.0.1:43515 <= 127.0.0.1:58502] is created: PID: 2900298\n", + "2024-05-01 12:51:34,230 - Cell - INFO - Register blob CB for channel='aux_communication', topic='*'\n", + "2024-05-01 12:51:34,261 - Cell - INFO - Register blob CB for channel='aux_communication', topic='*'\n", + "2024-05-01 12:51:34,736 - Cell - INFO - broadcast: channel='aux_communication', topic='__sync_runner__', targets=['server.simulate_job'], timeout=2.0\n", + "2024-05-01 12:51:34,747 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: synced to Server Runner in 0.5117754936218262 seconds\n", + "2024-05-01 12:51:34,748 - CoreCell - INFO - site-2_simulate_job_passive: created backbone external connector to tcp://0:43515\n", + "2024-05-01 12:51:34,748 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00002 ACTIVE tcp://0:43515] is starting\n", + "2024-05-01 12:51:34,749 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00007 127.0.0.1:43515 <= 127.0.0.1:58508] is created: PID: 2900298\n", + "2024-05-01 12:51:34,750 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00004 127.0.0.1:58508 => 127.0.0.1:43515] is created: PID: 2900387\n", + "2024-05-01 12:51:34,751 - CellPipe - INFO - registered CellPipe request CB for cell_pipe.metric\n", + "2024-05-01 12:51:34,753 - CellPipe - INFO - registered CellPipe request CB for cell_pipe.task\n", + "2024-05-01 12:51:34,759 - ClientRunner - INFO - [identity=site-2, run=simulate_job]: client runner started\n", + "2024-05-01 12:51:34,759 - ClientTaskWorker - INFO - Initialize ClientRunner for client: site-2\n", + "2024-05-01 12:51:34,761 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, task_name=train, task_id=2895f9f9-a9db-4c98-99d6-1547afabb45a]: assigned task to client site-2: name=train, id=2895f9f9-a9db-4c98-99d6-1547afabb45a\n", + "2024-05-01 12:51:34,761 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-2, peer_run=simulate_job, task_name=train, task_id=2895f9f9-a9db-4c98-99d6-1547afabb45a]: sent task assignment to client. client_name:site-2 task_id:2895f9f9-a9db-4c98-99d6-1547afabb45a\n", + "2024-05-01 12:51:34,762 - GetTaskCommand - INFO - return task to client. client_name: site-2 task_name: train task_id: 2895f9f9-a9db-4c98-99d6-1547afabb45a sharable_header_task_id: 2895f9f9-a9db-4c98-99d6-1547afabb45a\n", + "2024-05-01 12:51:34,764 - Communicator - INFO - Received from simulator_server server. getTask: train size: 622B (622 Bytes) time: 0.004549 seconds\n", + "2024-05-01 12:51:34,764 - FederatedClient - INFO - pull_task completed. Task name:train Status:True \n", + "2024-05-01 12:51:34,764 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=train, id=2895f9f9-a9db-4c98-99d6-1547afabb45a\n", + "2024-05-01 12:51:34,764 - ClientRunner - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=2895f9f9-a9db-4c98-99d6-1547afabb45a]: invoking task executor PTClientAPILauncherExecutor\n", + "2024-05-01 12:51:34,764 - PTClientAPILauncherExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=2895f9f9-a9db-4c98-99d6-1547afabb45a]: execute for task (train)\n", + "2024-05-01 12:51:34,764 - PTClientAPILauncherExecutor - INFO - [identity=site-2, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=2895f9f9-a9db-4c98-99d6-1547afabb45a]: External execution for task (train) is launched.\n", + "2024-05-01 12:51:34,766 - Cell - INFO - broadcast: channel='aux_communication', topic='__sync_runner__', targets=['server.simulate_job'], timeout=2.0\n", + "2024-05-01 12:51:34,773 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: synced to Server Runner in 0.5075929164886475 seconds\n", + "2024-05-01 12:51:34,775 - CoreCell - INFO - site-1_simulate_job_passive: created backbone external connector to tcp://0:43515\n", + "2024-05-01 12:51:34,775 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connector [CH00002 ACTIVE tcp://0:43515] is starting\n", + "2024-05-01 12:51:34,777 - CellPipe - INFO - registered CellPipe request CB for cell_pipe.metric\n", + "2024-05-01 12:51:34,777 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00004 127.0.0.1:58514 => 127.0.0.1:43515] is created: PID: 2900386\n", + "2024-05-01 12:51:34,778 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00008 127.0.0.1:43515 <= 127.0.0.1:58514] is created: PID: 2900298\n", + "2024-05-01 12:51:34,779 - CellPipe - INFO - registered CellPipe request CB for cell_pipe.task\n", + "2024-05-01 12:51:34,782 - ClientRunner - INFO - [identity=site-1, run=simulate_job]: client runner started\n", + "2024-05-01 12:51:34,783 - ClientTaskWorker - INFO - Initialize ClientRunner for client: site-1\n", + "2024-05-01 12:51:34,784 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, task_name=train, task_id=4c9a99f8-002b-44ac-8250-c8c928af1a7e]: assigned task to client site-1: name=train, id=4c9a99f8-002b-44ac-8250-c8c928af1a7e\n", + "2024-05-01 12:51:34,785 - ServerRunner - INFO - [identity=simulator_server, run=simulate_job, wf=scatter_and_gather, peer=site-1, peer_run=simulate_job, task_name=train, task_id=4c9a99f8-002b-44ac-8250-c8c928af1a7e]: sent task assignment to client. client_name:site-1 task_id:4c9a99f8-002b-44ac-8250-c8c928af1a7e\n", + "2024-05-01 12:51:34,785 - GetTaskCommand - INFO - return task to client. client_name: site-1 task_name: train task_id: 4c9a99f8-002b-44ac-8250-c8c928af1a7e sharable_header_task_id: 4c9a99f8-002b-44ac-8250-c8c928af1a7e\n", + "2024-05-01 12:51:34,786 - Communicator - INFO - Received from simulator_server server. getTask: train size: 622B (622 Bytes) time: 0.003079 seconds\n", + "2024-05-01 12:51:34,786 - FederatedClient - INFO - pull_task completed. Task name:train Status:True \n", + "2024-05-01 12:51:34,786 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job]: got task assignment: name=train, id=4c9a99f8-002b-44ac-8250-c8c928af1a7e\n", + "2024-05-01 12:51:34,786 - ClientRunner - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=4c9a99f8-002b-44ac-8250-c8c928af1a7e]: invoking task executor PTClientAPILauncherExecutor\n", + "2024-05-01 12:51:34,787 - PTClientAPILauncherExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=4c9a99f8-002b-44ac-8250-c8c928af1a7e]: execute for task (train)\n", + "2024-05-01 12:51:34,787 - PTClientAPILauncherExecutor - INFO - [identity=site-1, run=simulate_job, peer=simulator_server, peer_run=simulate_job, task_name=train, task_id=4c9a99f8-002b-44ac-8250-c8c928af1a7e]: External execution for task (train) is launched.\n", + "2024-05-01 12:51:38,021 - SubprocessLauncher - INFO - MONAI version: 1.4.dev2416\n", + "2024-05-01 12:51:38,022 - SubprocessLauncher - INFO - Numpy version: 1.26.4\n", + "2024-05-01 12:51:38,024 - SubprocessLauncher - INFO - Pytorch version: 2.3.0+cu121\n", + "2024-05-01 12:51:38,024 - SubprocessLauncher - INFO - MONAI flags: HAS_EXT = False, USE_COMPILED = False, USE_META_DICT = False\n", + "2024-05-01 12:51:38,024 - SubprocessLauncher - INFO - MONAI rev id: d487995c8f685729ee5e1259f364034dab4811da\n", + "2024-05-01 12:51:38,024 - SubprocessLauncher - INFO - MONAI __file__: /home//.venv_monai/lib/python3.10/site-packages/monai/__init__.py\n", + "2024-05-01 12:51:38,024 - SubprocessLauncher - INFO - \n", + "2024-05-01 12:51:38,024 - SubprocessLauncher - INFO - Optional dependencies:\n", + "2024-05-01 12:51:38,024 - SubprocessLauncher - INFO - Pytorch Ignite version: 0.4.11\n", + "2024-05-01 12:51:38,024 - SubprocessLauncher - INFO - ITK version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,024 - SubprocessLauncher - INFO - Nibabel version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,024 - SubprocessLauncher - INFO - scikit-image version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - scipy version: 1.13.0\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - Pillow version: 10.3.0\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - Tensorboard version: 2.16.2\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - gdown version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - TorchVision version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - tqdm version: 4.66.2\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - lmdb version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - psutil version: 5.9.8\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - pandas version: 2.2.2\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - einops version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - transformers version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - mlflow version: 2.12.1\n", + "2024-05-01 12:51:38,025 - SubprocessLauncher - INFO - pynrrd version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,026 - SubprocessLauncher - INFO - clearml version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,026 - SubprocessLauncher - INFO - \n", + "2024-05-01 12:51:38,026 - SubprocessLauncher - INFO - For details about installing the optional dependencies, please visit:\n", + "2024-05-01 12:51:38,026 - SubprocessLauncher - INFO - https://docs.monai.io/en/latest/installation.html#installing-the-recommended-dependencies\n", + "2024-05-01 12:51:38,026 - SubprocessLauncher - INFO - \n", + "2024-05-01 12:51:38,030 - SubprocessLauncher - INFO - MONAI version: 1.4.dev2416\n", + "2024-05-01 12:51:38,030 - SubprocessLauncher - INFO - Numpy version: 1.26.4\n", + "2024-05-01 12:51:38,030 - SubprocessLauncher - INFO - Pytorch version: 2.3.0+cu121\n", + "2024-05-01 12:51:38,030 - SubprocessLauncher - INFO - MONAI flags: HAS_EXT = False, USE_COMPILED = False, USE_META_DICT = False\n", + "2024-05-01 12:51:38,030 - SubprocessLauncher - INFO - MONAI rev id: d487995c8f685729ee5e1259f364034dab4811da\n", + "2024-05-01 12:51:38,030 - SubprocessLauncher - INFO - MONAI __file__: /home//.venv_monai/lib/python3.10/site-packages/monai/__init__.py\n", + "2024-05-01 12:51:38,031 - SubprocessLauncher - INFO - \n", + "2024-05-01 12:51:38,031 - SubprocessLauncher - INFO - Optional dependencies:\n", + "2024-05-01 12:51:38,031 - SubprocessLauncher - INFO - Pytorch Ignite version: 0.4.11\n", + "2024-05-01 12:51:38,031 - SubprocessLauncher - INFO - ITK version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,031 - SubprocessLauncher - INFO - Nibabel version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,032 - SubprocessLauncher - INFO - scikit-image version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,032 - SubprocessLauncher - INFO - scipy version: 1.13.0\n", + "2024-05-01 12:51:38,032 - SubprocessLauncher - INFO - Pillow version: 10.3.0\n", + "2024-05-01 12:51:38,032 - SubprocessLauncher - INFO - Tensorboard version: 2.16.2\n", + "2024-05-01 12:51:38,032 - SubprocessLauncher - INFO - gdown version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,032 - SubprocessLauncher - INFO - TorchVision version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,032 - SubprocessLauncher - INFO - tqdm version: 4.66.2\n", + "2024-05-01 12:51:38,032 - SubprocessLauncher - INFO - lmdb version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,032 - SubprocessLauncher - INFO - psutil version: 5.9.8\n", + "2024-05-01 12:51:38,033 - SubprocessLauncher - INFO - pandas version: 2.2.2\n", + "2024-05-01 12:51:38,033 - SubprocessLauncher - INFO - einops version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,033 - SubprocessLauncher - INFO - transformers version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,033 - SubprocessLauncher - INFO - mlflow version: 2.12.1\n", + "2024-05-01 12:51:38,033 - SubprocessLauncher - INFO - pynrrd version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,033 - SubprocessLauncher - INFO - clearml version: NOT INSTALLED or UNKNOWN VERSION.\n", + "2024-05-01 12:51:38,033 - SubprocessLauncher - INFO - \n", + "2024-05-01 12:51:38,033 - SubprocessLauncher - INFO - For details about installing the optional dependencies, please visit:\n", + "2024-05-01 12:51:38,033 - SubprocessLauncher - INFO - https://docs.monai.io/en/latest/installation.html#installing-the-recommended-dependencies\n", + "2024-05-01 12:51:38,034 - SubprocessLauncher - INFO - \n", + "2024-05-01 12:51:38,043 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00009 127.0.0.1:43515 <= 127.0.0.1:58520] is created: PID: 2900298\n", + "2024-05-01 12:51:38,051 - nvflare.fuel.f3.sfm.conn_manager - INFO - Connection [CN00010 127.0.0.1:43515 <= 127.0.0.1:58536] is created: PID: 2900298\n", + "2024-05-01 12:51:38,052 - SubprocessLauncher - INFO - /tmp/tmp3vxaowbg\n", + "2024-05-01 12:51:38,056 - SubprocessLauncher - INFO - /tmp/tmpk6z1ssep\n", + "MedNIST.tar.gz: 59.0MB [00:02, 25.2MB/s]9.0M [00:02<00:00, 28.4MB/s]\n", + "2024-05-01 12:51:40,509 - SubprocessLauncher - INFO - 2024-05-01 12:51:40,509 - INFO - Downloaded: /tmp/tmp3vxaowbg/MedNIST.tar.gz\n", + "2024-05-01 12:51:40,605 - SubprocessLauncher - INFO - 2024-05-01 12:51:40,605 - INFO - Verified 'MedNIST.tar.gz', md5: 0bc7306e7427e00ad1c5526a6677552d.\n", + "2024-05-01 12:51:40,606 - SubprocessLauncher - INFO - 2024-05-01 12:51:40,605 - INFO - Writing into directory: /tmp/tmp3vxaowbg.\n", + "MedNIST.tar.gz: 59.0MB [00:04, 13.1MB/s]9.0M [00:04<00:00, 21.9MB/s]\n", + "2024-05-01 12:51:42,772 - SubprocessLauncher - INFO - 2024-05-01 12:51:42,772 - INFO - Downloaded: /tmp/tmpk6z1ssep/MedNIST.tar.gz\n", + "2024-05-01 12:51:42,862 - SubprocessLauncher - INFO - 2024-05-01 12:51:42,862 - INFO - Verified 'MedNIST.tar.gz', md5: 0bc7306e7427e00ad1c5526a6677552d.\n", + "2024-05-01 12:51:42,863 - SubprocessLauncher - INFO - 2024-05-01 12:51:42,862 - INFO - Writing into directory: /tmp/tmpk6z1ssep.\n", + "Loading dataset: 100%|██████████| 47164/47164 [00:34<00:00, 1363.17it/s]\n", + "2024-05-01 12:52:20,541 - SubprocessLauncher - INFO - current_round=0\n", + "2024-05-01 12:52:20,541 - SubprocessLauncher - INFO - INFO:ignite.engine.engine.SupervisedTrainer:Engine run resuming from iteration 0, epoch 0 until 1 epochs\n", + "2024-05-01 12:52:21,668 - SubprocessLauncher - INFO - /home/hroth/.venv_monai/lib/python3.10/site-packages/monai/handlers/stats_handler.py:261: UserWarning: ignoring non-scalar output in StatsHandler, make sure `output_transform(engine.state.output)` returns a scalar or dictionary of key and scalar pairs to avoid this warning. image:\n", + "2024-05-01 12:52:21,668 - SubprocessLauncher - INFO - warnings.warn(\n", + "2024-05-01 12:52:21,669 - SubprocessLauncher - INFO - /home/hroth/.venv_monai/lib/python3.10/site-packages/monai/handlers/stats_handler.py:261: UserWarning: ignoring non-scalar output in StatsHandler, make sure `output_transform(engine.state.output)` returns a scalar or dictionary of key and scalar pairs to avoid this warning. pred:\n", + "2024-05-01 12:52:21,669 - SubprocessLauncher - INFO - warnings.warn(\n", + "2024-05-01 12:52:21,669 - SubprocessLauncher - INFO - 2024-05-01 12:52:21,668 - INFO - Epoch: 1/1, Iter: 1/93 -- label: 1.0000 loss: 1.7749\n", + "2024-05-01 12:52:21,795 - SubprocessLauncher - INFO - 2024-05-01 12:52:21,795 - INFO - Epoch: 1/1, Iter: 2/93 -- label: 1.0000 loss: 1.7655\n", + "2024-05-01 12:52:21,928 - SubprocessLauncher - INFO - 2024-05-01 12:52:21,927 - INFO - Epoch: 1/1, Iter: 3/93 -- label: 4.0000 loss: 1.7341\n", + "2024-05-01 12:52:22,050 - SubprocessLauncher - INFO - 2024-05-01 12:52:22,050 - INFO - Epoch: 1/1, Iter: 4/93 -- label: 1.0000 loss: 1.7060\n", + "2024-05-01 12:52:22,175 - SubprocessLauncher - INFO - 2024-05-01 12:52:22,175 - INFO - Epoch: 1/1, Iter: 5/93 -- label: 5.0000 loss: 1.6660\n", + "Loading dataset: 100%|██████████| 47164/47164 [00:34<00:00, 1350.85it/s]\n", + "2024-05-01 12:52:22,619 - SubprocessLauncher - INFO - 2024-05-01 12:52:22,618 - INFO - Epoch: 1/1, Iter: 6/93 -- label: 3.0000 loss: 1.6453\n", + "2024-05-01 12:52:22,751 - SubprocessLauncher - INFO - 2024-05-01 12:52:22,751 - INFO - Epoch: 1/1, Iter: 7/93 -- label: 0.0000 loss: 1.6157\n", + "2024-05-01 12:52:22,895 - SubprocessLauncher - INFO - 2024-05-01 12:52:22,895 - INFO - Epoch: 1/1, Iter: 8/93 -- label: 0.0000 loss: 1.6096\n", + "2024-05-01 12:52:22,941 - SubprocessLauncher - INFO - current_round=0\n", + "2024-05-01 12:52:22,941 - SubprocessLauncher - INFO - INFO:ignite.engine.engine.SupervisedTrainer:Engine run resuming from iteration 0, epoch 0 until 1 epochs\n", + "2024-05-01 12:52:23,024 - SubprocessLauncher - INFO - 2024-05-01 12:52:23,024 - INFO - Epoch: 1/1, Iter: 9/93 -- label: 1.0000 loss: 1.5664\n", + "2024-05-01 12:52:23,161 - SubprocessLauncher - INFO - 2024-05-01 12:52:23,161 - INFO - Epoch: 1/1, Iter: 10/93 -- label: 3.0000 loss: 1.5647\n", + "2024-05-01 12:52:23,296 - SubprocessLauncher - INFO - 2024-05-01 12:52:23,295 - INFO - Epoch: 1/1, Iter: 11/93 -- label: 5.0000 loss: 1.5287\n", + "2024-05-01 12:52:23,442 - SubprocessLauncher - INFO - 2024-05-01 12:52:23,441 - INFO - Epoch: 1/1, Iter: 12/93 -- label: 2.0000 loss: 1.5165\n", + "2024-05-01 12:52:23,589 - SubprocessLauncher - INFO - 2024-05-01 12:52:23,588 - INFO - Epoch: 1/1, Iter: 13/93 -- label: 3.0000 loss: 1.4941\n", + "2024-05-01 12:52:23,719 - SubprocessLauncher - INFO - 2024-05-01 12:52:23,718 - INFO - Epoch: 1/1, Iter: 14/93 -- label: 4.0000 loss: 1.4703\n", + "2024-05-01 12:52:23,870 - SubprocessLauncher - INFO - 2024-05-01 12:52:23,870 - INFO - Epoch: 1/1, Iter: 15/93 -- label: 3.0000 loss: 1.4267\n", + "2024-05-01 12:52:24,019 - SubprocessLauncher - INFO - 2024-05-01 12:52:24,019 - INFO - Epoch: 1/1, Iter: 16/93 -- label: 3.0000 loss: 1.4114\n", + "2024-05-01 12:52:24,213 - SubprocessLauncher - INFO - 2024-05-01 12:52:24,212 - INFO - Epoch: 1/1, Iter: 17/93 -- label: 0.0000 loss: 1.3923\n", + "2024-05-01 12:52:24,260 - SubprocessLauncher - INFO - /home/hroth/.venv_monai/lib/python3.10/site-packages/monai/handlers/stats_handler.py:261: UserWarning: ignoring non-scalar output in StatsHandler, make sure `output_transform(engine.state.output)` returns a scalar or dictionary of key and scalar pairs to avoid this warning. image:\n", + "2024-05-01 12:52:24,261 - SubprocessLauncher - INFO - warnings.warn(\n", + "2024-05-01 12:52:24,261 - SubprocessLauncher - INFO - /home/hroth/.venv_monai/lib/python3.10/site-packages/monai/handlers/stats_handler.py:261: UserWarning: ignoring non-scalar output in StatsHandler, make sure `output_transform(engine.state.output)` returns a scalar or dictionary of key and scalar pairs to avoid this warning. pred:\n", + "2024-05-01 12:52:24,261 - SubprocessLauncher - INFO - warnings.warn(\n", + "2024-05-01 12:52:24,261 - SubprocessLauncher - INFO - 2024-05-01 12:52:24,260 - INFO - Epoch: 1/1, Iter: 1/93 -- label: 1.0000 loss: 1.8233\n", + "2024-05-01 12:52:24,415 - SubprocessLauncher - INFO - 2024-05-01 12:52:24,414 - INFO - Epoch: 1/1, Iter: 18/93 -- label: 4.0000 loss: 1.3866\n", + "2024-05-01 12:52:24,448 - SubprocessLauncher - INFO - 2024-05-01 12:52:24,447 - INFO - Epoch: 1/1, Iter: 2/93 -- label: 0.0000 loss: 1.7886\n", + "2024-05-01 12:52:24,642 - SubprocessLauncher - INFO - 2024-05-01 12:52:24,641 - INFO - Epoch: 1/1, Iter: 19/93 -- label: 4.0000 loss: 1.3683\n", + "2024-05-01 12:52:24,649 - SubprocessLauncher - INFO - 2024-05-01 12:52:24,649 - INFO - Epoch: 1/1, Iter: 3/93 -- label: 1.0000 loss: 1.7609\n", + "2024-05-01 12:52:24,860 - SubprocessLauncher - INFO - 2024-05-01 12:52:24,860 - INFO - Epoch: 1/1, Iter: 20/93 -- label: 2.0000 loss: 1.3307\n", + "2024-05-01 12:52:24,866 - SubprocessLauncher - INFO - 2024-05-01 12:52:24,866 - INFO - Epoch: 1/1, Iter: 4/93 -- label: 3.0000 loss: 1.7345\n", + "2024-05-01 12:52:25,080 - SubprocessLauncher - INFO - 2024-05-01 12:52:25,079 - INFO - Epoch: 1/1, Iter: 21/93 -- label: 3.0000 loss: 1.3220\n", + "2024-05-01 12:52:25,083 - SubprocessLauncher - INFO - 2024-05-01 12:52:25,083 - INFO - Epoch: 1/1, Iter: 5/93 -- label: 4.0000 loss: 1.6863\n", + "2024-05-01 12:52:25,294 - SubprocessLauncher - INFO - 2024-05-01 12:52:25,293 - INFO - Epoch: 1/1, Iter: 22/93 -- label: 2.0000 loss: 1.3027\n", + "2024-05-01 12:52:25,299 - SubprocessLauncher - INFO - 2024-05-01 12:52:25,299 - INFO - Epoch: 1/1, Iter: 6/93 -- label: 3.0000 loss: 1.6616\n", + "2024-05-01 12:52:25,526 - SubprocessLauncher - INFO - 2024-05-01 12:52:25,526 - INFO - Epoch: 1/1, Iter: 7/93 -- label: 4.0000 loss: 1.6457\n", + "2024-05-01 12:52:25,527 - SubprocessLauncher - INFO - 2024-05-01 12:52:25,527 - INFO - Epoch: 1/1, Iter: 23/93 -- label: 2.0000 loss: 1.2692\n", + "2024-05-01 12:52:25,748 - SubprocessLauncher - INFO - 2024-05-01 12:52:25,747 - INFO - Epoch: 1/1, Iter: 24/93 -- label: 5.0000 loss: 1.2578\n", + "2024-05-01 12:52:25,880 - SubprocessLauncher - INFO - 2024-05-01 12:52:25,879 - INFO - Epoch: 1/1, Iter: 25/93 -- label: 2.0000 loss: 1.2373\n", + "2024-05-01 12:52:26,007 - SubprocessLauncher - INFO - 2024-05-01 12:52:26,006 - INFO - Epoch: 1/1, Iter: 26/93 -- label: 3.0000 loss: 1.1903\n", + "2024-05-01 12:52:26,080 - SubprocessLauncher - INFO - 2024-05-01 12:52:26,079 - INFO - Epoch: 1/1, Iter: 8/93 -- label: 0.0000 loss: 1.6397\n", + "2024-05-01 12:52:26,194 - SubprocessLauncher - INFO - 2024-05-01 12:52:26,193 - INFO - Epoch: 1/1, Iter: 27/93 -- label: 3.0000 loss: 1.1886\n", + "2024-05-01 12:52:26,275 - SubprocessLauncher - INFO - 2024-05-01 12:52:26,274 - INFO - Epoch: 1/1, Iter: 9/93 -- label: 4.0000 loss: 1.5913\n", + "2024-05-01 12:52:26,384 - SubprocessLauncher - INFO - 2024-05-01 12:52:26,383 - INFO - Epoch: 1/1, Iter: 28/93 -- label: 5.0000 loss: 1.1557\n", + "2024-05-01 12:52:26,460 - SubprocessLauncher - INFO - 2024-05-01 12:52:26,460 - INFO - Epoch: 1/1, Iter: 10/93 -- label: 4.0000 loss: 1.5650\n", + "2024-05-01 12:52:26,586 - SubprocessLauncher - INFO - 2024-05-01 12:52:26,585 - INFO - Epoch: 1/1, Iter: 29/93 -- label: 3.0000 loss: 1.1665\n", + "2024-05-01 12:52:26,652 - SubprocessLauncher - INFO - 2024-05-01 12:52:26,651 - INFO - Epoch: 1/1, Iter: 11/93 -- label: 5.0000 loss: 1.5365\n", + "2024-05-01 12:52:26,787 - SubprocessLauncher - INFO - 2024-05-01 12:52:26,786 - INFO - Epoch: 1/1, Iter: 30/93 -- label: 3.0000 loss: 1.1500\n", + "2024-05-01 12:52:26,836 - SubprocessLauncher - INFO - 2024-05-01 12:52:26,835 - INFO - Epoch: 1/1, Iter: 12/93 -- label: 1.0000 loss: 1.5204\n", + "2024-05-01 12:52:26,996 - SubprocessLauncher - INFO - 2024-05-01 12:52:26,996 - INFO - Epoch: 1/1, Iter: 31/93 -- label: 5.0000 loss: 1.1323\n", + "2024-05-01 12:52:27,037 - SubprocessLauncher - INFO - 2024-05-01 12:52:27,037 - INFO - Epoch: 1/1, Iter: 13/93 -- label: 2.0000 loss: 1.4807\n", + "2024-05-01 12:52:27,205 - SubprocessLauncher - INFO - 2024-05-01 12:52:27,204 - INFO - Epoch: 1/1, Iter: 32/93 -- label: 4.0000 loss: 1.1058\n", + "2024-05-01 12:52:27,228 - SubprocessLauncher - INFO - 2024-05-01 12:52:27,227 - INFO - Epoch: 1/1, Iter: 14/93 -- label: 0.0000 loss: 1.4553\n", + "2024-05-01 12:52:27,425 - SubprocessLauncher - INFO - 2024-05-01 12:52:27,425 - INFO - Epoch: 1/1, Iter: 33/93 -- label: 3.0000 loss: 1.0866\n", + "2024-05-01 12:52:27,435 - SubprocessLauncher - INFO - 2024-05-01 12:52:27,434 - INFO - Epoch: 1/1, Iter: 15/93 -- label: 5.0000 loss: 1.4429\n", + "2024-05-01 12:52:27,645 - SubprocessLauncher - INFO - 2024-05-01 12:52:27,644 - INFO - Epoch: 1/1, Iter: 34/93 -- label: 4.0000 loss: 1.0537\n", + "2024-05-01 12:52:27,648 - SubprocessLauncher - INFO - 2024-05-01 12:52:27,647 - INFO - Epoch: 1/1, Iter: 16/93 -- label: 1.0000 loss: 1.3957\n", + "2024-05-01 12:52:27,871 - SubprocessLauncher - INFO - 2024-05-01 12:52:27,871 - INFO - Epoch: 1/1, Iter: 17/93 -- label: 5.0000 loss: 1.3750\n", + "2024-05-01 12:52:28,001 - SubprocessLauncher - INFO - 2024-05-01 12:52:28,000 - INFO - Epoch: 1/1, Iter: 18/93 -- label: 2.0000 loss: 1.3368\n", + "2024-05-01 12:52:28,124 - SubprocessLauncher - INFO - 2024-05-01 12:52:28,123 - INFO - Epoch: 1/1, Iter: 19/93 -- label: 0.0000 loss: 1.3243\n", + "2024-05-01 12:52:28,167 - SubprocessLauncher - INFO - 2024-05-01 12:52:28,167 - INFO - Epoch: 1/1, Iter: 35/93 -- label: 1.0000 loss: 1.0751\n", + "2024-05-01 12:52:28,339 - SubprocessLauncher - INFO - 2024-05-01 12:52:28,338 - INFO - Epoch: 1/1, Iter: 20/93 -- label: 2.0000 loss: 1.3229\n", + "2024-05-01 12:52:28,361 - SubprocessLauncher - INFO - 2024-05-01 12:52:28,361 - INFO - Epoch: 1/1, Iter: 36/93 -- label: 0.0000 loss: 1.0380\n", + "2024-05-01 12:52:28,554 - SubprocessLauncher - INFO - 2024-05-01 12:52:28,554 - INFO - Epoch: 1/1, Iter: 21/93 -- label: 3.0000 loss: 1.3307\n", + "2024-05-01 12:52:28,569 - SubprocessLauncher - INFO - 2024-05-01 12:52:28,569 - INFO - Epoch: 1/1, Iter: 37/93 -- label: 2.0000 loss: 1.0191\n", + "2024-05-01 12:52:28,772 - SubprocessLauncher - INFO - 2024-05-01 12:52:28,771 - INFO - Epoch: 1/1, Iter: 22/93 -- label: 2.0000 loss: 1.2583\n", + "2024-05-01 12:52:28,776 - SubprocessLauncher - INFO - 2024-05-01 12:52:28,776 - INFO - Epoch: 1/1, Iter: 38/93 -- label: 2.0000 loss: 0.9946\n", + "2024-05-01 12:52:28,980 - SubprocessLauncher - INFO - 2024-05-01 12:52:28,980 - INFO - Epoch: 1/1, Iter: 23/93 -- label: 1.0000 loss: 1.2429\n", + "2024-05-01 12:52:28,987 - SubprocessLauncher - INFO - 2024-05-01 12:52:28,986 - INFO - Epoch: 1/1, Iter: 39/93 -- label: 1.0000 loss: 0.9884\n", + "2024-05-01 12:52:29,194 - SubprocessLauncher - INFO - 2024-05-01 12:52:29,193 - INFO - Epoch: 1/1, Iter: 24/93 -- label: 0.0000 loss: 1.2285\n", + "2024-05-01 12:52:29,198 - SubprocessLauncher - INFO - 2024-05-01 12:52:29,198 - INFO - Epoch: 1/1, Iter: 40/93 -- label: 5.0000 loss: 0.9694\n", + "2024-05-01 12:52:29,419 - SubprocessLauncher - INFO - 2024-05-01 12:52:29,419 - INFO - Epoch: 1/1, Iter: 25/93 -- label: 1.0000 loss: 1.2235\n", + "2024-05-01 12:52:29,426 - SubprocessLauncher - INFO - 2024-05-01 12:52:29,426 - INFO - Epoch: 1/1, Iter: 41/93 -- label: 4.0000 loss: 0.9658\n", + "2024-05-01 12:52:29,657 - SubprocessLauncher - INFO - 2024-05-01 12:52:29,657 - INFO - Epoch: 1/1, Iter: 42/93 -- label: 5.0000 loss: 0.9119\n", + "2024-05-01 12:52:29,657 - SubprocessLauncher - INFO - 2024-05-01 12:52:29,657 - INFO - Epoch: 1/1, Iter: 26/93 -- label: 2.0000 loss: 1.1898\n", + "2024-05-01 12:52:29,888 - SubprocessLauncher - INFO - 2024-05-01 12:52:29,888 - INFO - Epoch: 1/1, Iter: 27/93 -- label: 0.0000 loss: 1.1464\n", + "2024-05-01 12:52:29,890 - SubprocessLauncher - INFO - 2024-05-01 12:52:29,889 - INFO - Epoch: 1/1, Iter: 43/93 -- label: 4.0000 loss: 0.9424\n", + "2024-05-01 12:52:30,110 - SubprocessLauncher - INFO - 2024-05-01 12:52:30,110 - INFO - Epoch: 1/1, Iter: 28/93 -- label: 0.0000 loss: 1.1403\n", + "2024-05-01 12:52:30,115 - SubprocessLauncher - INFO - 2024-05-01 12:52:30,114 - INFO - Epoch: 1/1, Iter: 44/93 -- label: 1.0000 loss: 0.9186\n", + "2024-05-01 12:52:30,322 - SubprocessLauncher - INFO - 2024-05-01 12:52:30,322 - INFO - Epoch: 1/1, Iter: 29/93 -- label: 5.0000 loss: 1.1341\n", + "2024-05-01 12:52:30,327 - SubprocessLauncher - INFO - 2024-05-01 12:52:30,327 - INFO - Epoch: 1/1, Iter: 45/93 -- label: 0.0000 loss: 0.9051\n", + "2024-05-01 12:52:30,551 - SubprocessLauncher - INFO - 2024-05-01 12:52:30,551 - INFO - Epoch: 1/1, Iter: 46/93 -- label: 3.0000 loss: 0.8955\n", + "2024-05-01 12:52:30,553 - SubprocessLauncher - INFO - 2024-05-01 12:52:30,552 - INFO - Epoch: 1/1, Iter: 30/93 -- label: 1.0000 loss: 1.1021\n", + "2024-05-01 12:52:30,768 - SubprocessLauncher - INFO - 2024-05-01 12:52:30,767 - INFO - Epoch: 1/1, Iter: 31/93 -- label: 0.0000 loss: 1.0807\n", + "2024-05-01 12:52:30,775 - SubprocessLauncher - INFO - 2024-05-01 12:52:30,775 - INFO - Epoch: 1/1, Iter: 47/93 -- label: 2.0000 loss: 0.8520\n", + "2024-05-01 12:52:30,985 - SubprocessLauncher - INFO - 2024-05-01 12:52:30,985 - INFO - Epoch: 1/1, Iter: 32/93 -- label: 0.0000 loss: 1.0938\n", + "2024-05-01 12:52:30,990 - SubprocessLauncher - INFO - 2024-05-01 12:52:30,989 - INFO - Epoch: 1/1, Iter: 48/93 -- label: 5.0000 loss: 0.8551\n", + "2024-05-01 12:52:31,200 - SubprocessLauncher - INFO - 2024-05-01 12:52:31,200 - INFO - Epoch: 1/1, Iter: 33/93 -- label: 1.0000 loss: 1.0545\n", + "2024-05-01 12:52:31,210 - SubprocessLauncher - INFO - 2024-05-01 12:52:31,209 - INFO - Epoch: 1/1, Iter: 49/93 -- label: 3.0000 loss: 0.8366\n", + "2024-05-01 12:52:31,433 - SubprocessLauncher - INFO - 2024-05-01 12:52:31,433 - INFO - Epoch: 1/1, Iter: 34/93 -- label: 2.0000 loss: 1.0471\n", + "2024-05-01 12:52:31,433 - SubprocessLauncher - INFO - 2024-05-01 12:52:31,433 - INFO - Epoch: 1/1, Iter: 50/93 -- label: 3.0000 loss: 0.8463\n", + "2024-05-01 12:52:31,643 - SubprocessLauncher - INFO - 2024-05-01 12:52:31,643 - INFO - Epoch: 1/1, Iter: 35/93 -- label: 3.0000 loss: 1.0585\n", + "2024-05-01 12:52:31,653 - SubprocessLauncher - INFO - 2024-05-01 12:52:31,652 - INFO - Epoch: 1/1, Iter: 51/93 -- label: 1.0000 loss: 0.7918\n", + "2024-05-01 12:52:31,852 - SubprocessLauncher - INFO - 2024-05-01 12:52:31,851 - INFO - Epoch: 1/1, Iter: 36/93 -- label: 1.0000 loss: 1.0007\n", + "2024-05-01 12:52:31,860 - SubprocessLauncher - INFO - 2024-05-01 12:52:31,859 - INFO - Epoch: 1/1, Iter: 52/93 -- label: 2.0000 loss: 0.7812\n", + "2024-05-01 12:52:32,067 - SubprocessLauncher - INFO - 2024-05-01 12:52:32,067 - INFO - Epoch: 1/1, Iter: 37/93 -- label: 5.0000 loss: 0.9865\n", + "2024-05-01 12:52:32,072 - SubprocessLauncher - INFO - 2024-05-01 12:52:32,072 - INFO - Epoch: 1/1, Iter: 53/93 -- label: 0.0000 loss: 0.7900\n", + "2024-05-01 12:52:32,293 - SubprocessLauncher - INFO - 2024-05-01 12:52:32,293 - INFO - Epoch: 1/1, Iter: 54/93 -- label: 3.0000 loss: 0.7572\n", + "2024-05-01 12:52:32,295 - SubprocessLauncher - INFO - 2024-05-01 12:52:32,295 - INFO - Epoch: 1/1, Iter: 38/93 -- label: 3.0000 loss: 0.9568\n", + "2024-05-01 12:52:32,519 - SubprocessLauncher - INFO - 2024-05-01 12:52:32,519 - INFO - Epoch: 1/1, Iter: 55/93 -- label: 5.0000 loss: 0.7697\n", + "2024-05-01 12:52:32,652 - SubprocessLauncher - INFO - 2024-05-01 12:52:32,652 - INFO - Epoch: 1/1, Iter: 56/93 -- label: 0.0000 loss: 0.7550\n", + "2024-05-01 12:52:32,780 - SubprocessLauncher - INFO - 2024-05-01 12:52:32,779 - INFO - Epoch: 1/1, Iter: 57/93 -- label: 4.0000 loss: 0.7242\n", + "2024-05-01 12:52:32,816 - SubprocessLauncher - INFO - 2024-05-01 12:52:32,815 - INFO - Epoch: 1/1, Iter: 39/93 -- label: 0.0000 loss: 0.9195\n", + "2024-05-01 12:52:32,993 - SubprocessLauncher - INFO - 2024-05-01 12:52:32,992 - INFO - Epoch: 1/1, Iter: 58/93 -- label: 3.0000 loss: 0.7224\n", + "2024-05-01 12:52:33,011 - SubprocessLauncher - INFO - 2024-05-01 12:52:33,010 - INFO - Epoch: 1/1, Iter: 40/93 -- label: 4.0000 loss: 0.9114\n", + "2024-05-01 12:52:33,222 - SubprocessLauncher - INFO - 2024-05-01 12:52:33,222 - INFO - Epoch: 1/1, Iter: 59/93 -- label: 5.0000 loss: 0.7172\n", + "2024-05-01 12:52:33,224 - SubprocessLauncher - INFO - 2024-05-01 12:52:33,223 - INFO - Epoch: 1/1, Iter: 41/93 -- label: 5.0000 loss: 0.9557\n", + "2024-05-01 12:52:33,438 - SubprocessLauncher - INFO - 2024-05-01 12:52:33,437 - INFO - Epoch: 1/1, Iter: 60/93 -- label: 1.0000 loss: 0.6860\n", + "2024-05-01 12:52:33,441 - SubprocessLauncher - INFO - 2024-05-01 12:52:33,441 - INFO - Epoch: 1/1, Iter: 42/93 -- label: 3.0000 loss: 0.8983\n", + "2024-05-01 12:52:33,661 - SubprocessLauncher - INFO - 2024-05-01 12:52:33,661 - INFO - Epoch: 1/1, Iter: 43/93 -- label: 4.0000 loss: 0.8953\n", + "2024-05-01 12:52:33,665 - SubprocessLauncher - INFO - 2024-05-01 12:52:33,664 - INFO - Epoch: 1/1, Iter: 61/93 -- label: 5.0000 loss: 0.6980\n", + "2024-05-01 12:52:33,886 - SubprocessLauncher - INFO - 2024-05-01 12:52:33,885 - INFO - Epoch: 1/1, Iter: 44/93 -- label: 3.0000 loss: 0.8632\n", + "2024-05-01 12:52:33,887 - SubprocessLauncher - INFO - 2024-05-01 12:52:33,886 - INFO - Epoch: 1/1, Iter: 62/93 -- label: 2.0000 loss: 0.6809\n", + "2024-05-01 12:52:34,097 - SubprocessLauncher - INFO - 2024-05-01 12:52:34,097 - INFO - Epoch: 1/1, Iter: 45/93 -- label: 1.0000 loss: 0.8802\n", + "2024-05-01 12:52:34,104 - SubprocessLauncher - INFO - 2024-05-01 12:52:34,104 - INFO - Epoch: 1/1, Iter: 63/93 -- label: 3.0000 loss: 0.6548\n", + "2024-05-01 12:52:34,328 - SubprocessLauncher - INFO - 2024-05-01 12:52:34,327 - INFO - Epoch: 1/1, Iter: 46/93 -- label: 4.0000 loss: 0.8634\n", + "2024-05-01 12:52:34,460 - SubprocessLauncher - INFO - 2024-05-01 12:52:34,460 - INFO - Epoch: 1/1, Iter: 47/93 -- label: 4.0000 loss: 0.8405\n", + "2024-05-01 12:52:34,589 - SubprocessLauncher - INFO - 2024-05-01 12:52:34,588 - INFO - Epoch: 1/1, Iter: 48/93 -- label: 4.0000 loss: 0.8317\n", + "2024-05-01 12:52:34,620 - SubprocessLauncher - INFO - 2024-05-01 12:52:34,619 - INFO - Epoch: 1/1, Iter: 64/93 -- label: 4.0000 loss: 0.6385\n", + "2024-05-01 12:52:34,808 - SubprocessLauncher - INFO - 2024-05-01 12:52:34,807 - INFO - Epoch: 1/1, Iter: 49/93 -- label: 4.0000 loss: 0.7892\n", + "2024-05-01 12:52:34,820 - SubprocessLauncher - INFO - 2024-05-01 12:52:34,819 - INFO - Epoch: 1/1, Iter: 65/93 -- label: 3.0000 loss: 0.6293\n", + "2024-05-01 12:52:35,036 - SubprocessLauncher - INFO - 2024-05-01 12:52:35,036 - INFO - Epoch: 1/1, Iter: 66/93 -- label: 0.0000 loss: 0.6353\n", + "2024-05-01 12:52:35,038 - SubprocessLauncher - INFO - 2024-05-01 12:52:35,038 - INFO - Epoch: 1/1, Iter: 50/93 -- label: 4.0000 loss: 0.8029\n", + "2024-05-01 12:52:35,262 - SubprocessLauncher - INFO - 2024-05-01 12:52:35,262 - INFO - Epoch: 1/1, Iter: 67/93 -- label: 0.0000 loss: 0.6465\n", + "2024-05-01 12:52:35,265 - SubprocessLauncher - INFO - 2024-05-01 12:52:35,264 - INFO - Epoch: 1/1, Iter: 51/93 -- label: 2.0000 loss: 0.7653\n", + "2024-05-01 12:52:35,480 - SubprocessLauncher - INFO - 2024-05-01 12:52:35,479 - INFO - Epoch: 1/1, Iter: 68/93 -- label: 0.0000 loss: 0.6575\n", + "2024-05-01 12:52:35,485 - SubprocessLauncher - INFO - 2024-05-01 12:52:35,484 - INFO - Epoch: 1/1, Iter: 52/93 -- label: 2.0000 loss: 0.7849\n", + "2024-05-01 12:52:35,706 - SubprocessLauncher - INFO - 2024-05-01 12:52:35,706 - INFO - Epoch: 1/1, Iter: 69/93 -- label: 4.0000 loss: 0.6169\n", + "2024-05-01 12:52:35,708 - SubprocessLauncher - INFO - 2024-05-01 12:52:35,707 - INFO - Epoch: 1/1, Iter: 53/93 -- label: 0.0000 loss: 0.7577\n", + "2024-05-01 12:52:35,916 - SubprocessLauncher - INFO - 2024-05-01 12:52:35,916 - INFO - Epoch: 1/1, Iter: 70/93 -- label: 0.0000 loss: 0.5976\n", + "2024-05-01 12:52:35,926 - SubprocessLauncher - INFO - 2024-05-01 12:52:35,925 - INFO - Epoch: 1/1, Iter: 54/93 -- label: 0.0000 loss: 0.7375\n", + "2024-05-01 12:52:36,141 - SubprocessLauncher - INFO - 2024-05-01 12:52:36,141 - INFO - Epoch: 1/1, Iter: 71/93 -- label: 5.0000 loss: 0.5941\n", + "2024-05-01 12:52:36,144 - SubprocessLauncher - INFO - 2024-05-01 12:52:36,144 - INFO - Epoch: 1/1, Iter: 55/93 -- label: 4.0000 loss: 0.7330\n", + "2024-05-01 12:52:36,349 - SubprocessLauncher - INFO - 2024-05-01 12:52:36,349 - INFO - Epoch: 1/1, Iter: 72/93 -- label: 1.0000 loss: 0.5656\n", + "2024-05-01 12:52:36,357 - SubprocessLauncher - INFO - 2024-05-01 12:52:36,357 - INFO - Epoch: 1/1, Iter: 56/93 -- label: 2.0000 loss: 0.7278\n", + "2024-05-01 12:52:36,568 - SubprocessLauncher - INFO - 2024-05-01 12:52:36,568 - INFO - Epoch: 1/1, Iter: 73/93 -- label: 2.0000 loss: 0.5668\n", + "2024-05-01 12:52:36,571 - SubprocessLauncher - INFO - 2024-05-01 12:52:36,571 - INFO - Epoch: 1/1, Iter: 57/93 -- label: 0.0000 loss: 0.7026\n", + "2024-05-01 12:52:36,781 - SubprocessLauncher - INFO - 2024-05-01 12:52:36,780 - INFO - Epoch: 1/1, Iter: 74/93 -- label: 3.0000 loss: 0.5502\n", + "2024-05-01 12:52:36,788 - SubprocessLauncher - INFO - 2024-05-01 12:52:36,788 - INFO - Epoch: 1/1, Iter: 58/93 -- label: 5.0000 loss: 0.7233\n", + "2024-05-01 12:52:37,010 - SubprocessLauncher - INFO - 2024-05-01 12:52:37,009 - INFO - Epoch: 1/1, Iter: 75/93 -- label: 4.0000 loss: 0.5322\n", + "2024-05-01 12:52:37,011 - SubprocessLauncher - INFO - 2024-05-01 12:52:37,011 - INFO - Epoch: 1/1, Iter: 59/93 -- label: 0.0000 loss: 0.6777\n", + "2024-05-01 12:52:37,232 - SubprocessLauncher - INFO - 2024-05-01 12:52:37,232 - INFO - Epoch: 1/1, Iter: 76/93 -- label: 2.0000 loss: 0.5378\n", + "2024-05-01 12:52:37,236 - SubprocessLauncher - INFO - 2024-05-01 12:52:37,235 - INFO - Epoch: 1/1, Iter: 60/93 -- label: 4.0000 loss: 0.6620\n", + "2024-05-01 12:52:37,467 - SubprocessLauncher - INFO - 2024-05-01 12:52:37,467 - INFO - Epoch: 1/1, Iter: 61/93 -- label: 5.0000 loss: 0.6752\n", + "2024-05-01 12:52:37,468 - SubprocessLauncher - INFO - 2024-05-01 12:52:37,467 - INFO - Epoch: 1/1, Iter: 77/93 -- label: 0.0000 loss: 0.5442\n", + "2024-05-01 12:52:37,696 - SubprocessLauncher - INFO - 2024-05-01 12:52:37,695 - INFO - Epoch: 1/1, Iter: 78/93 -- label: 0.0000 loss: 0.5225\n", + "2024-05-01 12:52:37,701 - SubprocessLauncher - INFO - 2024-05-01 12:52:37,700 - INFO - Epoch: 1/1, Iter: 62/93 -- label: 4.0000 loss: 0.6481\n", + "2024-05-01 12:52:37,924 - SubprocessLauncher - INFO - 2024-05-01 12:52:37,923 - INFO - Epoch: 1/1, Iter: 79/93 -- label: 1.0000 loss: 0.5166\n", + "2024-05-01 12:52:37,926 - SubprocessLauncher - INFO - 2024-05-01 12:52:37,926 - INFO - Epoch: 1/1, Iter: 63/93 -- label: 5.0000 loss: 0.6547\n", + "2024-05-01 12:52:38,142 - SubprocessLauncher - INFO - 2024-05-01 12:52:38,141 - INFO - Epoch: 1/1, Iter: 80/93 -- label: 1.0000 loss: 0.4993\n", + "2024-05-01 12:52:38,145 - SubprocessLauncher - INFO - 2024-05-01 12:52:38,145 - INFO - Epoch: 1/1, Iter: 64/93 -- label: 4.0000 loss: 0.6376\n", + "2024-05-01 12:52:38,383 - SubprocessLauncher - INFO - 2024-05-01 12:52:38,382 - INFO - Epoch: 1/1, Iter: 81/93 -- label: 0.0000 loss: 0.4842\n", + "2024-05-01 12:52:38,384 - SubprocessLauncher - INFO - 2024-05-01 12:52:38,383 - INFO - Epoch: 1/1, Iter: 65/93 -- label: 1.0000 loss: 0.6238\n", + "2024-05-01 12:52:38,610 - SubprocessLauncher - INFO - 2024-05-01 12:52:38,610 - INFO - Epoch: 1/1, Iter: 82/93 -- label: 5.0000 loss: 0.5015\n", + "2024-05-01 12:52:38,613 - SubprocessLauncher - INFO - 2024-05-01 12:52:38,612 - INFO - Epoch: 1/1, Iter: 66/93 -- label: 1.0000 loss: 0.6091\n", + "2024-05-01 12:52:38,848 - SubprocessLauncher - INFO - 2024-05-01 12:52:38,847 - INFO - Epoch: 1/1, Iter: 67/93 -- label: 5.0000 loss: 0.5667\n", + "2024-05-01 12:52:38,851 - SubprocessLauncher - INFO - 2024-05-01 12:52:38,851 - INFO - Epoch: 1/1, Iter: 83/93 -- label: 4.0000 loss: 0.4827\n", + "2024-05-01 12:52:39,086 - SubprocessLauncher - INFO - 2024-05-01 12:52:39,086 - INFO - Epoch: 1/1, Iter: 84/93 -- label: 3.0000 loss: 0.4672\n", + "2024-05-01 12:52:39,088 - SubprocessLauncher - INFO - 2024-05-01 12:52:39,088 - INFO - Epoch: 1/1, Iter: 68/93 -- label: 0.0000 loss: 0.6109\n", + "2024-05-01 12:52:39,318 - SubprocessLauncher - INFO - 2024-05-01 12:52:39,318 - INFO - Epoch: 1/1, Iter: 85/93 -- label: 0.0000 loss: 0.4613\n", + "2024-05-01 12:52:39,455 - SubprocessLauncher - INFO - 2024-05-01 12:52:39,455 - INFO - Epoch: 1/1, Iter: 86/93 -- label: 0.0000 loss: 0.4599\n", + "2024-05-01 12:52:39,514 - SubprocessLauncher - INFO - 2024-05-01 12:52:39,514 - INFO - Epoch: 1/1, Iter: 69/93 -- label: 4.0000 loss: 0.5876\n", + "2024-05-01 12:52:39,663 - SubprocessLauncher - INFO - 2024-05-01 12:52:39,663 - INFO - Epoch: 1/1, Iter: 87/93 -- label: 0.0000 loss: 0.4616\n", + "2024-05-01 12:52:39,718 - SubprocessLauncher - INFO - 2024-05-01 12:52:39,718 - INFO - Epoch: 1/1, Iter: 70/93 -- label: 0.0000 loss: 0.5523\n", + "2024-05-01 12:52:39,877 - SubprocessLauncher - INFO - 2024-05-01 12:52:39,877 - INFO - Epoch: 1/1, Iter: 88/93 -- label: 0.0000 loss: 0.4505\n", + "2024-05-01 12:52:39,921 - SubprocessLauncher - INFO - 2024-05-01 12:52:39,920 - INFO - Epoch: 1/1, Iter: 71/93 -- label: 5.0000 loss: 0.6095\n", + "2024-05-01 12:52:40,107 - SubprocessLauncher - INFO - 2024-05-01 12:52:40,107 - INFO - Epoch: 1/1, Iter: 89/93 -- label: 5.0000 loss: 0.4453\n", + "2024-05-01 12:52:40,134 - SubprocessLauncher - INFO - 2024-05-01 12:52:40,133 - INFO - Epoch: 1/1, Iter: 72/93 -- label: 1.0000 loss: 0.5824\n", + "2024-05-01 12:52:40,329 - SubprocessLauncher - INFO - 2024-05-01 12:52:40,328 - INFO - Epoch: 1/1, Iter: 90/93 -- label: 3.0000 loss: 0.4254\n", + "2024-05-01 12:52:40,337 - SubprocessLauncher - INFO - 2024-05-01 12:52:40,337 - INFO - Epoch: 1/1, Iter: 73/93 -- label: 1.0000 loss: 0.5388\n", + "2024-05-01 12:52:40,558 - SubprocessLauncher - INFO - 2024-05-01 12:52:40,558 - INFO - Epoch: 1/1, Iter: 91/93 -- label: 0.0000 loss: 0.4215\n", + "2024-05-01 12:52:40,566 - SubprocessLauncher - INFO - 2024-05-01 12:52:40,565 - INFO - Epoch: 1/1, Iter: 74/93 -- label: 4.0000 loss: 0.5428\n", + "2024-05-01 12:52:40,776 - SubprocessLauncher - INFO - 2024-05-01 12:52:40,776 - INFO - Epoch: 1/1, Iter: 92/93 -- label: 5.0000 loss: 0.3890\n", + "2024-05-01 12:52:40,783 - SubprocessLauncher - INFO - 2024-05-01 12:52:40,782 - INFO - Epoch: 1/1, Iter: 75/93 -- label: 0.0000 loss: 0.5056\n", + "2024-05-01 12:52:40,930 - SubprocessLauncher - INFO - 2024-05-01 12:52:40,929 - INFO - Epoch: 1/1, Iter: 76/93 -- label: 0.0000 loss: 0.5108\n", + "2024-05-01 12:52:41,074 - SubprocessLauncher - INFO - 2024-05-01 12:52:41,073 - INFO - Epoch: 1/1, Iter: 77/93 -- label: 4.0000 loss: 0.4951\n", + "2024-05-01 12:52:41,216 - SubprocessLauncher - INFO - 2024-05-01 12:52:41,216 - INFO - Epoch: 1/1, Iter: 93/93 -- label: 2.0000 loss: 0.4255\n", + "2024-05-01 12:52:41,217 - SubprocessLauncher - INFO - INFO:ignite.engine.engine.SupervisedTrainer:Epoch[1] Complete. Time taken: 00:00:20.492\n", + "2024-05-01 12:52:41,217 - SubprocessLauncher - INFO - INFO:ignite.engine.engine.SupervisedTrainer:Engine run complete. Time taken: 00:00:20.676\n", + "2024-05-01 12:52:41,229 - SubprocessLauncher - INFO - 2024-05-01 12:52:41,229 - INFO - Epoch: 1/1, Iter: 78/93 -- label: 2.0000 loss: 0.5023\n", + "2024-05-01 12:52:41,364 - SubprocessLauncher - INFO - 2024-05-01 12:52:41,364 - INFO - Epoch: 1/1, Iter: 79/93 -- label: 1.0000 loss: 0.5102\n", + "2024-05-01 12:52:41,503 - SubprocessLauncher - INFO - 2024-05-01 12:52:41,502 - INFO - Epoch: 1/1, Iter: 80/93 -- label: 3.0000 loss: 0.4766\n", + "2024-05-01 12:52:41,634 - SubprocessLauncher - INFO - 2024-05-01 12:52:41,634 - INFO - Epoch: 1/1, Iter: 81/93 -- label: 0.0000 loss: 0.4782\n", + "2024-05-01 12:52:41,773 - SubprocessLauncher - INFO - 2024-05-01 12:52:41,773 - INFO - Epoch: 1/1, Iter: 82/93 -- label: 1.0000 loss: 0.4899\n", + "2024-05-01 12:52:41,904 - SubprocessLauncher - INFO - 2024-05-01 12:52:41,903 - INFO - Epoch: 1/1, Iter: 83/93 -- label: 3.0000 loss: 0.4560\n", + "2024-05-01 12:52:42,034 - SubprocessLauncher - INFO - 2024-05-01 12:52:42,034 - INFO - Epoch: 1/1, Iter: 84/93 -- label: 5.0000 loss: 0.4419\n", + "2024-05-01 12:52:42,166 - SubprocessLauncher - INFO - 2024-05-01 12:52:42,166 - INFO - Epoch: 1/1, Iter: 85/93 -- label: 1.0000 loss: 0.4390\n", + "2024-05-01 12:52:42,302 - SubprocessLauncher - INFO - 2024-05-01 12:52:42,302 - INFO - Epoch: 1/1, Iter: 86/93 -- label: 2.0000 loss: 0.4208\n", + "2024-05-01 12:52:42,442 - SubprocessLauncher - INFO - 2024-05-01 12:52:42,441 - INFO - Epoch: 1/1, Iter: 87/93 -- label: 1.0000 loss: 0.4284\n", + "2024-05-01 12:52:42,601 - SubprocessLauncher - INFO - 2024-05-01 12:52:42,601 - INFO - Epoch: 1/1, Iter: 88/93 -- label: 5.0000 loss: 0.4012\n", + "2024-05-01 12:52:42,730 - SubprocessLauncher - INFO - 2024-05-01 12:52:42,730 - INFO - Epoch: 1/1, Iter: 89/93 -- label: 0.0000 loss: 0.3955\n", + "2024-05-01 12:52:42,864 - SubprocessLauncher - INFO - 2024-05-01 12:52:42,863 - INFO - Epoch: 1/1, Iter: 90/93 -- label: 3.0000 loss: 0.4008\n", + "2024-05-01 12:52:43,002 - SubprocessLauncher - INFO - 2024-05-01 12:52:43,002 - INFO - Epoch: 1/1, Iter: 91/93 -- label: 1.0000 loss: 0.4164\n", + "2024-05-01 12:52:43,142 - SubprocessLauncher - INFO - 2024-05-01 12:52:43,142 - INFO - Epoch: 1/1, Iter: 92/93 -- label: 0.0000 loss: 0.3996\n", + "2024-05-01 12:52:43,579 - SubprocessLauncher - INFO - 2024-05-01 12:52:43,579 - INFO - Epoch: 1/1, Iter: 93/93 -- label: 1.0000 loss: 0.3601\n", + "2024-05-01 12:52:43,580 - SubprocessLauncher - INFO - INFO:ignite.engine.engine.SupervisedTrainer:Epoch[1] Complete. Time taken: 00:00:20.453\n", + "2024-05-01 12:52:43,580 - SubprocessLauncher - INFO - INFO:ignite.engine.engine.SupervisedTrainer:Engine run complete. Time taken: 00:00:20.639\n", + "2024-05-01 12:52:46,615 - SubprocessLauncher - INFO - Class prediction is AbdomenCT. Ground-truth: AbdomenCT\n", + "2024-05-01 12:52:46,616 - SubprocessLauncher - INFO - Class prediction is BreastMRI. Ground-truth: BreastMRI\n", + "2024-05-01 12:52:46,616 - SubprocessLauncher - INFO - Class prediction is ChestCT. Ground-truth: ChestCT\n", + "2024-05-01 12:52:46,616 - SubprocessLauncher - INFO - Class prediction is CXR. Ground-truth: CXR\n", + "2024-05-01 12:52:46,616 - SubprocessLauncher - INFO - Class prediction is Hand. Ground-truth: Hand\n", + "2024-05-01 12:52:46,617 - SubprocessLauncher - INFO - Class prediction is HeadCT. Ground-truth: HeadCT\n", + "2024-05-01 12:52:46,617 - SubprocessLauncher - INFO - Class prediction is HeadCT. Ground-truth: HeadCT\n", + "2024-05-01 12:52:46,617 - SubprocessLauncher - INFO - Class prediction is CXR. Ground-truth: CXR\n", + "2024-05-01 12:52:46,617 - SubprocessLauncher - INFO - Class prediction is ChestCT. Ground-truth: ChestCT\n", + "2024-05-01 12:52:46,617 - SubprocessLauncher - INFO - Class prediction is BreastMRI. Ground-truth: BreastMRI\n", + "2024-05-01 12:52:46,617 - SubprocessLauncher - INFO - Accuracy of the network on the 5895 test images: 96.0 %\n", + "2024-05-01 12:52:46,617 - SubprocessLauncher - INFO - Traceback (most recent call last):\n", + "2024-05-01 12:52:46,617 - SubprocessLauncher - INFO - File \"/home/hroth/.venv_monai/lib/python3.10/site-packages/nvflare/client/model_registry.py\", line 82, in submit_model\n", + "2024-05-01 12:52:46,617 - SubprocessLauncher - INFO - model.params = diff_func(original=self.received_task.data.params, new=model.params)\n", + "2024-05-01 12:52:46,618 - SubprocessLauncher - INFO - File \"/home/hroth/.venv_monai/lib/python3.10/site-packages/nvflare/client/utils.py\", line 42, in numerical_params_diff\n", + "2024-05-01 12:52:46,618 - SubprocessLauncher - INFO - raise RuntimeError(\"no common keys between original and new dict, parameters difference are empty.\")\n", + "2024-05-01 12:52:46,618 - SubprocessLauncher - INFO - RuntimeError: no common keys between original and new dict, parameters difference are empty.\n", + "2024-05-01 12:52:46,618 - SubprocessLauncher - INFO - \n", + "2024-05-01 12:52:46,618 - SubprocessLauncher - INFO - During handling of the above exception, another exception occurred:\n", + "2024-05-01 12:52:46,618 - SubprocessLauncher - INFO - \n", + "2024-05-01 12:52:46,618 - SubprocessLauncher - INFO - Traceback (most recent call last):\n", + "2024-05-01 12:52:46,618 - SubprocessLauncher - INFO - File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/integration/monai/examples/mednist/fedavg_workspace/simulate_job/app_site-2/custom/monai_mednist_train.py\", line 165, in \n", + "2024-05-01 12:52:46,619 - SubprocessLauncher - INFO - main()\n", + "2024-05-01 12:52:46,619 - SubprocessLauncher - INFO - File \"/media/hroth/NVIDIA/home_old/hroth/Code2/nvflare/monai_flywheel/integration/monai/examples/mednist/fedavg_workspace/simulate_job/app_site-2/custom/monai_mednist_train.py\", line 161, in main\n", + "2024-05-01 12:52:46,619 - SubprocessLauncher - INFO - flare.send(output_model)\n", + "2024-05-01 12:52:46,619 - SubprocessLauncher - INFO - File \"/home/hroth/.venv_monai/lib/python3.10/site-packages/nvflare/client/api.py\", line 162, in send\n", + "2024-05-01 12:52:46,619 - SubprocessLauncher - INFO - model_registry.submit_model(model=fl_model)\n", + "2024-05-01 12:52:46,619 - SubprocessLauncher - INFO - File \"/home/hroth/.venv_monai/lib/python3.10/site-packages/nvflare/client/model_registry.py\", line 85, in submit_model\n", + "2024-05-01 12:52:46,619 - SubprocessLauncher - INFO - raise RuntimeError(f\"params diff function failed: {e}\")\n", + "2024-05-01 12:52:46,619 - SubprocessLauncher - INFO - RuntimeError: params diff function failed: no common keys between original and new dict, parameters difference are empty.\n" + ] + } + ], "source": [ "!nvflare simulator -n 2 -t 2 ./jobs/fedavg_mednist -w fedavg_workspace" ]