Add TODOs

YuanTingHsieh · Jul 18, 2023 · 698dedc · 698dedc
1 parent 2510207
commit 698dedc
Show file tree

Hide file tree

Showing 9 changed files with 228 additions and 54 deletions.
diff --git a/examples/advanced/ml-to-fl/cifar10_tutorial_clean.py b/examples/advanced/ml-to-fl/cifar10_tutorial_clean.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision
+import torchvision.transforms as transforms
+
+from net import Net
+
+transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+batch_size = 4
+
+trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
+
+testset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
+testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)
+
+
+net = Net()
+
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+
+for epoch in range(2):  # loop over the dataset multiple times
+
+    running_loss = 0.0
+    for i, data in enumerate(trainloader, 0):
+        # get the inputs; data is a list of [inputs, labels]
+        inputs, labels = data
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        # forward + backward + optimize
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        # print statistics
+        running_loss += loss.item()
+        if i % 2000 == 1999:  # print every 2000 mini-batches
+            print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
+            running_loss = 0.0
+
+print("Finished Training")
+
+
+PATH = "./cifar_net.pth"
+torch.save(net.state_dict(), PATH)
+
+
+net = Net()
+net.load_state_dict(torch.load(PATH))
+
+
+correct = 0
+total = 0
+# since we're not training, we don't need to calculate the gradients for our outputs
+with torch.no_grad():
+    for data in testloader:
+        images, labels = data
+        # calculate outputs by running images through the network
+        outputs = net(images)
+        # the class with the highest energy is what we choose as prediction
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+
+print(f"Accuracy of the network on the 10000 test images: {100 * correct // total} %")
diff --git a/examples/advanced/ml-to-fl/jobs/decorator/app/custom/cifar10.py b/examples/advanced/ml-to-fl/jobs/decorator/app/custom/cifar10.py
@@ -17,33 +17,47 @@
 import torch.optim as optim
 import torchvision
 import torchvision.transforms as transforms
+
 from net import Net
 
-import nvflare
+# TODO: nvflare.client package????
+import nvflare.client as flare
 
 PATH = "./cifar_net.pth"
 
 transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
 
 batch_size = 4
 
-trainset = torchvision.datasets.CIFAR10(root="/tmp/cifar10/data", train=True, download=True, transform=transform)
+trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
 trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
 
-testset = torchvision.datasets.CIFAR10(root="/tmp/cifar10/data", train=False, download=True, transform=transform)
+testset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
 testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)
 
-net = Net()
+# TODO: use init also for decorator as well
+flare.init(config="./config.json")
 
+net = Net()
 
-@nvflare.train
-def train(total_epochs, lr, weights=None, device="cuda:0"):
+# TODO: based on position first not pre-defined keyword name
+#     and you can pass your own mapping
+# TODO: add return of output meta
+# TODO: Do you want to have input meta? yes, use some str:
+#    - lr = "meta.lr"
+#    - rounds = "meta.total_rounds"
+# TODO: flare.get_sys_meta()
+
+@flare.train(
+    lr="meta.lr"
+)
+def train(weights=None, lr=0.001, device="cuda:0"):
     if weights is not None:
         net.load_state_dict(weights)
     net.to(device)
     criterion = nn.CrossEntropyLoss()
     optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9)
-    for epoch in range(total_epochs):  # loop over the dataset multiple times
+    for epoch in range(2):  # loop over the dataset multiple times
 
         running_loss = 0.0
         for i, data in enumerate(trainloader, 0):
@@ -64,12 +78,13 @@ def train(total_epochs, lr, weights=None, device="cuda:0"):
             if i % 2000 == 1999:  # print every 2000 mini-batches
                 print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
                 running_loss = 0.0
-                break
 
     print("Finished Training")
     torch.save(net.state_dict(), PATH)
 
-    return net.state_dict()
+    return net.state_dict(), meta
+
+# TODO: add return of meta
 
 
 @nvflare.evaluate
@@ -91,7 +106,7 @@ def evaluate(weights=None, device="cuda:0"):
             total += labels.size(0)
             correct += (predicted == labels).sum().item()
     print(f"Accuracy of the network on the 10000 test images: {100 * correct // total} %")
-    return 100 * correct // total
+    return 100 * correct // total, meta
 
 
 evaluate()

diff --git a/.../app_client/config/config_fed_client.json → ...terface/app/config/config_fed_client.json b/.../app_client/config/config_fed_client.json → ...terface/app/config/config_fed_client.json
@@ -21,7 +21,7 @@
       "id": "launcher",
       "name": "SubprocessLauncher",
       "args": {
-        "script": "python custom/cifar10.py --epochs 1"
+        "script": "python custom/cifar10_tutorial_clean.py --epochs 1"
       }
     }
   ]

diff --git a/.../app_server/config/config_fed_server.json → ...terface/app/config/config_fed_server.json b/.../app_server/config/config_fed_server.json → ...terface/app/config/config_fed_server.json
diff --git a/...bs/interface/app_client/custom/cifar10.py → ...o-fl/jobs/interface/app/custom/cifar10.py b/...bs/interface/app_client/custom/cifar10.py → ...o-fl/jobs/interface/app/custom/cifar10.py
diff --git a/examples/advanced/ml-to-fl/jobs/interface/app/custom/cifar10_tutorial_clean.py b/examples/advanced/ml-to-fl/jobs/interface/app/custom/cifar10_tutorial_clean.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision
+import torchvision.transforms as transforms
+
+import nvflare
+
+from net import Net
+
+transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+batch_size = 4
+
+trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)
+
+testset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
+testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)
+
+
+net = Net()
+
+# initializes NVFlare interface
+nvflare.init(conf="./config.json")
+input_model, input_meta = nvflare.receive_model()
+
+# TODO: input_meta JUST related to USER TRAINING CODE
+# TODO: some other helper methods?:
+#   - nvflare.get_total_rounds()
+#   - nvflare.get_job_id()
+# TODO: some other stuff like "nvflare.get_sys_meta()"
+#    will return "site-name", "job-id"?
+
+
+# get model from NVFlare
+net.load_state_dict(input_model)
+
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+
+for epoch in range(2):  # loop over the dataset multiple times
+
+    running_loss = 0.0
+    for i, data in enumerate(trainloader, 0):
+        # get the inputs; data is a list of [inputs, labels]
+        inputs, labels = data
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        # forward + backward + optimize
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        # print statistics
+        running_loss += loss.item()
+        if i % 2000 == 1999:  # print every 2000 mini-batches
+            print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
+            running_loss = 0.0
+
+print("Finished Training")
+
+
+PATH = "./cifar_net.pth"
+torch.save(net.state_dict(), PATH)
+
+
+net = Net()
+net.load_state_dict(input_model)
+
+
+correct = 0
+total = 0
+# since we're not training, we don't need to calculate the gradients for our outputs
+with torch.no_grad():
+    for data in testloader:
+        images, labels = data
+        # calculate outputs by running images through the network
+        outputs = net(images)
+        # the class with the highest energy is what we choose as prediction
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+
+print(f"Accuracy of the network on the 10000 test images: {100 * correct // total} %")
+
+nvflare.submit_metrics(100 * correct // total)
+nvflare.submit_model(net.state_dict())
+
+
+def submit_metrics(metrics: Any, meta: Optional[Dict] = None):
+    pass
+
+
+def submit_model(model: Any, meta: Optional[Dict] = None):
+    pass
diff --git a/...l/jobs/interface/app_client/custom/net.py → ...ml-to-fl/jobs/interface/app/custom/net.py b/...l/jobs/interface/app_client/custom/net.py → ...ml-to-fl/jobs/interface/app/custom/net.py
diff --git a/examples/advanced/ml-to-fl/jobs/interface/app_server/custom/net.py b/examples/advanced/ml-to-fl/jobs/interface/app_server/custom/net.py
diff --git a/examples/advanced/ml-to-fl/jobs/interface/meta.json b/examples/advanced/ml-to-fl/jobs/interface/meta.json
@@ -3,12 +3,8 @@
   "resource_spec": {},
   "min_clients" : 2,
   "deploy_map": {
-    "app_server": [
-      "server"
-    ],
-    "app_client": [
-      "site-1",
-      "site-2"
+    "app": [
+      "@ALL"
     ]
   }
 }