Skip to content

Commit

Permalink
Add TODOs
Browse files Browse the repository at this point in the history
  • Loading branch information
YuanTingHsieh committed Jul 18, 2023
1 parent 2510207 commit 698dedc
Show file tree
Hide file tree
Showing 9 changed files with 228 additions and 54 deletions.
86 changes: 86 additions & 0 deletions examples/advanced/ml-to-fl/cifar10_tutorial_clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

from net import Net

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 4

trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)


net = Net()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)


for epoch in range(2): # loop over the dataset multiple times

running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data

# zero the parameter gradients
optimizer.zero_grad()

# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()

# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
running_loss = 0.0

print("Finished Training")


PATH = "./cifar_net.pth"
torch.save(net.state_dict(), PATH)


net = Net()
net.load_state_dict(torch.load(PATH))


correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
for data in testloader:
images, labels = data
# calculate outputs by running images through the network
outputs = net(images)
# the class with the highest energy is what we choose as prediction
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()

print(f"Accuracy of the network on the 10000 test images: {100 * correct // total} %")
35 changes: 25 additions & 10 deletions examples/advanced/ml-to-fl/jobs/decorator/app/custom/cifar10.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,47 @@
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

from net import Net

import nvflare
# TODO: nvflare.client package????
import nvflare.client as flare

PATH = "./cifar_net.pth"

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 4

trainset = torchvision.datasets.CIFAR10(root="/tmp/cifar10/data", train=True, download=True, transform=transform)
trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root="/tmp/cifar10/data", train=False, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

net = Net()
# TODO: use init also for decorator as well
flare.init(config="./config.json")

net = Net()

@nvflare.train
def train(total_epochs, lr, weights=None, device="cuda:0"):
# TODO: based on position first not pre-defined keyword name
# and you can pass your own mapping
# TODO: add return of output meta
# TODO: Do you want to have input meta? yes, use some str:
# - lr = "meta.lr"
# - rounds = "meta.total_rounds"
# TODO: flare.get_sys_meta()

@flare.train(
lr="meta.lr"
)
def train(weights=None, lr=0.001, device="cuda:0"):
if weights is not None:
net.load_state_dict(weights)
net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9)
for epoch in range(total_epochs): # loop over the dataset multiple times
for epoch in range(2): # loop over the dataset multiple times

running_loss = 0.0
for i, data in enumerate(trainloader, 0):
Expand All @@ -64,12 +78,13 @@ def train(total_epochs, lr, weights=None, device="cuda:0"):
if i % 2000 == 1999: # print every 2000 mini-batches
print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
running_loss = 0.0
break

print("Finished Training")
torch.save(net.state_dict(), PATH)

return net.state_dict()
return net.state_dict(), meta

# TODO: add return of meta


@nvflare.evaluate
Expand All @@ -91,7 +106,7 @@ def evaluate(weights=None, device="cuda:0"):
total += labels.size(0)
correct += (predicted == labels).sum().item()
print(f"Accuracy of the network on the 10000 test images: {100 * correct // total} %")
return 100 * correct // total
return 100 * correct // total, meta


evaluate()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"id": "launcher",
"name": "SubprocessLauncher",
"args": {
"script": "python custom/cifar10.py --epochs 1"
"script": "python custom/cifar10_tutorial_clean.py --epochs 1"
}
}
]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

import nvflare

from net import Net

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 4

trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)


net = Net()

# initializes NVFlare interface
nvflare.init(conf="./config.json")
input_model, input_meta = nvflare.receive_model()

# TODO: input_meta JUST related to USER TRAINING CODE
# TODO: some other helper methods?:
# - nvflare.get_total_rounds()
# - nvflare.get_job_id()
# TODO: some other stuff like "nvflare.get_sys_meta()"
# will return "site-name", "job-id"?


# get model from NVFlare
net.load_state_dict(input_model)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)


for epoch in range(2): # loop over the dataset multiple times

running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data

# zero the parameter gradients
optimizer.zero_grad()

# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()

# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
running_loss = 0.0

print("Finished Training")


PATH = "./cifar_net.pth"
torch.save(net.state_dict(), PATH)


net = Net()
net.load_state_dict(input_model)


correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
for data in testloader:
images, labels = data
# calculate outputs by running images through the network
outputs = net(images)
# the class with the highest energy is what we choose as prediction
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()

print(f"Accuracy of the network on the 10000 test images: {100 * correct // total} %")

nvflare.submit_metrics(100 * correct // total)
nvflare.submit_model(net.state_dict())


def submit_metrics(metrics: Any, meta: Optional[Dict] = None):
pass


def submit_model(model: Any, meta: Optional[Dict] = None):
pass
37 changes: 0 additions & 37 deletions examples/advanced/ml-to-fl/jobs/interface/app_server/custom/net.py

This file was deleted.

8 changes: 2 additions & 6 deletions examples/advanced/ml-to-fl/jobs/interface/meta.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,8 @@
"resource_spec": {},
"min_clients" : 2,
"deploy_map": {
"app_server": [
"server"
],
"app_client": [
"site-1",
"site-2"
"app": [
"@ALL"
]
}
}

0 comments on commit 698dedc

Please sign in to comment.