-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathredis_ako_config.py
143 lines (118 loc) · 6.63 KB
/
redis_ako_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
class Config():
def __init__(self, **kwargs):
####################################
#### Basic Configurations ##########
####################################
# job_name and nID are set by parameters of execution command
self.job_name = kwargs.get("job_name", "wk")
self.nID = kwargs.get("nID", 0)
# When DL model is trained in local machine
self.local_ip = "localhost"
# When DL model is trained in remote machines,
# Their IP addresses.
self.remote_ip = ["111.222.333.444", "111.222.333.555", "111.222.333.666", "111.222.333.777", "111.222.333.888"]
# When DL model is trained in remote machines,
# Indices of the remote machines that you will use for training
self.remote = [0, 2, 3]
# When DL model is trained in local machine
# self.remote = None
# Number of machines you will use for training
self.num_workers = 2
# Workers' port number
self.worker_port = 2222
# Each worker has their own redis server.
# Redis server's port number
self.redis_port = 6380
self.training_epochs = 10
self.batch_size = 104
# Number of batches per epoch
self.num_batches = 480
####################################
#### Ways to train a model #########
####################################
# True : Synchronous DL
# False : Asynchronous DL
self.synchronous_training = False
# True: train a model until it reaches to the fixed accuracy
# False: train a model for a fixed training time
self.train_until_fixed_accuracy = False
# True: Testing mode: training for several iterations for testing purpose
# False: No testing mode
self.testing = False
if self.train_until_fixed_accuracy:
# self.train_until_fixed_accuracy = True
# Train a model until it reaches to the fixed accuracy
# Target accuracy
self.target_accuracy = 0.4
# Check model's accuracy every this iteration
self.iteration_to_check_accuracy = 5
# Stop training at this second even though it doesn't reach the target accuracy
self.stop_time = 3000
else:
# self.train_until_fixed_accuracy = False
# Train a model for a fixed training time
# Traing a model for this seconds
self.stop_time = 300
if self.testing:
# self.testing = True
# Testing mode
# Train a mode for this epoch & this iteration
self.training_epochs = 1
self.testing_iteration = 10
####################################
#### Ako Specific configuations ####
####################################
# P values of each workers
self.p = kwargs.get("p", [4, 4])
# Staleness bound (Number of iterations) for SSP(Stale Synchronization Parallel)
# 0 : the strongest synchronization
self.synch_max_diff = 0
# When partitioning gradients,
# False: Partition gradients layer-by-layer
# True: Partition even a layer to multiple partitions
self.fine_grained_partition = False
# Number of threads used for receiving gradients from other workers
self.num_dqthreads = 2
# Trainable variables and its queue name and indices
# It should be modified according to your model
# wid is incremental
# num_parts is number of partitions of the layer
# shape is the shape of the layer
# range is the indices of first dimension of the layer's partitions
self.weights = dict()
self.weights["W_conv1"] = {"wid": 0, "num_parts": 1, "shape": (5, 5, 3, 32), "range": [0, 5]}
self.weights["b_conv1"] = {"wid": 1, "num_parts": 1, "shape": (32), "range": [0, 32]}
self.weights["W_conv2"] = {"wid": 2, "num_parts": 1, "shape": (3, 3, 32, 64), "range": [0, 3]}
self.weights["b_conv2"] = {"wid": 3, "num_parts": 1, "shape": (64), "range": [0, 5]}
self.weights["W_conv3"] = {"wid": 4, "num_parts": 1, "shape": (3, 3, 64, 64), "range": [0, 3]}
self.weights["b_conv3"] = {"wid": 5, "num_parts": 1, "shape": (64), "range": [0, 64]}
self.weights["W_fc1"] = {"wid": 6, "num_parts": 4, "shape": (4096, 1024), "range": [0, 1024, 2048, 3072, 4096]}
self.weights["b_fc1"] = {"wid": 7, "num_parts": 1, "shape": (1024), "range": [0, 1024]}
self.weights["W_fc2"] = {"wid": 8, "num_parts": 1, "shape": (1024, 10), "range": [0, 1024]}
self.weights["b_fc2"] = {"wid": 9, "num_parts": 1, "shape": (10), "range": [0, 10]}
# For fine-grained partition
# When self.fine_grained_partition = True
# When self.weights has num_parts greater than 1, its subweights information should be stated here
# wid is incremental
# part is the indices of partitions of the layer
# shape is the shape of the subweights
self.subweights = dict()
self.subweights["1@W_fc1"] = {"wid": 10, "part": 1, "shape": (1024, 1024)}
self.subweights["2@W_fc1"] = {"wid": 11, "part": 2, "shape": (1024, 1024)}
self.subweights["3@W_fc1"] = {"wid": 12, "part": 3, "shape": (1024, 1024)}
self.subweights["4@W_fc1"] = {"wid": 13, "part": 4, "shape": (1024, 1024)}
# State how partitions of gradients are divided as per the P value
if self.fine_grained_partition:
# For fine-grained partition
# When self.fine_grained_partition = True
self.partitions = dict()
self.partitions[1] = [["W_conv1", "b_conv1", "W_conv2", "b_conv2", "W_conv3", "b_conv3", "W_fc1", "b_fc1", "W_fc2", "b_fc2"]]
self.partitions[2] = [["W_conv1", "b_conv1", "W_conv2", "b_conv2", "1@W_fc1", "2@W_fc1", "b_fc1"], ["W_conv3", "b_conv3", "W_fc2", "b_fc2", "3@W_fc1", "4@W_fc1"]]
self.partitions[4] = [["W_conv1", "b_conv1", "1@W_fc1", "b_fc1"], ["W_conv2", "b_conv2", "2@W_fc1"], ["W_conv3", "b_conv3", "3@W_fc1"], ["W_fc2", "b_fc2", "4@W_fc1"]]
else:
# For layer-by-layer partition
# When self.fine_grained_partition = True
self.partitions = dict()
self.partitions[1] = [["W_conv1", "b_conv1", "W_conv2", "b_conv2", "W_conv3", "b_conv3", "W_fc1", "b_fc1", "W_fc2", "b_fc2"]]
self.partitions[2] = [["W_conv1", "b_conv1", "W_conv2", "b_conv2", "W_conv3", "b_conv3", "W_fc2", "b_fc2"], ["W_fc1", "b_fc1"]]
self.partitions[4] = [["W_conv1", "b_conv1", "W_fc2", "b_fc2"], ["W_conv2", "b_conv2"], ["W_conv3", "b_conv3"], ["W_fc1", "b_fc1"]]