-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
214 lines (160 loc) · 7.86 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import argparse
import datetime
import functools
import logging
import pickle
from pathlib import Path
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
import os
import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import scipy.stats
# load dataset
def load_datasets(file: Path, n_train: int, n_valid: int, n_test: int):
data = np.load(file)
x = torch.tensor(data["x"])
x = x.type(torch.DoubleTensor)
print("x shape - ", x.shape)
#print(x.type())
y = torch.tensor(data["y"])
y = y.type(torch.DoubleTensor)
print("y shape - ", y.shape)
#print(y.type())
logging.info(f"Loaded {len(x)} examples from '{file.resolve()}'")
assert len(x) == n_train + n_valid + n_test
dataset = TensorDataset(x, y)
train_set, valid_set, test_set = random_split(dataset, [n_train, n_valid, n_test], generator=torch.Generator().manual_seed(42))
logging.debug(f"train/valid/test = {len(train_set)}/{len(valid_set)}/{len(test_set)}")
return train_set, valid_set, test_set
# plotting .95 confidence bounds on multiple runs
def mean_confidence_interval(data, confidence=0.95):
a = 1.0 * np.array(data)
n = len(a)
m = np.mean(a)
if n>1:
se = scipy.stats.sem(a)
h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
check = scipy.stats.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=scipy.stats.sem(data))
else:
h = 0
check = 0
return m, h, check
# returns tabular data from tfboard files
def tabulate_events(dpath, output_dir):
# dpath - path of the files where tensorboard files are saved
# add output directory to check if files with the same name already exist.
Path(output_dir, "runs_pkl_files").mkdir(parents=True, exist_ok=True)
save_directory = os.path.join(output_dir,"runs_pkl_files")
files_already_converted = os.listdir(save_directory)
#print("converted - ",files_already_converted)
final_out = {}
file_names = os.listdir(dpath)
#print(file_names)
for dname in file_names:
if dname+".pkl" in files_already_converted:
final_out[dname] = pd.read_pickle(os.path.join(save_directory, dname+".pkl"))
print(dname, " already converted")
continue
print(f"Converting run {dname}",end="")
#print(os.path.join(dpath, dname) )
ea = EventAccumulator(os.path.join(dpath, dname)).Reload()
tags = ea.Tags()['scalars']
out = {}
for tag in tags:
tag_values=[]
#wall_time=[]
steps=[]
for event in ea.Scalars(tag):
tag_values.append(event.value)
#wall_time.append(event.wall_time)
steps.append(event.step)
#out[tag]=pd.DataFrame(data=dict(zip(steps,np.array([tag_values,wall_time]).transpose())), columns=steps,index=['value','wall_time'])
out[tag]=pd.DataFrame(data=dict(zip(steps,tag_values)), columns=steps,index=['value'])
if len(tags)>0:
df= pd.concat(out.values(),keys=out.keys())
df.to_pickle(os.path.join(save_directory, dname+".pkl"))
print("- Done")
else:
print('- Not scalers to write')
final_out[dname] = df
#print(df)
return final_out
# collect loss values across mutiple runs and arranges them in dictionaries. This function returns 2 dictionaries - one for the loss values of various models across multiple learning rates for multiple runs, the other for their respective model file address in the directory
def collect_loss_values(row_heading, model_names, tab):
# returns loss values for all models for all learning rates in a single dictionary
test_best_model_dict = {}
test_best_model_address_dict = {}
for name in model_names:
test_best_model_dict[name] = [[],[],[],[]] # because we have three learning rates
test_best_model_address_dict[name] = [[],[],[],[]]
for i in tab:
row_headings = [k[0] for k in tab[i].index]
for model_name in model_names:
if model_name+"_lr" in i:
if "lr0.1" in i:
test_best_model_address_dict[model_name][0].append(i)
if "loss/test_best_model" not in row_headings:
test_best_model_dict[model_name][0].append(np.nan)
else:
test_best_model_dict[model_name][0].append(tab[i].loc[[row_heading]].to_numpy()[0][0])
elif "lr0.01" in i:
test_best_model_address_dict[model_name][1].append(i)
if "loss/test_best_model" not in row_headings:
test_best_model_dict[model_name][1].append(np.nan)
else:
test_best_model_dict[model_name][1].append(tab[i].loc[[row_heading]].to_numpy()[0][0])
elif "lr0.001" in i:
test_best_model_address_dict[model_name][2].append(i)
if "loss/test_best_model" not in row_headings:
test_best_model_dict[model_name][2].append(np.nan)
else:
test_best_model_dict[model_name][2].append(tab[i].loc[[row_heading]].to_numpy()[0][0])
elif "lr1.0" in i:
test_best_model_address_dict[model_name][3].append(i)
if "loss/test_best_model" not in row_headings:
test_best_model_dict[model_name][3].append(np.nan)
else:
test_best_model_dict[model_name][3].append(tab[i].loc[[row_heading]].to_numpy()[0][0])
break
return test_best_model_dict, test_best_model_address_dict
# Takes the two dictionaries with loss values and addresses across models-learning rates-runs and produces dictionaries for best model addresses and loss values across learning rates, runs and also returns the overall best model file name.
def collect_addresses_and_loss_for_LeastLoss_BestModel(test_best_model_dict, test_best_model_address):
min_values_across_runs = {} # for each learning rate, stores the minimum value across runs
min_addresses_across_runs = {} # for each learning rate, stores the address of run which produces minimum value
min_values_across_lr = {} # for each model, stores the minimum value
min_addresses_across_lr = {} # for each model, stores the address of the learning rate with minimum value
file_name_of_best_model = {} # for each model points to the file name that has the least loss, if there is no file, then it stores None
for i in test_best_model_dict:
min_values_across_runs[i] = [np.nanmin(k) if len(k)!=0 and np.isnan(k).all()==False else np.nan for k in test_best_model_dict[i]]
min_addresses_across_runs[i] = [np.nanargmin(k) if len(k)!=0 and np.isnan(k).all()==False else np.nan for k in test_best_model_dict[i]]
#temp_vals_lr = [np.nanmin(k) if len(k)!=0 and np.isnan(k).all()==False else np.nan for k in test_best_model_dict[i]]
#temp_addr_lr = [np.nanmin(k) if len(k)!=0 and np.isnan(k).all()==False else np.nan for k in test_best_model_dict[i]]
if np.isnan(min_values_across_runs[i]).all()==False:
min_values_across_lr[i] = np.nanmin(min_values_across_runs[i])
min_addresses_across_lr[i] = np.nanargmin(min_values_across_runs[i])
else:
min_values_across_lr[i] = np.nan
min_addresses_across_lr[i] = np.nan
for i in test_best_model_dict:
if np.isnan(min_addresses_across_lr[i]):
file_name_of_best_model[i] = None
else:
file_name_of_best_model[i] = test_best_model_address[i][min_addresses_across_lr[i]][min_addresses_across_runs[i][min_addresses_across_lr[i]]]
print("all loss values - ", test_best_model_dict)
print("\n----------------------------------------\n")
print("min_values_across_runs - ", min_values_across_runs)
print("\n----------------------------------------\n")
print("min_addresses_across_runs - ", min_addresses_across_runs)
print("\n----------------------------------------\n")
print("min_values_across_lr - ", min_values_across_lr)
print("\n----------------------------------------\n")
print("min_addresses_across_lr - ", min_addresses_across_lr)
print("\n----------------------------------------\n")
print("file address - ", file_name_of_best_model)
print("\n----------------------------------------\n")
return min_values_across_lr, min_addresses_across_lr, min_values_across_runs, min_addresses_across_runs, file_name_of_best_model