-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiagnostics.py
108 lines (86 loc) · 3.51 KB
/
diagnostics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import pandas as pd
#import numpy as np
import timeit
import os
# import json
import commons_proj as cproj
import subprocess
#%% Functions
# read the deployed model and a test dataset, calculate predictions
def model_predictions(test_df):
model = cproj.load_object('prod_deployment_path', 'trainedmodel.pkl')
# prepare data for model
X, y = cproj.prepare_data(test_df, cproj.input_features, cproj.output_feature)
# predict
predicted = model.predict(X)
return predicted, y
# get NA-percents for all columns in a dataframe
def na_percent_summary():
df = cproj.load_dataframe('output_folder_path', 'finaldata.csv')
percent_missing = df.isnull().sum() * 100 / len(df)
cols_na_percent = pd.DataFrame({'column_name': df.columns,
'percent_missing': percent_missing})
cols_na_percent.drop('column_name', axis = 1, inplace = True)
return cols_na_percent
# get numeric-inputs statistics
def numeric_inputs_stats(df, numeric_cols):
stats_dict = {}
for each_input in numeric_cols:
stats_dict[each_input] = ['min', 'max', 'mean', 'median', 'std']
cols_stats = df.agg(stats_dict)
return cols_stats
# get summary statistics
def numeric_inputs_summary():
df = cproj.load_dataframe('output_folder_path', 'finaldata.csv')
# get numeric columns statistics
numeric_cols = cproj.input_features.copy()
numeric_cols.append(cproj.output_feature)
numeric_cols_stats = numeric_inputs_stats(df, numeric_cols)
# # get NA-percentages for each column
# cols_na_percent = na_percent_summary()
# frames = [numeric_cols_stats, cols_na_percent.T]
# result = pd.concat(frames)
return numeric_cols_stats
#%% Function to get timings
def execute_script(script_file):
starttime = timeit.default_timer()
os.system(f"python3 {script_file}")
timing=timeit.default_timer() - starttime
return round(timing, 7)
def execution_time():
execution_times = {}
execution_times['ingestion.py'] = [execute_script('ingestion.py')]
execution_times['training.py'] = [execute_script('training.py')]
return execution_times
#%% Function to check dependencies
def outdated_packages_list():
outdated = subprocess.check_output(['pip', 'list','--outdated', '--format=columns'])
with open('outdated.txt', 'wb') as f:
f.write(outdated)
df = pd.read_csv('outdated.txt', sep=r'\s+', skiprows=[1])
df.drop(['Type'], axis=1, inplace=True)
return df
# def get_package_versions(package):
# import pkg_resources
# import requests
# installed_version = pkg_resources.get_distribution(package).version
# response = requests.get(f'https://pypi.org/pypi/{package}/json')
# latest_version = response.json()['info']['version']
# return package, installed_version, latest_version
#%%
if __name__ == '__main__':
fname = 'diagnostics.py'
print(f"- {fname}. -->")
df = cproj.load_dataframe('test_data_path', 'testdata.csv')
predicted, _ = model_predictions(df)
numeric_inputs = numeric_inputs_summary()
na_percent = na_percent_summary()
exec_time = execution_time()
outdated_df = outdated_packages_list()
outdated_dict = outdated_df.to_dict()
print(f"- {fname}. model_predictions:\n{predicted}")
print(f"- {fname}. numeric_inputs_stats:\n{numeric_inputs}")
print(f"- {fname}. na_percent:\n{na_percent}")
print(f"- {fname}. exec_time:\n{exec_time}")
print(f"- {fname}. outdated_df:\n{outdated_df}")
print(f"- {fname}. <--")