-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_data.py
114 lines (95 loc) · 4.67 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
clearn the data and construct the data for training
return csv files for each bus including the following columns:
Features (have been normalized for each bus):
'Weekday_sin', 'Weekday_cos', 'Hour_sin', 'Hour_cos', 'Temperature (k)', 'Shortwave Radiation (w/m2)',
'Longwave Radiation (w/m2)', 'Zonal Wind Speed (m/s)', 'Meridional Wind Speed (m/s)', 'Wind Speed (m/s)',
Target (is not normalized)):
'Load'
"""
import pandas as pd
from tqdm import trange
import numpy as np
import datetime
import os
import hydra
from omegaconf import DictConfig
@hydra.main(version_base=None, config_path="conf", config_name="config")
def main(cfg: DictConfig):
SAVE_DIR = f'data/data_case{cfg.case.no_bus}/'
NO_DAY = 365
NO_BUS_TOTAL = 123
NO_BUS = cfg.case.no_bus # number of buses to be selected (should be larger than the targeted number of buses)
NO_HOUR = 24
print('Cleaning data for case: ', cfg.case.no_bus)
# collect load by bus
load_all = []
for day in trange(1, NO_DAY+1, desc='Loading load data'):
load_all.append(pd.read_csv(f'data/Data_public/load_2019/load_annual_D{day}.txt', sep=" ", header=None))
load_all = pd.concat(load_all, axis=0)
load_all.reset_index(drop=True, inplace=True)
# find the buses that are most uncorrelated
load_corr = np.corrcoef(load_all.values.T)
bus_index_summary = []
corr_summary = []
for i in range(NO_BUS_TOTAL):
bus_index = [i]
for _ in range(1, NO_BUS):
summed_corr = np.sum(load_corr[bus_index, :], axis=0) # sum of correlation of all the previous buses
j = 0
new_index = np.argsort(summed_corr)[j]
while new_index in bus_index:
j += 1
new_index = np.argsort(summed_corr)[j]
bus_index.append(new_index)
bus_index_summary.append(bus_index)
corr = load_corr[bus_index, :][:, bus_index]
corr_summary.append(corr.mean())
index = np.argsort(corr_summary)[0]
BUS_INDEX = bus_index_summary[index] # the selected bus index
example_df = pd.read_excel("data/Data_public/Climate_2019/climate_2019_Day" + '1.csv', sheet_name='Hour 1')
climate_dict = {key: pd.DataFrame(columns=example_df.columns) for key in BUS_INDEX}
for i in trange(1, NO_DAY+1, desc='Loading climate data'):
climate_data_all = pd.ExcelFile("data/Data_public/Climate_2019/climate_2019_Day" + str(i) + '.csv')
for hour in [f'Hour {i}' for i in range(1,NO_HOUR+1)]:
climate_data_per_hour = climate_data_all.parse(hour)
for index, bus in enumerate(BUS_INDEX):
climate_dict[bus] = pd.concat([climate_dict[bus], climate_data_per_hour.iloc[bus-1:bus]], ignore_index=True, axis=0)
# remove bus index and normalize the climate data
for bus in BUS_INDEX:
climate_dict[bus].drop(columns=['Bus'], inplace=True)
# standardize
climate_dict[bus] = (climate_dict[bus] - climate_dict[bus].mean()) / climate_dict[bus].std()
# add weekday information for each bus
start_weekday = datetime.datetime(2019,1,1).weekday()
one_week = np.concatenate([np.arange(start_weekday, 7), (np.arange(0, start_weekday))])
day = np.repeat(np.arange(1,NO_DAY + 1), 24)
hour = np.tile(np.arange(1,25), NO_DAY)
weekday = np.tile(np.repeat(one_week, 24), 53)[:NO_DAY * 24]
# day_sin = np.sin(2 * np.pi * day / NO_DAY)
# day_cos = np.cos(2 * np.pi * day / NO_DAY)
hour_sin = np.sin(2 * np.pi * ( hour / 24))
hour_cos = np.cos(2 * np.pi * ( hour / 24))
weekday_sin = np.sin(2 * np.pi * ( weekday / 7))
weekday_cos = np.cos(2 * np.pi * ( weekday / 7))
# change the order of the columns
FEATURE_COLUMNS = ['Weekday_sin', 'Weekday_cos', 'Hour_sin', 'Hour_cos', 'Temperature (k)', 'Shortwave Radiation (w/m2)',
'Longwave Radiation (w/m2)', 'Zonal Wind Speed (m/s)',
'Meridional Wind Speed (m/s)', 'Wind Speed (m/s)']
TARGET_COLUMN = ['Load']
for bus in BUS_INDEX:
# climate_dict[bus]['Day_sin'] = day_sin
# climate_dict[bus]['Day_cos'] = day_cos
climate_dict[bus]['Hour_sin'] = hour_sin
climate_dict[bus]['Hour_cos'] = hour_cos
climate_dict[bus]['Weekday_sin'] = weekday_sin
climate_dict[bus]['Weekday_cos'] = weekday_cos
climate_dict[bus]['Load'] = load_all[bus]
climate_dict[bus] = climate_dict[bus][FEATURE_COLUMNS + TARGET_COLUMN]
climate_dict[bus].reset_index(drop=True, inplace=True)
if not os.path.exists(SAVE_DIR):
os.makedirs(SAVE_DIR)
for bus in BUS_INDEX:
climate_dict[bus].to_csv(SAVE_DIR + f'bus_{bus}.csv', index=False)
if __name__ == '__main__':
main()