-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpre_process.py
119 lines (88 loc) · 3.55 KB
/
pre_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import pandas as pd
import argparse
from src.datasets import load_orangepi_data
def get_args_parser():
parser = argparse.ArgumentParser('CPU regression', add_help=False)
parser.add_argument('--num', default=40000, type=int,
help='define the total number of samples to use from the dataset - by default its 40k for the CPU dataset')
parser.add_argument('--data_path', default='./data/cpu_data_custom_adapted.csv', type=str,
help='dataset path')
return parser
def main(args):
df = load_orangepi_data(args.data_path)
print(f"full size dataset: ", len(df))
"""
1500 samples for 1 day
10k for 1 week
"""
df = df[["Date", "TARGET"]]
df.columns = ['timestamp', 'target']
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)
for col in df.columns:
if df[col].dtype != 'object' and pd.api.types.is_string_dtype(df[col]) == False:
df[col] = df[col].astype('float32')
df.index.rename(None, inplace=True)
max_end = df.index[-1]
beginning = df.index[0]
new_index = pd.date_range(beginning, end=max_end, freq="1T")
new_index = new_index[:len(df)]
df.index = new_index
size = int(len(df) * 0.8)
data_train, data_test = df[0:size], df[size:len(df)]
data_folder = "./data"
os.makedirs(data_folder, exist_ok=True)
data_train.to_csv(os.path.join(data_folder, 'train_data.csv'))
data_test.to_csv(os.path.join(data_folder, 'test_data.csv'))
print("=" * 30)
print(f"| NEW DATA Data Information |")
print("=" * 30)
print("| Training Data: |")
print("=" * 30)
print(f"Shape {data_train.shape} - value range: ({data_train.target.min()} - {data_train.target.max()})")
print("-" * 30)
print("| Testing Data: |")
print("=" * 30)
print(f"Shape {data_test.shape} - value range: ({data_train.target.min()} - {data_train.target.max()})")
print("-" * 30)
return print("Done!")
def main_op(args):
df = load_orangepi_data(args.data_path)
df = df[-args.num:]
df = df[["Date", "TARGET"]]
df.columns = ['timestamp', 'TARGET']
df = df[['timestamp', 'TARGET']]
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)
for col in df.columns:
if pd.api.types.is_string_dtype(df[col]) == False:
df[col] = df[col].astype('float32')
df.index.name = None
max_end = df.index[-1]
beginning = df.index[0]
new_index = pd.date_range(beginning, end=max_end, freq="1T")
new_index = new_index[:len(df)]
df.index = new_index
size = int(len(df) * 0.8)
data_train, data_test = df[0:size], df[size:len(df)]
data_folder = "./data"
os.makedirs(data_folder, exist_ok=True)
data_train.to_csv(os.path.join(data_folder, 'train_data.csv'))
data_test.to_csv(os.path.join(data_folder, 'test_data.csv'))
print("=" * 30)
print(f"| NEW DATA Data Information |")
print("=" * 30)
print("| Training Data: |")
print("=" * 30)
print(f"Shape {data_train.shape} - value range: ({data_train.TARGET.min()} - {data_train.TARGET.max()})")
print("-" * 30)
print("| Testing Data: |")
print("=" * 30)
print(f"Shape {data_test.shape} - value range: ({data_train.TARGET.min()} - {data_train.TARGET.max()})")
print("-" * 30)
return data_train, data_test
if __name__ == '__main__':
args = get_args_parser()
args = args.parse_args()
main(args)