-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathII_dicingerPro_TD.py
138 lines (126 loc) · 6.12 KB
/
II_dicingerPro_TD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Compare with somethingElse.py, in dicingerPro_TD.py, deep learning algorithm is adopted. And, before the deep learning
process, dataset is generated for deep learning algorithm. Only trading data is used in this code.
Then, the prepared data, X_train, X_test, y_train, y_test and X_predict (which is used to predict one day ahead) is
transfer to the sthElseDeepLearning.py and for further process and deeplearning.
"""
# # Loading Dataset into a DataLoader ----------------------------------------------------------------------------------
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
import II_somethingElsepreprocessing as sepp
from sklearn.model_selection import train_test_split
from tqdm import tqdm
# Read dataset
TD_all_dataset = pd.read_csv('/home/crjLambda/PRO80/DEEPLEARN/TD_All.csv')
indX = TD_all_dataset.columns.values
indX[0] = 'DATE'
TD_all_dataset.columns = indX
# Also read the indexes
TD_indexes = pd.read_csv('/home/crjLambda/PRO80/DailyTDs/ref_TD.csv')
TD_yields_indexes = pd.read_csv('/home/crjLambda/PRO80/DailyTDs/ref_yields.csv')
TD_Currency_indexes = pd.read_csv('/home/crjLambda/PRO80/DailyTDs/ref_Currency.csv')
# And generate wanted dataset
indexesAll = TD_indexes.join(TD_Currency_indexes, rsuffix='_Currency')
indexesAll = indexesAll.join(TD_yields_indexes, rsuffix='_yields')
indexesAll_ind = indexesAll.iloc[0, ]
"""
Several newly opened contracts have to be eliminated as it contains too much NAs due to much shorter trading periods.
These includes: PK0, LH0, PG0, SA0, EB0, EG0 and SCM.
"""
indexesAll_ind.index = indexesAll_ind.values
indexesAll_ind = indexesAll_ind.drop(['PK0', 'LH0', 'PG0', 'EB0', 'EG0', 'LUM'])
# Dataset of Close
indX = ['DATE']
for ind in indexesAll_ind:
indX.append(ind+'Close')
datasetClose = TD_all_dataset[indX]
# Dataset of hodrick prescott filter's trend product
indX = ['DATE']
for ind in indexesAll_ind:
indX.append(ind+'_hpft')
datasetTrend = TD_all_dataset[indX]
# Dataset of hodrick prescott filter's cycle product
indX = ['DATE']
for ind in indexesAll_ind:
indX.append(ind+'_hpfc')
datasetCycle = TD_all_dataset[indX]
"""
After generating desired datasets, NAs have to be taken care of. There is one major problem with the data which
generated by the lagging update of the yields. And I am planing to using a 10 days window to impute the NAs within the
yields.
"""
# Close dataset
datasetClose_upperPart = datasetClose.iloc[:-10]
datasetClose_lowerPart = datasetClose.iloc[-10:]
# Drop NAs of the upperPart
datasetClose_upperPart_dropna = datasetClose_upperPart.dropna(axis=0)
# Impute the lowerPart NAs
imputed_data_DATE = datasetClose_lowerPart['DATE']
imputed_data_DATE = pd.DataFrame(imputed_data_DATE)
imputed_data_noDATE = datasetClose_lowerPart.drop(columns=['DATE'])
imr = KNNImputer(n_neighbors=2, weights='uniform')
imr = imr.fit(imputed_data_noDATE.values)
imputed_data = imr.transform(imputed_data_noDATE.values)
imputed_data_noDATE = pd.DataFrame(imputed_data, columns=imputed_data_noDATE.columns)
imputed_data_noDATE.index = imputed_data_DATE.index
datasetClose_lowerPart_imputedna = imputed_data_DATE.join(imputed_data_noDATE)
datasetClose = datasetClose_upperPart_dropna.append(datasetClose_lowerPart_imputedna)
X_predict_DATE = pd.to_datetime(datasetClose_lowerPart['DATE'].tail(1).values).date
"""
Currently, I am taking only Close dataset to be the target features. Nevertheless, log return will be generated among
through the whole dataset. And I am also trying to generate more suitable dataset with more transform technic like one
hot.
"""
dataClose_logr = sepp.generate_logr(datasetClose)
dataClose_onehot = sepp.generate_onehot(datasetClose)
dataClose_multiclasses = sepp.generate_MultiClassesLabel(datasetClose, classes=4)
"""
Then it comes to the thinking that how can I predict the future? One way is using today's features for tomorrow's label.
Which means I have to use one day lagging data for today's label target.
"""
indexWanted_CU0 = ['CU0', 'P0', 'Y0', 'AG0', 'BU0', 'ZN0', 'C0', 'AL0', 'RM0', 'M0', 'CF0']
indexWanted_RB0 = ['RB0', 'HC0', 'I0', 'V0', 'BU0', 'JM0', 'UR0', 'FG0', 'MA0', 'SA0', 'SR0']
indexWanted_SCM = ["SCM", 'AU0', 'PG0', 'EB0', 'FU0', 'TA0', 'PP0', 'L0', 'V0', 'LUM', 'RU0']
indexList = list(np.unique(indexWanted_CU0 + indexWanted_RB0 + indexWanted_SCM))
for x in ['PK0', 'LH0', 'PG0', 'EB0', 'EG0', 'LUM']:
try:
indexList.remove(x)
except ValueError:
continue
outputs_df = pd.DataFrame({'DATE': [X_predict_DATE, X_predict_DATE, X_predict_DATE],
'MODULE': ['linear', 'CNN', 'RNN']})
print(X_predict_DATE)
from BK import sthElseDeepLearning
for i in tqdm(range(len(indexList)), ncols=100, desc="somethingElse", colour="blue"):
ind = [indexList[i]]
label_onehot_oneDay, feature_onehot_oneDay = sepp.oneDaylagging(dataClose_onehot, ind, suffix='Close')
X, y = feature_onehot_oneDay, label_onehot_oneDay
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
X_predict = sepp.oneDay_pred_feature(dataClose_onehot, ind, suffix='Close')
sthedl = sthElseDeepLearning.somethingElseDeepLearning(X_train, X_test, y_train, y_test, X_predict,
ind[0], X_predict_DATE)
output = []
output.append(sthedl.linearModel_train(learning_rate=0.001, num_epochs=200))
output.append(sthedl.CNN_model_train(learning_rate=0.001, num_epochs=50))
output.append(sthedl.RNN_model_train(learning_rate=0.001, num_epochs=200))
output_df = pd.DataFrame(output, columns=['_'.join([ind[0], j]) for j in ['acc.', 'y']])
outputs_df = outputs_df.join(output_df)
"""
And Now saving the results for finalProba project.
"""
import os
from pathlib import Path
dataDirName = "finalProba/allProba"
outputs_filename = Path(Path(os.getcwd()).parents[1], dataDirName, "dicingerpro_TD.csv")
outputs_df.to_csv(outputs_filename)
print(outputs_df)
# I tried paddlepaddle as well
"""
import sthElseDeepLearning_paddle
sthedl_paddle = sthElseDeepLearning_paddle.somethingElseDeepLearning(
X_train, X_test, y_train, y_test, X_predict, indexWanted[0], X_predict_DATE
)
sthedl_paddle.linerModel_train()
print('\nDone!\n')
"""