II_dicingerPro_TD.py


"""
Compare with somethingElse.py, in dicingerPro_TD.py, deep learning algorithm is adopted. And, before the deep learning
process, dataset is generated for deep learning algorithm. Only trading data is used in this code.
Then, the prepared data, X_train, X_test, y_train, y_test and X_predict (which is used to predict one day ahead) is
transfer to the sthElseDeepLearning.py and for further process and deeplearning.
"""

# # Loading Dataset into a DataLoader ----------------------------------------------------------------------------------
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
import II_somethingElsepreprocessing as sepp
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Read dataset
TD_all_dataset = pd.read_csv('/home/crjLambda/PRO80/DEEPLEARN/TD_All.csv')
indX = TD_all_dataset.columns.values
indX[0] = 'DATE'
TD_all_dataset.columns = indX

# Also read the indexes
TD_indexes = pd.read_csv('/home/crjLambda/PRO80/DailyTDs/ref_TD.csv')
TD_yields_indexes = pd.read_csv('/home/crjLambda/PRO80/DailyTDs/ref_yields.csv')
TD_Currency_indexes = pd.read_csv('/home/crjLambda/PRO80/DailyTDs/ref_Currency.csv')

# And generate wanted dataset
indexesAll = TD_indexes.join(TD_Currency_indexes, rsuffix='_Currency')
indexesAll = indexesAll.join(TD_yields_indexes, rsuffix='_yields')
indexesAll_ind = indexesAll.iloc[0, ]
"""
Several newly opened contracts have to be eliminated as it contains too much NAs due to much shorter trading periods.
These includes: PK0, LH0, PG0, SA0, EB0, EG0 and SCM.
"""
indexesAll_ind.index = indexesAll_ind.values
indexesAll_ind = indexesAll_ind.drop(['PK0', 'LH0', 'PG0', 'EB0', 'EG0', 'LUM'])

# Dataset of Close
indX = ['DATE']
for ind in indexesAll_ind:
    indX.append(ind+'Close')
datasetClose = TD_all_dataset[indX]
# Dataset of hodrick prescott filter's trend product
indX = ['DATE']
for ind in indexesAll_ind:
    indX.append(ind+'_hpft')
datasetTrend = TD_all_dataset[indX]
# Dataset of hodrick prescott filter's cycle product
indX = ['DATE']
for ind in indexesAll_ind:
    indX.append(ind+'_hpfc')
datasetCycle = TD_all_dataset[indX]

"""
After generating desired datasets, NAs have to be taken care of. There is one major problem with the data which 
generated by the lagging update of the yields. And I am planing to using a 10 days window to impute the NAs within the 
yields.
"""
# Close dataset
datasetClose_upperPart = datasetClose.iloc[:-10]
datasetClose_lowerPart = datasetClose.iloc[-10:]
# Drop NAs of the upperPart
datasetClose_upperPart_dropna = datasetClose_upperPart.dropna(axis=0)
# Impute the lowerPart NAs
imputed_data_DATE = datasetClose_lowerPart['DATE']
imputed_data_DATE = pd.DataFrame(imputed_data_DATE)
imputed_data_noDATE = datasetClose_lowerPart.drop(columns=['DATE'])
imr = KNNImputer(n_neighbors=2, weights='uniform')
imr = imr.fit(imputed_data_noDATE.values)
imputed_data = imr.transform(imputed_data_noDATE.values)
imputed_data_noDATE = pd.DataFrame(imputed_data, columns=imputed_data_noDATE.columns)
imputed_data_noDATE.index = imputed_data_DATE.index
datasetClose_lowerPart_imputedna = imputed_data_DATE.join(imputed_data_noDATE)
datasetClose = datasetClose_upperPart_dropna.append(datasetClose_lowerPart_imputedna)
X_predict_DATE = pd.to_datetime(datasetClose_lowerPart['DATE'].tail(1).values).date
"""
Currently, I am taking only Close dataset to be the target features. Nevertheless, log return will be generated among 
through the whole dataset. And I am also trying to generate more suitable dataset with more transform technic like one
hot. 
"""
dataClose_logr = sepp.generate_logr(datasetClose)
dataClose_onehot = sepp.generate_onehot(datasetClose)
dataClose_multiclasses = sepp.generate_MultiClassesLabel(datasetClose, classes=4)

"""
Then it comes to the thinking that how can I predict the future? One way is using today's features for tomorrow's label.
Which means I have to use one day lagging data for today's label target.  
"""
indexWanted_CU0 = ['CU0', 'P0', 'Y0', 'AG0', 'BU0', 'ZN0', 'C0', 'AL0', 'RM0', 'M0', 'CF0']
indexWanted_RB0 = ['RB0', 'HC0', 'I0', 'V0', 'BU0', 'JM0', 'UR0', 'FG0', 'MA0', 'SA0', 'SR0']
indexWanted_SCM = ["SCM", 'AU0', 'PG0', 'EB0', 'FU0', 'TA0', 'PP0', 'L0', 'V0', 'LUM', 'RU0']
indexList = list(np.unique(indexWanted_CU0 + indexWanted_RB0 + indexWanted_SCM))
for x in ['PK0', 'LH0', 'PG0', 'EB0', 'EG0', 'LUM']:
    try:
        indexList.remove(x)
    except ValueError:
        continue
outputs_df = pd.DataFrame({'DATE': [X_predict_DATE, X_predict_DATE, X_predict_DATE],
                           'MODULE': ['linear', 'CNN', 'RNN']})
print(X_predict_DATE)
from BK import sthElseDeepLearning

for i in tqdm(range(len(indexList)), ncols=100, desc="somethingElse", colour="blue"):
    ind = [indexList[i]]
    label_onehot_oneDay, feature_onehot_oneDay = sepp.oneDaylagging(dataClose_onehot, ind, suffix='Close')
    X, y = feature_onehot_oneDay, label_onehot_oneDay
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
    X_predict = sepp.oneDay_pred_feature(dataClose_onehot, ind, suffix='Close')
    sthedl = sthElseDeepLearning.somethingElseDeepLearning(X_train, X_test, y_train, y_test, X_predict,
                                                           ind[0], X_predict_DATE)
    output = []
    output.append(sthedl.linearModel_train(learning_rate=0.001, num_epochs=200))
    output.append(sthedl.CNN_model_train(learning_rate=0.001, num_epochs=50))
    output.append(sthedl.RNN_model_train(learning_rate=0.001, num_epochs=200))
    output_df = pd.DataFrame(output, columns=['_'.join([ind[0], j]) for j in ['acc.', 'y']])
    outputs_df = outputs_df.join(output_df)

"""
And Now saving the results for finalProba project.
"""
import os
from pathlib import Path
dataDirName = "finalProba/allProba"
outputs_filename = Path(Path(os.getcwd()).parents[1], dataDirName, "dicingerpro_TD.csv")
outputs_df.to_csv(outputs_filename)

print(outputs_df)

# I tried paddlepaddle as well
"""
import sthElseDeepLearning_paddle
sthedl_paddle = sthElseDeepLearning_paddle.somethingElseDeepLearning(
    X_train, X_test, y_train, y_test, X_predict, indexWanted[0], X_predict_DATE
)
sthedl_paddle.linerModel_train()
print('\nDone!\n')
"""