diff --git a/OneShotFeatureGenerator.py b/OneShotFeatureGenerator.py index 5a4effb..6707d96 100644 --- a/OneShotFeatureGenerator.py +++ b/OneShotFeatureGenerator.py @@ -56,10 +56,10 @@ def _autoencode(features): # decoder_layer = autoencoder.layers[-1] # decoder = Model(encoded_input, decoder_layer(encoded_input)) - autoencoder.compile(optimizer='adam', loss='MSE') + autoencoder.compile(optimizer='adadelta', loss='MSE') autoencoder.fit(features, features, - epochs=20, + epochs=20, #tried 20 batch_size=256, shuffle=True, verbose=False) @@ -269,19 +269,19 @@ def _count_action_for_voter(self, action, voter_df): return action_counter - def _generate_A_ratios(self, df, X_train, y_train ,voter): + def _generate_A_ratios(self, df, X_train, y_train ,voter_index): """Generate A ratios - That is TRT-ratio, CMP-ratio, WLB-ratio, SLB-ratio, DOM-ratio Action is in {TRT,DLB,SLB,WLB,CMP,DOM} Scenario is in {A,B,C,D,E,F} """ - voter_df = pd.concat([X_train.loc[X_train['VoterID'] == voter.VoterID,] , y_train], axis=1, join='inner') + voter_df = pd.concat([X_train.loc[X_train.index & voter_index], y_train], axis=1, join='inner') for action in self._get_actions(): availability_counter = np.count_nonzero([x[1].Scenario in self._get_scenarios_by_action(action) for x in voter_df.iterrows()]) action_counter = self._count_action_for_voter(action, voter_df) - df.loc[df['VoterID'] == voter.VoterID, action + '-ratio'] = float(action_counter/availability_counter if availability_counter > 0 else 0) - df.loc[df['VoterID'] == voter.VoterID, action + '-counter'] = float(action_counter) + df.loc[voter_index, action + '-ratio'] = float(action_counter/availability_counter if availability_counter > 0 else 0) + df.loc[voter_index, action + '-counter'] = float(action_counter) return df @@ -302,22 +302,29 @@ def _generate_voter_type(self, df): - def _generate_feature_aggregation_class_dependant(self, df, X_train, y_train, scenarios, voter, feature_name, aggregation_func): + def _generate_feature_aggregation_class_dependant(self, df, X_train, y_train, scenarios, voter_index, feature_name, aggregation_func): X = df - X_train, y_train = X_train.loc[X_train['Scenario'].isin(scenarios)], y_train.loc[X_train['Scenario'].isin(scenarios)] + #X_train, y_train = X_train.loc[X_train['Scenario'].isin(scenarios)], y_train.loc[X_train['Scenario'].isin(scenarios)] #X_train, y_train = X_train, y_train #X.drop([self.target_index], axis=1),X[self.target_index] - for action in range(1, self.n_candidates + 1): - actioni_list = [float(x[1][feature_name]) for x in - X_train.loc[(X_train['VoterID'] == voter.VoterID) & (y_train == action)].iterrows()] - if len(actioni_list) > 0: - X.loc[X['VoterID'] == voter.VoterID, feature_name + '_action'+ str(action) + '_' + aggregation_func.__name__] = aggregation_func( - actioni_list) + + + + voter_train = X_train.loc[X_train.index & voter_index] + voter_train = voter_train.loc[voter_train["Scenario"].isin(scenarios)] + voter_targets = y_train.loc[voter_train.index] + if len(voter_train) > 0: + for action in range(1, self.n_candidates + 1): + actioni_list = [float(x[1][feature_name]) for x in + voter_train.loc[voter_targets == action,:].iterrows()] + if len(actioni_list) > 0: + X.loc[voter_index, feature_name + '_action'+ str(action) + '_' + aggregation_func.__name__] = aggregation_func( + actioni_list) return X - def _generate_action_aggregation_features(self, df, X_train, y_train, voter): + def _generate_action_aggregation_features(self, df, X_train, y_train, voter_index): X = df aggregators = [np.average, np.std, np.median] @@ -325,17 +332,17 @@ def _generate_action_aggregation_features(self, df, X_train, y_train, voter): scenarios = self._get_scenarios_by_actions(self._get_strategic_actions()) - X_train, y_train = X_train.loc[X_train['Scenario'].isin(scenarios)], y_train.loc[X_train['Scenario'].isin(scenarios)] - - voter_train = X_train.loc[(X_train['VoterID'] == voter.VoterID)] + voter_train = X_train.loc[X_train.index & voter_index] + voter_train = voter_train.loc[voter_train["Scenario"].isin(scenarios)] + voter_targets = y_train.loc[voter_train.index] for aggregation_func in aggregators: - X.loc[X['VoterID'] == voter.VoterID, feature_name + "_" + aggregation_func.__name__] = aggregation_func( - [float(y_train[x[0]]) for x in voter_train.iterrows()]) + X.loc[voter_index, feature_name + "_" + aggregation_func.__name__] = aggregation_func( + [float(voter_targets[x[0]]) for x in voter_train.iterrows()]) return X - def _generate_gaps_features(self, df, X_train, y_train, voter): + def _generate_gaps_features(self, df, X_train, y_train, voter_index): X = df features = self._get_gap_pref_features() @@ -344,7 +351,7 @@ def _generate_gaps_features(self, df, X_train, y_train, voter): for aggregator in aggregators: for feature in features: - X = self._generate_feature_aggregation_class_dependant(X, X_train, y_train, scenarios, voter, feature, aggregator) + X = self._generate_feature_aggregation_class_dependant(X, X_train, y_train, scenarios, voter_index, feature, aggregator) return X @@ -369,18 +376,19 @@ def _dynamic_feature_generation(self, df, X_train, y_train): a_ratio_columns, gaps_columns = [], [] all_voters = pd.DataFrame(X["VoterID"].drop_duplicates()) for voter in all_voters.iterrows(): + voter_index = X.loc[X['VoterID'] == voter[1].VoterID,].index before_columns = len(X.columns) - X = self._generate_A_ratios(X, X_train, y_train, voter[1]) + X = self._generate_A_ratios(X, X_train, y_train, voter_index) if len(a_ratio_columns) == 0: a_ratio_columns = list(range(before_columns, len(X.columns))) before_columns = len(X.columns) - X = self._generate_gaps_features(X, X_train, y_train, voter[1]) + X = self._generate_gaps_features(X, X_train, y_train, voter_index) if len(gaps_columns) == 0: gaps_columns = list(range(before_columns, len(X.columns))) - X = self._generate_action_aggregation_features(X, X_train, y_train, voter[1]) + X = self._generate_action_aggregation_features(X, X_train, y_train, voter_index) # Gaps features encoding X = X.fillna( @@ -402,31 +410,31 @@ def _dynamic_feature_generation(self, df, X_train, y_train): normalized_gap_fs = pd.DataFrame(preprocessing.normalize(OneShotDataPreparation._prepare_dataset(X.iloc[:, total_gaps_columns]))) - #Try auto encode each voter separately - # encoded_gap_fs = pd.DataFrame() - # - # for voter in all_voters.iterrows(): - # voter_index = X.loc[X['VoterID'] == voter[1].VoterID].index - # voter_encoded_gap_fs = pd.DataFrame(_autoencode(normalized_gap_fs.iloc[voter_index.tolist(),:])) - # voter_encoded_gap_fs.index = voter_index - # - # # aggregate results - # if len(encoded_gap_fs) == 0: - # encoded_gap_fs = pd.DataFrame(voter_encoded_gap_fs) - # else: - # encoded_gap_fs = pd.concat([encoded_gap_fs, pd.DataFrame(voter_encoded_gap_fs)]) - # - # encoded_gap_fs = pd.DataFrame(encoded_gap_fs) - # - # X = pd.concat([X, encoded_gap_fs], axis=1, join='inner') - - - encoded_gap_fs = pd.DataFrame(_autoencode(normalized_gap_fs)) + encoded_gap_fs.index = X.index + X = pd.concat([X, encoded_gap_fs], axis=1, join='inner') - X = pd.concat([X, encoded_gap_fs], axis=1, join='inner') + # #Try auto encode each voter separately + # # encoded_gap_fs = pd.DataFrame() + # # + # # for voter in all_voters.iterrows(): + # # voter_index = X.loc[X['VoterID'] == voter[1].VoterID].index + # # voter_encoded_gap_fs = pd.DataFrame(_autoencode(normalized_gap_fs.iloc[voter_index.tolist(),:])) + # # voter_encoded_gap_fs.index = voter_index + # # + # # # aggregate results + # # if len(encoded_gap_fs) == 0: + # # encoded_gap_fs = pd.DataFrame(voter_encoded_gap_fs) + # # else: + # # encoded_gap_fs = pd.concat([encoded_gap_fs, pd.DataFrame(voter_encoded_gap_fs)]) + # # + # # encoded_gap_fs = pd.DataFrame(encoded_gap_fs) + # # + # # X = pd.concat([X, encoded_gap_fs], axis=1, join='inner') + # + #X = X.drop(X.columns[gaps_columns + gaps_dif_columns], axis=1) @@ -439,29 +447,15 @@ def _dynamic_feature_generation(self, df, X_train, y_train): # plt.show() # Correlation with output variable - cor_target = abs(pd.concat([X.loc[X_train.index].drop(["Action"],axis=1), y_train], axis=1, join='inner').corr()["Action"]) - # Selecting highly correlated features - relevant_features = cor_target[cor_target > 0.4] - print(relevant_features) - - cols = list(X.columns) - model = RandomForestRegressor(random_state=1) - # Initializing RFE model - rfe = RFE(model, 20) - # Transforming data using RFE - #data_trans = X.loc[X_train.index].fillna( X.loc[X_train.index].mean()) - #OneShotDataPreparation._prepare_dataset(X["VoterType"]) - #OneShotDataPreparation._prepare_dataset(X["Scenario_type"]) - X_rfe = rfe.fit_transform(OneShotDataPreparation._prepare_dataset(X.loc[[x in X_train.index for x in X.index.tolist()]]), y_train) - # Fitting the data to model - model.fit(X_rfe, y_train) - temp = pd.Series(rfe.support_, index=cols) - selected_features_rfe = temp[temp == True].index - X = X.drop(X.columns[[not (x in selected_features_rfe) for x in X.columns]].tolist(), axis=1) - print(selected_features_rfe) + # cor_target = abs(pd.concat([X.loc[X_train.index], y_train], axis=1, join='inner').corr()["Action"]) + # # Selecting highly correlated features + # relevant_features = cor_target[cor_target > 0.4] + # print(relevant_features) + # + return X -RandomForestRegressor + diff --git a/OneShot_NewAnalysis_N4.py b/OneShot_NewAnalysis_N4.py index 767c626..3fbd7ae 100644 --- a/OneShot_NewAnalysis_N4.py +++ b/OneShot_NewAnalysis_N4.py @@ -32,6 +32,7 @@ from ExpertModels import DecisionTreeBaseline from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier +from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split @@ -73,12 +74,13 @@ def _get_k_folds(X,k): folds.append(X.iloc[test_indices].RoundIndex) return folds -def _select_features(features_train, targets_train, features_ext_df): +def _features_importance(features_ext_df, features_train, targets_train): #feature importance feature_importance = pd.DataFrame() rf_for_fs = RandomForestClassifier(n_estimators=100) - rf_for_fs.fit(X=features_train.values, y=targets_train) + transformed_features_train = OneShotDataPreparation._prepare_dataset(features_ext_df.loc[features_train.index, :]) + rf_for_fs.fit(X=transformed_features_train.values, y=targets_train) current_feature_importances = pd.DataFrame(rf_for_fs.feature_importances_, index=features_ext_df.columns, columns=['importance']).sort_values('importance', @@ -89,11 +91,30 @@ def _select_features(features_train, targets_train, features_ext_df): feature_importance['importance'] = feature_importance['importance'] + current_feature_importances['importance'] feature_importance['importance_percentage'] = feature_importance['importance']/np.max(feature_importance['importance']) - selected_comlumns = feature_importance.iloc[[feature_importance['importance_percentage']>0.2],].index.tolist() - return selected_comlumns + return feature_importance -def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, scenarios_df,n_candidates = 3): + +def _select_features(X, X_train, y_train, top_k=25): + cols = list(X.columns) + model = RandomForestClassifier(n_estimators=100, random_state=1) + # Initializing RFE model + rfe = RFE(model, top_k) + # Transforming data using RFE + # data_trans = X.loc[X_train.index].fillna( X.loc[X_train.index].mean()) + # OneShotDataPreparation._prepare_dataset(X["VoterType"]) + # OneShotDataPreparation._prepare_dataset(X["Scenario_type"]) + X_rfe = rfe.fit_transform(OneShotDataPreparation._prepare_dataset(X.loc[X_train.index, :]), y_train) + # Fitting the data to model + model.fit(X_rfe, y_train) + temp = pd.Series(rfe.support_, index=cols) + selected_features_rfe = temp[temp == True].index + + print(selected_features_rfe) + + return selected_features_rfe + +def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, scenarios_df, n_candidates = 3): data = raw_data.copy() data = data.drop(["Vote"], axis=1) @@ -109,6 +130,9 @@ def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, prediction = pd.DataFrame(np.matrix([])) + features_importance = pd.DataFrame(np.matrix([])) + selected_features = pd.DataFrame(np.matrix([])) + features_train = pd.DataFrame() # 10 fold cross validation for i in range(0,len(folds)): @@ -121,21 +145,51 @@ def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, test_indices = data.index.tolist() train_indices = data.index.tolist() else: - test_indices = data.index[[x[1].RoundIndex in folds[i].tolist() for x in data.iterrows()]].tolist() + test_indices = data.index[[(x[1].RoundIndex in folds[i].tolist()) for x in data.iterrows()]].tolist() train_indices = data.index[[not (x[1].RoundIndex in folds[i].tolist()) for x in data.iterrows()]].tolist() - # Feature Generation - features_train = features_df.loc[[ii for ii in train_indices],] - targets_train = target_df[[ii for ii in train_indices]] + # Feature Generation + features_train = features_df.loc[train_indices] + targets_train = target_df[train_indices] features_ext_df = oneshot_dyn_fg._dynamic_feature_generation(features_df, features_train, targets_train) # features_ext_df = features_ext_df.drop(["Vote"], axis=1) + + # Feature Selection + selected_features_rfe = _select_features(features_ext_df, features_train, targets_train) + current_selected_features = pd.DataFrame(selected_features_rfe) + current_selected_features.loc[:, "FOLD"] = str(i+1) + if len(selected_features) == 0: + selected_features = current_selected_features + else: + selected_features = pd.concat([selected_features, current_selected_features]) + + baseline_set = features_ext_df.loc[:, ["Scenario", "VoterType"]] + + features_ext_df = features_ext_df.drop( + features_ext_df.columns[[not (x in selected_features_rfe) for x in + features_ext_df.columns]].tolist(), + axis=1) + + + #Feature Importance + current_feature_importance = _features_importance(features_ext_df, features_train, targets_train) + current_feature_importance.loc[:, "FOLD"] = str(i+1) + if len(features_importance) == 0: + features_importance = current_feature_importance + else: + features_importance = pd.concat([features_importance, current_feature_importance]) + + # encoding the dataframes features_encoded_df = OneShotDataPreparation._prepare_dataset(features_ext_df.copy()) + features_encoded_df.index = data.index + target_encoded_df = target_df # make training and testing datasets - features_train = features_encoded_df.loc[[ii for ii in train_indices],] - features_test = features_encoded_df.loc[[ii for ii in test_indices],] - targets_train = target_encoded_df[[ii for ii in train_indices]] - targets_test = target_encoded_df[[ii for ii in test_indices]] + features_train = features_encoded_df.loc[train_indices] + features_test = features_encoded_df.loc[test_indices] + targets_train = target_encoded_df[train_indices] + targets_test = target_encoded_df[test_indices] + # select features #selected_columns = _select_features(features_train, targets_train, features_ext_df) @@ -154,7 +208,7 @@ def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, if "DecisionTreeBaseline" in clf_name: features_ext_df.to_csv("datasets/oneshot/test_features.csv") targets_test.to_csv("datasets/oneshot/test_target.csv") - predicated = clf.predict(features_ext_df.loc[[ii for ii in test_indices],]) + predicated = clf.predict(baseline_set.loc[[ii for ii in test_indices],]) else: # Test predicated = clf.predict(features_test.values) @@ -166,7 +220,7 @@ def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, else: prediction = pd.concat([prediction, pd.DataFrame(predicated)]) - raw_data.loc[[ii for ii in test_indices],"Prediction" + "_" + clf_name] = predicated + raw_data.loc[test_indices,"Prediction" + "_" + clf_name] = predicated raw_data = _convert_prediction(raw_data, "Prediction" + "_" + clf_name, n_candidates) @@ -183,7 +237,7 @@ def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, #results_df.Result = results_df.Result.apply(lambda x: x / n_folds) - return results_df, raw_data#, feature_importances + return results_df, raw_data, features_importance, selected_features def _build_data_by_folds(data, folds): transformed_data = pd.DataFrame() @@ -248,7 +302,7 @@ def _get_classifiers(df, n_candidates): # maxlikelihood_clf = MLHClassifier() if n_candidates == 3: baseline_clf = DecisionTreeBaseline() - classifiers = [rf_clf3]#[baseline_clf, extra_tree_clf, gb_clf, rfi1_clf, rfi2_clf, rfi3_clf, rfi4_clf, ordinal_clf ,personal_nn_clf,neural_net_cf,nn_cf_2, nn_cf_3, two_layer_nn_cf, three_layer_nn_cf, rf_clf1,rf_clf2, rf_clf3,rf_clf4,rf_clf5, rf_clf6, dt_clf,adaboost_clf,adaboost_clf2, adaboost_clf3,adaboost_clf4, svm_clf, svm_clf2, svm_clf3,logistics_clf] + classifiers = [baseline_clf, extra_tree_clf, gb_clf, rfi1_clf, rfi2_clf, rfi3_clf, rfi4_clf, ordinal_clf ,personal_nn_clf,neural_net_cf,nn_cf_2, nn_cf_3, two_layer_nn_cf, three_layer_nn_cf, rf_clf1,rf_clf2, rf_clf3,rf_clf4,rf_clf5, rf_clf6, dt_clf,adaboost_clf,adaboost_clf2, adaboost_clf3,adaboost_clf4, svm_clf, svm_clf2, svm_clf3,logistics_clf] else: classifiers = [extra_tree_clf, gb_clf, rfi1_clf, rfi2_clf, rfi3_clf, rfi4_clf, ordinal_clf, personal_nn_clf, neural_net_cf, nn_cf_2, nn_cf_3, two_layer_nn_cf, three_layer_nn_cf, rf_clf1, @@ -267,8 +321,7 @@ def _load_and_run(datasets, load_folds, scenarios = ['NONE'], is_loo = False, fo data = pd.read_excel(file_path, sheet_name=sheet) #Take sample from data - data = data.sample(frac=0.05,replace=False, random_state=1) - + #data = data.loc[data["VoterID"].isin(data["VoterID"].sample(frac=0.001, replace=False, random_state=1))] d_df = data.fillna(data.mean()) n_candidates = d_df.iloc[0]["NumberOfCandidates"] @@ -288,9 +341,11 @@ def _load_and_run(datasets, load_folds, scenarios = ['NONE'], is_loo = False, fo for scenario in scenarios: # ['A','B','C','D','E','F','NONE']: raw_data = d_df.copy() - d_performance_df, d_pred = _evaluation(raw_data, classifiers, 'Action', folds, scenario, actions_table, scenarios_table, n_candidates) + d_performance_df, d_pred, d_feature_importance, d_selected_features = _evaluation(raw_data, classifiers, 'Action', folds, scenario, actions_table, scenarios_table, n_candidates) d_performance_df.to_csv("Results\\" + dataset + "_" + sheet + "_performance_df_" + scenario + "_" + str(n_folds) + ".csv") d_pred.to_csv("Results\\" + dataset + "_" + sheet + "_pred_" + scenario + "_" + str(n_folds) + ".csv") + d_feature_importance.to_csv("Results\\" + dataset + "_" + sheet + "_feature_importance_" + scenario + "_" + str(n_folds) + ".csv") + d_selected_features.to_csv("Results\\" + dataset + "_" + sheet + "_selected_features_" + scenario + "_" + str(n_folds) + ".csv") pass @@ -305,9 +360,9 @@ def _load_and_run(datasets, load_folds, scenarios = ['NONE'], is_loo = False, fo # # _load_and_run(datasets=datasets, load_folds=True, classifiers=classifiers, n_candidates=n_candidates) # -datasets = ["d36_updated_train"]#["schram_train","tal_train","d36_updated_train","d32_updated_train","N4_first_90_train"] #["N4_first_90", "d32_updated", "d36_updated", "tal", "schram"]#["N4_first_90_sample", "d32_updated_sample", "d36_updated_sample", "tal_sample", "schram_sample"]#["N4_first_90", "d32_updated", "d36_updated", "tal", "schram"] +datasets = ['voter_sample_for_test']#["d36_updated_train","tal_train","d36_updated_train","schram_train","N4_first_90"]#["schram_train","tal_train","d36_updated_train","d32_updated_train","N4_first_90_train"] #["N4_first_90", "d32_updated", "d36_updated", "tal", "schram"]#["N4_first_90_sample", "d32_updated_sample", "d36_updated_sample", "tal_sample", "schram_sample"]#["N4_first_90", "d32_updated", "d36_updated", "tal", "schram"] fold_set = [10]#, 10] -_load_and_run(datasets=datasets, load_folds=False, fold_set=fold_set) +_load_and_run(datasets=datasets, load_folds=False,fold_set=fold_set) # # datasets = ["N4_first_90", "d32_updated", "d36_updated", "tal", "schram", "N4_first_90_train", "d32_updated_train", "d36_updated_train", "tal_train", "schram_train"]