diff --git a/DataVisualization.py b/DataVisualization.py new file mode 100644 index 0000000..e69de29 diff --git a/Evaluation.py b/Evaluation.py new file mode 100644 index 0000000..09e5ca2 --- /dev/null +++ b/Evaluation.py @@ -0,0 +1,238 @@ +import numpy as np +import pandas as pd +import sklearn +from datetime import datetime +# Classification models +from sklearn.tree import DecisionTreeClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble.forest import RandomForestClassifier +from sklearn.svm import SVC +from OrdinalClassifier import OrdinalClassifier +from OneShotFeatureGenerator import OneShotFeatureGenerator + +# Model and feature selection +from sklearn.feature_selection import SelectKBest +from sklearn.model_selection import KFold +from sklearn.feature_selection import chi2 + +# Classification metrics +from sklearn.metrics import f1_score +from sklearn.metrics import precision_score +from sklearn.metrics import recall_score +from sklearn.metrics import average_precision_score +from sklearn.metrics import accuracy_score +from sklearn.metrics import roc_auc_score + +# SVC classifier generator +def SVC_factory(C=1.0,kernel='rbf', degree=3, gamma='auto', + coef0=0.0, shrinking=True, probability=True, + tol=1e-3, cache_size=200, class_weight=None, + verbose=False, max_iter=-1, decision_function_shape=None, + random_state=None): + + return SVC(C=C,kernel=kernel, degree=degree, gamma=gamma, + coef0=coef0, shrinking=shrinking, probability=probability, + tol=tol, cache_size=cache_size, class_weight=class_weight, + verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape, + random_state=random_state) + +def RandomForestClassifier_factory(n_estimators=50,criterion="gini",max_depth=None,min_samples_split=2,min_samples_leaf=1,min_weight_fraction_leaf=0., + max_features="auto",max_leaf_nodes=None,min_impurity_split=1e-7,bootstrap=True,oob_score=False, n_jobs=1,random_state=None, + verbose=0, warm_start=False, class_weight=None): + return RandomForestClassifier(n_estimators=n_estimators,criterion=criterion,max_depth=max_depth,min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features,max_leaf_nodes=max_leaf_nodes,min_impurity_split=min_impurity_split,bootstrap=bootstrap,oob_score=oob_score, n_jobs=n_jobs,random_state=random_state, + verbose=verbose, warm_start=warm_start, class_weight=class_weight) + + +# DecisionTree classifier generator +def DecisionTreeClassifier_factory(criterion="gini", splitter="best", max_depth=None, + min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., + max_features=None, random_state=None, max_leaf_nodes=None, + min_impurity_split=1e-7, class_weight=None, presort=False): + + return DecisionTreeClassifier(criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + random_state=random_state, + max_leaf_nodes=max_leaf_nodes, + min_impurity_split=min_impurity_split, + class_weight=class_weight, + presort=presort) + +# Logistic Regression classifier generator +def LogisticRegressionClassifier_factory(penalty='l2', dual=False, tol=1e-4, C=1.0, + fit_intercept=True, intercept_scaling=1, class_weight=None, + random_state=None, solver='liblinear', max_iter=100, + multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): + + return LogisticRegression(penalty=penalty, dual=dual, tol=tol, C=C, + fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, + random_state=random_state, solver=solver, max_iter=max_iter, + multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs) + + +def data_cleaning(data_df,target_feature,le, target_le): + for c in data_df.columns: + if data_df[c].dtype in (float, int, np.int64, str, np.object): + data_df[c] = data_df[c].replace(to_replace=np.nan, value=0, regex=True) + if(c == target_feature): + data_df[c] = target_le.fit_transform(data_df[c]) + else: + data_df[c] = le.fit_transform(data_df[c]) + if data_df[c].dtype in (object, str): + for row in data_df[c]: + if isinstance(row, float): + data_df[c] = data_df[c].replace(to_replace=np.nan, value="Null", regex=True) + data_df[c] = data_df[c].astype("category") + if (c == target_feature): + data_df[c] = target_le.fit_transform(data_df[c]) + else: + data_df[c] = le.fit_transform(data_df[c]) + + return data_df.drop([target_feature], axis=1),data_df[target_feature] + +def feature_selection(features_df, target_df, score_func, n): #Return top n features + kbest = SelectKBest (score_func=score_func, k=n) + kf = KFold(10, shuffle = True) # 10 fold cross validation + for train_indices, test_indices in kf.split(features_df, target_df): + # make training and testing datasets + features_train = features_df.loc[[ii for ii in train_indices],] + targets_train = target_df[[ii for ii in train_indices]] + # Feature select phase HERE + kbest.fit(features_train, targets_train) + + scores = pd.DataFrame(kbest.scores_) + features = pd.DataFrame(np.asarray(features_df.columns), columns=["Feature"]) + features['Score'] = scores + top_features = features.sort_values(by=["Score"], ascending=False) + top_features = (top_features[0:n]).Feature + + return top_features + +def build_model(features_df, target_df, clf): + # features_df = train_features + # target_df = train_target + # Use a 10-cross validation to build a model + kf = KFold(10, shuffle = True) # 10 fold cross validation + for train_indices, test_indices in kf.split(features_df, target_df): + # make training and testing datasets + features_train = features_df.loc[[ii for ii in train_indices],] + targets_train = target_df[[ii for ii in train_indices]] + # Train + clf.fit(features_train.as_matrix(), targets_train.as_matrix()) + return clf + +def performance(features_df, target_df, clf): + n_folds = 10 + results_df = pd.DataFrame(columns=['Measure', 'Result']) + # Initialize metrics: + results_df.loc[0] = ['ACCURACY', 0] + results_df.loc[1] = ['PRECISION', 0] + results_df.loc[2] = ['RECALL', 0] + results_df.loc[3] = ['F_MEASURE', 0] + results_df.loc[4] = ['TRAIN_TIME', 0] + # 10 fold cross validation + kf = KFold(n_folds,shuffle=True,random_state=1) # 10 fold cross validation + for train_indices, test_indices in kf.split(features_df, target_df): + # make training and testing datasets + features_train = features_df.loc[[ii for ii in train_indices],] + features_test = features_df.loc[[ii for ii in test_indices],] + targets_train = target_df[[ii for ii in train_indices]] + targets_test = target_df[[ii for ii in test_indices]] + # Train + train_time = datetime.now() + clf.fit(features_train.as_matrix(), targets_train.as_matrix()) + train_time = datetime.now() - train_time + # Test + predicated = clf.predict(features_test.as_matrix()) + + # Measures + results_df.iloc[0, 1] = results_df.iloc[0, 1] + accuracy_score(targets_test, predicated) + results_df.iloc[1, 1] = results_df.iloc[1, 1] + precision_score(targets_test, predicated, average='weighted') + results_df.iloc[2, 1] = results_df.iloc[2, 1] + recall_score(targets_test, predicated, average='weighted') + results_df.iloc[3, 1] = results_df.iloc[3, 1] + f1_score(targets_test, predicated, average='weighted') + results_df.iloc[4, 1] = results_df.iloc[4, 1] + train_time.microseconds + + results_df.Result = results_df.Result.apply(lambda x: x / n_folds) + return results_df + +def evaluation(data_df, target_feature,le, target_le,dataset,n_feature_selection=-1,ordered_class=None): + # Data cleaning + train_features, train_target = data_cleaning(data_df,target_feature, le, target_le) + + # feature selection - TOP 15 + if n_feature_selection > 0: + train_features = train_features[feature_selection(train_features, train_target, chi2, n_feature_selection)] + print("done running feature selection") + + + # feature generation - Aggregated features. + train_features = OneShotFeatureGenerator.generate_A_ratios(train_features) + train_features = OneShotFeatureGenerator.generate_voter_type(train_features) + train_features = OneShotFeatureGenerator.generate_is_random_voter(train_features) + print("done running feature generation") + + + + + #Performance + + filename = "%s_result.csv" % dataset + f = open(filename,"w") + f.write("id,dataset,alogrithm,ACCURACY,PRECISION,RECALL,F_MEASURE,TRAIN_TIME\n") + + id = 0 + atom_classifiers = [RandomForestClassifier_factory, DecisionTreeClassifier_factory, LogisticRegressionClassifier_factory] + for classifier in atom_classifiers: + clf = classifier() + atom_results = performance(train_features,train_target,clf) + + result_list = list(list(atom_results.to_dict().values())[0].values()) + for i in range(len(result_list)): + result_list[i] = str(result_list[i]) + algorithm = "regular classifier = %s" % (str(classifier)) + f.write("%d,%s,%s,%s\n" % (id, dataset, algorithm, str(','.join(result_list)))) + + id += 1 + + ordinal_clf = OrdinalClassifier(base_classifier=classifier, ordered_class=ordered_class) + ordinal_results = performance(train_features,train_target,ordinal_clf) + + result_list = list(list(ordinal_results.to_dict().values())[0].values()) + for i in range(len(result_list)): + result_list[i] = str(result_list[i]) + algorithm = "ordinal classifier = %s" % (str(classifier)) + f.write("%d,%s,%s,%s\n" % (id, dataset, algorithm, str(','.join(result_list)))) + + id += 1 + + f.close() + +def runExp(path,target_feature,dataset,delimiter_data=';',ordered_class=None): + # Initialize + print("starting running on %s" %path) + data_df = pd.DataFrame() + le = sklearn.preprocessing.LabelEncoder() + target_le = sklearn.preprocessing.LabelEncoder() + ordered_class = target_le.fit_transform(ordered_class) + # End Initialize + print("reading the data") + # Load dataset from file + data_df = pd.read_csv(path,delimiter=delimiter_data) + # Evaluate + print("running experiment") + if len(data_df.columns) > 15: + evaluation(data_df, target_feature,le,target_le,dataset,15,ordered_class) + else: + evaluation(data_df, target_feature,le,target_le,dataset,ordered_class=ordered_class) + print("done") + +#Data sets +ordered_class = ["Q","Q'","Q''"] +runExp('datasets/oneshot/oneshot.csv',"Action_pref","oneshot",',',ordered_class) + + diff --git a/ExpertModels.py b/ExpertModels.py new file mode 100644 index 0000000..bd73553 --- /dev/null +++ b/ExpertModels.py @@ -0,0 +1,303 @@ +import numbers +import numpy as np +import sklearn +import pandas as pd + +from sklearn.utils import check_X_y, check_array, column_or_1d +from sklearn.utils.multiclass import check_classification_targets + +from sklearn.externals.joblib import Parallel, delayed #For parallel computing TODO: check if we need to be parallel or not +from sklearn.utils.validation import has_fit_parameter, check_is_fitted + +def _extract_num_votes(V_i): + return V_i.NumVotes[0] + +def _get_max_threshold(V_i): + numVotes = _extract_num_votes(V_i) + return int(numVotes*0.7) + + +def _determine_split_preferences(v_ij, split_feature): + below = 0 + above = 0 + if split_feature == "GAP12_poll": + below = 1 + above = 2 + if split_feature == "GAP13_poll": + below = 1 + above = 3 + return below, above + +def _threshold_range_accuracy(r, V_i, split_feature): + for v_ij in V_i: + preference_below, preference_above = _determine_split_preferences(v_ij, split_feature) + if v_ij.Action == preference_above & v_ij.loc[0, split_feature] <= r.range[0]: + r.errors_below = r.errors_below + 1 + elif v_ij.Action == preference_below & v_ij.loc[0, split_feature] >= r.range[1]: + r.errors_above = r.errors_above + 1 + elif v_ij.Action == preference_below & v_ij.loc[0, split_feature] <= r.range[0]: + r.correct_below = r.correct_below + 1 + elif v_ij.Action == preference_above & v_ij.loc[0, split_feature] >= r.range[1]: + r.correct_above = r.correct_above + 1 + return r + + +def _total_ranges_accuracy(R): + #R is list\array of ranges r + r_total = ThresholdRange() + for r in R: + r_total.errors_below = r_total.errors_below + r.errors_below + r_total.errors_above = r_total.errors_above + r.errors_above + r_total.correct_below = r_total.correct_below + r.correct_below + r_total.correct_above = r_total.correct_above + r.correct_above + return r_total + + +def _most_likely_threshold_range_for_voter_i(V_i, split_feature): + numVotes = _extract_num_votes(V_i) + gaps = list({0, V_i.split_feature, _get_max_threshold(V_i)}) + gaps.sort() + min_error = np.inf + r_best = None + for gapIndex in range(0, len(gaps) - 1): + cur_range = range(gaps[gapIndex], gaps[gapIndex + 1]) + r = ThresholdRange(range = cur_range) + r.numVotes = numVotes + r = _threshold_range_accuracy(r, V_i, split_feature) + total_error = r.errors_below + r.errors_above + if total_error < min_error: + min_error = total_error + r_best = r + return r_best + + +def _most_likely_threshold_ranges(X, y, split_feature): + R = set() + voters = pd.DataFrame(X[["VoterID", "SessionIDX"]].drop_duplicates()) + for voter in voters: + V_i = pd.concat([X.loc[X['VoterID'] == voter.VoterID,] , y], axis=1, join='inner') + r_best = _most_likely_threshold_range_for_voter_i(V_i, split_feature) + r_best.voter = voter + R.add(r_best) + + return R + + +def _threshold_probability_estimation(t, R): + prob = (1/len(R))*(np.sum([(1/(np.max(r.range)-np.min(r.range))) for r in R])) + return prob + +def _sample_probability_estimation(V_i, Z_floor, Z_ceiling, t, split_feature): + sample_prob = 1 + for v_ij in V_i: + gap = v_ij.split_feature + below, above = _determine_split_preferences(v_ij, split_feature) + if gap > t: + if v_ij.Action == above: + sample_prob = sample_prob*Z_ceiling + else: + sample_prob = sample_prob*(1 - Z_ceiling) + else: + if v_ij.Action == above: + sample_prob = sample_prob*Z_floor + else: + sample_prob = sample_prob*(1 - Z_floor) + + return sample_prob + +def _threshold_likelihood_estimation(V_i, R_without_i, Z_floor, Z_ceiling, n_votes, split_feature): + L = list() + for t in range(0, _get_max_threshold(n_votes)): + threshold_prob = _threshold_probability_estimation(t, R_without_i) + sample_prob = _sample_probability_estimation(V_i, Z_floor, Z_ceiling, t, split_feature) + L.append(threshold_prob*sample_prob) + return L + +def _voters_threshold_likelihoods_estimation(X, X_train, y_train, S, split_feature): + V_train = pd.concat([X_train.loc[X_train['Scenario'] == S & X_train['Is_Random'] == False & X_train["VoterType"] != "TRT",], y_train], axis=1, join='inner') + R = _most_likely_threshold_ranges(X_train, y_train, split_feature) + voters = pd.DataFrame(X[["VoterID", "SessionIDX"]].drop_duplicates()) + for voter in voters: + V_i = V_train.loc[V_train["VoterID"] == voter.VoterID] + R_without_i = R[[r.voter.VoterID != voter.VoterID for r in R]] + R_accuracy = _total_ranges_accuracy(R_without_i) + Z_floor = (R_accuracy.errors_below)/(R_accuracy.errors_below + R_accuracy.correct_below) + Z_ceiling = (R_accuracy.errors_above)/(R_accuracy.errors_above + R_accuracy.correct_above) + numVotes = _extract_num_votes(V_i) + L = _threshold_likelihood_estimation(V_i, R_without_i, Z_floor, Z_ceiling, numVotes, split_feature) + X = pd.concat([X.loc[X["VoterID"] == voter.VoterID], L, Z_floor, Z_ceiling], axis=1, join='inner') + + return X + + +def _voters_action_likelihoods_estimation(X,S, split_feature): + V = _voters_threshold_likelihoods_estimation(X, S, split_feature) + V_s = V.loc[V.Scenario == S] + + for v in V_s: + below, above = _determine_split_preferences(v, split_feature) + L_prob_below = 0 + L_prob_above = 0 + Z_floor = v.Z_floor + Z_celing = v.Z_celing + for t in range(0, _get_max_threshold(v.NumVotes)): + pass + + +class ThresholdRange(): + def __init__(self, + voter=None, range=None, numVotes=None): + self.voter = voter + self.range = range + self.numVotes = numVotes + self.errors_below = 0 + self.errors_above = 0 + self.correct_below = 0 + self.correct_above = 0 + + + +class DecisionTreeBaseline(): + """Base class for ordinal meta-classifier. + + """ + + def __init__(self): + pass + + def fit(self, X, y, sample_weight=None): + return self + + def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): + return self + + def predict(self, X): + #X = check_array(X, accept_sparse=['csr', 'csc']) + + # ---------------------------------------------Our CODE + n_samples = X.shape[0] + prediction = np.zeros((n_samples, 1)) + + for i in range(0, n_samples): + if X.iloc[i].Scenario == 3: + if X.iloc[i].VoterType == "LB": + prediction[i] = 2 #Q' vote + else: + prediction[i] = 1 #Q vote + else: + if X.iloc[i].Scenario in [5,6]: + if X.iloc[i].VoterType == "TRT": + prediction[i] = 1 #Q vote + else: + prediction[i] = 2 #Q' vote + + else: + prediction[i] = 1 #Q vote + + + return prediction + +class BayesRuleClassifier(DecisionTreeBaseline): + """Base class for ordinal meta-classifier. + + """ + + def __init__(self): + return self + + def fit(self, X, y, sample_weight=None): + return self + + def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): + return self + + def predict(self, X): + X = check_array(X, accept_sparse=['csr', 'csc']) + + # ---------------------------------------------Our CODE + n_samples = X.shape[0] + prediction = np.zeros((n_samples, 1)) + + for i in range(0, n_samples): + if X[i, "Scenario"] == "C": + if X[i, "VoterType"] == "LB": + prediction[i] = 2 # Q' vote + else: + prediction[i] = 1 # Q vote + else: + if X[i, "Scenario"] in ["E", "F"]: + if X[i, "VoterType"] == "TRT": + prediction[i] = 1 # Q vote + else: + prediction[i] = 2 # Q' vote + + else: + prediction[i] = 1 # Q vote + + return prediction + +class LHClassifier(DecisionTreeBaseline): + """Base class for ordinal meta-classifier. + + """ + + def __init__(self): + return self + + def fit(self, X, y, sample_weight=None): + return self + + def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): + return self + + def predict(self, X): + X = check_array(X, accept_sparse=['csr', 'csc']) + + # ---------------------------------------------Our CODE + n_samples = X.shape[0] + prediction = np.zeros((n_samples, 1)) + + for i in range(0, n_samples): + if X[i, "Scenario"] == "C": + if X[i, "VoterType"] == "LB": + prediction[i] = 2 # Q' vote + else: + prediction[i] = 1 # Q vote + else: + if X[i, "Scenario"] in ["E", "F"]: + if X[i, "VoterType"] == "TRT": + prediction[i] = 1 # Q vote + else: + prediction[i] = 2 # Q' vote + + else: + prediction[i] = 1 # Q vote + + return prediction + +class MLHClassifier(DecisionTreeBaseline): + """Base class for ordinal meta-classifier. + + """ + + def __init__(self): + return self + + def fit(self, X, y, sample_weight=None): + return self + + def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): + return self + + def predict(self, X): + X = check_array(X, accept_sparse=['csr', 'csc']) + + # ---------------------------------------------Our CODE + n_samples = X.shape[0] + prediction = super().predict(X) #Baseline prediciton + + #TODO: compelete this + _voters_action_likelihoods_estimation(X, "C", "GAP12_poll") + + return prediction + diff --git a/OneShotDataPreperation.py b/OneShotDataPreperation.py index 407613c..f524779 100644 --- a/OneShotDataPreperation.py +++ b/OneShotDataPreperation.py @@ -38,7 +38,7 @@ def _data_conversion(data_df, is_target, le): for c in data_df.columns: if data_df[c].dtype in (object, str, np.object, bool): if not (data_df[c].dtype in (int, float)): - data_df[c] = le.fit_transform(data_df[c]) + data_df.loc[:,c] = le.fit_transform(data_df.loc[:,c]) return data_df class OneShotDataPreparation(): @@ -49,7 +49,7 @@ class OneShotDataPreparation(): def _prepare_dataset(features_df): le = sklearn.preprocessing.LabelEncoder() features_encoded_df = pd.DataFrame( - preprocessing.normalize(preprocessing.scale(_data_conversion(features_df, False, le).as_matrix()), axis=0, + preprocessing.normalize(preprocessing.scale(_data_conversion(features_df, False, le).values), axis=0, norm='max')) # target_le = sklearn.preprocessing.LabelEncoder() diff --git a/OneShotFeatureGenerator.py b/OneShotFeatureGenerator.py index f7788f0..5a4effb 100644 --- a/OneShotFeatureGenerator.py +++ b/OneShotFeatureGenerator.py @@ -1,9 +1,14 @@ import numbers +from OneShotDataPreperation import OneShotDataPreparation import numpy as np import pandas as pd import sklearn from scipy.stats import kurtosis from scipy.stats import skew +from sklearn.feature_selection import RFE +import matplotlib +import matplotlib.pyplot as plt +import seaborn as sns from sklearn.ensemble.forest import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier @@ -11,6 +16,12 @@ from keras.layers import Input, Dense from keras.models import Model +from numpy.random import seed +seed(1) +from tensorflow import set_random_seed +set_random_seed(2) + + from datetime import datetime # Model and feature selection from sklearn.feature_selection import SelectKBest @@ -28,6 +39,7 @@ from sklearn.ensemble import VotingClassifier from sklearn.svm import SVC from sklearn.linear_model import logistic +from sklearn.ensemble import RandomForestRegressor def _autoencode(features): @@ -44,10 +56,10 @@ def _autoencode(features): # decoder_layer = autoencoder.layers[-1] # decoder = Model(encoded_input, decoder_layer(encoded_input)) - autoencoder.compile(optimizer='adadelta', loss='MSE') + autoencoder.compile(optimizer='adam', loss='MSE') autoencoder.fit(features, features, - epochs=150, + epochs=20, batch_size=256, shuffle=True, verbose=False) @@ -110,12 +122,12 @@ def _get_scenarios_by_action(self, action): return scenarios - def _generate_action_name(self, df): - # Generate action name - df['Action_name'] = [self._get_action_name(df, x[0]) for x in - df.iterrows()] - - return df + # def _generate_action_name(self, df): + # # Generate action name + # df['Action_name'] = [self._get_action_name(df, x[0]) for x in + # df.iterrows()] + # + # return df def _get_action_name(self, vote_row): action_name = (self.actions_df.loc[(self.actions_df.scenario == vote_row['Scenario']) & ( @@ -184,7 +196,7 @@ def _generate_pref_positions(self, df): for index in range(0, len(combined)): column_name = "Pref" + str(combined["pref"][index]) + "_pos" column_value = index + 1 - df.loc[df['VoterID'] == vote[1].VoterID,column_name] = int(column_value) + df.loc[vote[0],column_name] = int(column_value) return df @@ -355,7 +367,7 @@ def _generate_gap_dif_features(self, df): def _dynamic_feature_generation(self, df, X_train, y_train): X = df a_ratio_columns, gaps_columns = [], [] - all_voters = pd.DataFrame(X[["VoterID", "SessionIDX"]].drop_duplicates()) + all_voters = pd.DataFrame(X["VoterID"].drop_duplicates()) for voter in all_voters.iterrows(): before_columns = len(X.columns) X = self._generate_A_ratios(X, X_train, y_train, voter[1]) @@ -384,18 +396,72 @@ def _dynamic_feature_generation(self, df, X_train, y_train): for gap_pref_feature in gap_pref_features: total_gaps_columns.append(X.columns.get_loc(gap_pref_feature)) - normalized_gap_fs = pd.DataFrame(preprocessing.normalize(X.iloc[:, total_gaps_columns])) + total_gaps_columns.append(X.columns.get_loc("Scenario")) + total_gaps_columns.append(X.columns.get_loc("Scenario_type")) + total_gaps_columns.append(X.columns.get_loc("VoterID")) + + normalized_gap_fs = pd.DataFrame(preprocessing.normalize(OneShotDataPreparation._prepare_dataset(X.iloc[:, total_gaps_columns]))) + + #Try auto encode each voter separately + # encoded_gap_fs = pd.DataFrame() + # + # for voter in all_voters.iterrows(): + # voter_index = X.loc[X['VoterID'] == voter[1].VoterID].index + # voter_encoded_gap_fs = pd.DataFrame(_autoencode(normalized_gap_fs.iloc[voter_index.tolist(),:])) + # voter_encoded_gap_fs.index = voter_index + # + # # aggregate results + # if len(encoded_gap_fs) == 0: + # encoded_gap_fs = pd.DataFrame(voter_encoded_gap_fs) + # else: + # encoded_gap_fs = pd.concat([encoded_gap_fs, pd.DataFrame(voter_encoded_gap_fs)]) + # + # encoded_gap_fs = pd.DataFrame(encoded_gap_fs) + # + # X = pd.concat([X, encoded_gap_fs], axis=1, join='inner') + + + encoded_gap_fs = pd.DataFrame(_autoencode(normalized_gap_fs)) + + X = pd.concat([X, encoded_gap_fs], axis=1, join='inner') - X = X.drop(X.columns[gaps_columns + gaps_dif_columns], axis=1) + #X = X.drop(X.columns[gaps_columns + gaps_dif_columns], axis=1) X = self._generate_is_random_voter(X) X = self._generate_voter_type(X) - return X + # plt.figure(figsize=(12, 10)) + # cor = df.corr() + # sns.heatmap(cor, annot=True, cmap=plt.cm.Reds) + # plt.show() + + # Correlation with output variable + cor_target = abs(pd.concat([X.loc[X_train.index].drop(["Action"],axis=1), y_train], axis=1, join='inner').corr()["Action"]) + # Selecting highly correlated features + relevant_features = cor_target[cor_target > 0.4] + print(relevant_features) + + cols = list(X.columns) + model = RandomForestRegressor(random_state=1) + # Initializing RFE model + rfe = RFE(model, 20) + # Transforming data using RFE + #data_trans = X.loc[X_train.index].fillna( X.loc[X_train.index].mean()) + #OneShotDataPreparation._prepare_dataset(X["VoterType"]) + #OneShotDataPreparation._prepare_dataset(X["Scenario_type"]) + X_rfe = rfe.fit_transform(OneShotDataPreparation._prepare_dataset(X.loc[[x in X_train.index for x in X.index.tolist()]]), y_train) + # Fitting the data to model + model.fit(X_rfe, y_train) + temp = pd.Series(rfe.support_, index=cols) + selected_features_rfe = temp[temp == True].index + X = X.drop(X.columns[[not (x in selected_features_rfe) for x in X.columns]].tolist(), axis=1) + print(selected_features_rfe) + return X +RandomForestRegressor diff --git a/OneShot_NewAnalysis_N4.py b/OneShot_NewAnalysis_N4.py index 2cdea1c..767c626 100644 --- a/OneShot_NewAnalysis_N4.py +++ b/OneShot_NewAnalysis_N4.py @@ -4,44 +4,36 @@ @author: Adam """ -import numbers + import numpy as np import pandas as pd -import sklearn -from scipy.stats import kurtosis -from scipy.stats import skew -from keras.layers import Input, Dense -from keras.models import Model - from sklearn.ensemble.forest import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neural_network import MLPClassifier +from sklearn.feature_selection import RFE from datetime import datetime # Model and feature selection -from sklearn.feature_selection import SelectKBest from sklearn.model_selection import KFold -from sklearn.feature_selection import chi2 # Classification metrics from sklearn.metrics import f1_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import accuracy_score -from sklearn import preprocessing from PersonalClassifier import PersonalClassifier from sklearn.ensemble import AdaBoostClassifier -from sklearn.ensemble import VotingClassifier from sklearn.svm import SVC from sklearn.linear_model import logistic from OneShotFeatureGenerator import OneShotStaticFeatureGenerator from OneShotFeatureGenerator import OneShotDynamicFeatureGenerator from OneShotDataPreperation import OneShotDataPreparation from OrdinalClassifier import OrdinalClassifier -from BaselineModel import DecisionTreeBaseline -from BayesRuleModel import BayesRuleClassifier -from LikelihoodModel import LHClassifier -from MaximumLikelihoodModel import MLHClassifier +from ExpertModels import DecisionTreeBaseline +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import GradientBoostingClassifier + +from sklearn.model_selection import train_test_split def _convert_prediction(X, column_name, n_candidates): X.loc[X[column_name]==1,"Vote_"+column_name] = X.loc[X[column_name]==1,"Pref1"] @@ -73,13 +65,37 @@ def _get_loo_folds(X): def _get_k_folds(X,k): folds = list() - kf = KFold(k, shuffle=True, random_state=1) # 10 fold cross validation - for train_indices, test_indices in kf.split(X): - folds.append(test_indices) + if k == 1: + folds.append(X.index.tolist()) + else: + kf = KFold(k, shuffle=True, random_state=1) # 10 fold cross validation + for train_indices, test_indices in kf.split(X): + folds.append(X.iloc[test_indices].RoundIndex) return folds -def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, scenarios_df, n_candidates = 3): +def _select_features(features_train, targets_train, features_ext_df): + #feature importance + feature_importance = pd.DataFrame() + + rf_for_fs = RandomForestClassifier(n_estimators=100) + rf_for_fs.fit(X=features_train.values, y=targets_train) + current_feature_importances = pd.DataFrame(rf_for_fs.feature_importances_, + index=features_ext_df.columns, + columns=['importance']).sort_values('importance', + ascending=False) + if len(feature_importance) == 0: + feature_importance = current_feature_importances + else: + feature_importance['importance'] = feature_importance['importance'] + current_feature_importances['importance'] + + feature_importance['importance_percentage'] = feature_importance['importance']/np.max(feature_importance['importance']) + selected_comlumns = feature_importance.iloc[[feature_importance['importance_percentage']>0.2],].index.tolist() + return selected_comlumns + + +def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, scenarios_df,n_candidates = 3): data = raw_data.copy() + data = data.drop(["Vote"], axis=1) oneshot_static_fg = OneShotStaticFeatureGenerator(action_table_df, scenarios_df, n_candidates) oneshot_dyn_fg = OneShotDynamicFeatureGenerator(action_table_df, scenarios_df, n_candidates) @@ -89,11 +105,10 @@ def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, n_folds = len(folds) - results_df = pd.DataFrame(columns=['Classifier','FOLD','PRECISION','RECALL','F_MEASURE','ACCURACY']) prediction = pd.DataFrame(np.matrix([])) - feature_importances = pd.DataFrame() + features_train = pd.DataFrame() # 10 fold cross validation for i in range(0,len(folds)): @@ -102,15 +117,19 @@ def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, # Split into features and target features_df, target_df = data.drop([target], axis=1),data[target] - test_indices = data.index[[x[1].RoundIndex in folds[i] for x in data.iterrows()]].tolist() - train_indices = data.index[[not (x[1].RoundIndex in folds[i] or x[1].Scenario == scenario_filter) for x in data.iterrows()]].tolist() + if n_folds == 1: #Upperbound case + test_indices = data.index.tolist() + train_indices = data.index.tolist() + else: + test_indices = data.index[[x[1].RoundIndex in folds[i].tolist() for x in data.iterrows()]].tolist() + train_indices = data.index[[not (x[1].RoundIndex in folds[i].tolist()) for x in data.iterrows()]].tolist() # Feature Generation features_train = features_df.loc[[ii for ii in train_indices],] targets_train = target_df[[ii for ii in train_indices]] - features_ext_df = oneshot_dyn_fg._dynamic_feature_generation(features_df, features_train , targets_train) - features_ext_df = features_ext_df.drop(["Vote"], axis=1) + features_ext_df = oneshot_dyn_fg._dynamic_feature_generation(features_df, features_train, targets_train) +# features_ext_df = features_ext_df.drop(["Vote"], axis=1) # encoding the dataframes - features_encoded_df = OneShotDataPreparation._prepare_dataset(features_ext_df) + features_encoded_df = OneShotDataPreparation._prepare_dataset(features_ext_df.copy()) target_encoded_df = target_df # make training and testing datasets features_train = features_encoded_df.loc[[ii for ii in train_indices],] @@ -118,29 +137,28 @@ def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, targets_train = target_encoded_df[[ii for ii in train_indices]] targets_test = target_encoded_df[[ii for ii in test_indices]] + # select features + #selected_columns = _select_features(features_train, targets_train, features_ext_df) + for j in range(0,len(clfs)): clf = clfs[j] clf_name = str(clf).split("(")[0] - if i == 0: - #Initialize metrics - results_df.loc[j] = [str(clf), i + 1,0, 0, 0, 0] + # if i == 0: + # #Initialize metrics + # results_df.loc[j] = [str(clf), i + 1,0, 0, 0, 0] + # Train - clf.fit(X = features_train.as_matrix(), y = targets_train) - # Test - predicated = clf.predict(features_test.as_matrix()) - - # #feature importance - # current_feature_importances = pd.DataFrame(clf.feature_importances_, - # index=features_ext_df.columns, - # columns=['importance']).sort_values('importance', - # ascending=False) - # if len(feature_importances) == 0: - # feature_importances = current_feature_importances - # else: - # feature_importances['importance'] = feature_importances['importance'] + current_feature_importances['importance'] - # - # print(feature_importances) + clf.fit(X=features_train.values, y=targets_train) + + if "DecisionTreeBaseline" in clf_name: + features_ext_df.to_csv("datasets/oneshot/test_features.csv") + targets_test.to_csv("datasets/oneshot/test_target.csv") + predicated = clf.predict(features_ext_df.loc[[ii for ii in test_indices],]) + else: + # Test + predicated = clf.predict(features_test.values) + #aggregate results if len(prediction) == 0: @@ -154,14 +172,12 @@ def _evaluation(raw_data, clfs, target, folds, scenario_filter, action_table_df, print(str(clf) +": F_score = " + str(f1_score(targets_test, predicated, average='weighted'))) # Measures - results_df.iloc[j + i, 1] = results_df.iloc[j + i, 1] + precision_score(targets_test, predicated, average='weighted') - results_df.iloc[j + i, 2] = results_df.iloc[j + i, 2] + recall_score(targets_test, predicated, average='weighted') - results_df.iloc[j + i, 3] = results_df.iloc[j + i, 3] + f1_score(targets_test, predicated, average='weighted') - results_df.iloc[j + i, 4] = results_df.iloc[j + i, 4] + accuracy_score(targets_test, predicated) + + results_df.loc[i*len(clfs) + j] = [str(clf), i + 1, precision_score(targets_test, predicated, average='weighted'), recall_score(targets_test, predicated, average='weighted'), f1_score(targets_test, predicated, average='weighted'), accuracy_score(targets_test, predicated)] # if i == n_folds - 1: # results_df.iloc[j, 1] = results_df.iloc[j, 1]/n_folds - # results_df.iloc[j, 2] = results_df.iloc[j, 2]/n_folds + # results_df.iloc[j, 2] = results_df.iloc[j, 2]/n_folds # results_df.iloc[j, 3] = results_df.iloc[j, 3]/n_folds # results_df.iloc[j, 4] = results_df.iloc[j, 4]/n_folds @@ -182,9 +198,66 @@ def _build_data_by_folds(data, folds): transformed_data = pd.concat([transformed_data, fold_df]) return transformed_data -def _load_and_run(datasets, load_folds, classifiers, n_candidates, scenarios = ['NONE'], is_loo = False, n_folds = 10): - actions_table = pd.read_csv("datasets/oneshot/action_table_N"+str(n_candidates)+".csv") - scenarios_table = pd.read_csv("datasets/oneshot/scenario_table_N"+str(n_candidates)+".csv") +def _get_classifiers(df, n_candidates): + neural_net_cf = MLPClassifier(hidden_layer_sizes = (50), max_iter = 500, random_state=1) + two_layer_nn_cf = MLPClassifier(hidden_layer_sizes = (50,30), max_iter = 500, random_state=1) + three_layer_nn_cf = MLPClassifier(hidden_layer_sizes = (50,30,20), max_iter = 500, random_state=1) + nn_cf_2 = MLPClassifier(hidden_layer_sizes=(90), max_iter=500, random_state=1) + nn_cf_3 = MLPClassifier(hidden_layer_sizes=(20), max_iter=500, random_state=1) + rf_clf1 = RandomForestClassifier(n_estimators=20, random_state=1) + rf_clf2 = RandomForestClassifier(n_estimators=40, random_state=1) + rf_clf3 = RandomForestClassifier(n_estimators=60, random_state=1) + rf_clf4 = RandomForestClassifier(n_estimators=100, random_state=1) + rf_clf5 = RandomForestClassifier(n_estimators=300, random_state=1) + rf_clf6 = RandomForestClassifier(n_estimators=400, random_state=1) + dt_clf = DecisionTreeClassifier() + adaboost_clf = AdaBoostClassifier(n_estimators=30, random_state=1) + adaboost_clf2 = AdaBoostClassifier(n_estimators=50, random_state=1) + adaboost_clf3 = AdaBoostClassifier(n_estimators=80, random_state=1) + adaboost_clf4 = AdaBoostClassifier(n_estimators=300, random_state=1) + svm_clf = SVC(kernel="poly", degree=4, random_state=1) + svm_clf2 = SVC(kernel="sigmoid", degree=4, random_state=1) + svm_clf3 = SVC(kernel="rbf", degree=4, random_state=1) + logistics_clf = logistic.LogisticRegression(random_state=1) + extra_tree_clf = ExtraTreesClassifier(random_state=1) + gb_clf = GradientBoostingClassifier(random_state=1) + if n_candidates == 3: + ordered_class = [1,2,3] + else: + ordered_class = [1,2,3,4] + + rfi1_clf = PersonalClassifier(id_index=df.columns.get_loc("VoterID"), classes=ordered_class, + n_upsample=10, base_classifier=RandomForestClassifier(n_estimators=20, random_state=1)) + rfi2_clf = PersonalClassifier(id_index=df.columns.get_loc("VoterID"), classes=ordered_class, + n_upsample=10, base_classifier=RandomForestClassifier(n_estimators=40, random_state=1)) + rfi3_clf = PersonalClassifier(id_index=df.columns.get_loc("VoterID"), classes=ordered_class, + n_upsample=10, base_classifier=RandomForestClassifier(n_estimators=60, random_state=1)) + rfi4_clf = PersonalClassifier(id_index=df.columns.get_loc("VoterID"), classes=ordered_class, + n_upsample=10, base_classifier=RandomForestClassifier(n_estimators=80, random_state=1)) + + personal_nn_clf = PersonalClassifier(id_index=df.columns.get_loc("VoterID"), classes=ordered_class, + base_classifier=MLPClassifier(hidden_layer_sizes=(50), max_iter=500, random_state=1), + n_upsample=10, + general_base_classifier=True) # RandomForestClassifier(n_estimators=100) # MLPClassifier(hidden_layer_sizes = (92), max_iter = 500) + + ordinal_clf = OrdinalClassifier(base_classifier = RandomForestClassifier, ordered_class=ordered_class) + + #naive_bayes_clf = sklearn.naive_bayes() + # bayesrule_clf = BayesRuleClassifier() + # likelihood_clf = LHClassifier() + # maxlikelihood_clf = MLHClassifier() + if n_candidates == 3: + baseline_clf = DecisionTreeBaseline() + classifiers = [rf_clf3]#[baseline_clf, extra_tree_clf, gb_clf, rfi1_clf, rfi2_clf, rfi3_clf, rfi4_clf, ordinal_clf ,personal_nn_clf,neural_net_cf,nn_cf_2, nn_cf_3, two_layer_nn_cf, three_layer_nn_cf, rf_clf1,rf_clf2, rf_clf3,rf_clf4,rf_clf5, rf_clf6, dt_clf,adaboost_clf,adaboost_clf2, adaboost_clf3,adaboost_clf4, svm_clf, svm_clf2, svm_clf3,logistics_clf] + else: + classifiers = [extra_tree_clf, gb_clf, rfi1_clf, rfi2_clf, rfi3_clf, rfi4_clf, ordinal_clf, + personal_nn_clf, neural_net_cf, nn_cf_2, nn_cf_3, two_layer_nn_cf, three_layer_nn_cf, rf_clf1, + rf_clf2, rf_clf3, rf_clf4, rf_clf5, rf_clf6, dt_clf, adaboost_clf, adaboost_clf2, adaboost_clf3, + adaboost_clf4, svm_clf, svm_clf2, svm_clf3, logistics_clf] + + return classifiers + +def _load_and_run(datasets, load_folds, scenarios = ['NONE'], is_loo = False, fold_set = [10]): for dataset in datasets: file_path = "datasets/oneshot/" + dataset + ".xlsx" @@ -192,47 +265,58 @@ def _load_and_run(datasets, load_folds, classifiers, n_candidates, scenarios = [ for sheet in xls.sheet_names: #Get sheet from xlsx data = pd.read_excel(file_path, sheet_name=sheet) + + #Take sample from data + data = data.sample(frac=0.05,replace=False, random_state=1) + d_df = data.fillna(data.mean()) + n_candidates = d_df.iloc[0]["NumberOfCandidates"] + actions_table = pd.read_csv("datasets/oneshot/action_table_N" + str(n_candidates) + ".csv") + scenarios_table = pd.read_csv("datasets/oneshot/scenario_table_N" + str(n_candidates) + ".csv") + classifiers = _get_classifiers(d_df, n_candidates) + #Prepare folds - if load_folds == True: - folds = _read_roy_folds(open("datasets/oneshot/"+dataset+"_folds.txt", "r")) - else: - if is_loo == True: - folds = _get_loo_folds(d_df) + for n_folds in fold_set: + if load_folds == True: + folds = _read_roy_folds(open("datasets/oneshot/"+dataset+"_folds.txt", "r")) else: - folds = _get_k_folds(d_df, n_folds) + if is_loo == True: + folds = _get_loo_folds(d_df) + else: + folds = _get_k_folds(d_df, n_folds) - for scenario in scenarios: # ['A','B','C','D','E','F','NONE']: - raw_data = d_df.copy() + for scenario in scenarios: # ['A','B','C','D','E','F','NONE']: + raw_data = d_df.copy() + d_performance_df, d_pred = _evaluation(raw_data, classifiers, 'Action', folds, scenario, actions_table, scenarios_table, n_candidates) + d_performance_df.to_csv("Results\\" + dataset + "_" + sheet + "_performance_df_" + scenario + "_" + str(n_folds) + ".csv") + d_pred.to_csv("Results\\" + dataset + "_" + sheet + "_pred_" + scenario + "_" + str(n_folds) + ".csv") - d_performance_df, d_pred = _evaluation(raw_data, classifiers, 'Action', folds, scenario, actions_table, scenarios_table, n_candidates) - d_performance_df.to_csv("Results\\" + dataset + "_" + sheet + "_performance_df_" + scenario + ".csv") - d_pred.to_csv("Results\\" + dataset + "_" + sheet + "_pred_" + scenario + ".csv") pass #---------------------------------- Classifiers Definition ------------------------------------# -# personal_rf_clf = PersonalClassifier(id_index=raw_data.columns.get_loc("VoterID"), n_upsample=3)#RandomForestClassifier(n_estimators=100) # MLPClassifier(hidden_layer_sizes = (92), max_iter = 500) -# personal_nn_clf = PersonalClassifier(id_index=raw_data.columns.get_loc("VoterID"), base_classifier=MLPClassifier(hidden_layer_sizes = (92), max_iter = 500), n_upsample=10, general_base_classifier=True)#RandomForestClassifier(n_estimators=100) # MLPClassifier(hidden_layer_sizes = (92), max_iter = 500) -# neural_net_cf = MLPClassifier(hidden_layer_sizes = (92), max_iter = 500) -rf_clf = RandomForestClassifier(n_estimators=100) -# dt_clf = DecisionTreeClassifier() -# adaboost_clf = AdaBoostClassifier(n_estimators=200) -# svm_clf = SVC() -# logistics_clf = logistic.LogisticRegression() -#ordinal_clf = OrdinalClassifier(base_classifier = RandomForestClassifier(n_estimators=100)) -#baseline_clf = DecisionTreeBaseline() -# bayesrule_clf = BayesRuleClassifier() -# likelihood_clf = LHClassifier() -# maxlikelihood_clf = MLHClassifier() - -classifiers = [rf_clf] # ,personal_nn_clf,neural_net_cf, rf_clf,dt_clf,adaboost_clf, svm_clf,logistics_clf] + #---------------------------------- Classifiers Definition ------------------------------------# #----------------------------------- Dataset definition ---------------------------------------# # datasets: ["schram"]#["d36_2_folds","d36_4_folds","d36_6_folds","d32_2_folds","d32_4_folds","d32_6_folds"] -datasets = ["schram"] -n_candidates = 3 - -_load_and_run(datasets=datasets, load_folds=True, classifiers=classifiers, n_candidates=n_candidates) - +# datasets = ["schram"] +# n_candidates = 3 +# +# _load_and_run(datasets=datasets, load_folds=True, classifiers=classifiers, n_candidates=n_candidates) +# +datasets = ["d36_updated_train"]#["schram_train","tal_train","d36_updated_train","d32_updated_train","N4_first_90_train"] #["N4_first_90", "d32_updated", "d36_updated", "tal", "schram"]#["N4_first_90_sample", "d32_updated_sample", "d36_updated_sample", "tal_sample", "schram_sample"]#["N4_first_90", "d32_updated", "d36_updated", "tal", "schram"] +fold_set = [10]#, 10] +_load_and_run(datasets=datasets, load_folds=False, fold_set=fold_set) +# + +# datasets = ["N4_first_90", "d32_updated", "d36_updated", "tal", "schram", "N4_first_90_train", "d32_updated_train", "d36_updated_train", "tal_train", "schram_train"] +# for dataset in datasets: +# file_path = "datasets/oneshot/PartionedDatasets/Original/" + dataset + ".xlsx" +# xls = pd.ExcelFile(file_path) +# for sheet in xls.sheet_names: +# #Get sheet from xlsx +# data = pd.read_excel(file_path, sheet_name=sheet) +# data_train, data_test = train_test_split(data, random_state=1, test_size=0.2) +# data_train.to_excel("datasets\\oneshot\\PartionedDatasets\\" + dataset + "_train.xlsx") +# data_test.to_excel("datasets\\oneshot\\PartionedDatasets\\" + dataset + "_test.xlsx") \ No newline at end of file diff --git a/OrdinalClassifier.py b/OrdinalClassifier.py index b9247bc..9b66b0e 100644 --- a/OrdinalClassifier.py +++ b/OrdinalClassifier.py @@ -16,7 +16,7 @@ def _transform_data(ordered_class, class_value, y): """" private function used to transform the data into len(ordered_classes)-1 derived datasets of binary classification problems returns a pair of (class_value, derived_y) """ - ordered_class = ordered_class.tolist() + ordered_class = ordered_class#.tolist() y_derived = [int(ordered_class.index(i) > ordered_class.index(class_value)) for i in y] return y_derived diff --git a/PersonalClassifier.py b/PersonalClassifier.py index 3dd9a1f..e9717db 100644 --- a/PersonalClassifier.py +++ b/PersonalClassifier.py @@ -19,6 +19,7 @@ class PersonalClassifier(with_metaclass(ABCMeta, BaseEnsemble, ClassifierMixin)) def __init__(self, id_index, + classes, base_classifier = RandomForestClassifier(n_estimators=40), n_upsample = 1, general_base_classifier = False): @@ -26,6 +27,7 @@ def __init__(self, self.id_index = id_index self.personal_classifiers = dict() self.n_upsample = n_upsample + self.classes = classes self.general_base_classifier = general_base_classifier @@ -45,14 +47,17 @@ def fit(self, X, y, sample_weight=None): X_v = X[X[:,self.id_index] == voter[1][0]] y_v = y[X[:,self.id_index] == voter[1][0]] combined = np.c_[X_v,y_v] - combined_upsample = resample(combined, replace=True, n_samples=self.n_upsample*X_v.shape[0], random_state=0) - X_v = combined_upsample[:,0:X_v.shape[1]] - y_v = combined_upsample[:,-1] - if self.general_base_classifier == True: - voter_classifier.partial_fit(X_v, y_v, [1,2,3]) + if self.n_upsample*(X_v.shape[0]) <= (X_v.shape[0]): + print("something's wrong!") else: - voter_classifier.fit(X_v, y_v) - self.personal_classifiers[voter[1][0]] = voter_classifier + combined_upsample = resample(combined, replace=True, n_samples=self.n_upsample*(X_v.shape[0]), random_state=0) + X_v = combined_upsample[:,0:X_v.shape[1]] + y_v = combined_upsample[:,-1] + if self.general_base_classifier == True: + voter_classifier.partial_fit(X_v, y_v, self.classes) + else: + voter_classifier.fit(X_v, y_v) + self.personal_classifiers[voter[1][0]] = voter_classifier return self def predict(self, X): diff --git a/tests.py b/tests.py new file mode 100644 index 0000000..94a5ac7 --- /dev/null +++ b/tests.py @@ -0,0 +1,128 @@ +import sklearn +from pandas import DataFrame +from sklearn import preprocessing +from sklearn.tree import DecisionTreeClassifier +from sklearn.svm import SVC +from OrdinalClassifier import OrdinalClassifier +import numpy as np + +def SVC_factory(): + return SVC(probability=True) + +def data_cleaning(data_df, le): + for c in data_df.columns: + if data_df[c].dtype in (float, int, np.int64, str, np.object): + data_df[c] = data_df[c].replace(to_replace=np.nan, value=0, regex=True) + data_df[c] = le.fit_transform(data_df[c]) + if data_df[c].dtype in (object, str): + for row in data_df[c]: + if isinstance(row, float): + data_df[c] = data_df[c].replace(to_replace=np.nan, value="Null", regex=True) + data_df[c] = data_df[c].astype("category") + data_df[c] = le.fit_transform(data_df[c]) + + return data_df + +def clean_data(X,le=sklearn.preprocessing.LabelEncoder()): + df = DataFrame(X, columns=None) + return data_cleaning(df, le).as_matrix() + +ordered_classes = ["NONE", "ARAD", "SILVER", "GOLD"] +# class_value = "milt" +# y = ["cold","cold","milt","cold","hot","milt","hot","hot"] +# X = [[1,1,1],[2,2,1],[2,1,2],[2,3,1],[3,2,3],[2,4,2],[3,3,3],[3,3,2]] + + +# headers = ["COUNTRY","SPORT","IS STUDENT"] +X = [ + [0, 0, 0], + [0, 1, 1], + [0, 1, 0], + [0, 2, 1], + [0, 2, 0], + [1, 0, 1], + [1, 0, 0], + [1, 1, 1], + [1, 2, 1], + [1, 2, 0], + [2, 0, 1], + [2, 0, 0], + [2, 1, 0], + [2, 2, 1], + [2, 2, 0] +] + +x_test = [ + [0, 0, 1], + [1, 1, 0], + [2, 1, 1], +] + +# X = [ +# ["ISR", "FOOTBALL", "NO"], +# ["ISR", "BASKETBALL", "YES"], +# ["ISR", "BASKETBALL", "NO"], +# ["ISR", "CHESS", "YES"], +# ["ISR", "CHESS", "NO"], +# ["USD", "FOOTBALL", "YES"], +# ["USD", "FOOTBALL", "NO"], +# ["USD", "BASKETBALL", "YES"], +# ["USD", "CHESS", "YES"], +# ["USD", "CHESS", "NO"], +# ["FRA", "FOOTBALL", "YES"], +# ["FRA", "FOOTBALL", "NO"], +# ["FRA", "BASKETBALL", "NO"], +# ["FRA", "CHESS", "YES"], +# ["FRA", "CHESS", "NO"] +# ] +# +y = ["NONE", + "NONE", + "ARAD", + "ARAD", + "GOLD", + "NONE", + "NONE", + "SILVER", + "ARAD", + "SILVER", + "ARAD", + "GOLD", + "ARAD", + "NONE", + "NONE"] + +# x_test = [ +# ["ISR", "FOOTBALL", "YES"], +# ["USD", "BASKETBALL", "NO"], +# ["FRA", "BASKETBALL", "YES"], +# ] +# clean_training = clean_data(X) +clean_training = X +# clean_test = clean_data(x_test) +clean_test = x_test + +classifier_tree = OrdinalClassifier(base_classifier=DecisionTreeClassifier, ordered_classes=ordered_classes) +classifier_svc = OrdinalClassifier(base_classifier=SVC_factory, ordered_classes=ordered_classes) + +classifier_tree.fit(clean_training, y) +classifier_svc.fit(clean_training, y) + +# x_test = [[2,2,1],[2,1,2],[3,3,3],[3,2,2],[1,2,1],[1,3,2]] + + +tree_prediciton_results = classifier_tree.predict(clean_test) +svc_prediciton_results = classifier_svc.predict(clean_test) + +print("TREE: %d", tree_prediciton_results) +print("SVC: %d", svc_prediciton_results) +print("Goal: None,Gold,None") + +print("TEST END") + +for i in range(0,len(ordered_classes)-1): + classifier_tree.print_tree(index=i,out_file="tree%d.dot"%i) + + + +