diff --git a/multi_participants/mind_reading_package/mind_reading.py b/multi_participants/mind_reading_package/mind_reading.py index 6125b04..82663bb 100644 --- a/multi_participants/mind_reading_package/mind_reading.py +++ b/multi_participants/mind_reading_package/mind_reading.py @@ -11,6 +11,10 @@ from keras import backend as K from keras import Sequential from keras.layers import Dense +from keras.layers import Dropout +from sklearn.preprocessing import LabelEncoder +from keras.utils import np_utils +from keras.callbacks import EarlyStopping def load_data(file): @@ -39,11 +43,11 @@ def concatenate_data(df1, df2): def find_trials(data): """This function locates every index which indicates the start of a new trial - Args: - data (DataFrame): concatenated data + Args: + data (DataFrame): concatenated data - Returns: - List of all indices that indicate a new trial + Returns: + List of all indices that indicate a new trial """ # find every beginning point of a trial trial_idx = data.loc[data[73] == 1.0] @@ -54,24 +58,49 @@ def find_trials(data): def find_markers(data): """This function locates every index which indicates the markers needed to create labels - Args: - data (DataFrame): concatenated data + Args: + data (DataFrame): concatenated data - Returns: - DataFrame of all indices that indicate a marker + Returns: + DataFrame of all indices that indicate a marker """ # find every marker (label) markers_idx = data.loc[data[74] != 0.0] return markers_idx +def create_ic_labels(data): + """This function creates a binary label column to append to DataFrame for classification + (removing left/right attribute only congruent vs incongruent) + Args: + data (DataFrame): concatenated data + + Returns: + Pandas Series (column) of labels for supervised classification + """ + markers_idx = data.loc[data[74] != 0.0] + labels = pd.Series(markers_idx[74], name='Labels').reset_index().drop( + 'index', axis=1) + for i in labels.index: + # if congruent label, assign 0 + if int(labels.iloc[i]) == 11 or int(labels.iloc[i]) == 22: + labels.iloc[i] = 0 + else: + labels.iloc[i] = 1 + return labels +# 11 = left congruent +# 22 = right congruent +# 31 = left incongruent (left denotes direction of center arrow) +# 42 = right incongruent + + def create_binary_labels(data): """This function creates a binary label column to append to DataFrame for classification - (removing congruent/incongruent attribute only left/right) - Args: - data (DataFrame): concatenated data + (removing congruent/incongruent attribute only left/right) + Args: + data (DataFrame): concatenated data - Returns: + Returns: Pandas Series (column) of labels for supervised classification """ markers_idx = data.loc[data[74] != 0.0] @@ -85,14 +114,33 @@ def create_binary_labels(data): return labels +def create_multi_labels(data): + """This function creates multiple label columns to append to DataFrame for + multiclass classification + Args: + data (DataFrame): concatenated data + + Returns: + Pandas Series (column) of labels for supervised classification + """ + markers_idx = data.loc[data[74] != 0.0] + labels = pd.Series(markers_idx[74], name='Labels').reset_index().drop( + 'index', axis=1) + encoder = LabelEncoder() + labels = encoder.fit_transform(labels.values.ravel()) + labels = pd.DataFrame(labels, columns=['Labels']) + + return labels + + def separate_trials(data, trials_index): """This function separates the data into the different trials. - Args: - data (DataFrame): concatenated data - trials_index (List): list of all indices that indicate a new trial + Args: + data (DataFrame): concatenated data + trials_index (List): list of all indices that indicate a new trial - Returns: - List of each trial stored as DataFrames + Returns: + List of each trial stored as DataFrames """ # trials list to store every trial trials = [] @@ -112,17 +160,20 @@ def separate_trials(data, trials_index): return trials -def process_trials(trials): +def process_trials(trials, window_1, window_2): """This function goes through each trial, resets the columns to show sample rate, - gets data in the time window between 308th - 513th sample, and removes all channels from 64 on. + gets data in the time window between (window_1 bound) - (window_2 bound)sample, and removes all channels from 64 on then scales + data per channel for each trial. Args: trials (List): list of all trials separated previously + window_1 (Integer): first bound of window + window_2 (Integer): second bound of window Returns: List of each processed trial stored as DataFrames """ # Go through each trial, reset the columns, we split from 100-300ms ((308th sample to 513th sample)) - + scaler = MinMaxScaler() # Processed trials: trials which have been processed to split between 100-300ms pro_trials = [] @@ -132,9 +183,16 @@ def process_trials(trials): # Resets the column numbers to allow easier slicing of samples tr_df.columns = range(tr_df.shape[1]) # Slice each trial - tr_df = tr_df.loc[:, 308:513] + tr_df = tr_df.loc[:, window_1: window_2] # Remove all channels(rows) from 64 and up tr_df = tr_df.drop(tr_df.index[64:]) + # Turn trial frame around to scale across columns + tr_df = tr_df.T + # Scale per column/channel + for column in list(tr_df.columns): + tr_df[column] = scaler.fit_transform(pd.DataFrame(tr_df[column])) + # Flip trial frame back to output with channels on axis=0 + tr_df = tr_df.T # Append new/processed trials in list pro_trials.append(tr_df) @@ -155,10 +213,13 @@ def average_trials(pro_trials): for split_trial in range(len(pro_trials)): avg_trial = pro_trials[split_trial].mean(axis=1) + # Scale every average trial avg_trials.append(avg_trial) return avg_trials +# Concatenate the label column with the avg_trials_df + def create_ml_df(avg_trials, labels): """This function concatenates the average trials dataframe with labels to structure @@ -179,6 +240,8 @@ def create_ml_df(avg_trials, labels): return ml_df +# Splits data into train and test, scales data depending on parameter + def prepare_ml_df(ml_df, scale=True): """This function preprocesses the machine learning dataframe by giving @@ -190,8 +253,8 @@ def prepare_ml_df(ml_df, scale=True): Returns: DataFrame with machine learning structure """ - # Separating the independent variables from the label + if (scale == True): scaler = MinMaxScaler() X = ml_df.drop('Labels', axis=1) @@ -210,7 +273,18 @@ def prepare_ml_df(ml_df, scale=True): def train_svc(X_train, X_test, y_train, y_test): + """This function trains an SVC classifier using grid search for hyperparameter tuning + in order to return the accuracy and precision. + Args: + X_train : trained independent data + X_test : test independent data + y_train : trained label/dependent data + y_test : test label/dependent data + + Returns: + Accuracy and precision rates for the SVC + """ # parameter grid param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}] @@ -231,8 +305,52 @@ def train_svc(X_train, X_test, y_train, y_test): return accuracy, precision +def train_svc_multi(X_train, X_test, y_train, y_test): + """This function trains an SVC classifier using grid search for hyperparameter tuning + in order to return the accuracy and precision. + + Args: + X_train : trained independent data + X_test : test independent data + y_train : trained label/dependent data + y_test : test label/dependent data + + Returns: + Accuracy and precision rates for the SVC + """ + # parameter grid + param_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}] + + # Initializing the SVC Classifier + clf = SVC() + + # Initialize grid search for hyperparameter tuning + gs_SVC = GridSearchCV(clf, param_grid, cv=5) + gs_SVC.fit(X_train, y_train) + + # Predict using the fitted model + y_pred = gs_SVC.predict(X_test) + + # return accuracy and precision + accuracy = accuracy_score(y_pred, y_test) + precision = precision_score(y_pred, y_test, average='weighted') + + return accuracy, precision + + def train_dtc(X_train, X_test, y_train, y_test): + """This function trains a Decision Tree classifier using grid search for hyperparameter tuning + in order to return the accuracy and precision. + + Args: + X_train : trained independent data + X_test : test independent data + y_train : trained label/dependent data + y_test : test label/dependent data + Returns: + Accuracy and precision rates for the Decision Tree classifier + """ # parameter grid params = {'max_leaf_nodes': list( range(2, 100)), 'min_samples_split': [2, 3, 4]} @@ -254,7 +372,52 @@ def train_dtc(X_train, X_test, y_train, y_test): return accuracy, precision +def train_dtc_multi(X_train, X_test, y_train, y_test): + """This function trains a Decision Tree classifier using grid search for hyperparameter tuning + in order to return the accuracy and precision. + + Args: + X_train : trained independent data + X_test : test independent data + y_train : trained label/dependent data + y_test : test label/dependent data + + Returns: + Accuracy and precision rates for the Decision Tree classifier + """ + # parameter grid + params = {'max_leaf_nodes': list( + range(2, 100)), 'min_samples_split': [2, 3, 4]} + + # Initializing classifier + dtc = DecisionTreeClassifier(random_state=42) + + # Initialize grid search for hyperparameter tuning + gs_DTC = GridSearchCV(dtc, params, verbose=1, cv=5) + gs_DTC.fit(X_train, y_train) + + # Predict using the fitted model + y_pred = gs_DTC.predict(X_test) + + # return accuracy and precision + accuracy = accuracy_score(y_pred, y_test) + precision = precision_score(y_pred, y_test, average='weighted') + + return accuracy, precision + + def train_nb(X_train, X_test, y_train, y_test): + """This function trains a Naive Bayes classifier in order to return the accuracy and precision. + + Args: + X_train : trained independent data + X_test : test independent data + y_train : trained label/dependent data + y_test : test label/dependent data + + Returns: + Accuracy and precision rates for the Naive Bayes classifier + """ # Initialize classifier nb = GaussianNB() @@ -270,7 +433,42 @@ def train_nb(X_train, X_test, y_train, y_test): return accuracy, precision +def train_nb_multi(X_train, X_test, y_train, y_test): + """This function trains a Naive Bayes classifier in order to return the accuracy and precision. + + Args: + X_train : trained independent data + X_test : test independent data + y_train : trained label/dependent data + y_test : test label/dependent data + + Returns: + Accuracy and precision rates for the Naive Bayes classifier + """ + # Initialize classifier + nb = GaussianNB() + + nb.fit(X_train, y_train) + + # Predict using the fitted model + y_pred = nb.predict(X_test) + + # return accuracy and precision + accuracy = accuracy_score(y_pred, y_test) + precision = precision_score(y_pred, y_test, average='weighted') + + return accuracy, precision + + def precision_m(y_true, y_pred): + """This function finds the true precision of the keras classifier. + Args: + y_true : true numeric labels + y_pred : predicted numeric labels + + Returns: + True precision rate of the keras classifier + """ true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = true_positives / (predicted_positives + K.epsilon()) @@ -278,7 +476,18 @@ def precision_m(y_true, y_pred): def train_nn(n_inputs, X_train, X_test, y_train, y_test): + """This function uses a deep neural network for classification. + + Args: + n_inputs : number of inputs + X_train : trained independent data + X_test : test independent data + y_train : trained label/dependent data + y_test : test label/dependent data + Returns: + Accuracy and precision rates for the deep neural network classifier + """ classifier = Sequential() # First Hidden Layer classifier.add(Dense(4, activation='relu', @@ -295,21 +504,52 @@ def train_nn(n_inputs, X_train, X_test, y_train, y_test): 'acc', precision_m]) # Fitting the data to the training dataset - classifier.fit(X_train, y_train, batch_size=10, epochs=1000) + classifier.fit(X_train, y_train, batch_size=2, epochs=700) _, accuracy, precision = classifier.evaluate(X_test, y_test, verbose=0) return accuracy, precision -def create_metric_df(acc_list, prec_list, model_list): +def train_nn_multi(n_inputs, X_train, X_test, y_train, y_test): + """This function uses a deep neural network for multiclass classification. + + Args: + n_inputs : number of inputs + X_train : trained independent data + X_test : test independent data + y_train : trained label/dependent data + y_test : test label/dependent data + + Returns: + Accuracy and precision rates for the deep neural network classifier + """ + + model = Sequential() + # Rectified Linear Unit Activation Function + model.add(Dense(15, input_dim=n_inputs, activation='relu')) + model.add(Dropout(0.4)) + model.add(Dense(15, activation='relu')) + model.add(Dropout(0.4)) + # Softmax for multi-class classification + model.add(Dense(4, activation='softmax')) + + # Changes to labels to categorical for multi + y_train = np_utils.to_categorical(y_train) + y_test = np_utils.to_categorical(y_test) - metrics = [acc_list, prec_list] - metric_df = pd.DataFrame(metrics).T - metric_df.index = model_list - metric_df.columns = ['acc', 'prec'] + # Compile model here + model.compile(loss='categorical_crossentropy', + optimizer='adam', metrics=['acc', precision_m]) + + callback = EarlyStopping(monitor='val_loss', patience=50) + model.fit(X_train, y_train, validation_data=(X_test, y_test), + batch_size=2, epochs=500, callbacks=[callback]) + + _, accuracy, precision = model.evaluate(X_test, y_test, verbose=0) + + return accuracy, precision - return metric_df def res_df(df, column, participant): ''' @@ -324,4 +564,4 @@ def res_df(df, column, participant): data = pd.DataFrame({f"Participant {participant}": column}) df[f"Participant {participant}"] = data[f"Participant {participant}"].values - return df \ No newline at end of file + return df diff --git a/multi_participants/mind_reading_package/mind_reading_v2.py b/multi_participants/mind_reading_package/mind_reading_v2.py index 989e8ca..82663bb 100644 --- a/multi_participants/mind_reading_package/mind_reading_v2.py +++ b/multi_participants/mind_reading_package/mind_reading_v2.py @@ -69,6 +69,31 @@ def find_markers(data): return markers_idx +def create_ic_labels(data): + """This function creates a binary label column to append to DataFrame for classification + (removing left/right attribute only congruent vs incongruent) + Args: + data (DataFrame): concatenated data + + Returns: + Pandas Series (column) of labels for supervised classification + """ + markers_idx = data.loc[data[74] != 0.0] + labels = pd.Series(markers_idx[74], name='Labels').reset_index().drop( + 'index', axis=1) + for i in labels.index: + # if congruent label, assign 0 + if int(labels.iloc[i]) == 11 or int(labels.iloc[i]) == 22: + labels.iloc[i] = 0 + else: + labels.iloc[i] = 1 + return labels +# 11 = left congruent +# 22 = right congruent +# 31 = left incongruent (left denotes direction of center arrow) +# 42 = right incongruent + + def create_binary_labels(data): """This function creates a binary label column to append to DataFrame for classification (removing congruent/incongruent attribute only left/right) @@ -135,17 +160,20 @@ def separate_trials(data, trials_index): return trials -def process_trials(trials): +def process_trials(trials, window_1, window_2): """This function goes through each trial, resets the columns to show sample rate, - gets data in the time window between 308th - 513th sample, and removes all channels from 64 on. + gets data in the time window between (window_1 bound) - (window_2 bound)sample, and removes all channels from 64 on then scales + data per channel for each trial. Args: trials (List): list of all trials separated previously + window_1 (Integer): first bound of window + window_2 (Integer): second bound of window Returns: List of each processed trial stored as DataFrames """ # Go through each trial, reset the columns, we split from 100-300ms ((308th sample to 513th sample)) - + scaler = MinMaxScaler() # Processed trials: trials which have been processed to split between 100-300ms pro_trials = [] @@ -155,9 +183,16 @@ def process_trials(trials): # Resets the column numbers to allow easier slicing of samples tr_df.columns = range(tr_df.shape[1]) # Slice each trial - tr_df = tr_df.loc[:, 308:513] + tr_df = tr_df.loc[:, window_1: window_2] # Remove all channels(rows) from 64 and up tr_df = tr_df.drop(tr_df.index[64:]) + # Turn trial frame around to scale across columns + tr_df = tr_df.T + # Scale per column/channel + for column in list(tr_df.columns): + tr_df[column] = scaler.fit_transform(pd.DataFrame(tr_df[column])) + # Flip trial frame back to output with channels on axis=0 + tr_df = tr_df.T # Append new/processed trials in list pro_trials.append(tr_df) @@ -178,12 +213,14 @@ def average_trials(pro_trials): for split_trial in range(len(pro_trials)): avg_trial = pro_trials[split_trial].mean(axis=1) - #Scale every average trial + # Scale every average trial avg_trials.append(avg_trial) return avg_trials # Concatenate the label column with the avg_trials_df + + def create_ml_df(avg_trials, labels): """This function concatenates the average trials dataframe with labels to structure dataframe in format to allow machine learning classification. @@ -238,7 +275,7 @@ def prepare_ml_df(ml_df, scale=True): def train_svc(X_train, X_test, y_train, y_test): """This function trains an SVC classifier using grid search for hyperparameter tuning in order to return the accuracy and precision. - + Args: X_train : trained independent data X_test : test independent data @@ -271,7 +308,7 @@ def train_svc(X_train, X_test, y_train, y_test): def train_svc_multi(X_train, X_test, y_train, y_test): """This function trains an SVC classifier using grid search for hyperparameter tuning in order to return the accuracy and precision. - + Args: X_train : trained independent data X_test : test independent data @@ -304,7 +341,7 @@ def train_svc_multi(X_train, X_test, y_train, y_test): def train_dtc(X_train, X_test, y_train, y_test): """This function trains a Decision Tree classifier using grid search for hyperparameter tuning in order to return the accuracy and precision. - + Args: X_train : trained independent data X_test : test independent data @@ -338,7 +375,7 @@ def train_dtc(X_train, X_test, y_train, y_test): def train_dtc_multi(X_train, X_test, y_train, y_test): """This function trains a Decision Tree classifier using grid search for hyperparameter tuning in order to return the accuracy and precision. - + Args: X_train : trained independent data X_test : test independent data @@ -370,17 +407,17 @@ def train_dtc_multi(X_train, X_test, y_train, y_test): def train_nb(X_train, X_test, y_train, y_test): - """This function trains a Naive Bayes classifier in order to return the accuracy and precision. - - Args: - X_train : trained independent data - X_test : test independent data - y_train : trained label/dependent data - y_test : test label/dependent data + """This function trains a Naive Bayes classifier in order to return the accuracy and precision. - Returns: - Accuracy and precision rates for the Naive Bayes classifier - """ + Args: + X_train : trained independent data + X_test : test independent data + y_train : trained label/dependent data + y_test : test label/dependent data + + Returns: + Accuracy and precision rates for the Naive Bayes classifier + """ # Initialize classifier nb = GaussianNB() @@ -398,7 +435,7 @@ def train_nb(X_train, X_test, y_train, y_test): def train_nb_multi(X_train, X_test, y_train, y_test): """This function trains a Naive Bayes classifier in order to return the accuracy and precision. - + Args: X_train : trained independent data X_test : test independent data @@ -487,7 +524,7 @@ def train_nn_multi(n_inputs, X_train, X_test, y_train, y_test): Returns: Accuracy and precision rates for the deep neural network classifier """ - + model = Sequential() # Rectified Linear Unit Activation Function model.add(Dense(15, input_dim=n_inputs, activation='relu')) @@ -513,6 +550,7 @@ def train_nn_multi(n_inputs, X_train, X_test, y_train, y_test): return accuracy, precision + def res_df(df, column, participant): ''' Add precision/accuracy for every participant to the whole results @@ -526,4 +564,4 @@ def res_df(df, column, participant): data = pd.DataFrame({f"Participant {participant}": column}) df[f"Participant {participant}"] = data[f"Participant {participant}"].values - return df \ No newline at end of file + return df