diff --git a/.gitignore b/.gitignore
index 4fe1c3f..3c3aa94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .DS_Store
 config.yaml
+figures/
\ No newline at end of file
diff --git a/README.md b/README.md
index b7df1b0..6d9f611 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ NLP tools to analyse twitter for different user groups and perform topic modelli
 ## JH:
 - Data preprocessing:
   - cleaning hashtag lists
-  - write script to format csv's that Nisha has generated into format that the preprocessing code can use for wordcloud generation
+  - ~~write script to format csv's that Nisha has generated into format that the preprocessing code can use for wordcloud generation~~
   - find a list of English words to use for tweet filtering
 - Data analysis: 
   - wordcloud related stuff for hashtags and tweets --> specifically do a wordcloud of the text of the tweets
diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-30-2020-01-31.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-30-2020-01-31.tar.gz
new file mode 100644
index 0000000..5f7092e
Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-30-2020-01-31.tar.gz differ
diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-31-2020-02-01.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-31-2020-02-01.tar.gz
new file mode 100644
index 0000000..af3a067
Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-31-2020-02-01.tar.gz differ
diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-01-2020-02-02.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-01-2020-02-02.tar.gz
new file mode 100644
index 0000000..b65020a
Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-01-2020-02-02.tar.gz differ
diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-02-2020-02-03.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-02-2020-02-03.tar.gz
new file mode 100644
index 0000000..f9a4251
Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-02-2020-02-03.tar.gz differ
diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-03-2020-02-04.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-03-2020-02-04.tar.gz
new file mode 100644
index 0000000..701c38b
Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-03-2020-02-04.tar.gz differ
diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-04-2020-02-05.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-04-2020-02-05.tar.gz
new file mode 100644
index 0000000..a33d683
Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-04-2020-02-05.tar.gz differ
diff --git a/classifier/Generate_training_data_for_user_classification.py b/classifier/Generate_training_data_for_user_classification.py
new file mode 100644
index 0000000..27a4de6
--- /dev/null
+++ b/classifier/Generate_training_data_for_user_classification.py
@@ -0,0 +1,196 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Feb 10 01:46:24 2020
+
+@author: I Kit Cheng
+"""
+
+# In[]:
+
+# Generate features from training data
+from ScrapeTwitterTimeline_FeatureExtraction import main
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+
+def relabel_dataset(df):
+    """
+
+    Parameters
+    ----------
+    df : pandas.core.frame.DataFrame
+        Dataframe.
+
+    Returns
+    -------
+    df_labels : pandas.core.frame.DataFrame
+        Extracted labels.
+
+    """
+
+    # Label Distribution
+    print('\nLabel Distribution:')
+    print(df.gender.value_counts())
+    
+    # Drop rows with gender = nan
+    df = df.dropna(subset=['gender'])
+    
+    # Remove individuals with unknown label
+    df = df[df.gender != 'unknown']
+    
+    print(f'\nClean df length: {len(df)}')
+    
+    # Label Distribution (clean)
+    print('\nLabel Distribution:')
+    print(df.gender.value_counts())
+    
+    # Combine 'male' and 'females' labels to 0, and relabel 'brand' to 1
+    df_labels = pd.DataFrame([0 if (x =='female' or x == 'male') 
+                              else 1 for x in df.gender], columns=['labels'],
+                             index=df.name)
+    
+    return df_labels
+
+# In[]:
+
+
+def bool2int(df,columns):
+    """
+
+    Parameters
+    ----------
+    df : pandas.core.frame.DataFrame
+        Dataframe with boolean columns.
+    columns : list
+        Column names with boolean data.
+
+    Returns
+    -------
+    df : pandas.core.frame.DataFrame
+        Dataframe without boolean data (converted to binary 0 or 1)
+
+    """
+    print('\nChanging boolean data to 0 or 1.')
+    for col in columns:
+        df[col] = df[col].astype(int)
+    return df
+    
+def matching_labels_to_new_features(df):
+    """
+
+    Parameters
+    ----------
+    df : pandas.core.frame.DataFrame
+        Dataframe without labels.
+
+    Returns
+    -------
+    df : pandas.core.frame.DataFrame
+        Dataframe with labels.
+
+    """
+    print('Matching labels to new features dataframe.')
+    # Adding the corresponding label to the feature dataset
+    labels_for_sample = []
+    for i,v in enumerate(df.index.to_list()):
+        if len(df_labels.loc[v]) > 1:
+            labels_for_sample.append(df_labels.loc[v].iloc[0][0])
+        else:
+            labels_for_sample.append(df_labels.loc[v].iloc[0])
+            
+    df.index.names = ['username'] # name the index column
+    df['labels'] = labels_for_sample
+    df.to_csv('user_features_labels.csv')
+    return df
+
+
+# In[]:
+
+########################################### plot distribution of each variable ######################################
+def plotDist(save=False):
+    """
+
+    Parameters
+    ----------
+    save : bool, optional
+        Save plot option. The default is False.
+
+    Returns
+    -------
+    None.
+
+    """
+    for i, col in enumerate(df.columns[1:]):
+        print(col)
+        plt.figure()
+        try:
+            ax = sns.kdeplot(df[col])
+            ax.get_legend().remove()
+        except RuntimeError:
+            df[col].hist()
+        plt.title(col)
+        plt.close()
+    if save:
+        plt.savefig('dist_'+col+'.png')
+
+#plotDist()
+# In[]:
+######################################### Dealing with missing data ##############################################
+
+from sklearn.impute import SimpleImputer
+
+# Replace numerical nans with median (the median is less sensitive to outliers)
+def replaceNans(df, strategy='median'):
+    """
+
+    Parameters
+    ----------
+    df : pandas.core.frame.DataFrame
+        A dataframe (rows are examples and columns are features).
+    strategy: string, optional
+        Replace nans with specified strategy. The default is 'median'. 
+        Options are 'mean', 'median', 'most_frequent', 'constant'.
+
+    Returns
+    -------
+    df : pandas.core.frame.DataFrame
+        A dataframe without numerical nans.
+
+    """
+    print(f'Replacing Nans with {strategy}.')
+    imputer = SimpleImputer(missing_values=np.nan, strategy=strategy) 
+    for i, col in enumerate(df.columns[1:-1]):
+        if len(df[col].unique()) == 2: # categorical (binary) data
+            continue
+        else:
+            df[[col]] = imputer.fit_transform(df[[col]])
+    return df
+
+# In[]:
+if __name__ == '__main__':
+    # Set random seed to ensure reproducible runs
+    RSEED = 50
+
+    print('\n################# Begin Scraping User Timeline: #####################')
+    
+    # We'll limit the data to 1000 individuals to speed up training.
+    df = pd.read_csv('../Datasets/gender-classifier.csv', encoding = "ISO-8859-1")#.sample(10, random_state = RSEED)
+    users = df.name.to_list()
+    df_labels = relabel_dataset(df)
+    
+    scrape = False
+    if scrape:
+        df = main(users, N=200) # saves features in users_features.csv
+    
+    df = pd.read_csv('users_features.csv', index_col=0)
+    
+    df = bool2int(df, ['geo', 'location', 'url', 'description', 'verified'])
+    matching_labels_to_new_features(df)
+    
+    df = pd.read_csv('user_features_labels.csv', index_col=0) # training data (unclean)
+    df.index.name = 'username'
+    df = replaceNans(df)
+    df.to_csv('user_features_labels_noNan.csv')
+    print('___________________Done cleaning!_________________')
diff --git a/classifier/Random_Forest_Classifier.py b/classifier/Random_Forest_Classifier.py
new file mode 100644
index 0000000..8ff9294
--- /dev/null
+++ b/classifier/Random_Forest_Classifier.py
@@ -0,0 +1,204 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 11 10:19:29 2020
+
+@author: I Kit Cheng
+"""
+
+# In[]:
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import confusion_matrix
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.metrics import f1_score
+import itertools
+from collections import Counter
+from imblearn.over_sampling import SMOTE
+
+# In[]:
+
+# Set random seed to ensure reproducible runs
+RSEED = 50
+
+# Dataset
+df = pd.read_csv('../Datasets/user_classification/ind_vs_bot/dataset3/user_features_labels_noNan_dropped_fav_retweet_cols_combined_d1d2.csv', index_col=0) 
+
+
+
+print('\n _______________________ Split Data into Training and Testing Set __________________________')
+# Extract the labels
+labels = np.array(df.pop('labels'))
+
+# 30% examples in test data
+train, test, train_labels, test_labels = train_test_split(df, labels,
+                                                          stratify = labels,
+                                                          test_size = 0.3,
+                                                          random_state = RSEED)
+
+# In[]:
+# Features for feature importances
+features = list(train.columns)
+
+print(f'Train data shape: {train.shape}')
+print(f'Test data shape: {test.shape}')
+
+# In[]:
+
+print('\n _______________________ Evaluate the Decision Tree _______________________')
+def evaluate_model(predictions, probs, train_predictions, train_probs, smote=False):
+    """Compare machine learning model to baseline performance.
+    Computes statistics and shows ROC curve."""
+    
+    baseline = {}
+    
+    baseline['recall'] = recall_score(test_labels, [1 for _ in range(len(test_labels))])
+    baseline['precision'] = precision_score(test_labels, [1 for _ in range(len(test_labels))])
+    baseline['roc'] = 0.5
+    
+    results = {}
+    
+    results['recall'] = recall_score(test_labels, predictions)
+    results['precision'] = precision_score(test_labels, predictions)
+    results['roc'] = roc_auc_score(test_labels, probs)
+    
+    train_results = {}
+    if smote:
+        train_results['recall'] = recall_score(train_labels_res, train_predictions)
+        train_results['precision'] = precision_score(train_labels_res, train_predictions)
+        train_results['roc'] = roc_auc_score(train_labels_res, train_probs)
+    else:
+        train_results['recall'] = recall_score(train_labels, train_predictions)
+        train_results['precision'] = precision_score(train_labels, train_predictions)
+        train_results['roc'] = roc_auc_score(train_labels, train_probs)
+    
+    for metric in ['recall', 'precision', 'roc']:
+        print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}')
+    
+    # Calculate false positive rates and true positive rates
+    base_fpr, base_tpr, _ = roc_curve(test_labels, [1 for _ in range(len(test_labels))])
+    model_fpr, model_tpr, _ = roc_curve(test_labels, probs)
+
+    plt.figure(figsize = (8, 6))
+    plt.rcParams['font.size'] = 16
+    
+    # Plot both curves
+    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
+    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
+    plt.legend()
+    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves')
+
+# In[]:
+
+print('\n _______________________ Confusion matrix _______________________')
+
+
+def plot_confusion_matrix(cm, classes,
+                          normalize=False,
+                          title='Confusion matrix',
+                          cmap=plt.cm.Oranges):
+    """
+    This function prints and plots the confusion matrix.
+    Normalization can be applied by setting `normalize=True`.
+    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
+    """
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        print("Normalized confusion matrix")
+    else:
+        print('Confusion matrix, without normalization')
+
+    print(cm)
+
+    plt.figure(figsize = (10, 10))
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title, size = 24)
+    plt.colorbar(aspect=4)
+    tick_marks = np.arange(len(classes))
+    plt.xticks(tick_marks, classes, rotation=45, size = 14)
+    plt.yticks(tick_marks, classes, size = 14)
+
+    fmt = '.2f' if normalize else 'd'
+    thresh = cm.max() / 2.
+    
+    # Labeling the plot
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
+                 horizontalalignment="center",
+                 color="white" if cm[i, j] > thresh else "black")
+        
+    plt.grid(None)
+    plt.tight_layout()
+    plt.ylabel('True label', size = 18)
+    plt.xlabel('Predicted label', size = 18)
+
+# In[]:
+print('\n######################## Random Forest ########################')
+
+print('\n _______________________ Training 100 Trees _______________________')
+
+# Create the model with 100 trees
+model = RandomForestClassifier(n_estimators=100, 
+                               random_state=RSEED, 
+                               max_features = 'sqrt',
+                               n_jobs=-1, verbose = 1)
+
+# Fit on training data
+model.fit(train, train_labels)
+
+# In[]:
+n_nodes = []
+max_depths = []
+
+for ind_tree in model.estimators_:
+    n_nodes.append(ind_tree.tree_.node_count)
+    max_depths.append(ind_tree.tree_.max_depth)
+    
+print(f'Average number of nodes {int(np.mean(n_nodes))}')
+print(f'Average maximum depth {int(np.mean(max_depths))}')
+
+# In[]:
+print('\n _______________________ Assess Random Forest Performance _______________________')
+train_rf_predictions = model.predict(train)
+train_rf_probs = model.predict_proba(train)[:, 1]
+
+rf_predictions = model.predict(test)
+rf_probs = model.predict_proba(test)[:, 1]
+
+evaluate_model(rf_predictions, rf_probs, train_rf_predictions, train_rf_probs)
+plt.savefig('ROC_curve_rf.png')
+plt.close()
+
+print('\n _______________________ Confusion matrix _______________________')
+cm = confusion_matrix(test_labels, rf_predictions)
+plot_confusion_matrix(cm, classes = ['Individual', 'Bot'],
+                      title = 'User Confusion Matrix') 
+plt.savefig('Confusion_matrix_rf.png')
+plt.close()
+
+print(f"f1score = {f1_score(test_labels, rf_predictions, average='binary')}") # labels binary
+print(f1_score(test_labels, rf_predictions, average='micro')) # global metric 
+print(f'TPR = {cm[1][1]/(sum(cm[1])):.2f} (Predicting correctly a user is bot)')
+print(f'TNR = {cm[0][0]/(sum(cm[0])):.2f} (Predicting correctly a user is individual)')
+# In[]:
+print('\n _______________________ Feature Importances _______________________')
+fi_model = pd.DataFrame({'feature': features,
+                   'importance': model.feature_importances_}).\
+                    sort_values('importance', ascending = False)
+print(fi_model.head(10))
+
+# In[]:
+print('\n _______________________ Testing on another dataset _______________________')
+df1 = pd.read_csv('../Datasets/user_classification/ind_vs_bot/dataset1/user_features_labels_noNan_dropped_fav_retweet_cols.csv', index_col=0)
+labels1 = np.array(df1.pop('labels'))
+rf_predictions1 = model.predict(df1)
+cm = confusion_matrix(labels1, rf_predictions1)
+plot_confusion_matrix(cm, classes = ['Individual', 'Bot'],
+                      title = 'User Confusion Matrix') 
+plt.savefig('Confusion_matrix_rf1.png')
+plt.close()
\ No newline at end of file
diff --git a/classifier/Random_Forest_Classifier_SMOTE.py b/classifier/Random_Forest_Classifier_SMOTE.py
new file mode 100644
index 0000000..773ac2d
--- /dev/null
+++ b/classifier/Random_Forest_Classifier_SMOTE.py
@@ -0,0 +1,262 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 11 10:19:29 2020
+
+@author: I Kit Cheng
+"""
+
+# In[]:
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import confusion_matrix
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.metrics import f1_score
+import itertools
+from collections import Counter
+from imblearn.over_sampling import SMOTE
+import pickle
+
+# In[]:
+
+# Set random seed to ensure reproducible runs
+RSEED = 50
+
+# Dataset
+df = pd.read_csv('../../Datasets/user_classification/ind_vs_org/dataset2/user_features_labels_noNan_dropped_fav_retweet_cols.csv', index_col=0) 
+
+
+
+print('\n _______________________ Split Data into Training and Testing Set __________________________')
+# Extract the labels
+labels = np.array(df.pop('labels'))
+
+# 30% examples in test data
+train, test, train_labels, test_labels = train_test_split(df, labels, 
+                                                          stratify = labels,
+                                                          test_size = 0.3, 
+                                                          random_state = RSEED)
+
+# In[]:
+# Features for feature importances
+features = list(train.columns)
+
+print(f'Train data shape: {train.shape}')
+print(f'Test data shape: {test.shape}')
+
+# In[]:
+
+print('\n _______________________ Evaluate the Decision Tree _______________________')
+def evaluate_model(predictions, probs, train_predictions, train_probs, smote=False):
+    """Compare machine learning model to baseline performance.
+    Computes statistics and shows ROC curve."""
+    
+    baseline = {}
+    
+    baseline['recall'] = recall_score(test_labels, [1 for _ in range(len(test_labels))])
+    baseline['precision'] = precision_score(test_labels, [1 for _ in range(len(test_labels))])
+    baseline['roc'] = 0.5
+    
+    results = {}
+    
+    results['recall'] = recall_score(test_labels, predictions)
+    results['precision'] = precision_score(test_labels, predictions)
+    results['roc'] = roc_auc_score(test_labels, probs)
+    
+    train_results = {}
+    if smote:
+        train_results['recall'] = recall_score(train_labels_res, train_predictions)
+        train_results['precision'] = precision_score(train_labels_res, train_predictions)
+        train_results['roc'] = roc_auc_score(train_labels_res, train_probs)
+    else:
+        train_results['recall'] = recall_score(train_labels, train_predictions)
+        train_results['precision'] = precision_score(train_labels, train_predictions)
+        train_results['roc'] = roc_auc_score(train_labels, train_probs)
+    
+    for metric in ['recall', 'precision', 'roc']:
+        print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}')
+    
+    # Calculate false positive rates and true positive rates
+    base_fpr, base_tpr, _ = roc_curve(test_labels, [1 for _ in range(len(test_labels))])
+    model_fpr, model_tpr, _ = roc_curve(test_labels, probs)
+
+    plt.figure(figsize = (8, 6))
+    plt.rcParams['font.size'] = 16
+    
+    # Plot both curves
+    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
+    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
+    plt.legend()
+    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves')
+
+# In[]:
+
+print('\n _______________________ Confusion matrix _______________________')
+
+
+def plot_confusion_matrix(cm, classes,
+                          normalize=False,
+                          title='Confusion matrix',
+                          cmap=plt.cm.Oranges):
+    """
+    This function prints and plots the confusion matrix.
+    Normalization can be applied by setting `normalize=True`.
+    Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
+    """
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        print("Normalized confusion matrix")
+    else:
+        print('Confusion matrix, without normalization')
+
+    print(cm)
+
+    plt.figure(figsize = (10, 10))
+    plt.imshow(cm, interpolation='nearest', cmap=cmap)
+    plt.title(title, size = 24)
+    plt.colorbar(aspect=4)
+    tick_marks = np.arange(len(classes))
+    plt.xticks(tick_marks, classes, rotation=45, size = 14)
+    plt.yticks(tick_marks, classes, size = 14)
+
+    fmt = '.2f' if normalize else 'd'
+    thresh = cm.max() / 2.
+    
+    # Labeling the plot
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
+                 horizontalalignment="center",
+                 color="white" if cm[i, j] > thresh else "black")
+        
+    plt.grid(None)
+    plt.tight_layout()
+    plt.ylabel('True label', size = 18)
+    plt.xlabel('Predicted label', size = 18)
+# In[]:
+print('\n _______________________ Random Forest + SMOTE _______________________')
+
+
+sm = SMOTE(random_state=42)
+train_res, train_labels_res = sm.fit_resample(train, train_labels)
+print('Resampled dataset shape %s' % Counter(train_labels_res))
+
+# In[]:
+print('\n _______________________ Training 100 Trees _______________________')
+
+# Create the model with 100 trees
+model = RandomForestClassifier(n_estimators=100, 
+                               random_state=RSEED, 
+                               max_features = 'sqrt',
+                               n_jobs=-1, verbose = 1)
+
+# Fit on training data
+model.fit(train_res, train_labels_res)
+
+# In[]:
+n_nodes = []
+max_depths = []
+
+for ind_tree in model.estimators_:
+    n_nodes.append(ind_tree.tree_.node_count)
+    max_depths.append(ind_tree.tree_.max_depth)
+    
+print(f'Average number of nodes {int(np.mean(n_nodes))}')
+print(f'Average maximum depth {int(np.mean(max_depths))}')
+
+# In[]:
+print('\n _______________________ Assess Random Forest Performance _______________________')
+train_rf_predictions = model.predict(train_res)
+train_rf_probs = model.predict_proba(train_res)[:, 1]
+
+rf_predictions = model.predict(test)
+rf_probs = model.predict_proba(test)[:, 1]
+
+evaluate_model(rf_predictions, rf_probs, train_rf_predictions, train_rf_probs, smote=True)
+plt.savefig('ROC_curve_rf_smote.png')
+plt.close()
+
+print('\n _______________________ Confusion matrix _______________________')
+cm = confusion_matrix(test_labels, rf_predictions)
+plot_confusion_matrix(cm, classes = ['Individual', 'Organisation'],
+                      title = 'User Confusion Matrix') 
+plt.savefig('Confusion_matrix_rf_smote.png')
+plt.close()
+
+print(f"f1score = {f1_score(test_labels, rf_predictions, average='binary')}") # labels binary
+print(f1_score(test_labels, rf_predictions, average='micro')) # global metric 
+print(f'TPR = {cm[1][1]/(sum(cm[1])):.2f} (Predicting correctly a user is organisation)')
+print(f'TNR = {cm[0][0]/(sum(cm[0])):.2f} (Predicting correctly a user is individual)')
+# In[]:
+print('\n _______________________ Feature Importances _______________________')
+fi_model = pd.DataFrame({'feature': features,
+                   'importance': model.feature_importances_}).\
+                    sort_values('importance', ascending = False)
+print(fi_model.head(10))
+
+def plot_feature_importances(fi_model):
+    # Reset style 
+    plt.style.use('default')
+    
+    # list of x locations for plotting
+    importances = fi_model.importance
+    x_values = list(range(len(importances)))
+    
+    # Make a bar chart
+    plt.bar(x_values, importances, orientation = 'vertical', color = 'r', edgecolor = 'k', linewidth = 1.2)
+    
+    # Tick labels for x axis
+    plt.xticks(x_values, fi_model.feature, rotation='vertical')
+    
+    # Axis labels and title
+    plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');
+    
+    # List of features sorted from most to least important
+    sorted_importances = fi_model.importance
+    sorted_features = fi_model.feature
+    
+    # Cumulative importances
+    cumulative_importances = np.cumsum(sorted_importances)
+    
+    # Make a line graph
+    plt.plot(x_values, cumulative_importances, 'g-')
+    
+    # Draw line at 95% of importance retained
+    plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed')
+    
+    # Format x ticks and labels
+    plt.xticks(x_values, sorted_features, rotation = 'vertical')
+    
+    # Axis labels and title
+    plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances');
+    plt.tight_layout()
+    plt.grid(True)
+    
+plot_feature_importances(fi_model)
+# In[]:
+print('\n _______________________ Testing on another dataset _______________________')
+df1 = pd.read_csv('../Datasets/user_classification/ind_vs_bot/dataset1/user_features_labels_noNan_dropped_fav_retweet_cols.csv', index_col=0)
+labels1 = np.array(df1.pop('labels'))
+rf_predictions1 = model.predict(df1)
+cm = confusion_matrix(labels1, rf_predictions1)
+plot_confusion_matrix(cm, classes = ['Individual', 'Organisation'],
+                      title = 'User Confusion Matrix') 
+plt.savefig('Confusion_matrix_rf1_smote.png')
+plt.close()
+
+# In[]:
+print('\n _______________________ Pickle the model _______________________')
+
+rf_bot_classifier_pkl = open('rf_smote_org_classifier.pkl','wb')
+pickle.dump(model, rf_bot_classifier_pkl)
+rf_bot_classifier_pkl.close()
+print('Pickling complete.')
+
+# In[]:
+print('\n _______________________ Unpickle the model _______________________')
+model_pkl = open('rf_smote_bot_classifier.pkl','rb')
+model = pickle.load(model_pkl)
diff --git a/ScrapeTwitterTimeline_FeatureExtraction.py b/classifier/ScrapeTwitterTimeline_FeatureExtraction.py
similarity index 86%
rename from ScrapeTwitterTimeline_FeatureExtraction.py
rename to classifier/ScrapeTwitterTimeline_FeatureExtraction.py
index 83cffd7..bf2c7d3 100644
--- a/ScrapeTwitterTimeline_FeatureExtraction.py
+++ b/classifier/ScrapeTwitterTimeline_FeatureExtraction.py
@@ -17,8 +17,8 @@ def scrape_user_timeline(user, N):
 
     Parameters
     ----------
-    user : string
-        Twitter username.
+    user : string or int
+        Twitter screen_name or Twitter user_id.
     N : int
         Number of most recent posts of each user.
 
@@ -32,10 +32,13 @@ def scrape_user_timeline(user, N):
 
     # twitter api endpoint
     url = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
-
-    params = dict(
-        screen_name=user,
-        count=N)
+    
+    if isinstance(user,str):
+        params = dict(screen_name=user,
+                      count=N)
+    elif isinstance(user,int):
+        params = dict(user_id=user,
+                      count=N)
     
     with open('config.yaml', 'r') as ymlfile:
         cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
@@ -154,7 +157,7 @@ def fav(data, P):
         if (isinstance(data[i]['in_reply_to_status_id'], int)):  # a reply
             fav_count_replies.append(data[i]['favorite_count'])
 
-        elif ('retweeted_status' in data[i].keys() or data[i]['is_quote_status']):  # a retweet
+        elif ('retweeted_status' in data[i].keys()): #or data[i]['is_quote_status']):  # a retweet or quote 
             fav_count_retweets.append(data[i]['favorite_count'])
 
         else:  # an original tweet
@@ -195,7 +198,7 @@ def ret(data, P):
         if (isinstance(data[i]['in_reply_to_status_id'], int)):  # a reply
             ret_count_replies.append(data[i]['retweet_count'])
 
-        elif ('retweeted_status' in data[i].keys() or data[i]['is_quote_status']):  # post is a retweet
+        elif ('retweeted_status' in data[i].keys()): # or data[i]['is_quote_status']):  # post is a retweet
             ret_count_retweets.append(data[i]['retweet_count'])
 
         else:  # post is an original tweet
@@ -345,7 +348,7 @@ def main(users, N):
         counter += 1
         bar.update(counter)
 
-        print(f'\nScraping: {user}')
+        #print(f'\nScraping: {user}')
         data = scrape_user_timeline(user, N)
         if len(data) == 0:
             #print('No posts found.')
@@ -412,41 +415,45 @@ def main(users, N):
                                    Tavg]
     
     df = pd.DataFrame.from_dict(users_data_dict, orient='index')
-    df.columns = ['nFollowers',
-                'nFollowings',
-                'FollowersToFollowing',
-                'nLists',
-                'nFavs',
-                'nPosts',
-                'geo',
-                'location',
-                'url',
-                'description',
-                'verified',
-                'fav_tweets',
-                'fav_retweets',
-                'fav_replies',
-                'ret_tweets',
-                'ret_retweets',
-                'ret_replies',
-                'pop_fav_tweets',
-                'pop_fav_retweets',
-                'pop_fav_replies',
-                'pop_ret_tweets',
-                'pop_ret_retweets',
-                'pop_ret_replies',
-                'nPostMention',
-                'nPostQuote',
-                'nPostPlace',
-                'Tavg']
-                                                
-    df.to_csv('users_features.csv')
+    
+    try:
+        df.columns = ['nFollowers',
+                    'nFollowings',
+                    'FollowersToFollowing',
+                    'nLists',
+                    'nFavs',
+                    'nPosts',
+                    'geo',
+                    'location',
+                    'url',
+                    'description',
+                    'verified',
+                    'fav_tweets',
+                    'fav_retweets',
+                    'fav_replies',
+                    'ret_tweets',
+                    'ret_retweets',
+                    'ret_replies',
+                    'pop_fav_tweets',
+                    'pop_fav_retweets',
+                    'pop_fav_replies',
+                    'pop_ret_tweets',
+                    'pop_ret_retweets',
+                    'pop_ret_replies',
+                    'nPostMention',
+                    'nPostQuote',
+                    'nPostPlace',
+                    'Tavg']
+    except ValueError:
+        df.columns = []
+    df.index.name = 'username'                                            
+    df.to_csv('user_features.csv')
     return df
 
 # In[]:
 if __name__ == '__main__':
-    # with open('users.csv', "r") as f:
-    #     users = f.readlines()
+    from get_usernames import get_usernames
     N = 200  # number of posts to scrape from user timeline
-    users = ['Miss_Asabe','mkayla_bayla','naijama']
-    df = main(users, N)
\ No newline at end of file
+    users = get_usernames('../Datasets/user_classification/ind_vs_bot/brexitday/brexitday.csv')
+    #users = [461277906]
+    #df = main(users, N)
\ No newline at end of file
diff --git a/classifier/classify_new_bots_orgs.py b/classifier/classify_new_bots_orgs.py
new file mode 100644
index 0000000..c1f72bd
--- /dev/null
+++ b/classifier/classify_new_bots_orgs.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 18 21:32:26 2020
+
+@author: I Kit Cheng
+"""
+from cleaning_feature_df import main_cleaning
+import pickle
+import pandas as pd
+
+def load_trained_model(model_pkl_file):
+    model_pkl = open(model_pkl_file,'rb')
+    model = pickle.load(model_pkl)
+    return model
+
+if __name__ == '__main__':
+    csv_file = '../../Datasets/user_classification/ind_vs_bot/brexitday/user_features.csv'
+    df = main_cleaning(csv_file)
+    model_org = load_trained_model('rf_smote_org_classifier.pkl')
+    model_bot = load_trained_model('rf_smote_bot_classifier.pkl')   
+    pred_org = model_org.predict(df)
+    pred_bot = model_bot.predict(df)
+    classifications = {'labels_bot':pred_bot, 'labels_org':pred_org}
+    df_classify = pd.DataFrame(classifications, columns=['labels_bot','labels_org'])
+    df_classify.index = df.index
+    df_classify.to_csv('bot_org_labels.csv')
+    
+    # show accounts that are both bots and org
+    bot = df_classify[df_classify.labels_bot==1]
+    bot_and_org = bot[bot.labels_org==1]
+    print(f'Number of bots and orgs: {len(bot_and_org)}')
+    
\ No newline at end of file
diff --git a/classifier/cleaning_feature_df.py b/classifier/cleaning_feature_df.py
new file mode 100644
index 0000000..c514090
--- /dev/null
+++ b/classifier/cleaning_feature_df.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 18 20:59:17 2020
+
+@author: I Kit Cheng
+"""
+from sklearn.impute import SimpleImputer
+import pandas as pd
+import numpy as np
+
+# Replace numerical nans with median (the median is less sensitive to outliers)
+def replaceNans(df, strategy='median'):
+    """
+
+    Parameters
+    ----------
+    df : pandas.core.frame.DataFrame
+        A dataframe (rows are examples and columns are features).
+    strategy: string, optional
+        Replace nans with specified strategy. The default is 'median'. 
+        Options are 'mean', 'median', 'most_frequent', 'constant'.
+
+    Returns
+    -------
+    df : pandas.core.frame.DataFrame
+        A dataframe without numerical nans.
+
+    """
+    print(f'Replacing Nans with {strategy}.')
+    imputer = SimpleImputer(missing_values=np.nan, strategy=strategy)
+    for i, col in enumerate(df.columns):
+        if len(df[col].unique()) == 2: # categorical (binary) data
+            continue
+        else:
+            df[[col]] = imputer.fit_transform(df[[col]])
+    return df
+
+def bool2int(df,columns):
+    """
+
+    Parameters
+    ----------
+    df : pandas.core.frame.DataFrame
+        Dataframe with boolean columns.
+    columns : list
+        Column names with boolean data.
+
+    Returns
+    -------
+    df : pandas.core.frame.DataFrame
+        Dataframe without boolean data (converted to binary 0 or 1)
+
+    """
+    print('\nChanging boolean data to 0 or 1.')
+    for col in columns:
+        df[col] = df[col].astype(int)
+    return df
+
+def main_cleaning(csv_file):
+    df = pd.read_csv(csv_file, index_col=0)
+    df.index.name = 'username'
+    df = bool2int(df, ['geo', 'location', 'url', 'description', 'verified'])
+    df = replaceNans(df)
+    df.to_csv('user_features_noNan.csv')
+    print('___________________Cleaning Complete!_________________')
+    return df
+
+if __name__ == '__main__':
+    csv_file = '../Datasets/user_classification/ind_vs_bot/brexitday/user_features.csv'
+    df = main_cleaning(csv_file)
+
+    
+
diff --git a/classifier/get_usernames.py b/classifier/get_usernames.py
new file mode 100644
index 0000000..291ae86
--- /dev/null
+++ b/classifier/get_usernames.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 18 15:45:03 2020
+
+@author: I Kit Cheng
+"""
+# Getting user names from 'All_Headers' csv files 
+
+import pandas as pd
+
+def get_usernames(csv_file):
+    """
+
+    Parameters
+    ----------
+    csv_file : str
+        csv filename.
+
+    Returns
+    -------
+    users : list
+        List of usernames.
+
+    """
+    df = pd.read_csv(csv_file, header=None)
+    users = df[7].to_list()
+    return users
+
+if __name__ == '__main__':
+    csv_file = 'brexitday.csv'
+    users = get_usernames(csv_file)
\ No newline at end of file
diff --git a/classifier/parse_json.py b/classifier/parse_json.py
new file mode 100644
index 0000000..4145a7f
--- /dev/null
+++ b/classifier/parse_json.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Feb 12 17:05:55 2020
+
+@author: I Kit Cheng
+"""
+
+import json
+
+def parse_json(filename):
+    """
+
+    Parameters
+    ----------
+    filename : string
+        Full path to the json file.
+
+    Returns
+    -------
+    data : dict or list
+        Returns a dictionary or a list of dictionaries.
+
+    """
+    
+    with open(filename, 'r') as f:
+        data = json.load(f)
+    return data
+
+# In[]:    
+if __name__ == "__main__":
+    filename = '../Datasets/user_classification/ind_vs_org/organization_training_unbalanced_lower.json'
+    data = parse_json(filename)
+    
+    usernames_labels = []
+    for i, uid in enumerate(data['users'].keys()):
+        user_data = data['users'][uid]
+        usernames_labels.append([user_data['username'],user_data['label']])
diff --git a/classifier/rf_smote_bot_classifier.pkl b/classifier/rf_smote_bot_classifier.pkl
new file mode 100644
index 0000000..dce087e
Binary files /dev/null and b/classifier/rf_smote_bot_classifier.pkl differ
diff --git a/classifier/rf_smote_org_classifier.pkl b/classifier/rf_smote_org_classifier.pkl
new file mode 100644
index 0000000..6e384c1
Binary files /dev/null and b/classifier/rf_smote_org_classifier.pkl differ