diff --git a/.gitignore b/.gitignore index 4fe1c3f..3c3aa94 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .DS_Store config.yaml +figures/ \ No newline at end of file diff --git a/README.md b/README.md index b7df1b0..6d9f611 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ NLP tools to analyse twitter for different user groups and perform topic modelli ## JH: - Data preprocessing: - cleaning hashtag lists - - write script to format csv's that Nisha has generated into format that the preprocessing code can use for wordcloud generation + - ~~write script to format csv's that Nisha has generated into format that the preprocessing code can use for wordcloud generation~~ - find a list of English words to use for tweet filtering - Data analysis: - wordcloud related stuff for hashtags and tweets --> specifically do a wordcloud of the text of the tweets diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-30-2020-01-31.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-30-2020-01-31.tar.gz new file mode 100644 index 0000000..5f7092e Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-30-2020-01-31.tar.gz differ diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-31-2020-02-01.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-31-2020-02-01.tar.gz new file mode 100644 index 0000000..af3a067 Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-31-2020-02-01.tar.gz differ diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-01-2020-02-02.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-01-2020-02-02.tar.gz new file mode 100644 index 0000000..b65020a Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-01-2020-02-02.tar.gz differ diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-02-2020-02-03.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-02-2020-02-03.tar.gz new file mode 100644 index 0000000..f9a4251 Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-02-2020-02-03.tar.gz differ diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-03-2020-02-04.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-03-2020-02-04.tar.gz new file mode 100644 index 0000000..701c38b Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-03-2020-02-04.tar.gz differ diff --git a/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-04-2020-02-05.tar.gz b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-04-2020-02-05.tar.gz new file mode 100644 index 0000000..a33d683 Binary files /dev/null and b/ScrapingTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-04-2020-02-05.tar.gz differ diff --git a/classifier/Generate_training_data_for_user_classification.py b/classifier/Generate_training_data_for_user_classification.py new file mode 100644 index 0000000..27a4de6 --- /dev/null +++ b/classifier/Generate_training_data_for_user_classification.py @@ -0,0 +1,196 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Feb 10 01:46:24 2020 + +@author: I Kit Cheng +""" + +# In[]: + +# Generate features from training data +from ScrapeTwitterTimeline_FeatureExtraction import main +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt + + +def relabel_dataset(df): + """ + + Parameters + ---------- + df : pandas.core.frame.DataFrame + Dataframe. + + Returns + ------- + df_labels : pandas.core.frame.DataFrame + Extracted labels. + + """ + + # Label Distribution + print('\nLabel Distribution:') + print(df.gender.value_counts()) + + # Drop rows with gender = nan + df = df.dropna(subset=['gender']) + + # Remove individuals with unknown label + df = df[df.gender != 'unknown'] + + print(f'\nClean df length: {len(df)}') + + # Label Distribution (clean) + print('\nLabel Distribution:') + print(df.gender.value_counts()) + + # Combine 'male' and 'females' labels to 0, and relabel 'brand' to 1 + df_labels = pd.DataFrame([0 if (x =='female' or x == 'male') + else 1 for x in df.gender], columns=['labels'], + index=df.name) + + return df_labels + +# In[]: + + +def bool2int(df,columns): + """ + + Parameters + ---------- + df : pandas.core.frame.DataFrame + Dataframe with boolean columns. + columns : list + Column names with boolean data. + + Returns + ------- + df : pandas.core.frame.DataFrame + Dataframe without boolean data (converted to binary 0 or 1) + + """ + print('\nChanging boolean data to 0 or 1.') + for col in columns: + df[col] = df[col].astype(int) + return df + +def matching_labels_to_new_features(df): + """ + + Parameters + ---------- + df : pandas.core.frame.DataFrame + Dataframe without labels. + + Returns + ------- + df : pandas.core.frame.DataFrame + Dataframe with labels. + + """ + print('Matching labels to new features dataframe.') + # Adding the corresponding label to the feature dataset + labels_for_sample = [] + for i,v in enumerate(df.index.to_list()): + if len(df_labels.loc[v]) > 1: + labels_for_sample.append(df_labels.loc[v].iloc[0][0]) + else: + labels_for_sample.append(df_labels.loc[v].iloc[0]) + + df.index.names = ['username'] # name the index column + df['labels'] = labels_for_sample + df.to_csv('user_features_labels.csv') + return df + + +# In[]: + +########################################### plot distribution of each variable ###################################### +def plotDist(save=False): + """ + + Parameters + ---------- + save : bool, optional + Save plot option. The default is False. + + Returns + ------- + None. + + """ + for i, col in enumerate(df.columns[1:]): + print(col) + plt.figure() + try: + ax = sns.kdeplot(df[col]) + ax.get_legend().remove() + except RuntimeError: + df[col].hist() + plt.title(col) + plt.close() + if save: + plt.savefig('dist_'+col+'.png') + +#plotDist() +# In[]: +######################################### Dealing with missing data ############################################## + +from sklearn.impute import SimpleImputer + +# Replace numerical nans with median (the median is less sensitive to outliers) +def replaceNans(df, strategy='median'): + """ + + Parameters + ---------- + df : pandas.core.frame.DataFrame + A dataframe (rows are examples and columns are features). + strategy: string, optional + Replace nans with specified strategy. The default is 'median'. + Options are 'mean', 'median', 'most_frequent', 'constant'. + + Returns + ------- + df : pandas.core.frame.DataFrame + A dataframe without numerical nans. + + """ + print(f'Replacing Nans with {strategy}.') + imputer = SimpleImputer(missing_values=np.nan, strategy=strategy) + for i, col in enumerate(df.columns[1:-1]): + if len(df[col].unique()) == 2: # categorical (binary) data + continue + else: + df[[col]] = imputer.fit_transform(df[[col]]) + return df + +# In[]: +if __name__ == '__main__': + # Set random seed to ensure reproducible runs + RSEED = 50 + + print('\n################# Begin Scraping User Timeline: #####################') + + # We'll limit the data to 1000 individuals to speed up training. + df = pd.read_csv('../Datasets/gender-classifier.csv', encoding = "ISO-8859-1")#.sample(10, random_state = RSEED) + users = df.name.to_list() + df_labels = relabel_dataset(df) + + scrape = False + if scrape: + df = main(users, N=200) # saves features in users_features.csv + + df = pd.read_csv('users_features.csv', index_col=0) + + df = bool2int(df, ['geo', 'location', 'url', 'description', 'verified']) + matching_labels_to_new_features(df) + + df = pd.read_csv('user_features_labels.csv', index_col=0) # training data (unclean) + df.index.name = 'username' + df = replaceNans(df) + df.to_csv('user_features_labels_noNan.csv') + print('___________________Done cleaning!_________________') diff --git a/classifier/Random_Forest_Classifier.py b/classifier/Random_Forest_Classifier.py new file mode 100644 index 0000000..8ff9294 --- /dev/null +++ b/classifier/Random_Forest_Classifier.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 11 10:19:29 2020 + +@author: I Kit Cheng +""" + +# In[]: + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.metrics import confusion_matrix +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve +from sklearn.model_selection import RandomizedSearchCV +from sklearn.metrics import f1_score +import itertools +from collections import Counter +from imblearn.over_sampling import SMOTE + +# In[]: + +# Set random seed to ensure reproducible runs +RSEED = 50 + +# Dataset +df = pd.read_csv('../Datasets/user_classification/ind_vs_bot/dataset3/user_features_labels_noNan_dropped_fav_retweet_cols_combined_d1d2.csv', index_col=0) + + + +print('\n _______________________ Split Data into Training and Testing Set __________________________') +# Extract the labels +labels = np.array(df.pop('labels')) + +# 30% examples in test data +train, test, train_labels, test_labels = train_test_split(df, labels, + stratify = labels, + test_size = 0.3, + random_state = RSEED) + +# In[]: +# Features for feature importances +features = list(train.columns) + +print(f'Train data shape: {train.shape}') +print(f'Test data shape: {test.shape}') + +# In[]: + +print('\n _______________________ Evaluate the Decision Tree _______________________') +def evaluate_model(predictions, probs, train_predictions, train_probs, smote=False): + """Compare machine learning model to baseline performance. + Computes statistics and shows ROC curve.""" + + baseline = {} + + baseline['recall'] = recall_score(test_labels, [1 for _ in range(len(test_labels))]) + baseline['precision'] = precision_score(test_labels, [1 for _ in range(len(test_labels))]) + baseline['roc'] = 0.5 + + results = {} + + results['recall'] = recall_score(test_labels, predictions) + results['precision'] = precision_score(test_labels, predictions) + results['roc'] = roc_auc_score(test_labels, probs) + + train_results = {} + if smote: + train_results['recall'] = recall_score(train_labels_res, train_predictions) + train_results['precision'] = precision_score(train_labels_res, train_predictions) + train_results['roc'] = roc_auc_score(train_labels_res, train_probs) + else: + train_results['recall'] = recall_score(train_labels, train_predictions) + train_results['precision'] = precision_score(train_labels, train_predictions) + train_results['roc'] = roc_auc_score(train_labels, train_probs) + + for metric in ['recall', 'precision', 'roc']: + print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}') + + # Calculate false positive rates and true positive rates + base_fpr, base_tpr, _ = roc_curve(test_labels, [1 for _ in range(len(test_labels))]) + model_fpr, model_tpr, _ = roc_curve(test_labels, probs) + + plt.figure(figsize = (8, 6)) + plt.rcParams['font.size'] = 16 + + # Plot both curves + plt.plot(base_fpr, base_tpr, 'b', label = 'baseline') + plt.plot(model_fpr, model_tpr, 'r', label = 'model') + plt.legend() + plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves') + +# In[]: + +print('\n _______________________ Confusion matrix _______________________') + + +def plot_confusion_matrix(cm, classes, + normalize=False, + title='Confusion matrix', + cmap=plt.cm.Oranges): + """ + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html + """ + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + print(cm) + + plt.figure(figsize = (10, 10)) + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title, size = 24) + plt.colorbar(aspect=4) + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45, size = 14) + plt.yticks(tick_marks, classes, size = 14) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + + # Labeling the plot + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), fontsize = 20, + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + + plt.grid(None) + plt.tight_layout() + plt.ylabel('True label', size = 18) + plt.xlabel('Predicted label', size = 18) + +# In[]: +print('\n######################## Random Forest ########################') + +print('\n _______________________ Training 100 Trees _______________________') + +# Create the model with 100 trees +model = RandomForestClassifier(n_estimators=100, + random_state=RSEED, + max_features = 'sqrt', + n_jobs=-1, verbose = 1) + +# Fit on training data +model.fit(train, train_labels) + +# In[]: +n_nodes = [] +max_depths = [] + +for ind_tree in model.estimators_: + n_nodes.append(ind_tree.tree_.node_count) + max_depths.append(ind_tree.tree_.max_depth) + +print(f'Average number of nodes {int(np.mean(n_nodes))}') +print(f'Average maximum depth {int(np.mean(max_depths))}') + +# In[]: +print('\n _______________________ Assess Random Forest Performance _______________________') +train_rf_predictions = model.predict(train) +train_rf_probs = model.predict_proba(train)[:, 1] + +rf_predictions = model.predict(test) +rf_probs = model.predict_proba(test)[:, 1] + +evaluate_model(rf_predictions, rf_probs, train_rf_predictions, train_rf_probs) +plt.savefig('ROC_curve_rf.png') +plt.close() + +print('\n _______________________ Confusion matrix _______________________') +cm = confusion_matrix(test_labels, rf_predictions) +plot_confusion_matrix(cm, classes = ['Individual', 'Bot'], + title = 'User Confusion Matrix') +plt.savefig('Confusion_matrix_rf.png') +plt.close() + +print(f"f1score = {f1_score(test_labels, rf_predictions, average='binary')}") # labels binary +print(f1_score(test_labels, rf_predictions, average='micro')) # global metric +print(f'TPR = {cm[1][1]/(sum(cm[1])):.2f} (Predicting correctly a user is bot)') +print(f'TNR = {cm[0][0]/(sum(cm[0])):.2f} (Predicting correctly a user is individual)') +# In[]: +print('\n _______________________ Feature Importances _______________________') +fi_model = pd.DataFrame({'feature': features, + 'importance': model.feature_importances_}).\ + sort_values('importance', ascending = False) +print(fi_model.head(10)) + +# In[]: +print('\n _______________________ Testing on another dataset _______________________') +df1 = pd.read_csv('../Datasets/user_classification/ind_vs_bot/dataset1/user_features_labels_noNan_dropped_fav_retweet_cols.csv', index_col=0) +labels1 = np.array(df1.pop('labels')) +rf_predictions1 = model.predict(df1) +cm = confusion_matrix(labels1, rf_predictions1) +plot_confusion_matrix(cm, classes = ['Individual', 'Bot'], + title = 'User Confusion Matrix') +plt.savefig('Confusion_matrix_rf1.png') +plt.close() \ No newline at end of file diff --git a/classifier/Random_Forest_Classifier_SMOTE.py b/classifier/Random_Forest_Classifier_SMOTE.py new file mode 100644 index 0000000..773ac2d --- /dev/null +++ b/classifier/Random_Forest_Classifier_SMOTE.py @@ -0,0 +1,262 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 11 10:19:29 2020 + +@author: I Kit Cheng +""" + +# In[]: + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.metrics import confusion_matrix +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve +from sklearn.model_selection import RandomizedSearchCV +from sklearn.metrics import f1_score +import itertools +from collections import Counter +from imblearn.over_sampling import SMOTE +import pickle + +# In[]: + +# Set random seed to ensure reproducible runs +RSEED = 50 + +# Dataset +df = pd.read_csv('../../Datasets/user_classification/ind_vs_org/dataset2/user_features_labels_noNan_dropped_fav_retweet_cols.csv', index_col=0) + + + +print('\n _______________________ Split Data into Training and Testing Set __________________________') +# Extract the labels +labels = np.array(df.pop('labels')) + +# 30% examples in test data +train, test, train_labels, test_labels = train_test_split(df, labels, + stratify = labels, + test_size = 0.3, + random_state = RSEED) + +# In[]: +# Features for feature importances +features = list(train.columns) + +print(f'Train data shape: {train.shape}') +print(f'Test data shape: {test.shape}') + +# In[]: + +print('\n _______________________ Evaluate the Decision Tree _______________________') +def evaluate_model(predictions, probs, train_predictions, train_probs, smote=False): + """Compare machine learning model to baseline performance. + Computes statistics and shows ROC curve.""" + + baseline = {} + + baseline['recall'] = recall_score(test_labels, [1 for _ in range(len(test_labels))]) + baseline['precision'] = precision_score(test_labels, [1 for _ in range(len(test_labels))]) + baseline['roc'] = 0.5 + + results = {} + + results['recall'] = recall_score(test_labels, predictions) + results['precision'] = precision_score(test_labels, predictions) + results['roc'] = roc_auc_score(test_labels, probs) + + train_results = {} + if smote: + train_results['recall'] = recall_score(train_labels_res, train_predictions) + train_results['precision'] = precision_score(train_labels_res, train_predictions) + train_results['roc'] = roc_auc_score(train_labels_res, train_probs) + else: + train_results['recall'] = recall_score(train_labels, train_predictions) + train_results['precision'] = precision_score(train_labels, train_predictions) + train_results['roc'] = roc_auc_score(train_labels, train_probs) + + for metric in ['recall', 'precision', 'roc']: + print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}') + + # Calculate false positive rates and true positive rates + base_fpr, base_tpr, _ = roc_curve(test_labels, [1 for _ in range(len(test_labels))]) + model_fpr, model_tpr, _ = roc_curve(test_labels, probs) + + plt.figure(figsize = (8, 6)) + plt.rcParams['font.size'] = 16 + + # Plot both curves + plt.plot(base_fpr, base_tpr, 'b', label = 'baseline') + plt.plot(model_fpr, model_tpr, 'r', label = 'model') + plt.legend() + plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves') + +# In[]: + +print('\n _______________________ Confusion matrix _______________________') + + +def plot_confusion_matrix(cm, classes, + normalize=False, + title='Confusion matrix', + cmap=plt.cm.Oranges): + """ + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html + """ + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + + print(cm) + + plt.figure(figsize = (10, 10)) + plt.imshow(cm, interpolation='nearest', cmap=cmap) + plt.title(title, size = 24) + plt.colorbar(aspect=4) + tick_marks = np.arange(len(classes)) + plt.xticks(tick_marks, classes, rotation=45, size = 14) + plt.yticks(tick_marks, classes, size = 14) + + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + + # Labeling the plot + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): + plt.text(j, i, format(cm[i, j], fmt), fontsize = 20, + horizontalalignment="center", + color="white" if cm[i, j] > thresh else "black") + + plt.grid(None) + plt.tight_layout() + plt.ylabel('True label', size = 18) + plt.xlabel('Predicted label', size = 18) +# In[]: +print('\n _______________________ Random Forest + SMOTE _______________________') + + +sm = SMOTE(random_state=42) +train_res, train_labels_res = sm.fit_resample(train, train_labels) +print('Resampled dataset shape %s' % Counter(train_labels_res)) + +# In[]: +print('\n _______________________ Training 100 Trees _______________________') + +# Create the model with 100 trees +model = RandomForestClassifier(n_estimators=100, + random_state=RSEED, + max_features = 'sqrt', + n_jobs=-1, verbose = 1) + +# Fit on training data +model.fit(train_res, train_labels_res) + +# In[]: +n_nodes = [] +max_depths = [] + +for ind_tree in model.estimators_: + n_nodes.append(ind_tree.tree_.node_count) + max_depths.append(ind_tree.tree_.max_depth) + +print(f'Average number of nodes {int(np.mean(n_nodes))}') +print(f'Average maximum depth {int(np.mean(max_depths))}') + +# In[]: +print('\n _______________________ Assess Random Forest Performance _______________________') +train_rf_predictions = model.predict(train_res) +train_rf_probs = model.predict_proba(train_res)[:, 1] + +rf_predictions = model.predict(test) +rf_probs = model.predict_proba(test)[:, 1] + +evaluate_model(rf_predictions, rf_probs, train_rf_predictions, train_rf_probs, smote=True) +plt.savefig('ROC_curve_rf_smote.png') +plt.close() + +print('\n _______________________ Confusion matrix _______________________') +cm = confusion_matrix(test_labels, rf_predictions) +plot_confusion_matrix(cm, classes = ['Individual', 'Organisation'], + title = 'User Confusion Matrix') +plt.savefig('Confusion_matrix_rf_smote.png') +plt.close() + +print(f"f1score = {f1_score(test_labels, rf_predictions, average='binary')}") # labels binary +print(f1_score(test_labels, rf_predictions, average='micro')) # global metric +print(f'TPR = {cm[1][1]/(sum(cm[1])):.2f} (Predicting correctly a user is organisation)') +print(f'TNR = {cm[0][0]/(sum(cm[0])):.2f} (Predicting correctly a user is individual)') +# In[]: +print('\n _______________________ Feature Importances _______________________') +fi_model = pd.DataFrame({'feature': features, + 'importance': model.feature_importances_}).\ + sort_values('importance', ascending = False) +print(fi_model.head(10)) + +def plot_feature_importances(fi_model): + # Reset style + plt.style.use('default') + + # list of x locations for plotting + importances = fi_model.importance + x_values = list(range(len(importances))) + + # Make a bar chart + plt.bar(x_values, importances, orientation = 'vertical', color = 'r', edgecolor = 'k', linewidth = 1.2) + + # Tick labels for x axis + plt.xticks(x_values, fi_model.feature, rotation='vertical') + + # Axis labels and title + plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances'); + + # List of features sorted from most to least important + sorted_importances = fi_model.importance + sorted_features = fi_model.feature + + # Cumulative importances + cumulative_importances = np.cumsum(sorted_importances) + + # Make a line graph + plt.plot(x_values, cumulative_importances, 'g-') + + # Draw line at 95% of importance retained + plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed') + + # Format x ticks and labels + plt.xticks(x_values, sorted_features, rotation = 'vertical') + + # Axis labels and title + plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances'); + plt.tight_layout() + plt.grid(True) + +plot_feature_importances(fi_model) +# In[]: +print('\n _______________________ Testing on another dataset _______________________') +df1 = pd.read_csv('../Datasets/user_classification/ind_vs_bot/dataset1/user_features_labels_noNan_dropped_fav_retweet_cols.csv', index_col=0) +labels1 = np.array(df1.pop('labels')) +rf_predictions1 = model.predict(df1) +cm = confusion_matrix(labels1, rf_predictions1) +plot_confusion_matrix(cm, classes = ['Individual', 'Organisation'], + title = 'User Confusion Matrix') +plt.savefig('Confusion_matrix_rf1_smote.png') +plt.close() + +# In[]: +print('\n _______________________ Pickle the model _______________________') + +rf_bot_classifier_pkl = open('rf_smote_org_classifier.pkl','wb') +pickle.dump(model, rf_bot_classifier_pkl) +rf_bot_classifier_pkl.close() +print('Pickling complete.') + +# In[]: +print('\n _______________________ Unpickle the model _______________________') +model_pkl = open('rf_smote_bot_classifier.pkl','rb') +model = pickle.load(model_pkl) diff --git a/ScrapeTwitterTimeline_FeatureExtraction.py b/classifier/ScrapeTwitterTimeline_FeatureExtraction.py similarity index 86% rename from ScrapeTwitterTimeline_FeatureExtraction.py rename to classifier/ScrapeTwitterTimeline_FeatureExtraction.py index 83cffd7..bf2c7d3 100644 --- a/ScrapeTwitterTimeline_FeatureExtraction.py +++ b/classifier/ScrapeTwitterTimeline_FeatureExtraction.py @@ -17,8 +17,8 @@ def scrape_user_timeline(user, N): Parameters ---------- - user : string - Twitter username. + user : string or int + Twitter screen_name or Twitter user_id. N : int Number of most recent posts of each user. @@ -32,10 +32,13 @@ def scrape_user_timeline(user, N): # twitter api endpoint url = 'https://api.twitter.com/1.1/statuses/user_timeline.json' - - params = dict( - screen_name=user, - count=N) + + if isinstance(user,str): + params = dict(screen_name=user, + count=N) + elif isinstance(user,int): + params = dict(user_id=user, + count=N) with open('config.yaml', 'r') as ymlfile: cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) @@ -154,7 +157,7 @@ def fav(data, P): if (isinstance(data[i]['in_reply_to_status_id'], int)): # a reply fav_count_replies.append(data[i]['favorite_count']) - elif ('retweeted_status' in data[i].keys() or data[i]['is_quote_status']): # a retweet + elif ('retweeted_status' in data[i].keys()): #or data[i]['is_quote_status']): # a retweet or quote fav_count_retweets.append(data[i]['favorite_count']) else: # an original tweet @@ -195,7 +198,7 @@ def ret(data, P): if (isinstance(data[i]['in_reply_to_status_id'], int)): # a reply ret_count_replies.append(data[i]['retweet_count']) - elif ('retweeted_status' in data[i].keys() or data[i]['is_quote_status']): # post is a retweet + elif ('retweeted_status' in data[i].keys()): # or data[i]['is_quote_status']): # post is a retweet ret_count_retweets.append(data[i]['retweet_count']) else: # post is an original tweet @@ -345,7 +348,7 @@ def main(users, N): counter += 1 bar.update(counter) - print(f'\nScraping: {user}') + #print(f'\nScraping: {user}') data = scrape_user_timeline(user, N) if len(data) == 0: #print('No posts found.') @@ -412,41 +415,45 @@ def main(users, N): Tavg] df = pd.DataFrame.from_dict(users_data_dict, orient='index') - df.columns = ['nFollowers', - 'nFollowings', - 'FollowersToFollowing', - 'nLists', - 'nFavs', - 'nPosts', - 'geo', - 'location', - 'url', - 'description', - 'verified', - 'fav_tweets', - 'fav_retweets', - 'fav_replies', - 'ret_tweets', - 'ret_retweets', - 'ret_replies', - 'pop_fav_tweets', - 'pop_fav_retweets', - 'pop_fav_replies', - 'pop_ret_tweets', - 'pop_ret_retweets', - 'pop_ret_replies', - 'nPostMention', - 'nPostQuote', - 'nPostPlace', - 'Tavg'] - - df.to_csv('users_features.csv') + + try: + df.columns = ['nFollowers', + 'nFollowings', + 'FollowersToFollowing', + 'nLists', + 'nFavs', + 'nPosts', + 'geo', + 'location', + 'url', + 'description', + 'verified', + 'fav_tweets', + 'fav_retweets', + 'fav_replies', + 'ret_tweets', + 'ret_retweets', + 'ret_replies', + 'pop_fav_tweets', + 'pop_fav_retweets', + 'pop_fav_replies', + 'pop_ret_tweets', + 'pop_ret_retweets', + 'pop_ret_replies', + 'nPostMention', + 'nPostQuote', + 'nPostPlace', + 'Tavg'] + except ValueError: + df.columns = [] + df.index.name = 'username' + df.to_csv('user_features.csv') return df # In[]: if __name__ == '__main__': - # with open('users.csv', "r") as f: - # users = f.readlines() + from get_usernames import get_usernames N = 200 # number of posts to scrape from user timeline - users = ['Miss_Asabe','mkayla_bayla','naijama'] - df = main(users, N) \ No newline at end of file + users = get_usernames('../Datasets/user_classification/ind_vs_bot/brexitday/brexitday.csv') + #users = [461277906] + #df = main(users, N) \ No newline at end of file diff --git a/classifier/classify_new_bots_orgs.py b/classifier/classify_new_bots_orgs.py new file mode 100644 index 0000000..c1f72bd --- /dev/null +++ b/classifier/classify_new_bots_orgs.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 18 21:32:26 2020 + +@author: I Kit Cheng +""" +from cleaning_feature_df import main_cleaning +import pickle +import pandas as pd + +def load_trained_model(model_pkl_file): + model_pkl = open(model_pkl_file,'rb') + model = pickle.load(model_pkl) + return model + +if __name__ == '__main__': + csv_file = '../../Datasets/user_classification/ind_vs_bot/brexitday/user_features.csv' + df = main_cleaning(csv_file) + model_org = load_trained_model('rf_smote_org_classifier.pkl') + model_bot = load_trained_model('rf_smote_bot_classifier.pkl') + pred_org = model_org.predict(df) + pred_bot = model_bot.predict(df) + classifications = {'labels_bot':pred_bot, 'labels_org':pred_org} + df_classify = pd.DataFrame(classifications, columns=['labels_bot','labels_org']) + df_classify.index = df.index + df_classify.to_csv('bot_org_labels.csv') + + # show accounts that are both bots and org + bot = df_classify[df_classify.labels_bot==1] + bot_and_org = bot[bot.labels_org==1] + print(f'Number of bots and orgs: {len(bot_and_org)}') + \ No newline at end of file diff --git a/classifier/cleaning_feature_df.py b/classifier/cleaning_feature_df.py new file mode 100644 index 0000000..c514090 --- /dev/null +++ b/classifier/cleaning_feature_df.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 18 20:59:17 2020 + +@author: I Kit Cheng +""" +from sklearn.impute import SimpleImputer +import pandas as pd +import numpy as np + +# Replace numerical nans with median (the median is less sensitive to outliers) +def replaceNans(df, strategy='median'): + """ + + Parameters + ---------- + df : pandas.core.frame.DataFrame + A dataframe (rows are examples and columns are features). + strategy: string, optional + Replace nans with specified strategy. The default is 'median'. + Options are 'mean', 'median', 'most_frequent', 'constant'. + + Returns + ------- + df : pandas.core.frame.DataFrame + A dataframe without numerical nans. + + """ + print(f'Replacing Nans with {strategy}.') + imputer = SimpleImputer(missing_values=np.nan, strategy=strategy) + for i, col in enumerate(df.columns): + if len(df[col].unique()) == 2: # categorical (binary) data + continue + else: + df[[col]] = imputer.fit_transform(df[[col]]) + return df + +def bool2int(df,columns): + """ + + Parameters + ---------- + df : pandas.core.frame.DataFrame + Dataframe with boolean columns. + columns : list + Column names with boolean data. + + Returns + ------- + df : pandas.core.frame.DataFrame + Dataframe without boolean data (converted to binary 0 or 1) + + """ + print('\nChanging boolean data to 0 or 1.') + for col in columns: + df[col] = df[col].astype(int) + return df + +def main_cleaning(csv_file): + df = pd.read_csv(csv_file, index_col=0) + df.index.name = 'username' + df = bool2int(df, ['geo', 'location', 'url', 'description', 'verified']) + df = replaceNans(df) + df.to_csv('user_features_noNan.csv') + print('___________________Cleaning Complete!_________________') + return df + +if __name__ == '__main__': + csv_file = '../Datasets/user_classification/ind_vs_bot/brexitday/user_features.csv' + df = main_cleaning(csv_file) + + + diff --git a/classifier/get_usernames.py b/classifier/get_usernames.py new file mode 100644 index 0000000..291ae86 --- /dev/null +++ b/classifier/get_usernames.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Feb 18 15:45:03 2020 + +@author: I Kit Cheng +""" +# Getting user names from 'All_Headers' csv files + +import pandas as pd + +def get_usernames(csv_file): + """ + + Parameters + ---------- + csv_file : str + csv filename. + + Returns + ------- + users : list + List of usernames. + + """ + df = pd.read_csv(csv_file, header=None) + users = df[7].to_list() + return users + +if __name__ == '__main__': + csv_file = 'brexitday.csv' + users = get_usernames(csv_file) \ No newline at end of file diff --git a/classifier/parse_json.py b/classifier/parse_json.py new file mode 100644 index 0000000..4145a7f --- /dev/null +++ b/classifier/parse_json.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Feb 12 17:05:55 2020 + +@author: I Kit Cheng +""" + +import json + +def parse_json(filename): + """ + + Parameters + ---------- + filename : string + Full path to the json file. + + Returns + ------- + data : dict or list + Returns a dictionary or a list of dictionaries. + + """ + + with open(filename, 'r') as f: + data = json.load(f) + return data + +# In[]: +if __name__ == "__main__": + filename = '../Datasets/user_classification/ind_vs_org/organization_training_unbalanced_lower.json' + data = parse_json(filename) + + usernames_labels = [] + for i, uid in enumerate(data['users'].keys()): + user_data = data['users'][uid] + usernames_labels.append([user_data['username'],user_data['label']]) diff --git a/classifier/rf_smote_bot_classifier.pkl b/classifier/rf_smote_bot_classifier.pkl new file mode 100644 index 0000000..dce087e Binary files /dev/null and b/classifier/rf_smote_bot_classifier.pkl differ diff --git a/classifier/rf_smote_org_classifier.pkl b/classifier/rf_smote_org_classifier.pkl new file mode 100644 index 0000000..6e384c1 Binary files /dev/null and b/classifier/rf_smote_org_classifier.pkl differ