-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Saved organisation and bot random forest classifiers in pkl file, cre…
…ated script classify_new_bots_orgs.py which returns a csv with user classification labels for each username.
- Loading branch information
Showing
18 changed files
with
886 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
.DS_Store | ||
config.yaml | ||
figures/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file added
BIN
+4.9 MB
...ngTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-30-2020-01-31.tar.gz
Binary file not shown.
Binary file added
BIN
+13.1 MB
...ngTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-01-31-2020-02-01.tar.gz
Binary file not shown.
Binary file added
BIN
+17.9 MB
...ngTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-01-2020-02-02.tar.gz
Binary file not shown.
Binary file added
BIN
+5.03 MB
...ngTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-02-2020-02-03.tar.gz
Binary file not shown.
Binary file added
BIN
+1.77 MB
...ngTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-03-2020-02-04.tar.gz
Binary file not shown.
Binary file added
BIN
+829 KB
...ngTwitterRealTime/datasets/All_Headers/brexitday/scrape_data_2020-02-04-2020-02-05.tar.gz
Binary file not shown.
196 changes: 196 additions & 0 deletions
196
classifier/Generate_training_data_for_user_classification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Mon Feb 10 01:46:24 2020 | ||
@author: I Kit Cheng | ||
""" | ||
|
||
# In[]: | ||
|
||
# Generate features from training data | ||
from ScrapeTwitterTimeline_FeatureExtraction import main | ||
import pandas as pd | ||
import numpy as np | ||
import seaborn as sns | ||
import matplotlib.pyplot as plt | ||
|
||
|
||
def relabel_dataset(df): | ||
""" | ||
Parameters | ||
---------- | ||
df : pandas.core.frame.DataFrame | ||
Dataframe. | ||
Returns | ||
------- | ||
df_labels : pandas.core.frame.DataFrame | ||
Extracted labels. | ||
""" | ||
|
||
# Label Distribution | ||
print('\nLabel Distribution:') | ||
print(df.gender.value_counts()) | ||
|
||
# Drop rows with gender = nan | ||
df = df.dropna(subset=['gender']) | ||
|
||
# Remove individuals with unknown label | ||
df = df[df.gender != 'unknown'] | ||
|
||
print(f'\nClean df length: {len(df)}') | ||
|
||
# Label Distribution (clean) | ||
print('\nLabel Distribution:') | ||
print(df.gender.value_counts()) | ||
|
||
# Combine 'male' and 'females' labels to 0, and relabel 'brand' to 1 | ||
df_labels = pd.DataFrame([0 if (x =='female' or x == 'male') | ||
else 1 for x in df.gender], columns=['labels'], | ||
index=df.name) | ||
|
||
return df_labels | ||
|
||
# In[]: | ||
|
||
|
||
def bool2int(df,columns): | ||
""" | ||
Parameters | ||
---------- | ||
df : pandas.core.frame.DataFrame | ||
Dataframe with boolean columns. | ||
columns : list | ||
Column names with boolean data. | ||
Returns | ||
------- | ||
df : pandas.core.frame.DataFrame | ||
Dataframe without boolean data (converted to binary 0 or 1) | ||
""" | ||
print('\nChanging boolean data to 0 or 1.') | ||
for col in columns: | ||
df[col] = df[col].astype(int) | ||
return df | ||
|
||
def matching_labels_to_new_features(df): | ||
""" | ||
Parameters | ||
---------- | ||
df : pandas.core.frame.DataFrame | ||
Dataframe without labels. | ||
Returns | ||
------- | ||
df : pandas.core.frame.DataFrame | ||
Dataframe with labels. | ||
""" | ||
print('Matching labels to new features dataframe.') | ||
# Adding the corresponding label to the feature dataset | ||
labels_for_sample = [] | ||
for i,v in enumerate(df.index.to_list()): | ||
if len(df_labels.loc[v]) > 1: | ||
labels_for_sample.append(df_labels.loc[v].iloc[0][0]) | ||
else: | ||
labels_for_sample.append(df_labels.loc[v].iloc[0]) | ||
|
||
df.index.names = ['username'] # name the index column | ||
df['labels'] = labels_for_sample | ||
df.to_csv('user_features_labels.csv') | ||
return df | ||
|
||
|
||
# In[]: | ||
|
||
########################################### plot distribution of each variable ###################################### | ||
def plotDist(save=False): | ||
""" | ||
Parameters | ||
---------- | ||
save : bool, optional | ||
Save plot option. The default is False. | ||
Returns | ||
------- | ||
None. | ||
""" | ||
for i, col in enumerate(df.columns[1:]): | ||
print(col) | ||
plt.figure() | ||
try: | ||
ax = sns.kdeplot(df[col]) | ||
ax.get_legend().remove() | ||
except RuntimeError: | ||
df[col].hist() | ||
plt.title(col) | ||
plt.close() | ||
if save: | ||
plt.savefig('dist_'+col+'.png') | ||
|
||
#plotDist() | ||
# In[]: | ||
######################################### Dealing with missing data ############################################## | ||
|
||
from sklearn.impute import SimpleImputer | ||
|
||
# Replace numerical nans with median (the median is less sensitive to outliers) | ||
def replaceNans(df, strategy='median'): | ||
""" | ||
Parameters | ||
---------- | ||
df : pandas.core.frame.DataFrame | ||
A dataframe (rows are examples and columns are features). | ||
strategy: string, optional | ||
Replace nans with specified strategy. The default is 'median'. | ||
Options are 'mean', 'median', 'most_frequent', 'constant'. | ||
Returns | ||
------- | ||
df : pandas.core.frame.DataFrame | ||
A dataframe without numerical nans. | ||
""" | ||
print(f'Replacing Nans with {strategy}.') | ||
imputer = SimpleImputer(missing_values=np.nan, strategy=strategy) | ||
for i, col in enumerate(df.columns[1:-1]): | ||
if len(df[col].unique()) == 2: # categorical (binary) data | ||
continue | ||
else: | ||
df[[col]] = imputer.fit_transform(df[[col]]) | ||
return df | ||
|
||
# In[]: | ||
if __name__ == '__main__': | ||
# Set random seed to ensure reproducible runs | ||
RSEED = 50 | ||
|
||
print('\n################# Begin Scraping User Timeline: #####################') | ||
|
||
# We'll limit the data to 1000 individuals to speed up training. | ||
df = pd.read_csv('../Datasets/gender-classifier.csv', encoding = "ISO-8859-1")#.sample(10, random_state = RSEED) | ||
users = df.name.to_list() | ||
df_labels = relabel_dataset(df) | ||
|
||
scrape = False | ||
if scrape: | ||
df = main(users, N=200) # saves features in users_features.csv | ||
|
||
df = pd.read_csv('users_features.csv', index_col=0) | ||
|
||
df = bool2int(df, ['geo', 'location', 'url', 'description', 'verified']) | ||
matching_labels_to_new_features(df) | ||
|
||
df = pd.read_csv('user_features_labels.csv', index_col=0) # training data (unclean) | ||
df.index.name = 'username' | ||
df = replaceNans(df) | ||
df.to_csv('user_features_labels_noNan.csv') | ||
print('___________________Done cleaning!_________________') |
Oops, something went wrong.