-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Gun Control search instead of vaccines
- Loading branch information
Showing
41 changed files
with
22,240 additions
and
1 deletion.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
setwd("C:/Users/atryf/Desktop/Andreas/ESADE/Capstone/GunControl") | ||
library(data.table) | ||
#load dataset | ||
#unique video dataset or all videos dataset | ||
#df = fread("~/Documents/GitHub/esade_fake_news/4_Politics/python3_script/data/videos/20200504-193926_joe_biden_recommendations.csv") | ||
df[, 'V1':=NULL] | ||
df = fread("20200625-155513_gun_control") | ||
df = fread("20200625-155513_gun_control.csv") | ||
#groupby genre and count number of videos | ||
genre = df[, .N, by='genre'] | ||
View(genre) | ||
#groupby channel and count number of channels | ||
channel = df[, unique(channel), by='genre'] | ||
channel2 = channel[, .N, by='genre'] | ||
View(channel) | ||
View(channel2) | ||
View(channel2) | ||
View(channel) | ||
#merge genre and channel2 | ||
genre_channel = merge(genre, | ||
channel2, | ||
by='genre', | ||
suffixes= c('.video', '.channel'))[order(-N.video)] | ||
View(genre_channel) | ||
#groupby channel and count number of channels | ||
channel = df[, unique(channel), by='channe;'] | ||
channel2 = channel[, .N, by='channel'] | ||
#groupby channel and count number of channels | ||
channel = df[, unique(channel), by='channel'] | ||
channel2 = channel[, .N, by='channel'] | ||
View(channel) | ||
View(channel2) | ||
#groupby channel and count number of channels | ||
channel = df[, unique(channel), by='genre'] | ||
channel2 = channel[, .N, by='genre'] | ||
View(genre) | ||
View(genre_channel) | ||
View(channel) | ||
View(channel2) | ||
View(genre_channel) | ||
#import genre_channel with added relevant column | ||
genre_relevant = fread("~/Documents/GitHub/esade_fake_news/4_Politics/python3_script/data/genre/20200504-193926_joe_biden_genre_channel2.csv") | ||
#create relevant column equals 1 if it contains keywords | ||
#29 videos contain keywords in title or description columns | ||
genre_channel$relevant = ifelse(grepl('Entertainment|Education|News & Politics|People & Blogs|Nonprofits & Activism', paste(genre_channel$genre), ignore.case=TRUE), 1, 0) | ||
View(genre_channel) | ||
sum(genre_channel$relevant) | ||
#merge datasets to obtain relevant videos. | ||
df2 = merge(df, | ||
genre_channel, | ||
by='genre') | ||
# 75% of videos are relevant | ||
mean(df2$relevant) | ||
#create new dataset with only relevant videos | ||
df_relevant = df2[relevant==1] | ||
#create dataset with only channel and count. | ||
df_relevant_channel = df_relevant[, .N, by='channel'][order(channel)] | ||
#load media bias dataset | ||
media_bias = fread("media_bias_data.csv", drop='Website') | ||
#merge cleaned channel dataset with media bias | ||
df_merged = merge(df_updated, | ||
media_bias, | ||
by.x='media_bias', | ||
by.y='News Source') | ||
#merge cleaned channel dataset with media bias | ||
df_merged = merge(df_relevant_channel, | ||
media_bias, | ||
by.x='media_bias', | ||
by.y='News Source') | ||
View(df_relevant_channel) | ||
View(media_bias) | ||
#merge cleaned channel dataset with media bias | ||
df_merged = merge(df_relevant_channel, | ||
media_bias, | ||
by.x='channel', | ||
by.y='News Source') | ||
View(df_merged) | ||
#merge cleaned channel dataset with media bias | ||
df_merged = merge(df_relevant_channel, | ||
media_bias, | ||
by.x='media_bias', | ||
by.y='News Source') | ||
#export df_relevant. Match channels with media bias rating in Excel. | ||
#the channel names are not exact matches. thus, mannually prepare in Excel. | ||
write.csv(df_relevant_channel) | ||
#export df_relevant. Match channels with media bias rating in Excel. | ||
#the channel names are not exact matches. thus, mannually prepare in Excel. | ||
write.csv(df_relevant_channel,"20200625-155513_gun_control_channels.csv") | ||
View(df2) | ||
#load cleaned channel dataset | ||
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"), drop=c('Column1')) | ||
#load cleaned channel dataset | ||
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA")) | ||
View(df_updated) | ||
#load cleaned channel dataset | ||
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"), drop=c('Column1')) | ||
View(df_updated) | ||
#load cleaned channel dataset | ||
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA")) | ||
#load cleaned channel dataset | ||
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA")) | ||
View(df_updated) | ||
#load cleaned channel dataset | ||
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA")) | ||
#load cleaned channel dataset | ||
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"), drop=c('Column1')) | ||
#load media bias dataset | ||
media_bias = fread("media_bias_data.csv", drop='Website') | ||
#merge cleaned channel dataset with media bias | ||
df_merged = merge(df_relevant_channel, | ||
media_bias, | ||
by.x='media_bias', | ||
by.y='News Source') | ||
#merge cleaned channel dataset with media bias | ||
df_merged = merge(df_updated, | ||
media_bias, | ||
by.x='media_bias', | ||
by.y='News Source') | ||
#remove N column | ||
df_merged[, 'N':=NULL] | ||
#complete dataset | ||
df_merged2 = merge(df_relevant, | ||
df_merged, | ||
by='channel') | ||
View(df_merged) | ||
View(df_merged2) | ||
#complete dataset with all videos | ||
df_all =merge(df, | ||
df_merged, | ||
by='channel', | ||
all.x=TRUE) | ||
#ensure compelte_cases in views | ||
df_all = df_all[complete.cases(views), ] | ||
# % of videos still present in raw dataset. | ||
nrow(df_merged2) / nrow(df) | ||
#table with count and percentage | ||
tblFun <- function(x){ | ||
tbl <- table(x) | ||
res <- cbind(tbl,round(prop.table(tbl)*100,2)) | ||
colnames(res) <- c('Count','Percentage') | ||
res | ||
} | ||
# | ||
group_by_bias = do.call(rbind,lapply(df_all[, 'Bias'],tblFun)) | ||
group_by_bias | ||
write.csv(group_by_bias, "20200625-155513_gun_control_channels_bias.csv") | ||
#export dataset with channel bias | ||
write.csv(df_merged2, "20200625-155513_gun_control_bias.csv") | ||
#convert Bias into binary | ||
#right = 1 | ||
#left = 0 | ||
mapping <- c("Left" = 0, "Left-Center" = 0, | ||
"Least Biased" = 1, "Right" = 1, "Right-Center" = 1) | ||
df_merged2$Bias_num <- mapping[df_merged2$Bias] | ||
#keep columns title, description and Bias_num for NLP dataset | ||
df_nlp = df_merged2[, c('Bias_num' ,'title', 'description', 'channel','id')] | ||
#export NLP dataset | ||
write.csv(df_nlp, "20200625-155513_gun_control_nlp.csv") | ||
#export all videos dataset with media bias | ||
write.csv(df_all, "20200625-155513_gun_control_all.csv") | ||
# 32% Bias_num is 1, 68% is 0 | ||
sum(df_nlp$Bias_num) / nrow(df_nlp) | ||
uniqueN(df$title) | ||
len(df) | ||
length(df) | ||
lenght(df$title) | ||
length(df$title) |
Oops, something went wrong.