Skip to content

Commit

Permalink
Gun Control search instead of vaccines
Browse files Browse the repository at this point in the history
  • Loading branch information
atryfonos committed Jun 28, 2020
1 parent 12cc2a5 commit b24c8a9
Show file tree
Hide file tree
Showing 41 changed files with 22,240 additions and 1 deletion.
Binary file added 2_Gun Control/.RData
Binary file not shown.
167 changes: 167 additions & 0 deletions 2_Gun Control/.Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
setwd("C:/Users/atryf/Desktop/Andreas/ESADE/Capstone/GunControl")
library(data.table)
#load dataset
#unique video dataset or all videos dataset
#df = fread("~/Documents/GitHub/esade_fake_news/4_Politics/python3_script/data/videos/20200504-193926_joe_biden_recommendations.csv")
df[, 'V1':=NULL]
df = fread("20200625-155513_gun_control")
df = fread("20200625-155513_gun_control.csv")
#groupby genre and count number of videos
genre = df[, .N, by='genre']
View(genre)
#groupby channel and count number of channels
channel = df[, unique(channel), by='genre']
channel2 = channel[, .N, by='genre']
View(channel)
View(channel2)
View(channel2)
View(channel)
#merge genre and channel2
genre_channel = merge(genre,
channel2,
by='genre',
suffixes= c('.video', '.channel'))[order(-N.video)]
View(genre_channel)
#groupby channel and count number of channels
channel = df[, unique(channel), by='channe;']
channel2 = channel[, .N, by='channel']
#groupby channel and count number of channels
channel = df[, unique(channel), by='channel']
channel2 = channel[, .N, by='channel']
View(channel)
View(channel2)
#groupby channel and count number of channels
channel = df[, unique(channel), by='genre']
channel2 = channel[, .N, by='genre']
View(genre)
View(genre_channel)
View(channel)
View(channel2)
View(genre_channel)
#import genre_channel with added relevant column
genre_relevant = fread("~/Documents/GitHub/esade_fake_news/4_Politics/python3_script/data/genre/20200504-193926_joe_biden_genre_channel2.csv")
#create relevant column equals 1 if it contains keywords
#29 videos contain keywords in title or description columns
genre_channel$relevant = ifelse(grepl('Entertainment|Education|News & Politics|People & Blogs|Nonprofits & Activism', paste(genre_channel$genre), ignore.case=TRUE), 1, 0)
View(genre_channel)
sum(genre_channel$relevant)
#merge datasets to obtain relevant videos.
df2 = merge(df,
genre_channel,
by='genre')
# 75% of videos are relevant
mean(df2$relevant)
#create new dataset with only relevant videos
df_relevant = df2[relevant==1]
#create dataset with only channel and count.
df_relevant_channel = df_relevant[, .N, by='channel'][order(channel)]
#load media bias dataset
media_bias = fread("media_bias_data.csv", drop='Website')
#merge cleaned channel dataset with media bias
df_merged = merge(df_updated,
media_bias,
by.x='media_bias',
by.y='News Source')
#merge cleaned channel dataset with media bias
df_merged = merge(df_relevant_channel,
media_bias,
by.x='media_bias',
by.y='News Source')
View(df_relevant_channel)
View(media_bias)
#merge cleaned channel dataset with media bias
df_merged = merge(df_relevant_channel,
media_bias,
by.x='channel',
by.y='News Source')
View(df_merged)
#merge cleaned channel dataset with media bias
df_merged = merge(df_relevant_channel,
media_bias,
by.x='media_bias',
by.y='News Source')
#export df_relevant. Match channels with media bias rating in Excel.
#the channel names are not exact matches. thus, mannually prepare in Excel.
write.csv(df_relevant_channel)
#export df_relevant. Match channels with media bias rating in Excel.
#the channel names are not exact matches. thus, mannually prepare in Excel.
write.csv(df_relevant_channel,"20200625-155513_gun_control_channels.csv")
View(df2)
#load cleaned channel dataset
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"), drop=c('Column1'))
#load cleaned channel dataset
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"))
View(df_updated)
#load cleaned channel dataset
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"), drop=c('Column1'))
View(df_updated)
#load cleaned channel dataset
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"))
#load cleaned channel dataset
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"))
View(df_updated)
#load cleaned channel dataset
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"))
#load cleaned channel dataset
df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"), drop=c('Column1'))
#load media bias dataset
media_bias = fread("media_bias_data.csv", drop='Website')
#merge cleaned channel dataset with media bias
df_merged = merge(df_relevant_channel,
media_bias,
by.x='media_bias',
by.y='News Source')
#merge cleaned channel dataset with media bias
df_merged = merge(df_updated,
media_bias,
by.x='media_bias',
by.y='News Source')
#remove N column
df_merged[, 'N':=NULL]
#complete dataset
df_merged2 = merge(df_relevant,
df_merged,
by='channel')
View(df_merged)
View(df_merged2)
#complete dataset with all videos
df_all =merge(df,
df_merged,
by='channel',
all.x=TRUE)
#ensure compelte_cases in views
df_all = df_all[complete.cases(views), ]
# % of videos still present in raw dataset.
nrow(df_merged2) / nrow(df)
#table with count and percentage
tblFun <- function(x){
tbl <- table(x)
res <- cbind(tbl,round(prop.table(tbl)*100,2))
colnames(res) <- c('Count','Percentage')
res
}
#
group_by_bias = do.call(rbind,lapply(df_all[, 'Bias'],tblFun))
group_by_bias
write.csv(group_by_bias, "20200625-155513_gun_control_channels_bias.csv")
#export dataset with channel bias
write.csv(df_merged2, "20200625-155513_gun_control_bias.csv")
#convert Bias into binary
#right = 1
#left = 0
mapping <- c("Left" = 0, "Left-Center" = 0,
"Least Biased" = 1, "Right" = 1, "Right-Center" = 1)
df_merged2$Bias_num <- mapping[df_merged2$Bias]
#keep columns title, description and Bias_num for NLP dataset
df_nlp = df_merged2[, c('Bias_num' ,'title', 'description', 'channel','id')]
#export NLP dataset
write.csv(df_nlp, "20200625-155513_gun_control_nlp.csv")
#export all videos dataset with media bias
write.csv(df_all, "20200625-155513_gun_control_all.csv")
# 32% Bias_num is 1, 68% is 0
sum(df_nlp$Bias_num) / nrow(df_nlp)
uniqueN(df$title)
len(df)
length(df)
lenght(df$title)
length(df$title)
Loading

0 comments on commit b24c8a9

Please sign in to comment.