Gun Control search instead of vaccines

Rolfstx · Jun 28, 2020 · b24c8a9 · b24c8a9
1 parent 12cc2a5
commit b24c8a9
Show file tree

Hide file tree

Showing 41 changed files with 22,240 additions and 1 deletion.
diff --git a/2_Gun Control/.RData b/2_Gun Control/.RData
diff --git a/2_Gun Control/.Rhistory b/2_Gun Control/.Rhistory
@@ -0,0 +1,167 @@
+setwd("C:/Users/atryf/Desktop/Andreas/ESADE/Capstone/GunControl")
+library(data.table)
+#load dataset
+#unique video dataset or all videos dataset
+#df = fread("~/Documents/GitHub/esade_fake_news/4_Politics/python3_script/data/videos/20200504-193926_joe_biden_recommendations.csv")
+df[, 'V1':=NULL]
+df = fread("20200625-155513_gun_control")
+df = fread("20200625-155513_gun_control.csv")
+#groupby genre and count number of videos
+genre = df[, .N, by='genre']
+View(genre)
+#groupby channel and count number of channels
+channel = df[, unique(channel), by='genre']
+channel2 = channel[, .N, by='genre']
+View(channel)
+View(channel2)
+View(channel2)
+View(channel)
+#merge genre and channel2
+genre_channel = merge(genre,
+channel2,
+by='genre',
+suffixes= c('.video', '.channel'))[order(-N.video)]
+View(genre_channel)
+#groupby channel and count number of channels
+channel = df[, unique(channel), by='channe;']
+channel2 = channel[, .N, by='channel']
+#groupby channel and count number of channels
+channel = df[, unique(channel), by='channel']
+channel2 = channel[, .N, by='channel']
+View(channel)
+View(channel2)
+#groupby channel and count number of channels
+channel = df[, unique(channel), by='genre']
+channel2 = channel[, .N, by='genre']
+View(genre)
+View(genre_channel)
+View(channel)
+View(channel2)
+View(genre_channel)
+#import genre_channel with added relevant column
+genre_relevant = fread("~/Documents/GitHub/esade_fake_news/4_Politics/python3_script/data/genre/20200504-193926_joe_biden_genre_channel2.csv")
+#create relevant column equals 1 if it contains keywords
+#29 videos contain keywords in title or description columns
+genre_channel$relevant = ifelse(grepl('Entertainment|Education|News & Politics|People & Blogs|Nonprofits & Activism', paste(genre_channel$genre), ignore.case=TRUE), 1, 0)
+View(genre_channel)
+sum(genre_channel$relevant)
+#merge datasets to obtain relevant videos.
+df2 = merge(df,
+genre_channel,
+by='genre')
+# 75% of videos are relevant
+mean(df2$relevant)
+#create new dataset with only relevant videos
+df_relevant = df2[relevant==1]
+#create dataset with only channel and count.
+df_relevant_channel = df_relevant[, .N, by='channel'][order(channel)]
+#load media bias dataset
+media_bias = fread("media_bias_data.csv", drop='Website')
+#merge cleaned channel dataset with media bias
+df_merged = merge(df_updated,
+media_bias,
+by.x='media_bias',
+by.y='News Source')
+#merge cleaned channel dataset with media bias
+df_merged = merge(df_relevant_channel,
+media_bias,
+by.x='media_bias',
+by.y='News Source')
+View(df_relevant_channel)
+View(media_bias)
+#merge cleaned channel dataset with media bias
+df_merged = merge(df_relevant_channel,
+media_bias,
+by.x='channel',
+by.y='News Source')
+View(df_merged)
+#merge cleaned channel dataset with media bias
+df_merged = merge(df_relevant_channel,
+media_bias,
+by.x='media_bias',
+by.y='News Source')
+#export df_relevant. Match channels with media bias rating in Excel.
+#the channel names are not exact matches. thus, mannually prepare in Excel.
+write.csv(df_relevant_channel)
+#export df_relevant. Match channels with media bias rating in Excel.
+#the channel names are not exact matches. thus, mannually prepare in Excel.
+write.csv(df_relevant_channel,"20200625-155513_gun_control_channels.csv")
+View(df2)
+#load cleaned channel dataset
+df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"), drop=c('Column1'))
+#load cleaned channel dataset
+df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"))
+View(df_updated)
+#load cleaned channel dataset
+df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"), drop=c('Column1'))
+View(df_updated)
+#load cleaned channel dataset
+df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"))
+#load cleaned channel dataset
+df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"))
+View(df_updated)
+#load cleaned channel dataset
+df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"))
+#load cleaned channel dataset
+df_updated = fread("20200625-155513_gun_control_channels_clean.csv", na.strings=c("","NA"), drop=c('Column1'))
+#load media bias dataset
+media_bias = fread("media_bias_data.csv", drop='Website')
+#merge cleaned channel dataset with media bias
+df_merged = merge(df_relevant_channel,
+media_bias,
+by.x='media_bias',
+by.y='News Source')
+#merge cleaned channel dataset with media bias
+df_merged = merge(df_updated,
+media_bias,
+by.x='media_bias',
+by.y='News Source')
+#remove N column
+df_merged[, 'N':=NULL]
+#complete dataset
+df_merged2 = merge(df_relevant,
+df_merged,
+by='channel')
+View(df_merged)
+View(df_merged2)
+#complete dataset with all videos
+df_all =merge(df,
+df_merged,
+by='channel',
+all.x=TRUE)
+#ensure compelte_cases in views
+df_all = df_all[complete.cases(views), ]
+# % of videos still present in raw dataset.
+nrow(df_merged2) / nrow(df)
+#table with count and percentage
+tblFun <- function(x){
+tbl <- table(x)
+res <- cbind(tbl,round(prop.table(tbl)*100,2))
+colnames(res) <- c('Count','Percentage')
+res
+}
+#
+group_by_bias = do.call(rbind,lapply(df_all[, 'Bias'],tblFun))
+group_by_bias
+write.csv(group_by_bias, "20200625-155513_gun_control_channels_bias.csv")
+#export dataset with channel bias
+write.csv(df_merged2, "20200625-155513_gun_control_bias.csv")
+#convert Bias into binary
+#right = 1
+#left = 0
+mapping <- c("Left" = 0, "Left-Center" = 0,
+"Least Biased" = 1, "Right" = 1, "Right-Center" = 1)
+df_merged2$Bias_num <- mapping[df_merged2$Bias]
+#keep columns title, description and Bias_num for NLP dataset
+df_nlp = df_merged2[, c('Bias_num' ,'title', 'description', 'channel','id')]
+#export NLP dataset
+write.csv(df_nlp, "20200625-155513_gun_control_nlp.csv")
+#export all videos dataset with media bias
+write.csv(df_all, "20200625-155513_gun_control_all.csv")
+# 32% Bias_num is 1, 68% is 0
+sum(df_nlp$Bias_num) / nrow(df_nlp)
+uniqueN(df$title)
+len(df)
+length(df)
+lenght(df$title)
+length(df$title)