New Scraper

Added a new YouTube Scraper
Rolfstx · May 1, 2020 · a5a4794 · a5a4794
1 parent ced1cf8
commit a5a4794
Show file tree

Hide file tree

Showing 54 changed files with 63,169 additions and 3,538 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+
+4_Politics/youtube_recommendation_scrapper/subtitles/subtitles_20200319_Joe_Biden-2020-03-19/.DS_Store
+4_Politics/youtube_recommendation_scrapper/.DS_Store
+4_Politics/youtube_recommendation_scrapper/.DS_Store
+4_Politics/youtube_api_request_info/.Rhistory
diff --git a/4_Politics/youtube_api_request_info/data/test.csv b/4_Politics/youtube_api_request_info/data/test.csv
diff --git a/4_Politics/youtube_recommendation_scrapper/data/csv/clinton_20200424_unique.csv b/4_Politics/youtube_recommendation_scrapper/data/csv/clinton_20200424_unique.csv
diff --git a/4_Politics/youtube_recommendation_scrapper/data/json_to_csv.ipynb b/4_Politics/youtube_recommendation_scrapper/data/json_to_csv.ipynb
@@ -884,7 +884,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.7.4"
   }
  },
  "nbformat": 4,

diff --git a/5_5G/youtube_api_request_info/.DS_Store b/5_5G/youtube_api_request_info/.DS_Store
diff --git a/5_5G/youtube_api_request_info/.Rhistory b/5_5G/youtube_api_request_info/.Rhistory
@@ -0,0 +1,139 @@
+pwd
+wd
+#Install libraries
+#install.packages("RCurl")
+#install.packages("jsonlite")
+#install.packages("tuber")
+#Load libraries
+library(RCurl)
+library(jsonlite)
+library(tuber)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
+install.packages("tuber")
+#Install libraries
+#install.packages("RCurl")
+#install.packages("jsonlite")
+#install.packages("tuber")
+#Load libraries
+library(RCurl)
+library(jsonlite)
+library(tuber)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=FALSE)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=FALSE)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=FALSE, sep=",", dec=".")
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=FALSE, sep=",", dec=".")
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=FALSE, sep=",", dec=".", fill=TRUE)
+View(allids)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
+View(allids)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
+View(allids)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
+, sep=",", dec=".", fill=TRUE
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- delim("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.delim("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
+View(allids)
+#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
+allids <- read.delim("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
+allids <- unique(allids)
+#Build a URL to call the API
+URL_base='https://www.googleapis.com/youtube/v3/videos?id=' #this is the base URL
+URL_details='&part=contentDetails&key='                     #getting contentDetail for technical metadata
+URL_key='AIzaSyD6EgKdfm1MiM1ydsUhqx3aw8Asqi-zXMI'
+#cred <- yt_oauth(app_id = "USE YOUR APP_ID", app_secret = "USE YOUR APP_SECRET", scope = "ssl", token = ".httr-oauth")
+allids2 <- base::as.list(allids)
+#Loop through URLS to retrieve basic info (duration, format)
+alldata = data.frame()
+ptm <- proc.time()                                          #Time responses to the server
+# General info - Youtube API part: contentDetails
+for(i in 1:nrow(allids)){
+cat('Iteracio', i, '/', nrow(allids), '\n')
+url = paste(URL_base, allids[i, ], URL_details, URL_key, sep = "")
+dd <- getURL(url)
+result <- fromJSON(dd)
+id = result$items$id[[1]]
+duration = result$items$contentDetails$duration
+caption = result$items$contentDetails$caption
+definition = result$items$contentDetails$definition
+alldata = rbind(alldata, data.frame(id, duration, caption, definition))
+}
+# Video info (title, description, etc.) - Youtube API part: snippet
+alldata2 = data.frame()
+URL_details2='&part=snippet&key='                     #getting snippet for general metadata
+for(i in 1:nrow(allids)){
+cat('Iteracio', i, '/', nrow(allids), '\n')
+url2 = paste(URL_base, allids[i, ], URL_details2, URL_key, sep = "")
+dd2 <- getURL(url2)
+result2 <- fromJSON(dd2)
+id2 = result2$items$id[[1]]
+publishedAt = result2$items$snippet$publishedAt
+channelid = result2$items$snippet$channelId
+channeltitle = result2$items$snippet$channelTitle
+title = result2$items$snippet$title
+description = result2$items$snippet$description
+tag = result2$items$snippet$tags
+category = result2$items$snippet$categoryId
+alldata2 = rbind(alldata2, data.frame(id2, title, description, publishedAt, channelid, channeltitle, category))
+}
+# Video statistics (likes, views, etc.) - Youtube API part: statistics
+alldata3 = data.frame()
+URL_details3='&part=statistics&key='                     #getting statistics for technical metadata
+for(i in 1:nrow(allids)){
+cat('Iteracio', i, '/', nrow(allids), '\n')
+url3 = paste(URL_base, allids[i, ], URL_details3, URL_key, sep = "")
+dd3 <- getURL(url3)
+result3 <- fromJSON(dd3)
+id3 = result3$items$id[[1]]
+views = result3$items$statistics$viewCount
+likes = result3$items$statistics$likeCount
+dislikes = result3$items$statistics$dislikeCount
+favorite = result3$items$statistics$favoriteCount
+comments = result3$items$statistics$commentCount
+alldata3 = rbind(alldata3, data.frame(id3, views, likes, dislikes, favorite, comments))
+}
+View(allids2)
+URL_key='AIzaSyDR7gW29gCGg_KD1jPioOPbD3Jso0diC4w'
+#cred <- yt_oauth(app_id = "USE YOUR APP_ID", app_secret = "USE YOUR APP_SECRET", scope = "ssl", token = ".httr-oauth")
+allids2 <- base::as.list(allids)
+#Loop through URLS to retrieve basic info (duration, format)
+alldata = data.frame()
+ptm <- proc.time() #Time responses to the server
+# General info - Youtube API part: contentDetails
+for(i in 1:nrow(allids)){
+cat('Iteracio', i, '/', nrow(allids), '\n')
+url = paste(URL_base, allids[i, ], URL_details, URL_key, sep = "")
+dd <- getURL(url)
+result <- fromJSON(dd)
+id = result$items$id[[1]]
+duration = result$items$contentDetails$duration
+caption = result$items$contentDetails$caption
+definition = result$items$contentDetails$definition
+alldata = rbind(alldata, data.frame(id, duration, caption, definition))
+}