Skip to content

Commit

Permalink
New Scraper
Browse files Browse the repository at this point in the history
Added a new YouTube Scraper
  • Loading branch information
mikkelduif committed May 1, 2020
1 parent ced1cf8 commit a5a4794
Show file tree
Hide file tree
Showing 54 changed files with 63,169 additions and 3,538 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

4_Politics/youtube_recommendation_scrapper/subtitles/subtitles_20200319_Joe_Biden-2020-03-19/.DS_Store
4_Politics/youtube_recommendation_scrapper/.DS_Store
4_Politics/youtube_recommendation_scrapper/.DS_Store
4_Politics/youtube_api_request_info/.Rhistory
Empty file modified 4_Politics/youtube_api_request_info/data/test.csv
100644 → 100755
Empty file.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -884,7 +884,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.7.4"
}
},
"nbformat": 4,
Expand Down
Binary file added 5_5G/youtube_api_request_info/.DS_Store
Binary file not shown.
139 changes: 139 additions & 0 deletions 5_5G/youtube_api_request_info/.Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
pwd
wd
#Install libraries
#install.packages("RCurl")
#install.packages("jsonlite")
#install.packages("tuber")
#Load libraries
library(RCurl)
library(jsonlite)
library(tuber)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
install.packages("tuber")
#Install libraries
#install.packages("RCurl")
#install.packages("jsonlite")
#install.packages("tuber")
#Load libraries
library(RCurl)
library(jsonlite)
library(tuber)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=FALSE)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=FALSE)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=FALSE, sep=",", dec=".")
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=FALSE, sep=",", dec=".")
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=FALSE, sep=",", dec=".", fill=TRUE)
View(allids)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
View(allids)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
View(allids)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE)
, sep=",", dec=".", fill=TRUE
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.table("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- delim("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.delim("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
View(allids)
#Clean the list, remove the datestamps and keep unique IDs, as soom are repeated
allids <- read.delim("data/video-infos-the-earth-is-flat-the earth is flat-20200430.csv", header=TRUE, sep=",", dec=".", fill=TRUE)
allids <- unique(allids)
#Build a URL to call the API
URL_base='https://www.googleapis.com/youtube/v3/videos?id=' #this is the base URL
URL_details='&part=contentDetails&key=' #getting contentDetail for technical metadata
URL_key='AIzaSyD6EgKdfm1MiM1ydsUhqx3aw8Asqi-zXMI'
#cred <- yt_oauth(app_id = "USE YOUR APP_ID", app_secret = "USE YOUR APP_SECRET", scope = "ssl", token = ".httr-oauth")
allids2 <- base::as.list(allids)
#Loop through URLS to retrieve basic info (duration, format)
alldata = data.frame()
ptm <- proc.time() #Time responses to the server
# General info - Youtube API part: contentDetails
for(i in 1:nrow(allids)){
cat('Iteracio', i, '/', nrow(allids), '\n')
url = paste(URL_base, allids[i, ], URL_details, URL_key, sep = "")
dd <- getURL(url)
result <- fromJSON(dd)
id = result$items$id[[1]]
duration = result$items$contentDetails$duration
caption = result$items$contentDetails$caption
definition = result$items$contentDetails$definition
alldata = rbind(alldata, data.frame(id, duration, caption, definition))
}
# Video info (title, description, etc.) - Youtube API part: snippet
alldata2 = data.frame()
URL_details2='&part=snippet&key=' #getting snippet for general metadata
for(i in 1:nrow(allids)){
cat('Iteracio', i, '/', nrow(allids), '\n')
url2 = paste(URL_base, allids[i, ], URL_details2, URL_key, sep = "")
dd2 <- getURL(url2)
result2 <- fromJSON(dd2)
id2 = result2$items$id[[1]]
publishedAt = result2$items$snippet$publishedAt
channelid = result2$items$snippet$channelId
channeltitle = result2$items$snippet$channelTitle
title = result2$items$snippet$title
description = result2$items$snippet$description
tag = result2$items$snippet$tags
category = result2$items$snippet$categoryId
alldata2 = rbind(alldata2, data.frame(id2, title, description, publishedAt, channelid, channeltitle, category))
}
# Video statistics (likes, views, etc.) - Youtube API part: statistics
alldata3 = data.frame()
URL_details3='&part=statistics&key=' #getting statistics for technical metadata
for(i in 1:nrow(allids)){
cat('Iteracio', i, '/', nrow(allids), '\n')
url3 = paste(URL_base, allids[i, ], URL_details3, URL_key, sep = "")
dd3 <- getURL(url3)
result3 <- fromJSON(dd3)
id3 = result3$items$id[[1]]
views = result3$items$statistics$viewCount
likes = result3$items$statistics$likeCount
dislikes = result3$items$statistics$dislikeCount
favorite = result3$items$statistics$favoriteCount
comments = result3$items$statistics$commentCount
alldata3 = rbind(alldata3, data.frame(id3, views, likes, dislikes, favorite, comments))
}
View(allids2)
URL_key='AIzaSyDR7gW29gCGg_KD1jPioOPbD3Jso0diC4w'
#cred <- yt_oauth(app_id = "USE YOUR APP_ID", app_secret = "USE YOUR APP_SECRET", scope = "ssl", token = ".httr-oauth")
allids2 <- base::as.list(allids)
#Loop through URLS to retrieve basic info (duration, format)
alldata = data.frame()
ptm <- proc.time() #Time responses to the server
# General info - Youtube API part: contentDetails
for(i in 1:nrow(allids)){
cat('Iteracio', i, '/', nrow(allids), '\n')
url = paste(URL_base, allids[i, ], URL_details, URL_key, sep = "")
dd <- getURL(url)
result <- fromJSON(dd)
id = result$items$id[[1]]
duration = result$items$contentDetails$duration
caption = result$items$contentDetails$caption
definition = result$items$contentDetails$definition
alldata = rbind(alldata, data.frame(id, duration, caption, definition))
}
Loading

0 comments on commit a5a4794

Please sign in to comment.