From adab0cd5013f6585bca60bd0a43a1187faff2b78 Mon Sep 17 00:00:00 2001 From: petermeissner Date: Fri, 20 Oct 2017 23:59:39 +0200 Subject: [PATCH] major : future (parallel download), spiderbar (check speedup) --- .Rbuildignore | 1 + DESCRIPTION | 3 +- NAMESPACE | 1 + NEWS.md | 12 +- R/get_robotstxt.R | 2 +- R/get_robotstxt_http_get.R | 2 +- R/get_robotstxts.R | 19 ++- R/paths_allowed.R | 203 +++++++------------------- R/paths_allowed_worker_robotstxt.R | 40 +++++ R/paths_allowed_worker_spiderbar.R | 49 +++++++ benchmarks/spiderbar_and_futures.r | 112 ++++++++++++++ dev.r | 20 ++- man/paths_allowed.Rd | 19 ++- man/paths_allowed_worker_robotstxt.Rd | 7 +- man/paths_allowed_worker_spiderbar.Rd | 10 +- 15 files changed, 321 insertions(+), 179 deletions(-) create mode 100644 R/paths_allowed_worker_robotstxt.R create mode 100644 R/paths_allowed_worker_spiderbar.R create mode 100644 benchmarks/spiderbar_and_futures.r diff --git a/.Rbuildignore b/.Rbuildignore index 118a80b..e6b55c2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,3 +13,4 @@ _old_.travis.yml ^cran-comments\.md$ ^\.travis\.yml$ ^codecov\.yml$ +benchmarks \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 381e742..6e0e510 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: robotstxt -Date: 2017-10-17 +Date: 2017-10-20 Type: Package Title: A 'robots.txt' Parser and 'Webbot'/'Spider'/'Crawler' Permissions Checker Version: 0.5.0 @@ -23,6 +23,7 @@ Imports: stringr (>= 1.0.0), httr (>= 1.0.0), spiderbar (>= 0.2.0), + future (>= 1.6.2), magrittr Suggests: knitr, diff --git a/NAMESPACE b/NAMESPACE index 927c652..94e6845 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ S3method(print,robotstxt) S3method(print,robotstxt_text) export("%>%") export(get_robotstxt) +export(get_robotstxts) export(is_valid_robotstxt) export(parse_robotstxt) export(paths_allowed) diff --git a/NEWS.md b/NEWS.md index 1e7a00b..683a074 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,9 +4,13 @@ NEWS robotstxt 0.5.0 | 2017-10-07 -------------------------------------------------------------------------- -- include fast spiderbar functions -- robotstxt now assumes it knows how to parse --> if it cannot parse it assumes that it got no valid robots.txt file meaning that there are no restrictions - +- **feature** : use futures (from package future) to speed up retrieval and parsing +- **feature** : now there is a `get_robotstxts()` function wich is a 'vectorized' version of `get_robotstxt()` +- **feature** : `paths_allowed()` now allows checking via either robotstxt parsed robots.txt files or via functionality provided by the spiderbar package (the latter should be faster by approximatly factor 10) +- **feature** : various functions now have a ssl_verifypeer option (analog to CURL option https://curl.haxx.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html) which might help with robots.txt file retrieval in some cases +- **change** : user_agent for robots.txt file retrieval will now default to: `sessionInfo()$R.version$version.string` +- **change** : robotstxt now assumes it knows how to parse --> if it cannot parse it assumes that it got no valid robots.txt file meaning that there are no restrictions +- **fix** : valid_robotstxt would not accept some actual valid robotstxt files @@ -21,7 +25,7 @@ NEWS robotstxt 0.4.0 | 2017-07-14 -------------------------------------------------------------------------- -- **user_agent** parameter **added** to tobotstxt() and paths_allowed to allow for user defined HTTP user-agent send when retrieving robots.txt file from domain +- **user_agent** parameter **added** to robotstxt() and paths_allowed to allow for user defined HTTP user-agent send when retrieving robots.txt file from domain diff --git a/R/get_robotstxt.R b/R/get_robotstxt.R index 6cdea10..a1bb7fd 100644 --- a/R/get_robotstxt.R +++ b/R/get_robotstxt.R @@ -15,7 +15,7 @@ get_robotstxt <- domain, warn = TRUE, force = FALSE, - user_agent = NULL, + user_agent = sessionInfo()$R.version$version.string, ssl_verifypeer = c(1,0) ){ diff --git a/R/get_robotstxt_http_get.R b/R/get_robotstxt_http_get.R index 322d9d3..cc08a70 100644 --- a/R/get_robotstxt_http_get.R +++ b/R/get_robotstxt_http_get.R @@ -6,7 +6,7 @@ get_robotstxt_http_get <- function( domain, - user_agent = NULL, + user_agent = sessionInfo()$R.version$version.string, ssl_verifypeer = 1 ){ if ( !is.null(user_agent) ) { diff --git a/R/get_robotstxts.R b/R/get_robotstxts.R index 4f2a5bf..217d688 100644 --- a/R/get_robotstxts.R +++ b/R/get_robotstxts.R @@ -15,7 +15,7 @@ get_robotstxts <- domain, warn = TRUE, force = FALSE, - user_agent = NULL, + user_agent = sessionInfo()$R.version$version.string, ssl_verifypeer = c(1,0), use_futures = FALSE ){ @@ -54,9 +54,21 @@ get_robotstxts <- # prepare execution of get_robotstxt() - apply_fun <- if ( isTRUE(use_futures) ) { future::future_lapply }else{ lapply } + apply_fun <- + if ( isTRUE(use_futures) ) { + future::future_lapply + } else { + lapply + } + to_be_applied_fun <- function(x){ + + message( + paste("\r", x$domain, " "), + appendLF = FALSE + ) + get_robotstxt( domain = x$domain, warn = x$warn, @@ -64,6 +76,7 @@ get_robotstxts <- user_agent = x$user_agent, ssl_verifypeer = x$ssl_verifypeer ) + } # execute get_robotstxt to parameter grid @@ -72,6 +85,8 @@ get_robotstxts <- parameter_list, FUN = to_be_applied_fun ) + names(rtxt_list) <- domain + message("\n") # return return(rtxt_list) diff --git a/R/paths_allowed.R b/R/paths_allowed.R index bdce96d..f00bccf 100644 --- a/R/paths_allowed.R +++ b/R/paths_allowed.R @@ -7,23 +7,30 @@ #' fail. To be on the save side, provide appropriate domains manually. #' @param bot name of the bot, defaults to "*" #' @param paths paths for which to check bot's permission, defaults to "/" -#' @param check_method which method to use for checking -- either robotstxt for -#' the package's own method or spiderbar for using spiderbar::can_fetch +#' @param check_method which method to use for checking -- either +#' "robotstxt" for the package's own method or "spiderbar" +#' for using spiderbar::can_fetch +#' @param robotstxt_list either NULL -- the default -- or a list of character +#' vectors with one vector per path to check #' #' @inheritParams get_robotstxt +#' @inheritParams get_robotstxts #' #' @seealso \link{path_allowed} #' #' @export paths_allowed <- function( - paths = "/", - domain = "auto", - bot = "*", - user_agent = NULL, - check_method = c("spiderbar", "robotstxt"), - warn = TRUE, - force = FALSE + paths = "/", + domain = "auto", + bot = "*", + user_agent = sessionInfo()$R.version$version.string, + check_method = c("spiderbar", "robotstxt"), + warn = TRUE, + force = FALSE, + ssl_verifypeer = c(1,0), + use_futures = TRUE, + robotstxt_list = NULL ){ # process inputs @@ -32,29 +39,54 @@ paths_allowed <- paths <- remove_domain(paths) } - if( length(unique(domain))==1 ){ - domain <- domain[1] + # get robots.txt files + if( is.null(robotstxt_list) ){ + robotstxt_list <- + get_robotstxts( + domain, + warn = warn, + force = force, + user_agent = user_agent, + ssl_verifypeer = ssl_verifypeer, + use_futures = use_futures + ) + names(robotstxt_list) <- domain } - # check paths res <- if ( check_method[1] == "spiderbar"){ paths_allowed_worker_spiderbar( - user_agent = user_agent, - domain = domain, - bot = bot, - paths = paths + domain = domain, + bot = bot, + paths = paths, + robotstxt_list = robotstxt_list ) } else { + if( use_futures ){ + permissions_list <- + future::future_lapply( + robotstxt_list, + function(x){robotstxt(text=x)$permissions} + ) + + }else{ + permissions_list <- + lapply( + robotstxt_list, + function(x){robotstxt(text=x)$permissions} + ) + + } + paths_allowed_worker_robotstxt( - user_agent = user_agent, - domain = domain, - bot = bot, - paths = paths + domain = domain, + bot = bot, + paths = paths, + permissions_list = permissions_list ) } @@ -65,157 +97,26 @@ paths_allowed <- } -#' paths_allowed_worker for robotstxt flavor -#' -#' @inheritParams paths_allowed -#' -paths_allowed_worker_robotstxt <- - function( - user_agent, - domain, - bot, - paths - ){ - # get permissions - permissions <- - if ( length(user_agent) == 0 ) { - mapply( - FUN = - function(domain, user_agent){ - robotstxt( - domain = domain, - warn = TRUE, - force = FALSE - )$permissions - }, - domain = domain, - SIMPLIFY = FALSE - ) - }else{ - mapply( - FUN = - function(domain, user_agent){ - robotstxt( - domain = domain, - user_agent = user_agent, - warn = TRUE, - force = FALSE - )$permissions - }, - domain = domain, - user_agent = user_agent, - SIMPLIFY = FALSE - ) - } - # apply permission checker to permission data - worker <- - function(path, permissions, bot, domain){ - if( is.na(domain) ){ - return(NA) - }else{ - path_allowed( - permissions = permissions, - path = path, - bot = bot - ) - } - } - tmp <- - mapply( - worker, - path = paths, - permissions = permissions, - bot = bot, - domain = domain - ) - names(tmp) <- NULL - # return - return(tmp) - } -#' paths_allowed_worker spiderbar flavor -#' -#' @inheritParams paths_allowed -#' -paths_allowed_worker_spiderbar <- - function( - user_agent, - domain, - bot, - paths - ){ - - browser() - - permissions <- - if ( length(user_agent) == 0 ) { - - mapply( - - FUN = - function(domain, user_agent){ - robotstxt( - domain = domain, - warn = TRUE, - force = FALSE - )$permissions - }, - - domain = domain, - SIMPLIFY = FALSE - ) - - }else{ - - mapply( - FUN = - function(domain, user_agent){ - robotstxt( - domain = domain, - user_agent = user_agent, - warn = TRUE, - force = FALSE - )$permissions - }, - domain = domain, - user_agent = user_agent, - - SIMPLIFY = FALSE - ) - - } - - rbt_text <- - get_robotstxt( - domain = domain[1], - user_agent = user_agent - ) - - spiderbar::can_fetch( - obj = spiderbar::robxp(rbt_text), - path = paths[1], - user_agent = bot - ) - } diff --git a/R/paths_allowed_worker_robotstxt.R b/R/paths_allowed_worker_robotstxt.R new file mode 100644 index 0000000..2a10aa0 --- /dev/null +++ b/R/paths_allowed_worker_robotstxt.R @@ -0,0 +1,40 @@ +#' paths_allowed_worker for robotstxt flavor +#' +#' @inheritParams paths_allowed +#' + +paths_allowed_worker_robotstxt <- + function( + domain, + bot, + paths, + permissions_list + ){ + # apply permission checker to permission data + worker <- + function(path, permissions, bot, domain){ + if( is.na(domain) ){ + return(NA) + }else{ + path_allowed( + permissions = permissions, + path = path, + bot = bot + ) + } + } + + tmp <- + mapply( + worker, + path = paths, + permissions = permissions_list, + bot = bot, + domain = domain + ) + names(tmp) <- NULL + + # return + return(tmp) + } + diff --git a/R/paths_allowed_worker_spiderbar.R b/R/paths_allowed_worker_spiderbar.R new file mode 100644 index 0000000..9b099d6 --- /dev/null +++ b/R/paths_allowed_worker_spiderbar.R @@ -0,0 +1,49 @@ + + +#' paths_allowed_worker spiderbar flavor +#' +#' @inheritParams paths_allowed +#' +paths_allowed_worker_spiderbar <- + function( + domain, + bot, + paths, + robotstxt_list + ){ + + # process inputs + robotstxts <- + unlist(lapply(robotstxt_list, paste, collapse="\n")) + + + # apply permission checker to permission data + worker <- + function(path, robotstxt, domain, bot){ + if( is.na(domain) ){ + return(NA) + }else{ + spiderbar::can_fetch( + obj = spiderbar::robxp(robotstxt), + path = path, + user_agent = bot + ) + } + } + + tmp <- + mapply( + worker, + path = paths, + robotstxt = robotstxts, + bot = bot, + domain = domain + ) + names(tmp) <- NULL + + # return + return(tmp) + + } + + diff --git a/benchmarks/spiderbar_and_futures.r b/benchmarks/spiderbar_and_futures.r new file mode 100644 index 0000000..fb47333 --- /dev/null +++ b/benchmarks/spiderbar_and_futures.r @@ -0,0 +1,112 @@ +library(spiderbar) +library(robotstxt) +library(future) + +# get file with urls +urls_fname <- system.file("urls.txt", package="robotstxt") +readLines(urls_fname)[1:3] +urls <- readLines(urls_fname)[-c(1:5)][1:100] + +paths <- urls +domain <- robotstxt:::guess_domain(paths) + +# tests for sequential +plan("sequential") + +with_fetch_seq <- + system.time( + paths_allowed( + urls, + warn = FALSE, + force = TRUE, + use_futures = FALSE, + check_method = "robotstxt" + ) + ) + +wo_fetch_seq_robotstxt <- + system.time( + paths_allowed( + urls, + warn = FALSE, + force = FALSE, + use_futures = FALSE, + check_method = "robotstxt" + ) + ) + +wo_fetch_seq_spiderbar <- + system.time( + paths_allowed( + urls, + warn = FALSE, + force = FALSE, + use_futures = FALSE, + check_method = "spiderbar" + ) + ) + + +# tests for parallel +plan("multisession") + +with_fetch_parallel <- + system.time( + paths_allowed( + urls, + warn = FALSE, + force = TRUE, + use_futures = TRUE, + check_method = "robotstxt" + ) + ) + +wo_fetch_parallel_robotstxt <- + system.time( + paths_allowed( + urls, + warn = FALSE, + force = FALSE, + use_futures = TRUE, + check_method = "robotstxt" + ) + ) + +wo_fetch_parallel_spiderbar <- + system.time( + paths_allowed( + urls, + warn = FALSE, + force = FALSE, + use_futures = TRUE, + check_method = "spiderbar" + ) + ) + + + + +# results + +with_fetch_seq +wo_fetch_seq_robotstxt +wo_fetch_seq_spiderbar + + +with_fetch_parallel +wo_fetch_parallel_robotstxt +wo_fetch_parallel_spiderbar + + +with_fetch_seq +with_fetch_parallel + + +wo_fetch_seq_robotstxt +wo_fetch_parallel_robotstxt + + +wo_fetch_seq_spiderbar +wo_fetch_parallel_spiderbar + + diff --git a/dev.r b/dev.r index 74baa97..aa1fe83 100644 --- a/dev.r +++ b/dev.r @@ -1,16 +1,26 @@ library(spiderbar) library(robotstxt) - +library(future) +#plan("multisession") # get file with urls urls_fname <- system.file("urls.txt", package="robotstxt") readLines(urls_fname)[1:3] -urls <- readLines(urls_fname)[-c(1:5)][1:20] +urls <- readLines(urls_fname)[-c(1:5)][1:100] +paths <- urls +domain <- robotstxt:::guess_domain(paths) # +time1 <- + system.time(paths_allowed(urls, warn = FALSE)) +time2 <- + system.time(paths_allowed(urls, warn = FALSE, check_method = "robotstxt")) +time3 <- + system.time(paths_allowed(urls, warn = FALSE, check_method = "spiderbar")) + -time1 <- system.time(paths_allowed(urls)) -time2 <- system.time(paths_allowed(urls)) -time3 <- system.time(paths_allowed(urls, check_method = "spiderbar")) +time1 +time2 +time3 diff --git a/man/paths_allowed.Rd b/man/paths_allowed.Rd index 153c960..cd9f7a3 100644 --- a/man/paths_allowed.Rd +++ b/man/paths_allowed.Rd @@ -5,8 +5,9 @@ \title{check if a bot has permissions to access page(s)} \usage{ paths_allowed(paths = "/", domain = "auto", bot = "*", - user_agent = NULL, check_method = c("spiderbar", "robotstxt"), - warn = TRUE, force = FALSE) + user_agent = sessionInfo()$R.version$version.string, + check_method = c("spiderbar", "robotstxt"), warn = TRUE, force = FALSE, + ssl_verifypeer = c(1, 0), use_futures = TRUE, robotstxt_list = NULL) } \arguments{ \item{paths}{paths for which to check bot's permission, defaults to "/"} @@ -21,14 +22,24 @@ fail. To be on the save side, provide appropriate domains manually.} \item{user_agent}{HTTP user-agent string to be used to retrieve robots.txt file from domain} -\item{check_method}{which method to use for checking -- either robotstxt for -the package's own method or spiderbar for using spiderbar::can_fetch} +\item{check_method}{which method to use for checking -- either +"robotstxt" for the package's own method or "spiderbar" +for using spiderbar::can_fetch} \item{warn}{warn about being unable to download domain/robots.txt because of} \item{force}{if TRUE instead of using possible cached results the function will re-download the robotstxt file HTTP response status 404. If this happens,} + +\item{use_futures}{Should future::future_lapply be used for possible +parallel/async retrieval or not. Note: check out help +pages and vignettes of package future on how to set up +plans for future execution because the robotstxt package +does not do it on its own.} + +\item{robotstxt_list}{either NULL -- the default -- or a list of character +vectors with one vector per path to check} } \description{ wrapper to \code{\link{path_allowed}} diff --git a/man/paths_allowed_worker_robotstxt.Rd b/man/paths_allowed_worker_robotstxt.Rd index 3648cc7..b085c01 100644 --- a/man/paths_allowed_worker_robotstxt.Rd +++ b/man/paths_allowed_worker_robotstxt.Rd @@ -1,15 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/paths_allowed.R +% Please edit documentation in R/paths_allowed_worker_robotstxt.R \name{paths_allowed_worker_robotstxt} \alias{paths_allowed_worker_robotstxt} \title{paths_allowed_worker for robotstxt flavor} \usage{ -paths_allowed_worker_robotstxt(user_agent, domain, bot, paths) +paths_allowed_worker_robotstxt(domain, bot, paths, permissions_list) } \arguments{ -\item{user_agent}{HTTP user-agent string to be used to retrieve robots.txt file -from domain} - \item{domain}{Domain for which paths should be checked. Defaults to "auto". If set to "auto" function will try to guess the domain by parsing the paths argument. Note however, that these are educated guesses which might utterly diff --git a/man/paths_allowed_worker_spiderbar.Rd b/man/paths_allowed_worker_spiderbar.Rd index 15d389c..b33c2de 100644 --- a/man/paths_allowed_worker_spiderbar.Rd +++ b/man/paths_allowed_worker_spiderbar.Rd @@ -1,15 +1,12 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/paths_allowed.R +% Please edit documentation in R/paths_allowed_worker_spiderbar.R \name{paths_allowed_worker_spiderbar} \alias{paths_allowed_worker_spiderbar} \title{paths_allowed_worker spiderbar flavor} \usage{ -paths_allowed_worker_spiderbar(user_agent, domain, bot, paths) +paths_allowed_worker_spiderbar(domain, bot, paths, robotstxt_list) } \arguments{ -\item{user_agent}{HTTP user-agent string to be used to retrieve robots.txt file -from domain} - \item{domain}{Domain for which paths should be checked. Defaults to "auto". If set to "auto" function will try to guess the domain by parsing the paths argument. Note however, that these are educated guesses which might utterly @@ -18,6 +15,9 @@ fail. To be on the save side, provide appropriate domains manually.} \item{bot}{name of the bot, defaults to "*"} \item{paths}{paths for which to check bot's permission, defaults to "/"} + +\item{robotstxt_list}{either NULL -- the default -- or a list of character +vectors with one vector per path to check} } \description{ paths_allowed_worker spiderbar flavor