Skip to content

Commit

Permalink
major : future (parallel download), spiderbar (check speedup)
Browse files Browse the repository at this point in the history
  • Loading branch information
petermeissner committed Oct 20, 2017
1 parent 4e756fc commit adab0cd
Show file tree
Hide file tree
Showing 15 changed files with 321 additions and 179 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ _old_.travis.yml
^cran-comments\.md$
^\.travis\.yml$
^codecov\.yml$
benchmarks
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: robotstxt
Date: 2017-10-17
Date: 2017-10-20
Type: Package
Title: A 'robots.txt' Parser and 'Webbot'/'Spider'/'Crawler' Permissions Checker
Version: 0.5.0
Expand All @@ -23,6 +23,7 @@ Imports:
stringr (>= 1.0.0),
httr (>= 1.0.0),
spiderbar (>= 0.2.0),
future (>= 1.6.2),
magrittr
Suggests:
knitr,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ S3method(print,robotstxt)
S3method(print,robotstxt_text)
export("%>%")
export(get_robotstxt)
export(get_robotstxts)
export(is_valid_robotstxt)
export(parse_robotstxt)
export(paths_allowed)
Expand Down
12 changes: 8 additions & 4 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@ NEWS robotstxt
0.5.0 | 2017-10-07
--------------------------------------------------------------------------

- include fast spiderbar functions
- robotstxt now assumes it knows how to parse --> if it cannot parse it assumes that it got no valid robots.txt file meaning that there are no restrictions

- **feature** : use futures (from package future) to speed up retrieval and parsing
- **feature** : now there is a `get_robotstxts()` function wich is a 'vectorized' version of `get_robotstxt()`
- **feature** : `paths_allowed()` now allows checking via either robotstxt parsed robots.txt files or via functionality provided by the spiderbar package (the latter should be faster by approximatly factor 10)
- **feature** : various functions now have a ssl_verifypeer option (analog to CURL option https://curl.haxx.se/libcurl/c/CURLOPT_SSL_VERIFYPEER.html) which might help with robots.txt file retrieval in some cases
- **change** : user_agent for robots.txt file retrieval will now default to: `sessionInfo()$R.version$version.string`
- **change** : robotstxt now assumes it knows how to parse --> if it cannot parse it assumes that it got no valid robots.txt file meaning that there are no restrictions
- **fix** : valid_robotstxt would not accept some actual valid robotstxt files



Expand All @@ -21,7 +25,7 @@ NEWS robotstxt
0.4.0 | 2017-07-14
--------------------------------------------------------------------------

- **user_agent** parameter **added** to tobotstxt() and paths_allowed to allow for user defined HTTP user-agent send when retrieving robots.txt file from domain
- **user_agent** parameter **added** to robotstxt() and paths_allowed to allow for user defined HTTP user-agent send when retrieving robots.txt file from domain



Expand Down
2 changes: 1 addition & 1 deletion R/get_robotstxt.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ get_robotstxt <-
domain,
warn = TRUE,
force = FALSE,
user_agent = NULL,
user_agent = sessionInfo()$R.version$version.string,
ssl_verifypeer = c(1,0)
){

Expand Down
2 changes: 1 addition & 1 deletion R/get_robotstxt_http_get.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
get_robotstxt_http_get <-
function(
domain,
user_agent = NULL,
user_agent = sessionInfo()$R.version$version.string,
ssl_verifypeer = 1
){
if ( !is.null(user_agent) ) {
Expand Down
19 changes: 17 additions & 2 deletions R/get_robotstxts.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ get_robotstxts <-
domain,
warn = TRUE,
force = FALSE,
user_agent = NULL,
user_agent = sessionInfo()$R.version$version.string,
ssl_verifypeer = c(1,0),
use_futures = FALSE
){
Expand Down Expand Up @@ -54,16 +54,29 @@ get_robotstxts <-


# prepare execution of get_robotstxt()
apply_fun <- if ( isTRUE(use_futures) ) { future::future_lapply }else{ lapply }
apply_fun <-
if ( isTRUE(use_futures) ) {
future::future_lapply
} else {
lapply
}

to_be_applied_fun <-
function(x){

message(
paste("\r", x$domain, " "),
appendLF = FALSE
)

get_robotstxt(
domain = x$domain,
warn = x$warn,
force = x$force,
user_agent = x$user_agent,
ssl_verifypeer = x$ssl_verifypeer
)

}

# execute get_robotstxt to parameter grid
Expand All @@ -72,6 +85,8 @@ get_robotstxts <-
parameter_list,
FUN = to_be_applied_fun
)
names(rtxt_list) <- domain
message("\n")

# return
return(rtxt_list)
Expand Down
203 changes: 52 additions & 151 deletions R/paths_allowed.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,30 @@
#' fail. To be on the save side, provide appropriate domains manually.
#' @param bot name of the bot, defaults to "*"
#' @param paths paths for which to check bot's permission, defaults to "/"
#' @param check_method which method to use for checking -- either robotstxt for
#' the package's own method or spiderbar for using spiderbar::can_fetch
#' @param check_method which method to use for checking -- either
#' "robotstxt" for the package's own method or "spiderbar"
#' for using spiderbar::can_fetch
#' @param robotstxt_list either NULL -- the default -- or a list of character
#' vectors with one vector per path to check
#'
#' @inheritParams get_robotstxt
#' @inheritParams get_robotstxts
#'
#' @seealso \link{path_allowed}
#'
#' @export
paths_allowed <-
function(
paths = "/",
domain = "auto",
bot = "*",
user_agent = NULL,
check_method = c("spiderbar", "robotstxt"),
warn = TRUE,
force = FALSE
paths = "/",
domain = "auto",
bot = "*",
user_agent = sessionInfo()$R.version$version.string,
check_method = c("spiderbar", "robotstxt"),
warn = TRUE,
force = FALSE,
ssl_verifypeer = c(1,0),
use_futures = TRUE,
robotstxt_list = NULL
){

# process inputs
Expand All @@ -32,29 +39,54 @@ paths_allowed <-
paths <- remove_domain(paths)
}

if( length(unique(domain))==1 ){
domain <- domain[1]
# get robots.txt files
if( is.null(robotstxt_list) ){
robotstxt_list <-
get_robotstxts(
domain,
warn = warn,
force = force,
user_agent = user_agent,
ssl_verifypeer = ssl_verifypeer,
use_futures = use_futures
)
names(robotstxt_list) <- domain
}


# check paths
res <-
if ( check_method[1] == "spiderbar"){

paths_allowed_worker_spiderbar(
user_agent = user_agent,
domain = domain,
bot = bot,
paths = paths
domain = domain,
bot = bot,
paths = paths,
robotstxt_list = robotstxt_list
)

} else {

if( use_futures ){
permissions_list <-
future::future_lapply(
robotstxt_list,
function(x){robotstxt(text=x)$permissions}
)

}else{
permissions_list <-
lapply(
robotstxt_list,
function(x){robotstxt(text=x)$permissions}
)

}

paths_allowed_worker_robotstxt(
user_agent = user_agent,
domain = domain,
bot = bot,
paths = paths
domain = domain,
bot = bot,
paths = paths,
permissions_list = permissions_list
)

}
Expand All @@ -65,157 +97,26 @@ paths_allowed <-
}


#' paths_allowed_worker for robotstxt flavor
#'
#' @inheritParams paths_allowed
#'

paths_allowed_worker_robotstxt <-
function(
user_agent,
domain,
bot,
paths
){
# get permissions
permissions <-
if ( length(user_agent) == 0 ) {

mapply(

FUN =
function(domain, user_agent){
robotstxt(
domain = domain,
warn = TRUE,
force = FALSE
)$permissions
},

domain = domain,

SIMPLIFY = FALSE
)

}else{

mapply(

FUN =
function(domain, user_agent){
robotstxt(
domain = domain,
user_agent = user_agent,
warn = TRUE,
force = FALSE
)$permissions
},

domain = domain,
user_agent = user_agent,

SIMPLIFY = FALSE
)

}


# apply permission checker to permission data
worker <-
function(path, permissions, bot, domain){
if( is.na(domain) ){
return(NA)
}else{
path_allowed(
permissions = permissions,
path = path,
bot = bot
)
}
}

tmp <-
mapply(
worker,
path = paths,
permissions = permissions,
bot = bot,
domain = domain
)
names(tmp) <- NULL

# return
return(tmp)
}



#' paths_allowed_worker spiderbar flavor
#'
#' @inheritParams paths_allowed
#'
paths_allowed_worker_spiderbar <-
function(
user_agent,
domain,
bot,
paths
){

browser()

permissions <-
if ( length(user_agent) == 0 ) {

mapply(

FUN =
function(domain, user_agent){
robotstxt(
domain = domain,
warn = TRUE,
force = FALSE
)$permissions
},

domain = domain,

SIMPLIFY = FALSE
)

}else{

mapply(

FUN =
function(domain, user_agent){
robotstxt(
domain = domain,
user_agent = user_agent,
warn = TRUE,
force = FALSE
)$permissions
},

domain = domain,
user_agent = user_agent,

SIMPLIFY = FALSE
)

}

rbt_text <-
get_robotstxt(
domain = domain[1],
user_agent = user_agent
)

spiderbar::can_fetch(
obj = spiderbar::robxp(rbt_text),
path = paths[1],
user_agent = bot
)
}


Loading

0 comments on commit adab0cd

Please sign in to comment.