Skip to content

Commit

Permalink
Add support for a custimizable delay between every page crawled (#189)
Browse files Browse the repository at this point in the history
Add support for a customizable delay between every page crawled
  • Loading branch information
mattiasgeniar authored and freekmurze committed Oct 29, 2018
1 parent cc9dbee commit 7325eb5
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 0 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,15 @@ Crawler::create()
->setMaximumResponseSize(1024 * 1024 * 3)
```

## Add a delay between requests

In some cases you might get rate-limited when crawling too agressively. To circumvent this, you can use the `setDelayBetweenRequests()` method to add a pauze between every request. This value is expressed in miliseconds.

```php
Crawler::create()
->setDelayBetweenRequests(150) // After every page crawled, the crawler will wait for 150ms
```

## Using a custom crawl queue

When crawling a site the crawler will put urls to be crawled in a queue. By default, this queue is stored in memory using the built-in `CollectionCrawlQueue`.
Expand Down
15 changes: 15 additions & 0 deletions src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ class Crawler
/** @var string */
protected $crawlRequestFailedClass;

/** @var float */
protected $delayBetweenRequests = 0;

/** @var */
protected static $defaultClientOptions = [
RequestOptions::COOKIES => true,
Expand Down Expand Up @@ -155,6 +158,18 @@ public function getMaximumDepth(): ?int
return $this->maximumDepth;
}

public function setDelayBetweenRequests(int $delay): Crawler
{
$this->delayBetweenRequests = ($delay * 1000);

return $this;
}

public function getDelayBetweenRequests(): float
{
return $this->delayBetweenRequests;
}

public function ignoreRobots(): Crawler
{
$this->respectRobots = false;
Expand Down
2 changes: 2 additions & 0 deletions src/Handlers/CrawlRequestFailed.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,7 @@ public function __invoke(RequestException $exception, $index)
$crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);

$this->crawler->getCrawlObservers()->crawlFailed($crawlUrl, $exception);

usleep($this->crawler->getDelayBetweenRequests());
}
}
2 changes: 2 additions & 0 deletions src/Handlers/CrawlRequestFulfilled.php
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ public function __invoke(ResponseInterface $response, $index)
$body = $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize());

$this->linkAdder->addFromHtml($body, $crawlUrl->url);

usleep($this->crawler->getDelayBetweenRequests());
}

protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
Expand Down
22 changes: 22 additions & 0 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,28 @@ public function it_should_not_follow_nofollow_links()
$this->assertNotCrawled([['url' => 'http://localhost:8080/nofollow', 'foundOn' => 'http://localhost:8080/']]);
}

/** @test */
public function it_respects_the_requested_delay_between_requests()
{
$baseUrl = 'http://localhost:8080';

$start = time();

Crawler::create()
->setCrawlObserver(new CrawlLogger())
->setMaximumDepth(2)
->setDelayBetweenRequests(500) // 500ms
->setCrawlProfile(new CrawlSubdomains($baseUrl))
->startCrawling($baseUrl);

$end = time();

$diff = $end - $start;

// At 500ms delay per URL, crawling 8 URLs should take at least 4 seconds.
$this->assertGreaterThan(4, $diff);
}

/** @test */
public function custom_crawl_request_handlers_must_extend_abstracts()
{
Expand Down

0 comments on commit 7325eb5

Please sign in to comment.