Skip to content

Commit

Permalink
Merge pull request #212 from Boardfy/fix/noindex-follow-incorrect-Han…
Browse files Browse the repository at this point in the history
…dling

noindex,follow urls are being handled incorrectly
  • Loading branch information
brendt authored Apr 3, 2019
2 parents de2c095 + f1c5b97 commit 9d0b16a
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 13 deletions.
8 changes: 3 additions & 5 deletions src/Handlers/CrawlRequestFulfilled.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,6 @@ public function __invoke(ResponseInterface $response, $index)
{
$robots = new CrawlerRobots($response, $this->crawler->mustRespectRobots());

if (! $robots->mayIndex()) {
return;
}

$crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);

if ($this->crawler->mayExecuteJavaScript()) {
Expand All @@ -43,7 +39,9 @@ public function __invoke(ResponseInterface $response, $index)
$response = $response->withBody(stream_for($html));
}

$this->handleCrawled($response, $crawlUrl);
if ($robots->mayIndex()) {
$this->handleCrawled($response, $crawlUrl);
}

if (! $this->crawler->getCrawlProfile() instanceof CrawlSubdomains) {
if ($crawlUrl->url->getHost() !== $this->crawler->getBaseUrl()->getHost()) {
Expand Down
35 changes: 30 additions & 5 deletions tests/CrawlerRobotsTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,47 @@ public function it_should_follow_robots_txt_disallowed_links_when_robots_are_ign
}

/** @test */
public function it_should_not_follow_robots_meta_disallowed_links()
public function it_should_follow_robots_meta_follow_links()
{
$this->createCrawler()
->startCrawling('http://localhost:8080');

$this->assertNotCrawled([['url' => 'http://localhost:8080/meta-disallow', 'foundOn' => 'http://localhost:8080/']]);
$this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-nofollow', 'foundOn' => 'http://localhost:8080/meta-follow']]);
}

/** @test */
public function it_should_follow_robots_meta_disallowed_links_when_robots_are_ignored()
public function it_should_follow_robots_meta_nofollow_links_when_robots_are_ignored()
{
$this->createCrawler()
->ignoreRobots()
->startCrawling('http://localhost:8080');

$this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-disallow', 'foundOn' => 'http://localhost:8080/']]);
$this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-nofollow-target', 'foundOn' => 'http://localhost:8080/meta-nofollow']]);
}

/** @test */
public function it_should_not_index_robots_meta_noindex()
{
$this->createCrawler()
->startCrawling('http://localhost:8080');

$this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-nofollow', 'foundOn' => 'http://localhost:8080/meta-follow']]);

$this->assertNotCrawled([
['url' => 'http://localhost:8080/meta-follow'],
]);
}

/** @test */
public function it_should_index_robots_meta_noindex_when_robots_are_ignored()
{
$this->createCrawler()
->ignoreRobots()
->startCrawling('http://localhost:8080');

$this->assertCrawledOnce([
['url' => 'http://localhost:8080/meta-follow', 'foundOn' => 'http://localhost:8080/'],
]);
}

/** @test */
Expand All @@ -84,7 +109,7 @@ public function it_should_follow_robots_header_disallowed_links_when_robots_are_
private function createCrawler(): Crawler
{
return Crawler::create()
->setMaximumDepth(1)
->setMaximumDepth(3)
->setCrawlObserver(new CrawlLogger());
}
}
14 changes: 11 additions & 3 deletions tests/server/server.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
let app = require('express')();

app.get('/', function (request, response) {
response.end('<a href="/txt-disallow">txt disallowed</a><a href="/meta-disallow">meta disallowed</a><a href="/header-disallow">header disallowed</a><a href="/link1">Link1</a><a href="/link2">Link2</a><a href="dir/link4">Link4</a><a href="mailto:[email protected]">Email</a><a href="tel:123">Telephone</a><a href="/nofollow" rel="nofollow">No follow</a>');
response.end('<a href="/txt-disallow">txt disallowed</a><a href="/meta-follow">meta disallowed</a><a href="/header-disallow">header disallowed</a><a href="/link1">Link1</a><a href="/link2">Link2</a><a href="dir/link4">Link4</a><a href="mailto:[email protected]">Email</a><a href="tel:123">Telephone</a><a href="/nofollow" rel="nofollow">No follow</a>');
});

app.get('/link1', function (request, response) {
Expand Down Expand Up @@ -54,8 +54,16 @@ app.get('/txt-disallow', function (request, response) {
response.end('Not allowed');
});

app.get('/meta-disallow', function (request, response) {
response.end('<meta name="robots" content="noindex, follow">');
app.get('/meta-follow', function (request, response) {
response.end('<html><head>\n<meta name="robots" content="noindex, follow">\n</head><body><a href="/meta-nofollow">No follow</a></body></html>');
});

app.get('/meta-nofollow', function (request, response) {
response.end('<html><head>\n<meta name="robots" content="index, nofollow">\n</head><body><a href="/meta-nofollow-target">no follow it</a></body></html>');
});

app.get('/meta-nofollow-target', function (request, response) {
response.end('No followable');
});

app.get('/header-disallow', function (request, response) {
Expand Down

0 comments on commit 9d0b16a

Please sign in to comment.