From aeb92d6bd5ef9aa0f0f415ebbbb4f94d80700cb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Monroy=20Fern=C3=A1ndez?= Date: Mon, 1 Apr 2019 15:33:18 +0200 Subject: [PATCH 1/3] Changed incorrect tests. Added needed urls to test. --- tests/CrawlerRobotsTest.php | 8 ++++---- tests/server/server.js | 14 +++++++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/CrawlerRobotsTest.php b/tests/CrawlerRobotsTest.php index 210dec6..42cb13b 100644 --- a/tests/CrawlerRobotsTest.php +++ b/tests/CrawlerRobotsTest.php @@ -44,22 +44,22 @@ public function it_should_follow_robots_txt_disallowed_links_when_robots_are_ign } /** @test */ - public function it_should_not_follow_robots_meta_disallowed_links() + public function it_should_follow_robots_meta_follow_links() { $this->createCrawler() ->startCrawling('http://localhost:8080'); - $this->assertNotCrawled([['url' => 'http://localhost:8080/meta-disallow', 'foundOn' => 'http://localhost:8080/']]); + $this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-nofollow', 'foundOn' => 'http://localhost:8080/']]); } /** @test */ - public function it_should_follow_robots_meta_disallowed_links_when_robots_are_ignored() + public function it_should_follow_robots_meta_nofollow_links_when_robots_are_ignored() { $this->createCrawler() ->ignoreRobots() ->startCrawling('http://localhost:8080'); - $this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-disallow', 'foundOn' => 'http://localhost:8080/']]); + $this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-nofollow-target', 'foundOn' => 'http://localhost:8080/']]); } /** @test */ diff --git a/tests/server/server.js b/tests/server/server.js index aaaded5..930748b 100644 --- a/tests/server/server.js +++ b/tests/server/server.js @@ -3,7 +3,7 @@ let app = require('express')(); app.get('/', function (request, response) { - response.end('txt disallowedmeta disallowedheader disallowedLink1Link2Link4EmailTelephoneNo follow'); + response.end('txt disallowedmeta disallowedheader disallowedLink1Link2Link4EmailTelephoneNo follow'); }); app.get('/link1', function (request, response) { @@ -54,8 +54,16 @@ app.get('/txt-disallow', function (request, response) { response.end('Not allowed'); }); -app.get('/meta-disallow', function (request, response) { - response.end(''); +app.get('/meta-follow', function (request, response) { + response.end('No follow'); +}); + +app.get('/meta-nofollow', function (request, response) { + response.end('no follow it'); +}); + +app.get('/meta-nofollow-target', function (request, response) { + response.end('No followable'); }); app.get('/header-disallow', function (request, response) { From dbee3410385b0a0171da395112c7ba41a7e73577 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Monroy=20Fern=C3=A1ndez?= Date: Mon, 1 Apr 2019 15:42:55 +0200 Subject: [PATCH 2/3] Improved tests. Added test for noindex. --- tests/CrawlerRobotsTest.php | 31 ++++++++++++++++++++++++++++--- tests/server/server.js | 4 ++-- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/tests/CrawlerRobotsTest.php b/tests/CrawlerRobotsTest.php index 42cb13b..84b3a9b 100644 --- a/tests/CrawlerRobotsTest.php +++ b/tests/CrawlerRobotsTest.php @@ -49,7 +49,7 @@ public function it_should_follow_robots_meta_follow_links() $this->createCrawler() ->startCrawling('http://localhost:8080'); - $this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-nofollow', 'foundOn' => 'http://localhost:8080/']]); + $this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-nofollow', 'foundOn' => 'http://localhost:8080/meta-follow']]); } /** @test */ @@ -59,7 +59,32 @@ public function it_should_follow_robots_meta_nofollow_links_when_robots_are_igno ->ignoreRobots() ->startCrawling('http://localhost:8080'); - $this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-nofollow-target', 'foundOn' => 'http://localhost:8080/']]); + $this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-nofollow-target', 'foundOn' => 'http://localhost:8080/meta-nofollow']]); + } + + /** @test */ + public function it_should_not_index_robots_meta_noindex() + { + $this->createCrawler() + ->startCrawling('http://localhost:8080'); + + $this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-nofollow', 'foundOn' => 'http://localhost:8080/meta-follow']]); + + $this->assertNotCrawled([ + ['url' => 'http://localhost:8080/meta-follow'] + ]); + } + + /** @test */ + public function it_should_index_robots_meta_noindex_when_robots_are_ignored() + { + $this->createCrawler() + ->ignoreRobots() + ->startCrawling('http://localhost:8080'); + + $this->assertCrawledOnce([ + ['url' => 'http://localhost:8080/meta-follow', 'foundOn' => 'http://localhost:8080/'] + ]); } /** @test */ @@ -84,7 +109,7 @@ public function it_should_follow_robots_header_disallowed_links_when_robots_are_ private function createCrawler(): Crawler { return Crawler::create() - ->setMaximumDepth(1) + ->setMaximumDepth(3) ->setCrawlObserver(new CrawlLogger()); } } diff --git a/tests/server/server.js b/tests/server/server.js index 930748b..73138fa 100644 --- a/tests/server/server.js +++ b/tests/server/server.js @@ -55,11 +55,11 @@ app.get('/txt-disallow', function (request, response) { }); app.get('/meta-follow', function (request, response) { - response.end('No follow'); + response.end('\n\nNo follow'); }); app.get('/meta-nofollow', function (request, response) { - response.end('no follow it'); + response.end('\n\nno follow it'); }); app.get('/meta-nofollow-target', function (request, response) { From f1c5b97716af137a8ef55a82965adbf06b73c734 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Monroy=20Fern=C3=A1ndez?= Date: Mon, 1 Apr 2019 16:51:03 +0200 Subject: [PATCH 3/3] Fixed incrrect handling of meta robots tag. --- src/Handlers/CrawlRequestFulfilled.php | 8 +++----- tests/CrawlerRobotsTest.php | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index 82315b4..cf33674 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -31,10 +31,6 @@ public function __invoke(ResponseInterface $response, $index) { $robots = new CrawlerRobots($response, $this->crawler->mustRespectRobots()); - if (! $robots->mayIndex()) { - return; - } - $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index); if ($this->crawler->mayExecuteJavaScript()) { @@ -43,7 +39,9 @@ public function __invoke(ResponseInterface $response, $index) $response = $response->withBody(stream_for($html)); } - $this->handleCrawled($response, $crawlUrl); + if ($robots->mayIndex()) { + $this->handleCrawled($response, $crawlUrl); + } if (! $this->crawler->getCrawlProfile() instanceof CrawlSubdomains) { if ($crawlUrl->url->getHost() !== $this->crawler->getBaseUrl()->getHost()) { diff --git a/tests/CrawlerRobotsTest.php b/tests/CrawlerRobotsTest.php index 84b3a9b..d898bac 100644 --- a/tests/CrawlerRobotsTest.php +++ b/tests/CrawlerRobotsTest.php @@ -71,7 +71,7 @@ public function it_should_not_index_robots_meta_noindex() $this->assertCrawledOnce([['url' => 'http://localhost:8080/meta-nofollow', 'foundOn' => 'http://localhost:8080/meta-follow']]); $this->assertNotCrawled([ - ['url' => 'http://localhost:8080/meta-follow'] + ['url' => 'http://localhost:8080/meta-follow'], ]); } @@ -83,7 +83,7 @@ public function it_should_index_robots_meta_noindex_when_robots_are_ignored() ->startCrawling('http://localhost:8080'); $this->assertCrawledOnce([ - ['url' => 'http://localhost:8080/meta-follow', 'foundOn' => 'http://localhost:8080/'] + ['url' => 'http://localhost:8080/meta-follow', 'foundOn' => 'http://localhost:8080/'], ]); }