From 62009d9ef0f6bcbfa295ff9468f818910301d864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Monroy=20Fern=C3=A1ndez?= Date: Tue, 21 May 2019 11:10:46 +0200 Subject: [PATCH 1/8] Added test for redirects with relative paths. --- tests/CrawlerTest.php | 89 +++++++++++++++++++++++++----------------- tests/server/server.js | 12 ++++++ 2 files changed, 65 insertions(+), 36 deletions(-) diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index dab959f..3508635 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -2,18 +2,24 @@ namespace Spatie\Crawler\Test; -use stdClass; use GuzzleHttp\Psr7\Uri; -use Spatie\Crawler\Crawler; -use Spatie\Crawler\CrawlProfile; +use GuzzleHttp\RequestOptions; use Psr\Http\Message\UriInterface; use Spatie\Browsershot\Browsershot; -use Spatie\Crawler\CrawlSubdomains; +use Spatie\Crawler\Crawler; use Spatie\Crawler\CrawlInternalUrls; +use Spatie\Crawler\CrawlProfile; +use Spatie\Crawler\CrawlSubdomains; use Spatie\Crawler\Exception\InvalidCrawlRequestHandler; +use stdClass; class CrawlerTest extends TestCase { + public static function log(string $text) + { + file_put_contents(static::$logPath, $text . PHP_EOL, FILE_APPEND); + } + public function setUp() { parent::setUp(); @@ -35,6 +41,30 @@ public function it_will_crawl_all_found_urls() $this->assertNotCrawled($this->javascriptInjectedUrls()); } + protected function regularUrls(): array + { + return [ + ['url' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/link1'], + ['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/link1'], + ['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/link2'], + ['url' => 'http://localhost:8080/notExists', 'foundOn' => 'http://localhost:8080/link3'], + ['url' => 'http://example.com/', 'foundOn' => 'http://localhost:8080/link1'], + ['url' => 'http://localhost:8080/dir/link4', 'foundOn' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/dir/link5', 'foundOn' => 'http://localhost:8080/dir/link4'], + ['url' => 'http://localhost:8080/dir/subdir/link6', 'foundOn' => 'http://localhost:8080/dir/link5'], + ]; + } + + protected function javascriptInjectedUrls(): array + { + return [ + ['url' => 'http://localhost:8080/javascript', 'foundOn' => 'http://localhost:8080/link1'], + ]; + } + /** @test */ public function it_will_not_crawl_tel_links() { @@ -143,7 +173,8 @@ public function it_has_a_method_to_disable_executing_javascript() /** @test */ public function it_uses_a_crawl_profile_to_determine_what_should_be_crawled() { - $crawlProfile = new class extends CrawlProfile { + $crawlProfile = new class extends CrawlProfile + { public function shouldCrawl(UriInterface $url): bool { return $url->getPath() !== '/link3'; @@ -186,7 +217,8 @@ public function it_uses_crawl_profile_for_internal_urls() /** @test */ public function it_can_handle_pages_with_invalid_urls() { - $crawlProfile = new class extends CrawlProfile { + $crawlProfile = new class extends CrawlProfile + { public function shouldCrawl(UriInterface $url): bool { return true; @@ -286,11 +318,6 @@ public function it_will_crawl_to_specified_depth() ]); } - public static function log(string $text) - { - file_put_contents(static::$logPath, $text.PHP_EOL, FILE_APPEND); - } - /** @test */ public function profile_crawls_a_domain_and_its_subdomains() { @@ -353,6 +380,20 @@ public function it_should_not_follow_nofollow_links() $this->assertNotCrawled([['url' => 'http://localhost:8080/nofollow', 'foundOn' => 'http://localhost:8080/']]); } + /** @test */ + public function it_should_handle_redirects_correctly_when_tracking_is_active() + { + Crawler::create([ + RequestOptions::ALLOW_REDIRECTS => [ + 'track_redirects' => true, + ], + ]) + ->setCrawlObserver(new CrawlLogger()) + ->startCrawling('http://localhost:8080/dir1/infernal-redirect-entry/'); + + $this->assertCrawledUrlCount(3); + } + /** @test */ public function it_respects_the_requested_delay_between_requests() { @@ -363,7 +404,7 @@ public function it_respects_the_requested_delay_between_requests() Crawler::create() ->setCrawlObserver(new CrawlLogger()) ->setMaximumDepth(2) - ->setDelayBetweenRequests(500) // 500ms + ->setDelayBetweenRequests(500)// 500ms ->setCrawlProfile(new CrawlSubdomains($baseUrl)) ->startCrawling($baseUrl); @@ -386,28 +427,4 @@ public function custom_crawl_request_handlers_must_extend_abstracts() Crawler::create()->setCrawlFailedHandlerClass(stdClass::class); } - - protected function regularUrls(): array - { - return [ - ['url' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/link1'], - ['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/link1'], - ['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/link2'], - ['url' => 'http://localhost:8080/notExists', 'foundOn' => 'http://localhost:8080/link3'], - ['url' => 'http://example.com/', 'foundOn' => 'http://localhost:8080/link1'], - ['url' => 'http://localhost:8080/dir/link4', 'foundOn' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/dir/link5', 'foundOn' => 'http://localhost:8080/dir/link4'], - ['url' => 'http://localhost:8080/dir/subdir/link6', 'foundOn' => 'http://localhost:8080/dir/link5'], - ]; - } - - protected function javascriptInjectedUrls(): array - { - return [ - ['url' => 'http://localhost:8080/javascript', 'foundOn' => 'http://localhost:8080/link1'], - ]; - } } diff --git a/tests/server/server.js b/tests/server/server.js index 73138fa..77b9103 100644 --- a/tests/server/server.js +++ b/tests/server/server.js @@ -62,6 +62,18 @@ app.get('/meta-nofollow', function (request, response) { response.end('\n\nno follow it'); }); +app.get('/dir1/infernal-redirect-entry/', function (request, response) { + response.end('trapped trap-start'); +}); + +app.get('/dir1/infernal-redirect/trap/', function (request, response) { + response.redirect(301, '/dir1/infernal-redirect-entry/'); +}); + +app.get('/dir1/loop-generator/infernal-redirect/trapped/', function (request, response) { + response.end('It should be crawled once'); +}); + app.get('/meta-nofollow-target', function (request, response) { response.end('No followable'); }); From e111232c01a6f4ed723c4c762e6a9af7ae9d6cfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Monroy=20Fern=C3=A1ndez?= Date: Tue, 21 May 2019 12:08:42 +0200 Subject: [PATCH 2/8] Fixed incorrect crawling when with relative urls after redirects. --- src/Handlers/CrawlRequestFulfilled.php | 12 +++++- tests/CrawlerTest.php | 60 +++++++++++++------------- 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index cf33674..6ba94eb 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -2,6 +2,8 @@ namespace Spatie\Crawler\Handlers; +use GuzzleHttp\Psr7\Uri; +use GuzzleHttp\RedirectMiddleware; use Spatie\Crawler\Crawler; use Spatie\Crawler\CrawlUrl; use Spatie\Crawler\LinkAdder; @@ -55,7 +57,15 @@ public function __invoke(ResponseInterface $response, $index) $body = $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize()); - $this->linkAdder->addFromHtml($body, $crawlUrl->url); + $historyHeader = $response->getHeader(RedirectMiddleware::HISTORY_HEADER); + if (count($historyHeader) > 0) { + $lastRedirectUrl = $historyHeader[count($historyHeader)-1]; + $baseUrl = new Uri($lastRedirectUrl); + } else { + $baseUrl = $crawlUrl->url; + } + + $this->linkAdder->addFromHtml($body, $baseUrl); usleep($this->crawler->getDelayBetweenRequests()); } diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index 3508635..1ac1faf 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -2,24 +2,19 @@ namespace Spatie\Crawler\Test; +use stdClass; use GuzzleHttp\Psr7\Uri; +use Spatie\Crawler\Crawler; use GuzzleHttp\RequestOptions; +use Spatie\Crawler\CrawlProfile; use Psr\Http\Message\UriInterface; use Spatie\Browsershot\Browsershot; -use Spatie\Crawler\Crawler; -use Spatie\Crawler\CrawlInternalUrls; -use Spatie\Crawler\CrawlProfile; use Spatie\Crawler\CrawlSubdomains; +use Spatie\Crawler\CrawlInternalUrls; use Spatie\Crawler\Exception\InvalidCrawlRequestHandler; -use stdClass; class CrawlerTest extends TestCase { - public static function log(string $text) - { - file_put_contents(static::$logPath, $text . PHP_EOL, FILE_APPEND); - } - public function setUp() { parent::setUp(); @@ -41,23 +36,6 @@ public function it_will_crawl_all_found_urls() $this->assertNotCrawled($this->javascriptInjectedUrls()); } - protected function regularUrls(): array - { - return [ - ['url' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/link1'], - ['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/link1'], - ['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/link2'], - ['url' => 'http://localhost:8080/notExists', 'foundOn' => 'http://localhost:8080/link3'], - ['url' => 'http://example.com/', 'foundOn' => 'http://localhost:8080/link1'], - ['url' => 'http://localhost:8080/dir/link4', 'foundOn' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/dir/link5', 'foundOn' => 'http://localhost:8080/dir/link4'], - ['url' => 'http://localhost:8080/dir/subdir/link6', 'foundOn' => 'http://localhost:8080/dir/link5'], - ]; - } - protected function javascriptInjectedUrls(): array { return [ @@ -173,8 +151,7 @@ public function it_has_a_method_to_disable_executing_javascript() /** @test */ public function it_uses_a_crawl_profile_to_determine_what_should_be_crawled() { - $crawlProfile = new class extends CrawlProfile - { + $crawlProfile = new class extends CrawlProfile { public function shouldCrawl(UriInterface $url): bool { return $url->getPath() !== '/link3'; @@ -217,8 +194,7 @@ public function it_uses_crawl_profile_for_internal_urls() /** @test */ public function it_can_handle_pages_with_invalid_urls() { - $crawlProfile = new class extends CrawlProfile - { + $crawlProfile = new class extends CrawlProfile { public function shouldCrawl(UriInterface $url): bool { return true; @@ -318,6 +294,11 @@ public function it_will_crawl_to_specified_depth() ]); } + public static function log(string $text) + { + file_put_contents(static::$logPath, $text . PHP_EOL, FILE_APPEND); + } + /** @test */ public function profile_crawls_a_domain_and_its_subdomains() { @@ -394,6 +375,23 @@ public function it_should_handle_redirects_correctly_when_tracking_is_active() $this->assertCrawledUrlCount(3); } + protected function regularUrls(): array + { + return [ + ['url' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/link1'], + ['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/link1'], + ['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/link2'], + ['url' => 'http://localhost:8080/notExists', 'foundOn' => 'http://localhost:8080/link3'], + ['url' => 'http://example.com/', 'foundOn' => 'http://localhost:8080/link1'], + ['url' => 'http://localhost:8080/dir/link4', 'foundOn' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/dir/link5', 'foundOn' => 'http://localhost:8080/dir/link4'], + ['url' => 'http://localhost:8080/dir/subdir/link6', 'foundOn' => 'http://localhost:8080/dir/link5'], + ]; + } + /** @test */ public function it_respects_the_requested_delay_between_requests() { @@ -404,7 +402,7 @@ public function it_respects_the_requested_delay_between_requests() Crawler::create() ->setCrawlObserver(new CrawlLogger()) ->setMaximumDepth(2) - ->setDelayBetweenRequests(500)// 500ms + ->setDelayBetweenRequests(500) // 500ms ->setCrawlProfile(new CrawlSubdomains($baseUrl)) ->startCrawling($baseUrl); From 6a1036119b72ae549fad77312c4f302d78e669aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Monroy=20Fern=C3=A1ndez?= Date: Tue, 21 May 2019 12:32:46 +0200 Subject: [PATCH 3/8] Fixed Style --- src/Handlers/CrawlRequestFulfilled.php | 4 ++-- tests/CrawlerTest.php | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index 6ba94eb..42aec48 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -3,11 +3,11 @@ namespace Spatie\Crawler\Handlers; use GuzzleHttp\Psr7\Uri; -use GuzzleHttp\RedirectMiddleware; use Spatie\Crawler\Crawler; use Spatie\Crawler\CrawlUrl; use Spatie\Crawler\LinkAdder; use Spatie\Crawler\CrawlerRobots; +use GuzzleHttp\RedirectMiddleware; use Psr\Http\Message\UriInterface; use Spatie\Crawler\CrawlSubdomains; use Psr\Http\Message\StreamInterface; @@ -59,7 +59,7 @@ public function __invoke(ResponseInterface $response, $index) $historyHeader = $response->getHeader(RedirectMiddleware::HISTORY_HEADER); if (count($historyHeader) > 0) { - $lastRedirectUrl = $historyHeader[count($historyHeader)-1]; + $lastRedirectUrl = $historyHeader[count($historyHeader) - 1]; $baseUrl = new Uri($lastRedirectUrl); } else { $baseUrl = $crawlUrl->url; diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index 1ac1faf..a6c511f 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -296,7 +296,7 @@ public function it_will_crawl_to_specified_depth() public static function log(string $text) { - file_put_contents(static::$logPath, $text . PHP_EOL, FILE_APPEND); + file_put_contents(static::$logPath, $text.PHP_EOL, FILE_APPEND); } /** @test */ From 7a89cdce940e5a0c09eaecf953281b1d88b15a78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Monroy=20Fern=C3=A1ndez?= Date: Tue, 21 May 2019 13:01:13 +0200 Subject: [PATCH 4/8] Improved code cleaningness. --- src/Handlers/CrawlRequestFulfilled.php | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index 42aec48..9c364f9 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -57,10 +57,9 @@ public function __invoke(ResponseInterface $response, $index) $body = $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize()); - $historyHeader = $response->getHeader(RedirectMiddleware::HISTORY_HEADER); - if (count($historyHeader) > 0) { - $lastRedirectUrl = $historyHeader[count($historyHeader) - 1]; - $baseUrl = new Uri($lastRedirectUrl); + $redirectHistory = $response->getHeader(RedirectMiddleware::HISTORY_HEADER); + if (! empty($redirectHistory)) { + $baseUrl = new Uri(end($redirectHistory)); } else { $baseUrl = $crawlUrl->url; } From 932db4318d5ef38a7573a99fe9769bdc18d01cf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Monroy=20Fern=C3=A1ndez?= Date: Thu, 23 May 2019 09:45:57 +0200 Subject: [PATCH 5/8] Fixed imports to comply with PSR --- src/Handlers/CrawlRequestFulfilled.php | 12 ++++++------ tests/CrawlerTest.php | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index 9c364f9..1331bf3 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -3,15 +3,15 @@ namespace Spatie\Crawler\Handlers; use GuzzleHttp\Psr7\Uri; -use Spatie\Crawler\Crawler; -use Spatie\Crawler\CrawlUrl; -use Spatie\Crawler\LinkAdder; -use Spatie\Crawler\CrawlerRobots; use GuzzleHttp\RedirectMiddleware; +use Psr\Http\Message\ResponseInterface; +use Psr\Http\Message\StreamInterface; use Psr\Http\Message\UriInterface; +use Spatie\Crawler\Crawler; +use Spatie\Crawler\CrawlerRobots; use Spatie\Crawler\CrawlSubdomains; -use Psr\Http\Message\StreamInterface; -use Psr\Http\Message\ResponseInterface; +use Spatie\Crawler\CrawlUrl; +use Spatie\Crawler\LinkAdder; use function GuzzleHttp\Psr7\stream_for; class CrawlRequestFulfilled diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index a6c511f..105ee83 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -2,16 +2,16 @@ namespace Spatie\Crawler\Test; -use stdClass; use GuzzleHttp\Psr7\Uri; -use Spatie\Crawler\Crawler; use GuzzleHttp\RequestOptions; -use Spatie\Crawler\CrawlProfile; use Psr\Http\Message\UriInterface; use Spatie\Browsershot\Browsershot; -use Spatie\Crawler\CrawlSubdomains; +use Spatie\Crawler\Crawler; use Spatie\Crawler\CrawlInternalUrls; +use Spatie\Crawler\CrawlProfile; +use Spatie\Crawler\CrawlSubdomains; use Spatie\Crawler\Exception\InvalidCrawlRequestHandler; +use stdClass; class CrawlerTest extends TestCase { From f073be154e7d7bd3fa1f66535b92a1a0bbe05f51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Monroy=20Fern=C3=A1ndez?= Date: Thu, 23 May 2019 09:52:35 +0200 Subject: [PATCH 6/8] Making StyleCi happy again. --- src/Handlers/CrawlRequestFulfilled.php | 12 ++++++------ tests/CrawlerTest.php | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index 1331bf3..9c364f9 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -3,15 +3,15 @@ namespace Spatie\Crawler\Handlers; use GuzzleHttp\Psr7\Uri; -use GuzzleHttp\RedirectMiddleware; -use Psr\Http\Message\ResponseInterface; -use Psr\Http\Message\StreamInterface; -use Psr\Http\Message\UriInterface; use Spatie\Crawler\Crawler; -use Spatie\Crawler\CrawlerRobots; -use Spatie\Crawler\CrawlSubdomains; use Spatie\Crawler\CrawlUrl; use Spatie\Crawler\LinkAdder; +use Spatie\Crawler\CrawlerRobots; +use GuzzleHttp\RedirectMiddleware; +use Psr\Http\Message\UriInterface; +use Spatie\Crawler\CrawlSubdomains; +use Psr\Http\Message\StreamInterface; +use Psr\Http\Message\ResponseInterface; use function GuzzleHttp\Psr7\stream_for; class CrawlRequestFulfilled diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index 105ee83..a6c511f 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -2,16 +2,16 @@ namespace Spatie\Crawler\Test; +use stdClass; use GuzzleHttp\Psr7\Uri; +use Spatie\Crawler\Crawler; use GuzzleHttp\RequestOptions; +use Spatie\Crawler\CrawlProfile; use Psr\Http\Message\UriInterface; use Spatie\Browsershot\Browsershot; -use Spatie\Crawler\Crawler; -use Spatie\Crawler\CrawlInternalUrls; -use Spatie\Crawler\CrawlProfile; use Spatie\Crawler\CrawlSubdomains; +use Spatie\Crawler\CrawlInternalUrls; use Spatie\Crawler\Exception\InvalidCrawlRequestHandler; -use stdClass; class CrawlerTest extends TestCase { From aaf2938d598c489caf3a9be261e5fffcdcb55aca Mon Sep 17 00:00:00 2001 From: Rias Date: Thu, 6 Jun 2019 09:56:19 +0200 Subject: [PATCH 7/8] Refactor to early return --- src/Handlers/CrawlRequestFulfilled.php | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index 9c364f9..85e065a 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -56,19 +56,24 @@ public function __invoke(ResponseInterface $response, $index) } $body = $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize()); - - $redirectHistory = $response->getHeader(RedirectMiddleware::HISTORY_HEADER); - if (! empty($redirectHistory)) { - $baseUrl = new Uri(end($redirectHistory)); - } else { - $baseUrl = $crawlUrl->url; - } + $baseUrl = $this->getBaseUrl($response, $crawlUrl); $this->linkAdder->addFromHtml($body, $baseUrl); usleep($this->crawler->getDelayBetweenRequests()); } + protected function getBaseUrl(ResponseInterface $response, CrawlUrl $crawlUrl) + { + $redirectHistory = $response->getHeader(RedirectMiddleware::HISTORY_HEADER); + + if (empty($redirectHistory)) { + return $crawlUrl->url; + } + + return new Uri(end($redirectHistory)); + } + protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl) { $this->crawler->getCrawlObservers()->crawled($crawlUrl, $response); From dc5fc87316710f2eb82fd73121d76a6d46941c7c Mon Sep 17 00:00:00 2001 From: Rias Date: Thu, 6 Jun 2019 09:56:27 +0200 Subject: [PATCH 8/8] Fix typo in tests --- tests/CrawlerTest.php | 2 +- tests/server/server.js | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index a6c511f..4624bc4 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -370,7 +370,7 @@ public function it_should_handle_redirects_correctly_when_tracking_is_active() ], ]) ->setCrawlObserver(new CrawlLogger()) - ->startCrawling('http://localhost:8080/dir1/infernal-redirect-entry/'); + ->startCrawling('http://localhost:8080/dir1/internal-redirect-entry/'); $this->assertCrawledUrlCount(3); } diff --git a/tests/server/server.js b/tests/server/server.js index 77b9103..0ad1281 100644 --- a/tests/server/server.js +++ b/tests/server/server.js @@ -62,15 +62,15 @@ app.get('/meta-nofollow', function (request, response) { response.end('\n\nno follow it'); }); -app.get('/dir1/infernal-redirect-entry/', function (request, response) { - response.end('trapped trap-start'); +app.get('/dir1/internal-redirect-entry/', function (request, response) { + response.end('trapped trap-start'); }); -app.get('/dir1/infernal-redirect/trap/', function (request, response) { - response.redirect(301, '/dir1/infernal-redirect-entry/'); +app.get('/dir1/internal-redirect/trap/', function (request, response) { + response.redirect(301, '/dir1/internal-redirect-entry/'); }); -app.get('/dir1/loop-generator/infernal-redirect/trapped/', function (request, response) { +app.get('/dir1/loop-generator/internal-redirect/trapped/', function (request, response) { response.end('It should be crawled once'); });