diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index cf33674..85e065a 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -2,10 +2,12 @@ namespace Spatie\Crawler\Handlers; +use GuzzleHttp\Psr7\Uri; use Spatie\Crawler\Crawler; use Spatie\Crawler\CrawlUrl; use Spatie\Crawler\LinkAdder; use Spatie\Crawler\CrawlerRobots; +use GuzzleHttp\RedirectMiddleware; use Psr\Http\Message\UriInterface; use Spatie\Crawler\CrawlSubdomains; use Psr\Http\Message\StreamInterface; @@ -54,12 +56,24 @@ public function __invoke(ResponseInterface $response, $index) } $body = $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize()); + $baseUrl = $this->getBaseUrl($response, $crawlUrl); - $this->linkAdder->addFromHtml($body, $crawlUrl->url); + $this->linkAdder->addFromHtml($body, $baseUrl); usleep($this->crawler->getDelayBetweenRequests()); } + protected function getBaseUrl(ResponseInterface $response, CrawlUrl $crawlUrl) + { + $redirectHistory = $response->getHeader(RedirectMiddleware::HISTORY_HEADER); + + if (empty($redirectHistory)) { + return $crawlUrl->url; + } + + return new Uri(end($redirectHistory)); + } + protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl) { $this->crawler->getCrawlObservers()->crawled($crawlUrl, $response); diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index dab959f..4624bc4 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -5,6 +5,7 @@ use stdClass; use GuzzleHttp\Psr7\Uri; use Spatie\Crawler\Crawler; +use GuzzleHttp\RequestOptions; use Spatie\Crawler\CrawlProfile; use Psr\Http\Message\UriInterface; use Spatie\Browsershot\Browsershot; @@ -35,6 +36,13 @@ public function it_will_crawl_all_found_urls() $this->assertNotCrawled($this->javascriptInjectedUrls()); } + protected function javascriptInjectedUrls(): array + { + return [ + ['url' => 'http://localhost:8080/javascript', 'foundOn' => 'http://localhost:8080/link1'], + ]; + } + /** @test */ public function it_will_not_crawl_tel_links() { @@ -353,6 +361,37 @@ public function it_should_not_follow_nofollow_links() $this->assertNotCrawled([['url' => 'http://localhost:8080/nofollow', 'foundOn' => 'http://localhost:8080/']]); } + /** @test */ + public function it_should_handle_redirects_correctly_when_tracking_is_active() + { + Crawler::create([ + RequestOptions::ALLOW_REDIRECTS => [ + 'track_redirects' => true, + ], + ]) + ->setCrawlObserver(new CrawlLogger()) + ->startCrawling('http://localhost:8080/dir1/internal-redirect-entry/'); + + $this->assertCrawledUrlCount(3); + } + + protected function regularUrls(): array + { + return [ + ['url' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/link1'], + ['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/link1'], + ['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/link2'], + ['url' => 'http://localhost:8080/notExists', 'foundOn' => 'http://localhost:8080/link3'], + ['url' => 'http://example.com/', 'foundOn' => 'http://localhost:8080/link1'], + ['url' => 'http://localhost:8080/dir/link4', 'foundOn' => 'http://localhost:8080/'], + ['url' => 'http://localhost:8080/dir/link5', 'foundOn' => 'http://localhost:8080/dir/link4'], + ['url' => 'http://localhost:8080/dir/subdir/link6', 'foundOn' => 'http://localhost:8080/dir/link5'], + ]; + } + /** @test */ public function it_respects_the_requested_delay_between_requests() { @@ -386,28 +425,4 @@ public function custom_crawl_request_handlers_must_extend_abstracts() Crawler::create()->setCrawlFailedHandlerClass(stdClass::class); } - - protected function regularUrls(): array - { - return [ - ['url' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/link1'], - ['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/link1'], - ['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/link2'], - ['url' => 'http://localhost:8080/notExists', 'foundOn' => 'http://localhost:8080/link3'], - ['url' => 'http://example.com/', 'foundOn' => 'http://localhost:8080/link1'], - ['url' => 'http://localhost:8080/dir/link4', 'foundOn' => 'http://localhost:8080/'], - ['url' => 'http://localhost:8080/dir/link5', 'foundOn' => 'http://localhost:8080/dir/link4'], - ['url' => 'http://localhost:8080/dir/subdir/link6', 'foundOn' => 'http://localhost:8080/dir/link5'], - ]; - } - - protected function javascriptInjectedUrls(): array - { - return [ - ['url' => 'http://localhost:8080/javascript', 'foundOn' => 'http://localhost:8080/link1'], - ]; - } } diff --git a/tests/server/server.js b/tests/server/server.js index 73138fa..0ad1281 100644 --- a/tests/server/server.js +++ b/tests/server/server.js @@ -62,6 +62,18 @@ app.get('/meta-nofollow', function (request, response) { response.end('\n\nno follow it'); }); +app.get('/dir1/internal-redirect-entry/', function (request, response) { + response.end('trapped trap-start'); +}); + +app.get('/dir1/internal-redirect/trap/', function (request, response) { + response.redirect(301, '/dir1/internal-redirect-entry/'); +}); + +app.get('/dir1/loop-generator/internal-redirect/trapped/', function (request, response) { + response.end('It should be crawled once'); +}); + app.get('/meta-nofollow-target', function (request, response) { response.end('No followable'); });