Skip to content

Commit

Permalink
Merge branch 'Boardfy-fix/incorrect-relative-redirect-handling'
Browse files Browse the repository at this point in the history
  • Loading branch information
Rias committed Jun 6, 2019
2 parents e991498 + dc5fc87 commit b761cd5
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 25 deletions.
16 changes: 15 additions & 1 deletion src/Handlers/CrawlRequestFulfilled.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

namespace Spatie\Crawler\Handlers;

use GuzzleHttp\Psr7\Uri;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\LinkAdder;
use Spatie\Crawler\CrawlerRobots;
use GuzzleHttp\RedirectMiddleware;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\CrawlSubdomains;
use Psr\Http\Message\StreamInterface;
Expand Down Expand Up @@ -54,12 +56,24 @@ public function __invoke(ResponseInterface $response, $index)
}

$body = $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize());
$baseUrl = $this->getBaseUrl($response, $crawlUrl);

$this->linkAdder->addFromHtml($body, $crawlUrl->url);
$this->linkAdder->addFromHtml($body, $baseUrl);

usleep($this->crawler->getDelayBetweenRequests());
}

protected function getBaseUrl(ResponseInterface $response, CrawlUrl $crawlUrl)
{
$redirectHistory = $response->getHeader(RedirectMiddleware::HISTORY_HEADER);

if (empty($redirectHistory)) {
return $crawlUrl->url;
}

return new Uri(end($redirectHistory));
}

protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
{
$this->crawler->getCrawlObservers()->crawled($crawlUrl, $response);
Expand Down
63 changes: 39 additions & 24 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use stdClass;
use GuzzleHttp\Psr7\Uri;
use Spatie\Crawler\Crawler;
use GuzzleHttp\RequestOptions;
use Spatie\Crawler\CrawlProfile;
use Psr\Http\Message\UriInterface;
use Spatie\Browsershot\Browsershot;
Expand Down Expand Up @@ -35,6 +36,13 @@ public function it_will_crawl_all_found_urls()
$this->assertNotCrawled($this->javascriptInjectedUrls());
}

protected function javascriptInjectedUrls(): array
{
return [
['url' => 'http://localhost:8080/javascript', 'foundOn' => 'http://localhost:8080/link1'],
];
}

/** @test */
public function it_will_not_crawl_tel_links()
{
Expand Down Expand Up @@ -353,6 +361,37 @@ public function it_should_not_follow_nofollow_links()
$this->assertNotCrawled([['url' => 'http://localhost:8080/nofollow', 'foundOn' => 'http://localhost:8080/']]);
}

/** @test */
public function it_should_handle_redirects_correctly_when_tracking_is_active()
{
Crawler::create([
RequestOptions::ALLOW_REDIRECTS => [
'track_redirects' => true,
],
])
->setCrawlObserver(new CrawlLogger())
->startCrawling('http://localhost:8080/dir1/internal-redirect-entry/');

$this->assertCrawledUrlCount(3);
}

protected function regularUrls(): array
{
return [
['url' => 'http://localhost:8080/'],
['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'],
['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/link1'],
['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/link1'],
['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/'],
['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/link2'],
['url' => 'http://localhost:8080/notExists', 'foundOn' => 'http://localhost:8080/link3'],
['url' => 'http://example.com/', 'foundOn' => 'http://localhost:8080/link1'],
['url' => 'http://localhost:8080/dir/link4', 'foundOn' => 'http://localhost:8080/'],
['url' => 'http://localhost:8080/dir/link5', 'foundOn' => 'http://localhost:8080/dir/link4'],
['url' => 'http://localhost:8080/dir/subdir/link6', 'foundOn' => 'http://localhost:8080/dir/link5'],
];
}

/** @test */
public function it_respects_the_requested_delay_between_requests()
{
Expand Down Expand Up @@ -386,28 +425,4 @@ public function custom_crawl_request_handlers_must_extend_abstracts()

Crawler::create()->setCrawlFailedHandlerClass(stdClass::class);
}

protected function regularUrls(): array
{
return [
['url' => 'http://localhost:8080/'],
['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'],
['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/link1'],
['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/link1'],
['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/'],
['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/link2'],
['url' => 'http://localhost:8080/notExists', 'foundOn' => 'http://localhost:8080/link3'],
['url' => 'http://example.com/', 'foundOn' => 'http://localhost:8080/link1'],
['url' => 'http://localhost:8080/dir/link4', 'foundOn' => 'http://localhost:8080/'],
['url' => 'http://localhost:8080/dir/link5', 'foundOn' => 'http://localhost:8080/dir/link4'],
['url' => 'http://localhost:8080/dir/subdir/link6', 'foundOn' => 'http://localhost:8080/dir/link5'],
];
}

protected function javascriptInjectedUrls(): array
{
return [
['url' => 'http://localhost:8080/javascript', 'foundOn' => 'http://localhost:8080/link1'],
];
}
}
12 changes: 12 additions & 0 deletions tests/server/server.js
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ app.get('/meta-nofollow', function (request, response) {
response.end('<html><head>\n<meta name="robots" content="index, nofollow">\n</head><body><a href="/meta-nofollow-target">no follow it</a></body></html>');
});

app.get('/dir1/internal-redirect-entry/', function (request, response) {
response.end('<a href="../loop-generator/internal-redirect/trapped/">trapped</a> <a href="../../dir1/internal-redirect/trap/">trap-start</a>');
});

app.get('/dir1/internal-redirect/trap/', function (request, response) {
response.redirect(301, '/dir1/internal-redirect-entry/');
});

app.get('/dir1/loop-generator/internal-redirect/trapped/', function (request, response) {
response.end('It should be crawled once');
});

app.get('/meta-nofollow-target', function (request, response) {
response.end('No followable');
});
Expand Down

0 comments on commit b761cd5

Please sign in to comment.