Skip to content

Commit

Permalink
fix for exception being thrown when encountering a malformatted url
Browse files Browse the repository at this point in the history
  • Loading branch information
freekmurze committed Jan 2, 2018
1 parent cc66278 commit 66151fe
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 2 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

All notable changes to `spatie/crawler` will be documented in this file.

## 3.0.1 - 2018-02-01
- fix for exception being thrown when encountering a malformatted url

## 3.0.0 - 2017-12-22
- use `\Psr\Http\Message\UriInterface` for all urls
- use Puppeteer
Expand Down
10 changes: 8 additions & 2 deletions src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
use GuzzleHttp\Exception\RequestException;
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
use InvalidArgumentException;

class Crawler
{
Expand Down Expand Up @@ -327,8 +328,13 @@ protected function extractAllLinks(string $html, UriInterface $foundOnUrl): Coll

return collect($domCrawler->filterXpath('//a')->links())
->map(function (Link $link) {
return new Uri($link->getUri());
});
try {
return new Uri($link->getUri());
} catch (InvalidArgumentException $exception) {
return null;
}
})
->filter();
}

protected function normalizeUrl(UriInterface $url): UriInterface
Expand Down
20 changes: 20 additions & 0 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,26 @@ public function it_uses_crawl_profile_for_internal_urls()
]);
}

/** @test */
public function it_can_handle_pages_with_invalid_urls()
{
$crawlProfile = new class implements CrawlProfile {
public function shouldCrawl(UriInterface $url): bool
{
return true;
}
};

Crawler::create()
->setCrawlObserver(new CrawlLogger())
->setCrawlProfile($crawlProfile)
->startCrawling('localhost:8080/invalid-url');

$this->assertCrawledOnce([
['url' => 'http://localhost:8080/invalid-url'],
]);
}

/** @test */
public function it_respects_the_maximum_amount_of_urls_to_be_crawled()
{
Expand Down
4 changes: 4 additions & 0 deletions tests/server/server.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ app.get('/dir/subdir/link6', function (request, response) {
response.end('You are on /dir/subdir/link6<a href="/link1">link 1</a>');
});

app.get('/invalid-url', function (request, response) {
response.end('There is an <a href="https:///AfyaVzw">invalid</a> url');
});

let server = app.listen(8080, function () {
const host = 'localhost';
const port = server.address().port;
Expand Down

0 comments on commit 66151fe

Please sign in to comment.