Skip to content

Commit

Permalink
Added capability of crawling links with rel= next or prev. (#207)
Browse files Browse the repository at this point in the history
* Added capability of crawling links with rel= next or prev.

* Removed extra closing head tag in test server.
  • Loading branch information
Redominus authored and freekmurze committed Mar 11, 2019
1 parent 2ae1901 commit 64303c3
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 44 deletions.
2 changes: 1 addition & 1 deletion src/LinkAdder.php
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl)
{
$domCrawler = new DomCrawler($html, $foundOnUrl);

return collect($domCrawler->filterXpath('//a')->links())
return collect($domCrawler->filterXpath('//a | //link[@rel="next" or @rel="prev"]')->links())
->reject(function (Link $link) {
return $link->getNode()->getAttribute('rel') === 'nofollow';
})
Expand Down
2 changes: 2 additions & 0 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,8 @@ protected function regularUrls(): array
return [
['url' => 'http://localhost:8080/'],
['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/'],
['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/link1'],
['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/link1'],
['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/'],
['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/link2'],
['url' => 'http://localhost:8080/notExists', 'foundOn' => 'http://localhost:8080/link3'],
Expand Down
84 changes: 42 additions & 42 deletions tests/server/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion tests/server/server.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,21 @@ app.get('/', function (request, response) {
});

app.get('/link1', function (request, response) {
response.end('<html><body><script>var url = \'/javascript\';document.body.innerHTML = document.body.innerHTML + "<a href=\'" + url + "\'>Javascript Link</a>"</script>You are on link1<a href="http://example.com/">External Link</a></body></html>');
response.end('<html><head><link rel="next" href="/link1-next"><link rel="prev" href="/link1-prev"></head><body><script>var url = \'/javascript\';document.body.innerHTML = document.body.innerHTML + "<a href=\'" + url + "\'>Javascript Link</a>"</script>You are on link1<a href="http://example.com/">External Link</a></body></html>');
});

app.get('/javascript', function (request, response) {
response.end('This page can only be reached if JavaScript is being executed');
});

app.get('/link1-next', function (request, response) {
response.end('You are on link1-next. Next page of link1');
});

app.get('/link1-prev', function (request, response) {
response.end('You are on link1-prev. Previous page of link1');
});

app.get('/nofollow', function (request, response) {
response.end('This page should not be crawled');
});
Expand Down

0 comments on commit 64303c3

Please sign in to comment.