Skip to content

Commit

Permalink
Merge pull request #169 from spatie/robots-txt
Browse files Browse the repository at this point in the history
Robots txt
  • Loading branch information
brendt authored May 8, 2018
2 parents 4bb5697 + 472dd6f commit 381ee25
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

All notable changes to `laravel-sitemap` will be documented in this file

## 5.2.0 - 2018-05-08

- Support robots checks.

## 5.1.0 - 2018-04-30

- add support for a maximum amount of tags in one sitemap
Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,20 @@ SitemapGenerator::create('https://example.com')
->writeToFile($sitemapPath);
```

#### Configuring the crawler

The crawler itself can be [configured](https://github.com/spatie/crawler#usage) to do a few different things.

You can configure the crawler used by the sitemap generator, for example: to ignore robot checks; like so.

```php
SitemapGenerator::create('http://localhost:4020')
->configureCrawler(function (Crawler $crawler) {
$crawler->ignoreRobots();
})
->writeToFile($file);
```

#### Limiting the amount of pages crawled

You can limit the amount of pages crawled by calling `setMaximumCrawlCount`
Expand Down
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"php": "^7.1",
"illuminate/support": "~5.5.0|~5.6.0",
"nesbot/carbon": "^1.21",
"spatie/crawler": "^4.0.3",
"spatie/crawler": "^4.1.0",
"spatie/temporary-directory": "^1.1"
},
"require-dev": {
Expand Down
8 changes: 8 additions & 0 deletions src/SitemapGenerator.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace Spatie\Sitemap;

use Closure;
use GuzzleHttp\Psr7\Uri;
use Spatie\Crawler\Crawler;
use Spatie\Sitemap\Tags\Url;
Expand Down Expand Up @@ -59,6 +60,13 @@ public function __construct(Crawler $crawler)
};
}

public function configureCrawler(Closure $closure): self
{
call_user_func_array($closure, [$this->crawler]);

return $this;
}

public function setConcurrency(int $concurrency)
{
$this->concurrency = $concurrency;
Expand Down
26 changes: 26 additions & 0 deletions tests/SitemapGeneratorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace Spatie\Sitemap\Test;

use Throwable;
use Spatie\Crawler\Crawler;
use Spatie\Sitemap\Tags\Url;
use Psr\Http\Message\UriInterface;
use Spatie\Sitemap\SitemapGenerator;
Expand Down Expand Up @@ -103,6 +104,31 @@ public function it_will_not_crawl_an_url_if_should_crawl_returns_false()
$this->assertMatchesXmlSnapshot(file_get_contents($sitemapPath));
}

/** @test */
public function it_will_not_crawl_an_url_if_listed_in_robots_txt()
{
$sitemapPath = $this->temporaryDirectory->path('test.xml');

SitemapGenerator::create('http://localhost:4020')
->writeToFile($sitemapPath);

$this->assertNotContains('/not-allowed', file_get_contents($sitemapPath));
}

/** @test */
public function it_will_crawl_an_url_if_robots_txt_check_is_disabled()
{
$sitemapPath = $this->temporaryDirectory->path('test.xml');

SitemapGenerator::create('http://localhost:4020')
->configureCrawler(function (Crawler $crawler) {
$crawler->ignoreRobots();
})
->writeToFile($sitemapPath);

$this->assertContains('/not-allowed', file_get_contents($sitemapPath));
}

/** @test */
public function it_can_use_a_custom_profile()
{
Expand Down
12 changes: 11 additions & 1 deletion tests/server/server.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
var app = require('express')();

app.get('/', function (req, res) {
var html = ['page1', 'page2', 'page3'].map(function (pageName) {
var html = ['page1', 'page2', 'page3', 'not-allowed'].map(function (pageName) {
return '<a href="' + pageName + '">' + pageName + '</a><br />';
}).join('');

Expand All @@ -15,6 +15,16 @@ app.get('/', function (req, res) {
res.end(html);
});

app.get('/robots.txt', function (req, res) {
var html = 'User-agent: *\n' +
'Disallow: /not-allowed';

console.log('Visited robots.txt and saw\n' + html);

res.writeHead(200, { 'Content-Type': 'text/html' });
res.end(html);
});

app.get('/:page', function (req, res) {
var page = req.params.page;

Expand Down

0 comments on commit 381ee25

Please sign in to comment.