diff --git a/src/Handlers/CrawlRequestFulfilled.php b/src/Handlers/CrawlRequestFulfilled.php index 7df172a..1f9a0e7 100644 --- a/src/Handlers/CrawlRequestFulfilled.php +++ b/src/Handlers/CrawlRequestFulfilled.php @@ -15,6 +15,9 @@ use Spatie\Crawler\CrawlUrl; use Spatie\Crawler\ResponseWithCachedBody; use Spatie\Crawler\UrlParsers\UrlParser; +use Symfony\Component\Process\Exception\ProcessFailedException; +use GuzzleHttp\Psr7\Request; +use GuzzleHttp\Exception\RequestException; class CrawlRequestFulfilled { @@ -39,7 +42,18 @@ public function __invoke(ResponseInterface $response, $index) $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index); if ($this->crawler->mayExecuteJavaScript()) { - $body = $this->getBodyAfterExecutingJavaScript($crawlUrl->url); + try { + $body = $this->getBodyAfterExecutingJavaScript($crawlUrl->url); + } catch (ProcessFailedException $exception) { + $request = new Request("GET", $crawlUrl->url); + $exception = new RequestException($exception->getMessage(), $request); + $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index); + + $this->crawler->getCrawlObservers()->crawlFailed($crawlUrl, $exception); + + usleep($this->crawler->getDelayBetweenRequests()); + return; + } $response = $response->withBody(Utils::streamFor($body)); } diff --git a/tests/CrawlerTest.php b/tests/CrawlerTest.php index ccfd953..e0ea670 100644 --- a/tests/CrawlerTest.php +++ b/tests/CrawlerTest.php @@ -14,6 +14,7 @@ use Spatie\Crawler\Test\TestClasses\CrawlLogger; use Spatie\Crawler\Test\TestClasses\Log; use stdClass; +use Symfony\Component\Process\Exception\ProcessFailedException; beforeEach(function () { skipIfTestServerIsNotRunning(); @@ -117,6 +118,24 @@ expect(javascriptInjectedUrls())->each->notToBeCrawled(); }); +it('fails gracefully when browsershot fails', function () { + expect(function () { + $browsershot = (new Browsershot)->waitUntilNetworkIdle(); + + Crawler::create([ + RequestOptions::CONNECT_TIMEOUT => 60, + RequestOptions::TIMEOUT => 60, + RequestOptions::READ_TIMEOUT => 60, + ]) + ->setBrowsershot($browsershot) + ->executeJavaScript() + ->setCrawlObserver(new CrawlLogger()) + ->startCrawling('http://localhost:8080/simulate-activity'); + })->not->toThrow(ProcessFailedException::class); + + expect(['url' => 'http://localhost:8080/simulate-activity'])->toBeCrawledOnce(); +}); + it('uses a crawl profile to determine what should be crawled', function () { $crawlProfile = new class() extends CrawlProfile { diff --git a/tests/server/server.js b/tests/server/server.js index bfc8e3e..74489b8 100644 --- a/tests/server/server.js +++ b/tests/server/server.js @@ -211,6 +211,38 @@ app.get('/sitemap2.xml', function (req, res) { res.end(sitemap2); }); +// Route that initiates but never completes the response +app.get('/never-complete', (req, res) => { + req.socket.setTimeout(0); // Disable automatic socket timeout + res.writeHead(200, { 'Content-Type': 'text/plain' }); + res.write('Starting but never completing...\n'); + // Intentionally do not call res.end() or send more data, leaving the response hanging +}); + +app.get('/simulate-activity', (req, res) => { + res.send(` + + + + + + Simulated Network Activity + + +

This page simulates a never-ending network request

+ + + + `); +}); + let server = app.listen(8080, function () { const host = 'localhost';