Skip to content

Commit

Permalink
Execute javascript before observers are run (#166)
Browse files Browse the repository at this point in the history
* Execute javascript before observers are run

* Apply fixes from StyleCI
  • Loading branch information
brendt authored and freekmurze committed Jun 25, 2018
1 parent f55f259 commit 8f63cde
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 13 deletions.
17 changes: 17 additions & 0 deletions src/Handlers/CrawlRequestFulfilled.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\LinkAdder;
use Spatie\Crawler\CrawlerRobots;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\CrawlSubdomains;
use Psr\Http\Message\StreamInterface;
use Psr\Http\Message\ResponseInterface;
use function GuzzleHttp\Psr7\stream_for;

class CrawlRequestFulfilled
{
Expand All @@ -35,6 +37,12 @@ public function __invoke(ResponseInterface $response, $index)

$crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);

if ($this->crawler->mayExecuteJavaScript()) {
$html = $this->getBodyAfterExecutingJavaScript($crawlUrl->url);

$response = $response->withBody(stream_for($html));
}

$this->handleCrawled($response, $crawlUrl);

if (! $this->crawler->getCrawlProfile() instanceof CrawlSubdomains) {
Expand Down Expand Up @@ -65,4 +73,13 @@ protected function convertBodyToString(StreamInterface $bodyStream, $readMaximum

return $body;
}

protected function getBodyAfterExecutingJavaScript(UriInterface $url): string
{
$browsershot = $this->crawler->getBrowsershot();

$html = $browsershot->setUrl((string) $url)->bodyHtml();

return html_entity_decode($html);
}
}
13 changes: 0 additions & 13 deletions src/LinkAdder.php
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,6 @@ public function addFromHtml(string $html, UriInterface $foundOnUrl)
*/
protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl)
{
if ($this->crawler->mayExecuteJavaScript()) {
$html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
}

$domCrawler = new DomCrawler($html, $foundOnUrl);

return collect($domCrawler->filterXpath('//a')->links())
Expand Down Expand Up @@ -101,13 +97,4 @@ protected function shouldCrawl(Node $node): bool

return $node->getDepth() <= $maximumDepth;
}

protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
{
$browsershot = $this->crawler->getBrowsershot();

$html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();

return html_entity_decode($html);
}
}

0 comments on commit 8f63cde

Please sign in to comment.