From a91167714f1cf69e78e71c897bb0d6fafa3b6215 Mon Sep 17 00:00:00 2001 From: Dimitris Spachos Date: Wed, 4 Nov 2020 17:38:33 +0200 Subject: [PATCH] SRE-62 Broken links sitemap check --- src/Audit/BrokenLinksSitemap.php | 143 ++++++++++++++++++++++++------- 1 file changed, 113 insertions(+), 30 deletions(-) diff --git a/src/Audit/BrokenLinksSitemap.php b/src/Audit/BrokenLinksSitemap.php index 9390fc0..092e888 100644 --- a/src/Audit/BrokenLinksSitemap.php +++ b/src/Audit/BrokenLinksSitemap.php @@ -5,7 +5,6 @@ use Drutiny\Audit; use Drutiny\Sandbox\Sandbox; use Drutiny\Annotation\Token; -use Drutiny\Annotation\Param; use Drutiny\Target\DrushTarget; /** @@ -25,7 +24,7 @@ class BrokenLinksSitemap extends Audit { /** - * Check that Curl exists. + * Check that curl exists. * * @return boolean */ @@ -37,71 +36,155 @@ protected function requireCurl(){ } } + /** + * Check that target is actually a DrushTarget + * + * @param Sandbox $sandbox + * @return void + */ + protected function requireDrushTarget(Sandbox $sandbox){ + return $sandbox->getTarget() instanceof DrushTarget; + } + /** * Finds if status is between range. * * @param integer $int * @param integer $min * @param integer $max - * @return void + * @return integer|NULL */ protected function statusRange($int,$min,$max){ return ($min<=$int && $int<=$max); } + /** + * Search subdirectories for sitemap.xml files + * + * @param Sandbox $sandbox + * @param [type] $files_to_check + * @return void + */ + private function searchDirectoriesForSitemap(Sandbox $sandbox, &$files_to_check, $dir, $uri) { + // TODO: Search directories for sitemap.xml + $command = "cd $dir && find . -type f -name 'sitemap.xml'"; + $output = $sandbox->exec($command); + $lines = array_filter(explode(PHP_EOL, $output)); + foreach ($lines as $line) { + $files_to_check[] = $uri . '/' . str_replace('./', '',$line); + } + } + + /** + * Check the status code of a url + * + * @param string $url + * @return integer + */ private function checkUrl($url) { $ch = curl_init ($url); // TODO Revise this values curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10); - curl_setopt($ch, CURLOPT_TIMEOUT, 10); + curl_setopt($ch, CURLOPT_TIMEOUT, 15); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HEADER, 1); - $output = curl_exec ($ch); + curl_exec ($ch); + + if(curl_errno($ch)){ + throw new \Exception(curl_error($ch)); + return NULL; + } + $status = curl_getinfo($ch)['http_code']; curl_close($ch); return $status; } /** - * @inheritdoc + * Check links for a certain sitemap file + * + * @param [type] $uri + * @return void */ - public function audit(Sandbox $sandbox) { - $depth = $sandbox->getParameter('depth'); - - $sdata = $sandbox->drush(['format' => 'json'])->status(); - $uri = $sdata['uri']; - - $msg = ''; - $checkForFiles = array('sitemap.xml'); - $broken_links = []; - foreach($checkForFiles as $file){ - $url = $uri . '/' . $file; - $status = $this->checkUrl($url); - if ( $status !== 200 ){ - // Do something + private function checkSitemap($uri, &$broken_links, &$total_links_count, &$total_execution_time) { + // Checking this as sometimes the main sitemap.xml + // is not a file in the system but a dynamic generated one + $status = $this->checkUrl($uri); + if ( $status !== 200 || $status === NULL){ + return; } else { - $results = []; - $links = json_decode(json_encode(simplexml_load_file($url) ), TRUE); + $links = json_decode(json_encode(simplexml_load_file($uri) ), TRUE); $items = array_shift($links); - $msg = 'Total number of links checked:' . count($items) . PHP_EOL; - $count =0 ; + $total_links_count += count($items); $time_start = microtime(true); foreach ($items as $item) { - $s = $this->checkUrl($item['loc']); - if ($s !== 200) { - $broken_links[] = [ 'uri' => $item['loc'], 'status' => $s]; + $link_status = $this->checkUrl($item['loc']); + if ($link_status !== 200) { + $broken_links[] = [ 'uri' => $item['loc'], 'status' => $link_status]; } } //.end for $time_end = microtime(true); $execution_time = ($time_end - $time_start)/1; - $msg .= 'Execution time: ' . round($execution_time,1) . ' seconds' . PHP_EOL; + $total_execution_time += round($execution_time,1); } } - // TODO: Search other directories for sitemap.xml files (?) + /** + * Returns the normalize URI (with https://) + * + * @param string $uri + * @return string + */ + private function getFullUri($uri) { + if ( strpos($uri, 'http') !== 0) { + $uri = 'https://' . $uri; + } + return $uri; + } + + /** + * @inheritdoc + */ + public function audit(Sandbox $sandbox) { + $directory = $sandbox->getParameter('directory', '%root'); + $stat = $sandbox->drush(['format' => 'json'])->status(); + + $uri = $this->getFullUri($stat['uri']); + + $directory = strtr($directory, $stat['%paths']); + + $files_to_check = []; + $total_links_count = 0; + $total_execution_time = 0; + + $status = $this->checkUrl($uri . '/sitemap.xml'); + if ($status === 200) { + $files_to_check[] = $uri . '/sitemap.xml'; + $links = json_decode(json_encode(simplexml_load_file($uri . '/sitemap.xml') ), TRUE); + if (isset($links['sitemap'])) { + foreach($links['sitemap'] as $link) { + $files_to_check[] = $link['loc']; + } + } + } else { + // There is no sitemap.xml + $msg = 'A sitemap.xml file cannot be found.'; + $sandbox->setParameter('warning_message', $msg); + return Audit::WARNING; + } + $this->searchDirectoriesForSitemap($sandbox, $files_to_check, $directory, $uri); + + $msg = ''; + $broken_links = []; + foreach($files_to_check as $file){ + $this->checkSitemap($file, $broken_links, $total_links_count, $total_execution_time); + } + + $msg .= PHP_EOL . 'Total links checked: ' . $total_links_count . PHP_EOL; + $msg .= PHP_EOL . 'Total execution time: ' . $total_execution_time . PHP_EOL; if (count($broken_links)) { $msg .= PHP_EOL . 'There ' . count($broken_links) .' broken links in the sitemap.xml' . PHP_EOL; @@ -113,7 +196,7 @@ public function audit(Sandbox $sandbox) { return Audit::FAILURE; } - $sandbox->setParameter('status', $msg . 'All links are valid!'); + $sandbox->setParameter('status', 'All links are valid!' . $msg); return Audit::SUCCESS; }