From 13f8332cb521a409c7646ea256f14b9d204079ab Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 31 Oct 2023 22:47:43 -0300 Subject: [PATCH] Various trims. Last minute stuff --- .../OcrPostProcessor.php | 33 ++++++++++--------- .../SystemBinaryPostProcessor.php | 2 +- .../TextPostProcessor.php | 6 ++-- .../WebPageTextPostProcessor.php | 2 +- ...rawberryRunnersPostProcessorPluginBase.php | 2 +- strawberry_runners.install | 2 +- 6 files changed, 24 insertions(+), 23 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index b9a2ecb..cda3298 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -304,7 +304,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $config = $this->getConfiguration(); $timeout = $config['timeout']; // in seconds - $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default']) : 'eng']; + $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { $output = new \stdClass(); // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} @@ -353,7 +353,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $width = $io->input->metadata['flv:exif']['ImageWidth'] ?? NULL; $height = $io->input->metadata['flv:exif']['ImageHeight'] ?? NULL; } - + if ($width && $height) { // Cast them to INT to make sure we are matching exactly $width = (int)$width; @@ -506,7 +506,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; $output->searchapi['metadata'] = []; // Check if NPL processing is enabled and if so do it. - if ($config['nlp'] && !empty($config['nlp_url']) && strlen(trim($page_text)) > 0) { + if ($config['nlp'] && !empty($config['nlp_url']) && strlen(trim($page_text ?? '')) > 0) { $nlp = new NlpClient($config['nlp_url']); if ($nlp) { $capabilities = $nlp->get_call('/status', NULL); @@ -674,7 +674,7 @@ public function buildExecutableCommand(\stdClass $io) { if (!empty($tesseract_input_filename)) { $this->instanceFiles[] = $tesseract_input_filename; - if (strlen(trim($datafolder_tesseract))>0) { + if (strlen(trim($datafolder_tesseract ?? '')) >0) { $arguments_tesseract = ' --tessdata-dir ' . $datafolder_tesseract . ' ' . $arguments_tesseract; } $arguments_tesseract = str_replace('%s', '', $arguments_tesseract); @@ -696,6 +696,7 @@ public function buildExecutableCommand(\stdClass $io) { // Only return $command if it contains the original filepath somewhere if (strpos($command, $file_path) !== FALSE) { return $command; + error_log($command); } return NULL; } @@ -765,7 +766,7 @@ protected function hOCRtoMiniOCR($output, $pageid) { $titleparts = explode(';', $page['title']); $pagetitle = NULL; foreach ($titleparts as $titlepart) { - $titlepart = trim($titlepart); + $titlepart = trim($titlepart ?? ''); $title_pos = strpos($titlepart, 'bbox'); // External/old HOCR might have more data before the bbox. if ($title_pos !== FALSE) { @@ -785,7 +786,7 @@ protected function hOCRtoMiniOCR($output, $pageid) { $miniocr->startElement("p"); $miniocr->writeAttribute("xml:id", 'sequence_' . $pageid); $miniocr->writeAttribute("wh", - ltrim($pwidth, 0) . " " . ltrim($pheight, 0)); + ltrim($pwidth ?? '', 0) . " " . ltrim($pheight ?? '', 0)); $miniocr->startElement("b"); $page->registerXPathNamespace('ns', 'http://www.w3.org/1999/xhtml'); foreach ($page->xpath('.//ns:span[@class="ocr_line"]') as $line) { @@ -798,17 +799,17 @@ protected function hOCRtoMiniOCR($output, $pageid) { $y0 = (float) $wcoos[2]; $x1 = (float) $wcoos[3]; $y1 = (float) $wcoos[4]; - $l = ltrim(sprintf('%.3f', ($x0 / $pwidth)), 0); - $t = ltrim(sprintf('%.3f', ($y0 / $pheight)), 0); - $w = ltrim(sprintf('%.3f', (($x1 - $x0) / $pwidth)), 0); - $h = ltrim(sprintf('%.3f', (($y1 - $y0) / $pheight)), 0); + $l = ltrim(sprintf('%.3f', ($x0 / $pwidth)) ?? '', 0); + $t = ltrim(sprintf('%.3f', ($y0 / $pheight)) ?? '', 0); + $w = ltrim(sprintf('%.3f', (($x1 - $x0) / $pwidth)) ?? '', 0); + $h = ltrim(sprintf('%.3f', (($y1 - $y0) / $pheight)) ?? '', 0); $text = (string) $word; if ($notFirstWord) { $miniocr->text(' '); } $notFirstWord = TRUE; // New OCR Highlight does not like empty tags at all - if (strlen(trim($text)) > 0) { + if (strlen(trim($text ?? '')) > 0) { $miniocr->startElement("w"); $miniocr->writeAttribute("x", $l . ' ' . $t . ' ' . $w . ' ' . $h); @@ -881,10 +882,10 @@ protected function ALTOtoMiniOCR($output, $pageid) { $width_rel = (float) $child_node['WIDTH'] / $pageWidthPts; $height_rel = (float) $child_node['HEIGHT'] / $pageHeightPts; - $l = ltrim(sprintf('%.3f', $hpos_rel), 0); - $t = ltrim(sprintf('%.3f', $vpos_rel), 0); - $w = ltrim(sprintf('%.3f', $width_rel), 0); - $h = ltrim(sprintf('%.3f', $height_rel), 0); + $l = ltrim(sprintf('%.3f', $hpos_rel) ?? '', 0); + $t = ltrim(sprintf('%.3f', $vpos_rel) ?? '', 0); + $w = ltrim(sprintf('%.3f', $width_rel) ?? '', 0); + $h = ltrim(sprintf('%.3f', $height_rel) ?? '', 0); // New OCR Highlight > 0.71 does not like empty tags at all if (strlen(trim($child_node['CONTENT'] ?? "")) > 0) { @@ -948,7 +949,7 @@ protected function areTesseractLanguages($execpath_tesseract, $datafolder_tesser ->verifyCommand($execpath_tesseract)) { // --tessdata-dir /usr/share/tessdata - if ($datafolder_tesseract && strlen(trim($datafolder_tesseract)) >0 ) { + if ($datafolder_tesseract && strlen(trim($datafolder_tesseract ?? '') >0 )) { $execpath_tesseract = $execpath_tesseract . ' --tessdata-dir '. escapeshellarg( $datafolder_tesseract); } $execpath_tesseract = $execpath_tesseract .' --list-langs'; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php index e5812f6..94bfc90 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php @@ -266,7 +266,7 @@ public function buildExecutableCommand(\stdClass $io) { //Ok, let's try to get the an extension if there is one // will be 4 characters after $extension = substr($arguments, (int) $pos + 8, 5); - $extension = trim($extension); + $extension = trim($extension ?? ''); $extension = (strpos($extension, '.') === 0) && strlen($extension) >= 4 ? $extension : ''; $out_file_path = $this->temporary_directory . '/' . pathinfo($file_path, PATHINFO_FILENAME); diff --git a/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php index e5c83e2..043681a 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/TextPostProcessor.php @@ -238,7 +238,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; $config = $this->getConfiguration(); - $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default']) : 'eng']; + $file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng']; if ($file_path && $file_uuid && $node_uuid) { $output = new \stdClass(); // Let's see if we need an output path or not @@ -281,7 +281,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug if (json_last_error() == JSON_ERROR_NONE) { $page_text = ''; array_walk_recursive($page_array, function ($item, $key) use (&$page_text){$page_text .= $key.' '. $item .' ';}); - $page_text = trim($page_text); + $page_text = trim($page_text ?? ''); } } $output->searchapi['fulltext'] @@ -302,7 +302,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug // Check if NPL processing is enabled and if so do it. if ($config['nlp'] && !empty($config['nlp_url']) && strlen( - trim($page_text) + trim($page_text ?? '') ) > 0 ) { $nlp = new NlpClient($config['nlp_url']); diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php index 4fa5507..aebd46a 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php @@ -188,7 +188,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $page_text = preg_replace('/[\x0D]/', '', $page_text); $page_ts = $page_info['ts'] ?? date("c"); // Check if NPL processing is enabled and if so do it. - if ($config['nlp'] && !empty($config['nlp_url']) && strlen(trim($page_text)) > 0 ) { + if ($config['nlp'] && !empty($config['nlp_url']) && strlen(trim($page_text ?? '')) > 0 ) { $nlp = new NlpClient($config['nlp_url']); if ($nlp) { $languages_enabled = []; diff --git a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php index a50e4ab..462414d 100644 --- a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php +++ b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php @@ -218,7 +218,7 @@ protected function proc_execute($command, $timeout = 5) { $ppid = $status['pid']; exec('ps -o pid,ppid', $ps_out); for($i = 1; $i <= count($ps_out) - 1; $i++) { - $pid_row = preg_split('/\s+/', trim($ps_out[$i])); + $pid_row = preg_split('/\s+/', trim($ps_out[$i] ?? '')); if (((int)$pid_row[1] ?? '') == $ppid && is_numeric(($pid_row[0] ?? ''))) { $pid_to_kill = (int) $pid_row[0]; $this->kill($pid_to_kill); diff --git a/strawberry_runners.install b/strawberry_runners.install index 92883d8..d3367dc 100644 --- a/strawberry_runners.install +++ b/strawberry_runners.install @@ -75,7 +75,7 @@ function strawberry_runners_update_8904() { $pdfalto_executable = `which pdfalto`; if ($pdfalto_executable) { // Remove trailing newline. - $pdfalto_executable = trim($pdfalto_executable); + $pdfalto_executable = trim($pdfalto_executable ?? ''); $plugin_config_defaults = [ 'path_pdfalto' => $pdfalto_executable, 'arguments_pdfalto' => '%file',