Skip to content

Commit

Permalink
Various trims. Last minute stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
DiegoPino committed Nov 1, 2023
1 parent 0644410 commit 13f8332
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 23 deletions.
33 changes: 17 additions & 16 deletions src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug

$config = $this->getConfiguration();
$timeout = $config['timeout']; // in seconds
$file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default']) : 'eng'];
$file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng'];
if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) {
$output = new \stdClass();
// To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber}
Expand Down Expand Up @@ -353,7 +353,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug
$width = $io->input->metadata['flv:exif']['ImageWidth'] ?? NULL;
$height = $io->input->metadata['flv:exif']['ImageHeight'] ?? NULL;
}

if ($width && $height) {
// Cast them to INT to make sure we are matching exactly
$width = (int)$width;
Expand Down Expand Up @@ -506,7 +506,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug
PHP_EOL . "<l> ", $output->searchapi['fulltext'])) : '';
$output->searchapi['metadata'] = [];
// Check if NPL processing is enabled and if so do it.
if ($config['nlp'] && !empty($config['nlp_url']) && strlen(trim($page_text)) > 0) {
if ($config['nlp'] && !empty($config['nlp_url']) && strlen(trim($page_text ?? '')) > 0) {
$nlp = new NlpClient($config['nlp_url']);
if ($nlp) {
$capabilities = $nlp->get_call('/status', NULL);
Expand Down Expand Up @@ -674,7 +674,7 @@ public function buildExecutableCommand(\stdClass $io) {

if (!empty($tesseract_input_filename)) {
$this->instanceFiles[] = $tesseract_input_filename;
if (strlen(trim($datafolder_tesseract))>0) {
if (strlen(trim($datafolder_tesseract ?? '')) >0) {
$arguments_tesseract = ' --tessdata-dir ' . $datafolder_tesseract . ' ' . $arguments_tesseract;
}
$arguments_tesseract = str_replace('%s', '', $arguments_tesseract);
Expand All @@ -696,6 +696,7 @@ public function buildExecutableCommand(\stdClass $io) {
// Only return $command if it contains the original filepath somewhere
if (strpos($command, $file_path) !== FALSE) {
return $command;
error_log($command);
}
return NULL;
}
Expand Down Expand Up @@ -765,7 +766,7 @@ protected function hOCRtoMiniOCR($output, $pageid) {
$titleparts = explode(';', $page['title']);
$pagetitle = NULL;
foreach ($titleparts as $titlepart) {
$titlepart = trim($titlepart);
$titlepart = trim($titlepart ?? '');
$title_pos = strpos($titlepart, 'bbox');
// External/old HOCR might have more data before the bbox.
if ($title_pos !== FALSE) {
Expand All @@ -785,7 +786,7 @@ protected function hOCRtoMiniOCR($output, $pageid) {
$miniocr->startElement("p");
$miniocr->writeAttribute("xml:id", 'sequence_' . $pageid);
$miniocr->writeAttribute("wh",
ltrim($pwidth, 0) . " " . ltrim($pheight, 0));
ltrim($pwidth ?? '', 0) . " " . ltrim($pheight ?? '', 0));
$miniocr->startElement("b");
$page->registerXPathNamespace('ns', 'http://www.w3.org/1999/xhtml');
foreach ($page->xpath('.//ns:span[@class="ocr_line"]') as $line) {
Expand All @@ -798,17 +799,17 @@ protected function hOCRtoMiniOCR($output, $pageid) {
$y0 = (float) $wcoos[2];
$x1 = (float) $wcoos[3];
$y1 = (float) $wcoos[4];
$l = ltrim(sprintf('%.3f', ($x0 / $pwidth)), 0);
$t = ltrim(sprintf('%.3f', ($y0 / $pheight)), 0);
$w = ltrim(sprintf('%.3f', (($x1 - $x0) / $pwidth)), 0);
$h = ltrim(sprintf('%.3f', (($y1 - $y0) / $pheight)), 0);
$l = ltrim(sprintf('%.3f', ($x0 / $pwidth)) ?? '', 0);
$t = ltrim(sprintf('%.3f', ($y0 / $pheight)) ?? '', 0);
$w = ltrim(sprintf('%.3f', (($x1 - $x0) / $pwidth)) ?? '', 0);
$h = ltrim(sprintf('%.3f', (($y1 - $y0) / $pheight)) ?? '', 0);
$text = (string) $word;
if ($notFirstWord) {
$miniocr->text(' ');
}
$notFirstWord = TRUE;
// New OCR Highlight does not like empty <w> tags at all
if (strlen(trim($text)) > 0) {
if (strlen(trim($text ?? '')) > 0) {
$miniocr->startElement("w");
$miniocr->writeAttribute("x",
$l . ' ' . $t . ' ' . $w . ' ' . $h);
Expand Down Expand Up @@ -881,10 +882,10 @@ protected function ALTOtoMiniOCR($output, $pageid) {
$width_rel = (float) $child_node['WIDTH'] / $pageWidthPts;
$height_rel = (float) $child_node['HEIGHT'] / $pageHeightPts;

$l = ltrim(sprintf('%.3f', $hpos_rel), 0);
$t = ltrim(sprintf('%.3f', $vpos_rel), 0);
$w = ltrim(sprintf('%.3f', $width_rel), 0);
$h = ltrim(sprintf('%.3f', $height_rel), 0);
$l = ltrim(sprintf('%.3f', $hpos_rel) ?? '', 0);
$t = ltrim(sprintf('%.3f', $vpos_rel) ?? '', 0);
$w = ltrim(sprintf('%.3f', $width_rel) ?? '', 0);
$h = ltrim(sprintf('%.3f', $height_rel) ?? '', 0);

// New OCR Highlight > 0.71 does not like empty <w> tags at all
if (strlen(trim($child_node['CONTENT'] ?? "")) > 0) {
Expand Down Expand Up @@ -948,7 +949,7 @@ protected function areTesseractLanguages($execpath_tesseract, $datafolder_tesser
->verifyCommand($execpath_tesseract)) {
// --tessdata-dir /usr/share/tessdata

if ($datafolder_tesseract && strlen(trim($datafolder_tesseract)) >0 ) {
if ($datafolder_tesseract && strlen(trim($datafolder_tesseract ?? '') >0 )) {
$execpath_tesseract = $execpath_tesseract . ' --tessdata-dir '. escapeshellarg( $datafolder_tesseract);
}
$execpath_tesseract = $execpath_tesseract .' --list-langs';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ public function buildExecutableCommand(\stdClass $io) {
//Ok, let's try to get the an extension if there is one
// will be 4 characters after
$extension = substr($arguments, (int) $pos + 8, 5);
$extension = trim($extension);
$extension = trim($extension ?? '');
$extension = (strpos($extension, '.') === 0) && strlen($extension) >= 4 ? $extension : '';

$out_file_path = $this->temporary_directory . '/' . pathinfo($file_path, PATHINFO_FILENAME);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug
$node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL;
$file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL;
$config = $this->getConfiguration();
$file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default']) : 'eng'];
$file_languages = isset($io->input->lang) ? (array) $io->input->lang : [$config['language_default'] ? trim($config['language_default'] ?? '') : 'eng'];
if ($file_path && $file_uuid && $node_uuid) {
$output = new \stdClass();
// Let's see if we need an output path or not
Expand Down Expand Up @@ -281,7 +281,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug
if (json_last_error() == JSON_ERROR_NONE) {
$page_text = '';
array_walk_recursive($page_array, function ($item, $key) use (&$page_text){$page_text .= $key.' '. $item .' ';});
$page_text = trim($page_text);
$page_text = trim($page_text ?? '');
}
}
$output->searchapi['fulltext']
Expand All @@ -302,7 +302,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug
// Check if NPL processing is enabled and if so do it.
if ($config['nlp'] && !empty($config['nlp_url'])
&& strlen(
trim($page_text)
trim($page_text ?? '')
) > 0
) {
$nlp = new NlpClient($config['nlp_url']);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug
$page_text = preg_replace('/[\x0D]/', '', $page_text);
$page_ts = $page_info['ts'] ?? date("c");
// Check if NPL processing is enabled and if so do it.
if ($config['nlp'] && !empty($config['nlp_url']) && strlen(trim($page_text)) > 0 ) {
if ($config['nlp'] && !empty($config['nlp_url']) && strlen(trim($page_text ?? '')) > 0 ) {
$nlp = new NlpClient($config['nlp_url']);
if ($nlp) {
$languages_enabled = [];
Expand Down
2 changes: 1 addition & 1 deletion src/Plugin/StrawberryRunnersPostProcessorPluginBase.php
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ protected function proc_execute($command, $timeout = 5) {
$ppid = $status['pid'];
exec('ps -o pid,ppid', $ps_out);
for($i = 1; $i <= count($ps_out) - 1; $i++) {
$pid_row = preg_split('/\s+/', trim($ps_out[$i]));
$pid_row = preg_split('/\s+/', trim($ps_out[$i] ?? ''));
if (((int)$pid_row[1] ?? '') == $ppid && is_numeric(($pid_row[0] ?? ''))) {
$pid_to_kill = (int) $pid_row[0];
$this->kill($pid_to_kill);
Expand Down
2 changes: 1 addition & 1 deletion strawberry_runners.install
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ function strawberry_runners_update_8904() {
$pdfalto_executable = `which pdfalto`;
if ($pdfalto_executable) {
// Remove trailing newline.
$pdfalto_executable = trim($pdfalto_executable);
$pdfalto_executable = trim($pdfalto_executable ?? '');
$plugin_config_defaults = [
'path_pdfalto' => $pdfalto_executable,
'arguments_pdfalto' => '%file',
Expand Down

0 comments on commit 13f8332

Please sign in to comment.