From e403cf852e0eb0524989b31016d52402e8542cac Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Fri, 21 May 2021 14:22:17 -0400 Subject: [PATCH 01/12] First pass. Just testing for now --- .../FrictionlessDataPackagePostProcessor.php | 225 ++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php diff --git a/src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php new file mode 100644 index 0000000..92d087f --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/FrictionlessDataPackagePostProcessor.php @@ -0,0 +1,225 @@ + 'asstructure', + 'mime_type' => ['application/zip, application\/vnd.datapackage+zip'], + 'path' => '', + 'arguments' => '', + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'time_out' => '300', + ] + parent::defaultConfiguration(); + } + + + public function calculateDependencies() { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + // @TODO: Implement calculateDependencies() method. + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'asstructure' => 'File entities referenced in the as:filetype JSON structure', + 'filepath' => 'Full file paths passed by another processor', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source file this processor needs is fetched'), + '#required' => TRUE, + ]; + + $element['ado_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('ADO type(s) to limit this processor to.'), + '#default_value' => $this->getConfiguration()['ado_type'], + '#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'), + ]; + + $element['jsonkey'] = [ + '#type' => 'checkboxes', + '#title' => $this->t('The JSON key that contains the desired source files.'), + '#options' => [ + 'as:document' => 'as:document', + 'as:application' => 'as:application', + ], + '#default_value' => (!empty($this->getConfiguration()['jsonkey']) && is_array($this->getConfiguration()['jsonkey'])) ? $this->getConfiguration()['jsonkey'] : [], + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[source_type]"]' => ['value' => 'asstructure'], + ], + ], + '#required' => TRUE, + ]; + + $element['mime_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('Mimetypes(s) to limit this Processor to.'), + '#default_value' => $this->getConfiguration()['mime_type'], + '#description' => $this->t('A single Mimetype type or a coma separed list of mimetypes that qualify to be Processed. Leave empty to apply any file'), + ]; + + + $element['resource_path'] = [ + '#type' => 'textfield', + '#title' => $this->t('A relative path of the resource to index'), + '#default_value' => !empty($this->getConfiguration()['arguments']) ? $this->getConfiguration()['arguments'] : 'pages/pages.jsonl', + '#description' => t('Which non binary resource (text, json, csv, jsonl, xml contained in manifest.json of the Data packaged will be extracted and processed. We will do our best to extract the most relevant information based on the format.'), + '#required' => TRUE, + ]; + + $element['output_type'] = [ + '#type' => 'select', + '#title' => $this->t('The expected and desired output of this processor.'), + '#options' => [ + 'entity:file' => 'One or more Files', + 'json' => 'Data/Values that can be serialized to JSON', + ], + '#default_value' => $this->getConfiguration()['output_type'], + '#description' => $this->t('If the output is just data and "One or more Files" is selected all data will be dumped into a file and handled as such.'), + ]; + + $element['output_destination'] = [ + '#type' => 'checkboxes', + '#title' => $this->t("Where and how the output will be used."), + '#options' => [ + 'plugin' => 'As Input for another processor Plugin', + 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', + ], + '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#required' => TRUE, + ]; + + $element['processor_queue_type'] = [ + '#type' => 'select', + '#title' => $this->t('The queue to use for this processor.'), + '#options' => [ + 'background' => 'Secondary queue in background', + 'realtime' => 'Primary queue in realtime', + ], + '#default_value' => $this->getConfiguration()['processor_queue_type'], + '#description' => $this->t('The primary queue will be execute in realtime while the Secondary will be execute in background'), + ]; + + $element['timeout'] = [ + '#type' => 'number', + '#title' => $this->t('Timeout in seconds for this process.'), + '#default_value' => $this->getConfiguration()['timeout'], + '#description' => $this->t('If the process runs out of time it can still be processed again.'), + '#size' => 3, + '#maxlength' => 3, + '#min' => 1, + ]; + $element['weight'] = [ + '#type' => 'number', + '#title' => $this->t('Order or execution in the global chain.'), + '#default_value' => $this->getConfiguration()['weight'], + ]; + + return $element; + } + + + public function onDependencyRemoval(array $dependencies) { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + return parent::onDependencyRemoval( + $dependencies + ); // TODO: Change the autogenerated stub + } + + /** + * Executes the logic of this plugin given a file path and a context. + * + * @param \stdClass $io + * $io->input needs to contain + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments + * $io->output will contain the result of the processor + * @param string $context + */ + public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS) { + // Specific input key as defined in the annotation + // In this case it will contain an absolute Path to a File. + // Needed since this executes locally on the server via SHELL. + + $input_property = $this->pluginDefinition['input_property']; + $input_argument = $this->pluginDefinition['input_argument']; + $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL; + $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; + $config = $this->getConfiguration(); + $timeout = $config['timeout']; // in seconds + + if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { + // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} + $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; + $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; + $package = Package::load($file_path); + foreach ($package as $resource) { + echo $resource->name(); + foreach ($resource as $row) { + echo $row; + } + } + $output = new \stdClass(); + $output->searchapi['fulltext'] = NULL; + $output->plugin = NULL; + $io->output = $output; + } + // Lastly plain text version of the XML. + $io->output->searchapi['plaintext'] = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; + } + + /** + * Builds a clean Command string using a File path. + * + * @param \stdClass $io + * $io->input needs to contain + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments + * $io->output will contain the result of the processor + * + * @return null|string + */ + public function buildExecutableCommand(\stdClass $io) { + + return NULL; + } + +} From 82aa47ead4e67ccea62989fa2886134277201fd7 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 23 May 2021 22:32:32 -0400 Subject: [PATCH 02/12] WACZ Search Index working (First pass) Figured out that WACZ is not 100% frictionlessdata package so had to go for an alternative here. --- .../AbstractPostProcessorQueueWorker.php | 56 +++-- .../WaczPagesSequencePostProcessor.php | 213 ++++++++++++++++++ .../WebPageTextPostProcessor.php | 166 ++++++++++++++ ...rawberryRunnersPostProcessorPluginBase.php | 1 - 4 files changed, 422 insertions(+), 14 deletions(-) create mode 100644 src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php create mode 100644 src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index 1a7556f..eefeeca 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -213,11 +213,22 @@ public function processItem($data) { // If argument is not there we will assume there is a mistake and its // a single one. $data->{$input_argument} = isset($data->{$input_argument}) ? $data->{$input_argument} : 1; + + // In case $data->{$input_argument} is an array/data we will use the key as "sequence" + // Each processor needs to be sure it passes a single item and with a unique key + + if (is_array($data->{$input_argument})) { + $sequence_key = array_key_first($data->{$input_argument}); + } + else { + $sequence_key = (int) $data->{$input_argument}; + } + if (is_a($entity, TranslatableInterface::class)) { $translations = $entity->getTranslationLanguages(); foreach ($translations as $translation_id => $translation) { //@TODO here, the number 1 needs to come from the sequence. - $item_id = $entity->id() . ':' . $data->{$input_argument} . ':' . $translation_id . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id; + $item_id = $entity->id() . ':' . $sequence_key . ':' . $translation_id . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id; // a single 0 as return will force us to reindex. $inindex = $inindex * $this->flavorInSolrIndex($item_id, $data->metadata['checksum'], $indexes); $item_ids[] = $item_id; @@ -239,7 +250,7 @@ public function processItem($data) { $inkeystore = $inkeystore && FALSE; } } - //@TODO allow a force in case of corrupted key value? Partial output + // Allows a force in case of corrupted key value? Partial output // Extragenoxus weird data? if (($inindex === 0 || $inkeystore === FALSE) || $data->force == TRUE) { @@ -248,8 +259,11 @@ public function processItem($data) { // Check if $io->output exists? $toindex = new stdClass(); - $toindex->fulltext = $io->output->searchapi['fulltext']; - $toindex->plaintext = $io->output->searchapi['plaintext']; + $toindex->fulltext = $io->output->searchapi['fulltext'] ?? ''; + $toindex->plaintext = $io->output->searchapi['plaintext'] ?? ''; + $toindex->metadata = $io->output->searchapi['metadata'] ?? []; + $toindex->label = $io->output->searchapi['label'] ?? NULL; + // $siblings will be the amount of total children processors that were // enqueued for a single Processor chain. $toindex->sequence_total = !empty($data->siblings) ? $data->siblings : 1; @@ -316,23 +330,29 @@ public function processItem($data) { // Possible input properties: // - Can come from the original Data (most likely) // - May be overriden by the $io->output, e.g when a processor generates a file that is not part of any node + $input_property_value_from_plugin = TRUE; $input_property_value = isset($io->output->plugin) && isset($io->output->plugin[$input_property]) ? $io->output->plugin[$input_property] : NULL; + // If was not defined by the previous processor try from the main data. if ($input_property_value == NULL) { + $input_property_value_from_plugin = FALSE; $input_property_value = isset($data->{$input_property}) ? $data->{$input_property} : NULL; } + // If still null means the child is incompatible with the parent. We abort. if ($input_property_value == NULL) { - $this->logger->log(LogLevel::WARNING, 'Sorry @childplugin is incompatible with @parentplugin, skipping.', [ - '@parentplugin' => $data->plugin_config_entity_id, - '@childplugin' => $childdata->plugin_config_entity_id, - - ]); + $this->logger->log(LogLevel::WARNING, + 'Sorry @childplugin is incompatible with @parentplugin, skipping.', + [ + '@parentplugin' => $data->plugin_config_entity_id, + '@childplugin' => $childdata->plugin_config_entity_id, + ]); continue; } // Warning Diego. This may lead to a null $childdata->{$input_property} = $input_property_value; $childdata->plugin_config_entity_id = $postprocessor_config_entity->id(); - $input_argument_value = isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ? $io->output->plugin[$input_argument] : $data->{$input_argument}; + $input_argument_value = isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ? + $io->output->plugin[$input_argument] : $data->{$input_argument}; // This is a must: Solr indexing requires a list of sequences. A single one // will not be enqueued. if (is_array($input_argument_value)) { @@ -345,6 +365,14 @@ public function processItem($data) { // The count will always be relative to this call // Means count of how many children are being called. $childdata->siblings = count($input_argument_value); + // In case the $input_property_value is an array coming from a plugin we may want to if has the same amount of values of $input_argument_value + // If so its many to one and we only need the corresponding entry to this sequence + if ($input_property_value_from_plugin && + is_array($input_property_value) && + count($input_property_value) == $childdata->siblings && + isset($input_property_value[$value])) { + $childdata->{$input_property} = $input_property_value[$value]; + } Drupal::queue('strawberryrunners_process_background', TRUE) ->createItem($childdata); } @@ -479,6 +507,10 @@ public function flavorInSolrIndex(string $key, string $checksum, array $indexes) $parse_mode = $this->parseModeManager->createInstance('terms'); $query->setParseMode($parse_mode); $query->sort('search_api_relevance', 'DESC'); + $query->setOption('search_api_retrieved_field_values', ['id']); + // Query breaks if not because standard hl is enabled for all fields. + // and normal hl offsets on OCR HL specific ones. + $query->setOption('no_highlight', 'on'); $query->addCondition('search_api_id', 'strawberryfield_flavor_datasource/' . $key) ->addCondition('search_api_datasource', 'strawberryfield_flavor_datasource') @@ -535,9 +567,7 @@ private function invokeProcessor(StrawberryRunnersPostProcessorPluginInterface $ $io = new stdClass(); $input = new stdClass(); - // @NOTE: this is the only place where we just pass filelocation fixed instead of the - // actual property named $input_property. Which may be weird? - $input->{$input_property} = $data->filepath; + $input->{$input_property} = $data->{$input_property}; $input->{$input_argument} = isset($data->{$input_argument}) ? $data->{$input_argument} : 1; // The Node UUID $input->nuuid = $data->nuuid; diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php new file mode 100644 index 0000000..79dddca --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php @@ -0,0 +1,213 @@ + 'asstructure', + 'mime_type' => ['application/vnd.datapackage+zip'], + 'output_type' => 'json', + 'output_destination' => ['plugin' => 'plugin'], + 'processor_queue_type' => ['realtime' => 'realtime'], + ] + parent::defaultConfiguration(); + } + + + public function calculateDependencies() { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + // @TODO: Implement calculateDependencies() method. + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + + $element['source_type'] = [ + '#type' => 'hidden', + '#title' => $this->t('The type of source data this processor works on'), + '#default_value' => $this->getConfiguration()['source_type'], + ]; + + $element['jsonkey'] = [ + '#type' => 'checkboxes', + '#title' => $this->t('The JSON key that contains the desired source files.'), + '#options' => [ + 'as:image' => 'as:image', + 'as:document' => 'as:document', + 'as:audio' => 'as:audio', + 'as:video' => 'as:video', + 'as:text' => 'as:text', + 'as:application' => 'as:application', + ], + '#default_value' => (!empty($this->getConfiguration()['jsonkey']) && is_array($this->getConfiguration()['jsonkey'])) ? $this->getConfiguration()['jsonkey'] : [], + '#required' => TRUE, + ]; + + // Because we are using the default entity Form, we want to ensure the + // Settings for contains all the values + $element['output_destination'] = [ + '#type' => 'value', + '#default_value' => $this->defaultConfiguration()['output_destination'], + ]; + + $element['mime_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('Mimetypes(s) to limit this Processor to.'), + '#default_value' => $this->getConfiguration()['mime_type'], + '#description' => $this->t('A single Mimetype type or a coma separed list of mimetypes that qualify to be Processed. Leave empty to apply any file'), + ]; + + + $element['ado_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('ADO type(s) to limit this processor to.'), + '#default_value' => $this->getConfiguration()['ado_type'], + '#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'), + ]; + + $element['output_type'] = [ + '#type' => 'select', + '#title' => $this->t('The expected and desired output of this processor.'), + '#options' => [ + 'json' => 'Data/Values that can be serialized to JSON', + ], + '#default_value' => $this->getConfiguration()['output_type'], + '#description' => $this->t('Only option for this Processor is JSON output'), + ]; + + + $element['processor_queue_type'] = [ + '#type' => 'select', + '#title' => $this->t('The queue to use for this processor.'), + '#options' => [ + 'background' => 'Secondary queue in background', + 'realtime' => 'Primary queue in realtime', + ], + '#default_value' => $this->getConfiguration()['processor_queue_type'], + '#description' => $this->t('The primary queue will be execute in realtime while the Secondary will be execute in background'), + ]; + + $element['timeout'] = [ + '#type' => 'number', + '#title' => $this->t('Timeout in seconds for this process.'), + '#default_value' => $this->getConfiguration()['timeout'], + '#description' => $this->t('If the process runs out of time it can still be processed again.'), + '#size' => 3, + '#maxlength' => 3, + '#min' => 1, + ]; + $element['weight'] = [ + '#type' => 'number', + '#title' => $this->t('Order or execution in the global chain.'), + '#default_value' => $this->getConfiguration()['weight'], + ]; + + return $element; + } + + + public function onDependencyRemoval(array $dependencies) { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + return parent::onDependencyRemoval( + $dependencies + ); // TODO: Change the autogenerated stub + } + + /** + * Executes the logic of this plugin given a file path and a context. + * + * @param \stdClass $io + * $io->input needs to contain + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments + * $io->output will contain the result of the processor + * @param string $context + */ + public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS) { + + $input_property = $this->pluginDefinition['input_property']; + $input_argument = $this->pluginDefinition['input_argument']; + $file_uuid = isset($io->input->metadata['dr:uuid']) ? $io->input->metadata['dr:uuid'] : NULL; + $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; + $config = $this->getConfiguration(); + $output = new \stdClass(); + $io->output = $io->input; + $output->searchapi['fulltext'] = ''; + if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { + // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} + $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; + // File path may not be .zip? + // We may want to check + $info = pathinfo($file_path); + $newname = $info['dirname'].'/'.$info['filename'] . '.' . 'zip'; + $sequence_data = []; + $sequence_number = []; + $this->fileSystem->move($file_path, $newname, FileSystemInterface::EXISTS_REPLACE); + $z = new \ZipArchive(); + $contents = NULL; + if ($z->open($newname)) { + $fp = $z->getStream('pages/pages.jsonl'); + if ($fp) { + $i = 0; + while (($buffer = fgets($fp, 4096)) !== FALSE) { + // First row in a jsonl will be the headers, we do not need this one. + if ($i == 0) { + $i++; + continue; + } + $sequence_data[$i] = $buffer; + $sequence_number[] = $i; + $i++; + } + if (!feof($fp)) { + error_log('ups!'); + } + fclose($fp); + } + else { + // Opening the ZIP file failed. + error_log('NO Pages found to extract'); + } + } + + $output = new \stdClass(); + $output->plugin = [ + 'sequence_number' => $sequence_number, + 'plugin_metadata' => $sequence_data, + ]; + $io->output = $output; + error_log(var_export($io, true)); + } + else { + throw new \InvalidArgumentException(\sprintf("Invalid arguments passed to %s", $this->getPluginId())); + } + } + +} + + + + diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php new file mode 100644 index 0000000..88193ef --- /dev/null +++ b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php @@ -0,0 +1,166 @@ + 'json', + 'output_type' => 'json', + 'output_destination' => 'searchapi', + 'processor_queue_type' => 'background', + 'time_out' => '300', + ] + parent::defaultConfiguration(); + } + + + public function calculateDependencies() { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + // @TODO: Implement calculateDependencies() method. + } + + public function settingsForm(array $parents, FormStateInterface $form_state) { + + $element['source_type'] = [ + '#type' => 'select', + '#title' => $this->t('The type of source data this processor works on'), + '#options' => [ + 'json' => 'JSON passed by a parent Processor', + ], + '#default_value' => $this->getConfiguration()['source_type'], + '#description' => $this->t('Select from where the source data for this processor is fetched'), + '#required' => TRUE, + ]; + + $element['ado_type'] = [ + '#type' => 'textfield', + '#title' => $this->t('ADO type(s) to limit this processor to.'), + '#default_value' => $this->getConfiguration()['ado_type'], + '#description' => $this->t('A single ADO type or a coma delimited list of ado types that qualify to be Processed. Leave empty to apply to all ADOs.'), + ]; + + $element['output_type'] = [ + '#type' => 'select', + '#title' => $this->t('The expected and desired output of this processor.'), + '#options' => [ + 'json' => 'Data/Values that can be serialized to JSON', + ], + '#default_value' => $this->getConfiguration()['output_type'], + '#description' => $this->t('If the output is just data and "One or more Files" is selected all data will be dumped into a file and handled as such.'), + ]; + + $element['output_destination'] = [ + '#type' => 'checkboxes', + '#title' => $this->t("Where and how the output will be used."), + '#options' => [ + 'plugin' => 'As Input for another processor Plugin', + 'searchapi' => 'In a Search API Document using the Strawberryfield Flavor Data Source (e.g used for HOCR highlight)', + ], + '#default_value' => (!empty($this->getConfiguration()['output_destination']) && is_array($this->getConfiguration()['output_destination'])) ? $this->getConfiguration()['output_destination'] : [], + '#description' => t('As Input for another processor Plugin will only have an effect if another Processor is setup to consume this ouput.'), + '#required' => TRUE, + ]; + + $element['processor_queue_type'] = [ + '#type' => 'select', + '#title' => $this->t('The queue to use for this processor.'), + '#options' => [ + 'background' => 'Secondary queue in background', + 'realtime' => 'Primary queue in realtime', + ], + '#default_value' => $this->getConfiguration()['processor_queue_type'], + '#description' => $this->t('The primary queue will be execute in realtime while the Secondary will be execute in background'), + ]; + + $element['timeout'] = [ + '#type' => 'number', + '#title' => $this->t('Timeout in seconds for this process.'), + '#default_value' => $this->getConfiguration()['timeout'], + '#description' => $this->t('If the process runs out of time it can still be processed again.'), + '#size' => 3, + '#maxlength' => 3, + '#min' => 1, + ]; + $element['weight'] = [ + '#type' => 'number', + '#title' => $this->t('Order or execution in the global chain.'), + '#default_value' => $this->getConfiguration()['weight'], + ]; + + return $element; + } + + + public function onDependencyRemoval(array $dependencies) { + // Since Processors could be chained we need to check if any other + // processor instance is using an instance of this one + return parent::onDependencyRemoval( + $dependencies + ); // TODO: Change the autogenerated stub + } + + /** + * Executes the logic of this plugin given a file path and a context. + * + * @param \stdClass $io + * $io->input needs to contain + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_property + * \Drupal\strawberry_runners\Annotation\StrawberryRunnersPostProcessor::$input_arguments + * $io->output will contain the result of the processor + * @param string $context + */ + public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPluginInterface::PROCESS) { + $input_property = $this->pluginDefinition['input_property']; + $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; + $output = new \stdClass(); + $output->searchapi['fulltext'] = StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; + if (isset($io->input->{$input_property}) && $node_uuid) { + $page_info = json_decode($io->input->{$input_property}, true, 3); + if (json_last_error() == JSON_ERROR_NONE) { + $page_title = $page_info['title'] ?? NULL; + $page_url = $page_info['url'] ?? ''; + $page_title = $page_title ?? $page_url; + $page_text = $page_text['text'] ?? ''; + $nlp = new NlpClient('http://esmero-nlp:6400'); + if ($nlp) { + $polyglot = $nlp->polyglot_entities($page_text, 'en'); + $entities_all = $polyglot->getEntities(); + error_log(var_export($entities_all, TRUE)); + } + $output->searchapi['plaintext'] = $page_url . ' , '. $page_title . ' , ' . $page_text; + $output->searchapi['label'] = $page_title; + $output->searchapi['metadata'][] = $page_url; + if (!empty($page_info['ts'])) { + $output->searchapi['metadata'][] = $page_info['ts']; + } + $output->plugin = $output->searchapi; + } else { + throw new \Exception("WebPage Text was not a valid JSON"); + } + } + $io->output = $output; + } +} diff --git a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php index ed7c664..560fc93 100644 --- a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php +++ b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php @@ -88,7 +88,6 @@ public function __construct( $this->fileSystem = $file_system; $this->temporary_directory = $this->fileSystem->getTempDirectory(); $this->logger = $logger; - } public static function create(ContainerInterface $container, array $configuration, $plugin_id, $plugin_definition) { From eeca2e1dd1d52a354225021c06450f796f943e99 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 24 May 2021 10:10:38 -0400 Subject: [PATCH 03/12] Fix non sense on WaczPagesSequencePostProcessor --- .../WaczPagesSequencePostProcessor.php | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php index 79dddca..ad63f0f 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php @@ -155,7 +155,6 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $config = $this->getConfiguration(); $output = new \stdClass(); $io->output = $io->input; - $output->searchapi['fulltext'] = ''; if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} $file_path = isset($io->input->{$input_property}) ? $io->input->{$input_property} : NULL; @@ -165,7 +164,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $newname = $info['dirname'].'/'.$info['filename'] . '.' . 'zip'; $sequence_data = []; $sequence_number = []; - $this->fileSystem->move($file_path, $newname, FileSystemInterface::EXISTS_REPLACE); + $newname = $this->fileSystem->move($file_path, $newname, FileSystemInterface::EXISTS_REPLACE); $z = new \ZipArchive(); $contents = NULL; if ($z->open($newname)) { @@ -193,17 +192,17 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug } } - $output = new \stdClass(); + $output->plugin = [ 'sequence_number' => $sequence_number, 'plugin_metadata' => $sequence_data, ]; - $io->output = $output; - error_log(var_export($io, true)); + } else { throw new \InvalidArgumentException(\sprintf("Invalid arguments passed to %s", $this->getPluginId())); } + $io->output = $output; } } From 7a296d74f93a17a4ab85893a73d4255076be8298 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 24 May 2021 13:58:32 -0400 Subject: [PATCH 04/12] Adds polyglot returns into metadata --- .../WebPageTextPostProcessor.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php index 88193ef..4a3fd66 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php @@ -143,12 +143,14 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $page_title = $page_info['title'] ?? NULL; $page_url = $page_info['url'] ?? ''; $page_title = $page_title ?? $page_url; - $page_text = $page_text['text'] ?? ''; + $page_text = $page_info['text'] ?? ''; $nlp = new NlpClient('http://esmero-nlp:6400'); if ($nlp) { $polyglot = $nlp->polyglot_entities($page_text, 'en'); $entities_all = $polyglot->getEntities(); - error_log(var_export($entities_all, TRUE)); + if (!empty($entities_all) and is_array($entities_all)) { + $output->searchapi['metadata'] = $entities_all; + } } $output->searchapi['plaintext'] = $page_url . ' , '. $page_title . ' , ' . $page_text; $output->searchapi['label'] = $page_title; From d3421cd511afc6d45682d6e7e142504003a14d18 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 25 May 2021 09:57:54 -0400 Subject: [PATCH 05/12] Add more webpage metadata to the SBF --- .../WebPageTextPostProcessor.php | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php index 4a3fd66..fe16331 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php @@ -137,6 +137,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $node_uuid = isset($io->input->nuuid) ? $io->input->nuuid : NULL; $output = new \stdClass(); $output->searchapi['fulltext'] = StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; + $output->searchapi['metadata'] = []; if (isset($io->input->{$input_property}) && $node_uuid) { $page_info = json_decode($io->input->{$input_property}, true, 3); if (json_last_error() == JSON_ERROR_NONE) { @@ -144,9 +145,14 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $page_url = $page_info['url'] ?? ''; $page_title = $page_title ?? $page_url; $page_text = $page_info['text'] ?? ''; + $page_ts = $page_info['ts'] ?? date("c"); $nlp = new NlpClient('http://esmero-nlp:6400'); if ($nlp) { $polyglot = $nlp->polyglot_entities($page_text, 'en'); + $output->searchapi['where']= $polyglot->getLocations(); + $output->searchapi['who'] = array_unique(array_merge($polyglot->getOrganizations() , $polyglot->getPersons())); + $output->searchapi['sentiment'] = $polyglot->getSentiment(); + $output->searchapi['uri'] = $page_url; $entities_all = $polyglot->getEntities(); if (!empty($entities_all) and is_array($entities_all)) { $output->searchapi['metadata'] = $entities_all; @@ -155,9 +161,9 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $output->searchapi['plaintext'] = $page_url . ' , '. $page_title . ' , ' . $page_text; $output->searchapi['label'] = $page_title; $output->searchapi['metadata'][] = $page_url; - if (!empty($page_info['ts'])) { - $output->searchapi['metadata'][] = $page_info['ts']; - } + + $output->searchapi['ts'] = $page_ts; + $output->plugin = $output->searchapi; } else { throw new \Exception("WebPage Text was not a valid JSON"); From 0cc3a1217323d12df77b23bfa21fe358e2a24758 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 25 May 2021 10:22:29 -0400 Subject: [PATCH 06/12] Add other properties to the index --- src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index eefeeca..89af0d6 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -262,7 +262,13 @@ public function processItem($data) { $toindex->fulltext = $io->output->searchapi['fulltext'] ?? ''; $toindex->plaintext = $io->output->searchapi['plaintext'] ?? ''; $toindex->metadata = $io->output->searchapi['metadata'] ?? []; + $toindex->who = $io->output->searchapi['who'] ?? []; + $toindex->where = $io->output->searchapi['where'] ?? []; + $toindex->when = $io->output->searchapi['when'] ?? []; + $toindex->ts = $io->output->searchapi['ts'] ?? NULL; + $toindex->uri = $io->output->searchapi['uri'] ?? NULL; $toindex->label = $io->output->searchapi['label'] ?? NULL; + $toindex->sentiment = $io->output->searchapi['sentiment'] ?? 0; // $siblings will be the amount of total children processors that were // enqueued for a single Processor chain. From b32b5373785f95f22f9b3d03529feb518f86d6e3 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 1 Jun 2021 17:47:32 -0400 Subject: [PATCH 07/12] Update $output->searchapi['who'] --- .../StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php index fe16331..7b90ec7 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php @@ -150,7 +150,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug if ($nlp) { $polyglot = $nlp->polyglot_entities($page_text, 'en'); $output->searchapi['where']= $polyglot->getLocations(); - $output->searchapi['who'] = array_unique(array_merge($polyglot->getOrganizations() , $polyglot->getPersons())); + $output->searchapi['who'] = array_unique(array_merge((array) $polyglot->getOrganizations() , (array) $polyglot->getPersons())); $output->searchapi['sentiment'] = $polyglot->getSentiment(); $output->searchapi['uri'] = $page_url; $entities_all = $polyglot->getEntities(); From 0d34d5795261f53105e7c72e4e6d42656f9a99f8 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Tue, 1 Jun 2021 18:15:02 -0400 Subject: [PATCH 08/12] $childdata->plugin_config_entity_id has not been set there yet Use the original $postprocessor_config_entity->id() to report --- src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php index 89af0d6..41af490 100644 --- a/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php +++ b/src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php @@ -347,10 +347,10 @@ public function processItem($data) { // If still null means the child is incompatible with the parent. We abort. if ($input_property_value == NULL) { $this->logger->log(LogLevel::WARNING, - 'Sorry @childplugin is incompatible with @parentplugin, skipping.', + 'Sorry @childplugin is incompatible with @parentplugin or its output or the later is empty, skipping.', [ '@parentplugin' => $data->plugin_config_entity_id, - '@childplugin' => $childdata->plugin_config_entity_id, + '@childplugin' => $postprocessor_config_entity->id(), ]); continue; } From 38a58f1884028194fbe62857fead2637f02a9957 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Thu, 3 Jun 2021 00:09:59 -0400 Subject: [PATCH 09/12] OCR + Webtext with Polyglot or spaCy and configs and schema! This is working very very fine now! Also fixed some defaults and made some things "tidier". --- config/schema/strawberry_runners.schema.yml | 88 ++++++++++++++ .../OcrPostProcessor.php | 110 ++++++++++++++--- .../SystemBinaryPostProcessor.php | 1 + .../WebPageTextPostProcessor.php | 112 +++++++++++++++--- 4 files changed, 278 insertions(+), 33 deletions(-) diff --git a/config/schema/strawberry_runners.schema.yml b/config/schema/strawberry_runners.schema.yml index 829310c..7fdb508 100644 --- a/config/schema/strawberry_runners.schema.yml +++ b/config/schema/strawberry_runners.schema.yml @@ -66,6 +66,9 @@ strawberryfield_runners.strawberry_runners_postprocessor.binary: weight: type: integer label: 'Order or execution in the global chain' + processor_queue_type: + type: string + label: 'The queue to use for this processor' strawberryfield_runners.strawberry_runners_postprocessor.ocr: type: config_object label: 'Strawberry Runners Post Processor Config Entity OCR specific config' @@ -131,6 +134,15 @@ strawberryfield_runners.strawberry_runners_postprocessor.ocr: weight: type: integer label: 'Order or execution in the global chain' + nlp: + type: boolean + label: 'If NLP should be triggered for the extracted Text' + nlp_url: + type: string + label: 'The URL of the NLP64 server' + nlp_method: + type: string + label: 'The NLP method, spaCy or Polyglot' strawberryfield_runners.strawberry_runners_postprocessor.filesequence: type: config_object label: 'Strawberry Runners Post Processor Config Entity JSON sequence specific config' @@ -163,3 +175,79 @@ strawberryfield_runners.strawberry_runners_postprocessor.filesequence: weight: type: integer label: 'Order or execution in the global chain' +strawberryfield_runners.strawberry_runners_postprocessor.waczpages: + type: config_object + label: 'Strawberry Runners Post Processor Config Entity WACZ URL sequence specific config' + mapping: + source_type: + type: string + label: 'The type of Source Data this Processor works on' + ado_type: + type: string + label: 'DO type(s) to limit this Processor to' + jsonkey: + type: sequence + label: 'The JSON key(s) containing the desired Source File(s)' + sequence: + - type: string + mime_type: + type: string + label: 'Mimetypes(s) to limit this Processor to' + output_type: + type: string + label: 'The expected and desired output of this processor' + output_destination: + type: sequence + label: 'Where and how the output will be used' + sequence: + - type: string + timeout: + type: integer + label: 'Timeout in seconds for this process' + weight: + type: integer + label: 'Order or execution in the global chain' +strawberryfield_runners.strawberry_runners_postprocessor.webpage: + type: config_object + label: 'Strawberry Runners Post Processor Config Entity WebPage Text specific config' + mapping: + source_type: + type: string + label: 'The type of Source Data this Processor works on' + ado_type: + type: string + label: 'DO type(s) to limit this Processor to' + jsonkey: + type: sequence + label: 'The JSON key(s) containing the desired Source File(s)' + sequence: + - type: string + mime_type: + type: string + label: 'Mimetypes(s) to limit this Processor to' + output_type: + type: string + label: 'The expected and desired output of this processor' + output_destination: + type: sequence + label: 'Where and how the output will be used' + sequence: + - type: string + timeout: + type: integer + label: 'Timeout in seconds for this process' + weight: + type: integer + label: 'Order or execution in the global chain' + processor_queue_type: + type: string + label: 'The queue to use for this processor' + nlp: + type: boolean + label: 'If NLP should be triggered for the extracted Text' + nlp_url: + type: string + label: 'The URL of the NLP64 server' + nlp_method: + type: string + label: 'The NLP method, spaCy or Polyglot' diff --git a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php index 22dee38..4bf8a30 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/OcrPostProcessor.php @@ -14,6 +14,7 @@ use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginBase; use Drupal\strawberry_runners\Plugin\StrawberryRunnersPostProcessorPluginInterface; use Drupal\strawberryfield\Plugin\search_api\datasource\StrawberryfieldFlavorDatasource; +use Web64\Nlp\NlpClient; /** @@ -50,6 +51,10 @@ public function defaultConfiguration() { 'output_type' => 'json', 'output_destination' => 'searchapi', 'processor_queue_type' => 'background', + 'timeout' => 300, + 'nlp' => TRUE, + 'nlp_url' => 'http://esmero-nlp:6400', + 'nlp_method' => 'polyglot', ] + parent::defaultConfiguration(); } @@ -222,6 +227,40 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#description' => $this->t('The primary queue will be execute in realtime while the Secondary will be execute in background'), ]; + $element['nlp'] = [ + '#type' => 'checkbox', + '#title' => $this->t("Use NLP to extract entities from Text"), + '#default_value' => $this->getConfiguration()['nlp'] ?? TRUE, + '#description' => t('If checked Full text will be processed for Natural language Entity extraction using Polyglot'), + ]; + $element['nlp_url'] = [ + '#type' => 'url', + '#title' => $this->t("The URL location of your NLP64 server."), + '#default_value' => $this->getConfiguration()['nlp_url'] ?? 'http://esmero-nlp:6400', + '#description' => t('Defaults to http://esmero-nlp:6400'), + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[nlp]"]' => ['checked' => TRUE], + ], + ], + ]; + + $element['nlp_method'] = [ + '#type' => 'radios', + '#title' => $this->t('Which method(NER) to use'), + '#options' => [ + 'spacy' => 'spaCy (more accurate)', + 'polyglot' => 'Polyglot (faster)', + ], + '#default_value' => $this->getConfiguration()['nlp_method'], + '#description' => $this->t('The NER NLP method to use to extract Agents, Places and Sentiment'), + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[nlp]"]' => ['checked' => TRUE], + ], + ], + ]; + $element['timeout'] = [ '#type' => 'number', '#title' => $this->t('Timeout in seconds for this process.'), @@ -272,6 +311,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $timeout = $config['timeout']; // in seconds if (isset($io->input->{$input_property}) && $file_uuid && $node_uuid) { + $output = new \stdClass(); // To be used by miniOCR as id in the form of {nodeuuid}/canvas/{fileuuid}/p{pagenumber} $sequence_number = isset($io->input->{$input_argument}) ? (int) $io->input->{$input_argument} : 1; @@ -282,7 +322,6 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $execstring_check_searchable = $this->buildExecutableCommand_checkSearchable($io); // assume its not there/won't work. $proc_output_check_searchable = 0; - if ($execstring_check_searchable) { $backup_locale = setlocale(LC_CTYPE, '0'); setlocale(LC_CTYPE, $backup_locale); @@ -316,18 +355,14 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $proc_output_mod = str_replace('ocrx_line', 'ocr_line', $proc_output); $miniocr = $this->hOCRtoMiniOCR($proc_output_mod, $sequence_number); - $output = new \stdClass(); + $output->searchapi['fulltext'] = $miniocr; $output->plugin = $miniocr; $io->output = $output; } - - //Do we have to remove djvu file? } else { - //if not searchable run tesseract - // setlocale(LC_CTYPE, 'en_US.UTF-8'); $execstring = $this->buildExecutableCommand($io); if ($execstring) { @@ -343,19 +378,65 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug } $miniocr = $this->hOCRtoMiniOCR($proc_output, $sequence_number); - $output = new \stdClass(); $output->searchapi['fulltext'] = $miniocr; $output->plugin = $miniocr; - $io->output = $output; } } - // Lastly plain text version of the XML. - $io->output->searchapi['plaintext'] = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; + // Lastly plain text version of the XML + $page_text = isset($output->searchapi['fulltext']) ? strip_tags(str_replace("", PHP_EOL . " ", $output->searchapi['fulltext'])) : ''; + $output->searchapi['metadata'] = []; + // Check if NPL processing is enabled and if so do it. + if ($config['nlp'] && !empty($config['nlp_url']) && strlen(trim($page_text)) > 0 ) { + $nlp = new NlpClient($config['nlp_url']); + if ($nlp) { + if ($config['nlp_method'] == 'spacy') { + /* + PERSON: People, including fictional. + NORP: Nationalities or religious or political groups. + FAC: Buildings, airports, highways, bridges, etc. + ORG: Companies, agencies, institutions, etc. + GPE: Countries, cities, states. + LOC: Non-GPE locations, mountain ranges, bodies of water. + PRODUCT: Objects, vehicles, foods, etc. (Not services.) + EVENT: Named hurricanes, battles, wars, sports events, etc. + WORK_OF_ART: Titles of books, songs, etc. + LAW: Named documents made into laws. + LANGUAGE: Any named language. + DATE: Absolute or relative dates or periods. + TIME: Times smaller than a day. + PERCENT: Percentage, including ”%“. + MONEY: Monetary values, including unit. + QUANTITY: Measurements, as of weight or distance. + ORDINAL: “first”, “second”, etc. + CARDINAL: Numerals that do not fall under another type. + */ + $spacy = $nlp->spacy_entities($page_text ,'en'); + $output->searchapi['sentiment'] = $nlp->sentiment($page_text , 'en'); + $output->searchapi['sentiment'] = is_scalar($output->searchapi['sentiment']) ? $output->searchapi['sentiment'] : NULL; + $output->searchapi['where'] = array_unique(($spacy['GPE'] ?? []) + ($spacy['FAC'] ?? [])); + $output->searchapi['who'] = array_unique(($spacy['PERSON'] ?? []) + ($spacy['ORG'] ?? [])); + $output->searchapi['metadata'] = array_unique(($spacy['WORK_OF_ART'] ?? []) + ($spacy['EVENT'] ?? [])); + } + elseif ($config['nlp_method'] == 'polyglot') { + $polyglot = $nlp->polyglot_entities($page_text, 'en'); + $output->searchapi['where'] = $polyglot->getLocations(); + $output->searchapi['who'] = array_unique(array_merge((array) $polyglot->getOrganizations(), + (array) $polyglot->getPersons())); + $output->searchapi['sentiment'] = $polyglot->getSentiment(); + $entities_all = $polyglot->getEntities(); + if (!empty($entities_all) and is_array($entities_all)) { + $output->searchapi['metadata'] = $entities_all; + } + } + } + } + $output->searchapi['plaintext'] = $page_text; + $output->searchapi['ts'] = date("c"); + $output->searchapi['label'] = $this->t("Sequence"). ' '. $sequence_number; + $io->output = $output; } else { - $query = \Drupal::entityTypeManager()->getStorage('node')->getQuery(); - - \throwException(new \InvalidArgumentException); + throw new \Exception("Invalid argument for OCR processor"); } } @@ -615,7 +696,6 @@ public function buildExecutableCommand_djvu2hocr(\stdClass $io) { // This run function executes a 1 step function // First djvu2hocr some_output_file.djv - $command = ''; $can_run_djvu2hocr = \Drupal::service('strawberryfield.utility') ->verifyCommand($execpath_djvu2hocr); @@ -638,9 +718,7 @@ public function buildExecutableCommand_djvu2hocr(\stdClass $io) { else { //"missing arguments for djvu 2 OCR"); } - return $command; - } } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php index 0b53c99..7df365b 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/SystemBinaryPostProcessor.php @@ -46,6 +46,7 @@ public function defaultConfiguration() { 'arguments' => '', 'output_type' => 'json', 'output_destination' => 'subkey', + 'timeout' => 300, ] + parent::defaultConfiguration(); } diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php index 7b90ec7..d24ce2e 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WebPageTextPostProcessor.php @@ -32,6 +32,9 @@ public function defaultConfiguration() { 'output_destination' => 'searchapi', 'processor_queue_type' => 'background', 'time_out' => '300', + 'nlp' => TRUE, + 'nlp_url' => 'http://esmero-nlp:6400', + 'nlp_method' => 'polyglot', ] + parent::defaultConfiguration(); } @@ -95,6 +98,40 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#description' => $this->t('The primary queue will be execute in realtime while the Secondary will be execute in background'), ]; + $element['nlp'] = [ + '#type' => 'checkbox', + '#title' => $this->t("Use NLP to extract entities from Text"), + '#default_value' => $this->getConfiguration()['nlp'] ?? TRUE, + '#description' => t('If checked Full text will be processed for Natural language Entity extraction using Polyglot'), + ]; + $element['nlp_url'] = [ + '#type' => 'url', + '#title' => $this->t("The URL location of your NLP64 server."), + '#default_value' => $this->getConfiguration()['nlp_url'] ?? 'http://esmero-nlp:6400', + '#description' => t('Defaults to http://esmero-nlp:6400'), + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[nlp]"]' => ['checked' => TRUE], + ], + ], + ]; + + $element['nlp_method'] = [ + '#type' => 'radios', + '#title' => $this->t('Which method(NER) to use'), + '#options' => [ + 'spacy' => 'spaCy (more accurate)', + 'polyglot' => 'Polyglot (faster)', + ], + '#default_value' => $this->getConfiguration()['nlp_method'], + '#description' => $this->t('The NER NLP method to use to extract Agents, Places and Sentiment'), + '#states' => [ + 'visible' => [ + ':input[name="pluginconfig[nlp]"]' => ['checked' => TRUE], + ], + ], + ]; + $element['timeout'] = [ '#type' => 'number', '#title' => $this->t('Timeout in seconds for this process.'), @@ -104,6 +141,7 @@ public function settingsForm(array $parents, FormStateInterface $form_state) { '#maxlength' => 3, '#min' => 1, ]; + $element['weight'] = [ '#type' => 'number', '#title' => $this->t('Order or execution in the global chain.'), @@ -138,37 +176,77 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $output = new \stdClass(); $output->searchapi['fulltext'] = StrawberryfieldFlavorDatasource::EMPTY_MINIOCR_XML; $output->searchapi['metadata'] = []; + $config = $this->getConfiguration(); if (isset($io->input->{$input_property}) && $node_uuid) { - $page_info = json_decode($io->input->{$input_property}, true, 3); + $page_info = json_decode($io->input->{$input_property}, true, 3); if (json_last_error() == JSON_ERROR_NONE) { $page_title = $page_info['title'] ?? NULL; $page_url = $page_info['url'] ?? ''; $page_title = $page_title ?? $page_url; $page_text = $page_info['text'] ?? ''; + $page_text = preg_replace('/[\x0D]/', '', $page_text); $page_ts = $page_info['ts'] ?? date("c"); - $nlp = new NlpClient('http://esmero-nlp:6400'); - if ($nlp) { - $polyglot = $nlp->polyglot_entities($page_text, 'en'); - $output->searchapi['where']= $polyglot->getLocations(); - $output->searchapi['who'] = array_unique(array_merge((array) $polyglot->getOrganizations() , (array) $polyglot->getPersons())); - $output->searchapi['sentiment'] = $polyglot->getSentiment(); - $output->searchapi['uri'] = $page_url; - $entities_all = $polyglot->getEntities(); - if (!empty($entities_all) and is_array($entities_all)) { - $output->searchapi['metadata'] = $entities_all; + // Check if NPL processing is enabled and if so do it. + if ($config['nlp'] && !empty($config['nlp_url']) && strlen(trim($page_text)) > 0 ) { + $nlp = new NlpClient($config['nlp_url']); + if ($nlp) { + if ($config['nlp_method'] == 'spacy') { + /* + PERSON: People, including fictional. + NORP: Nationalities or religious or political groups. + FAC: Buildings, airports, highways, bridges, etc. + ORG: Companies, agencies, institutions, etc. + GPE: Countries, cities, states. + LOC: Non-GPE locations, mountain ranges, bodies of water. + PRODUCT: Objects, vehicles, foods, etc. (Not services.) + EVENT: Named hurricanes, battles, wars, sports events, etc. + WORK_OF_ART: Titles of books, songs, etc. + LAW: Named documents made into laws. + LANGUAGE: Any named language. + DATE: Absolute or relative dates or periods. + TIME: Times smaller than a day. + PERCENT: Percentage, including ”%“. + MONEY: Monetary values, including unit. + QUANTITY: Measurements, as of weight or distance. + ORDINAL: “first”, “second”, etc. + CARDINAL: Numerals that do not fall under another type. + */ + $spacy = $nlp->spacy_entities($page_text,'en'); + $output->searchapi['sentiment'] = $nlp->sentiment($page_text, 'en'); + $output->searchapi['sentiment'] = is_scalar($output->searchapi['sentiment']) ? $output->searchapi['sentiment'] : NULL; + $output->searchapi['where'] = array_unique(($spacy['GPE'] ?? []) + ($spacy['FAC'] ?? [])); + $output->searchapi['who'] = array_unique(($spacy['PERSON'] ?? []) + ($spacy['ORG'] ?? [])); + $output->searchapi['metadata'] = array_unique(($spacy['WORK_OF_ART'] ?? []) + ($spacy['EVENT'] ?? [])); + } + elseif ($config['nlp_method'] == 'polyglot') { + $polyglot = $nlp->polyglot_entities($page_text, 'en'); + $output->searchapi['where'] = $polyglot->getLocations(); + $output->searchapi['who'] = array_unique(array_merge((array) $polyglot->getOrganizations(), + (array) $polyglot->getPersons())); + $output->searchapi['sentiment'] = $polyglot->getSentiment(); + $entities_all = $polyglot->getEntities(); + if (!empty($entities_all) and is_array($entities_all)) { + $output->searchapi['metadata'] = $entities_all; + } + } + } + else { + $this->logger->warning('NLP64 server @nlp_url could not be queried. Skipping NLP.', + [ + '@nlp_url' => $config['nlp_url'], + ]); } } - $output->searchapi['plaintext'] = $page_url . ' , '. $page_title . ' , ' . $page_text; + $output->searchapi['uri'] = $page_url; + $output->searchapi['plaintext'] = $page_title . '\n' . $page_text; $output->searchapi['label'] = $page_title; - $output->searchapi['metadata'][] = $page_url; - $output->searchapi['ts'] = $page_ts; - $output->plugin = $output->searchapi; - } else { + } + else { throw new \Exception("WebPage Text was not a valid JSON"); } } - $io->output = $output; + $io->output = $output; } } From a6e26bee935e32b4d802273c963b61e1ead8e63d Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Thu, 3 Jun 2021 00:10:26 -0400 Subject: [PATCH 10/12] Update StrawberryRunnersPostProcessorPluginBase.php Trying with posix_kill($pid, SIGKILL); instead of exec("kill -9 $pid");. The error message of the later was driving me nuts! --- src/Plugin/StrawberryRunnersPostProcessorPluginBase.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php index 560fc93..74b71a9 100644 --- a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php +++ b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php @@ -173,7 +173,7 @@ protected function proc_execute($command, $timeout = 5) { $handle = proc_open($command, [['pipe', 'r'], ['pipe', 'w'], ['pipe', 'w']], $pipe); $startTime = microtime(true); $read = NULL; - /* Read the command output and kill it if the proccess surpassed the timeout */ + /* Read the command output and kill it if the process surpassed the timeout */ while(!feof($pipe[1])) { $read .= fread($pipe[1], 8192); if($startTime + $timeout < microtime(true)) { @@ -190,7 +190,7 @@ protected function proc_execute($command, $timeout = 5) { /* The proc_terminate() function doesn't end proccess properly on Windows */ protected function kill($pid) { - return strstr(PHP_OS, 'WIN') ? exec("taskkill /F /T /PID $pid") : exec("kill -9 $pid"); + return strstr(PHP_OS, 'WIN') ? exec("taskkill /F /T /PID $pid") : posix_kill($pid, SIGKILL); } /** From 56ea20a90b624966be3dbae453b043f702810f1f Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Sun, 6 Jun 2021 18:16:55 -0400 Subject: [PATCH 11/12] PASS int instead Not sure why but Docker is complaining. --- src/Plugin/StrawberryRunnersPostProcessorPluginBase.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php index 74b71a9..faefae3 100644 --- a/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php +++ b/src/Plugin/StrawberryRunnersPostProcessorPluginBase.php @@ -190,7 +190,7 @@ protected function proc_execute($command, $timeout = 5) { /* The proc_terminate() function doesn't end proccess properly on Windows */ protected function kill($pid) { - return strstr(PHP_OS, 'WIN') ? exec("taskkill /F /T /PID $pid") : posix_kill($pid, SIGKILL); + return strstr(PHP_OS, 'WIN') ? exec("taskkill /F /T /PID $pid") : posix_kill($pid, 9); } /** From 234af4c51e5300accc82d1993dbfa69ec7c81056 Mon Sep 17 00:00:00 2001 From: Diego Pino Navarro Date: Mon, 7 Jun 2021 00:10:26 -0400 Subject: [PATCH 12/12] 4096 is clearly not enough for a single line Fixing it to 32K for now. That is also the max size of a Text Field in Solr. --- src/Controller/Redirect.php | 89 +++++++++++++++++++ .../WaczPagesSequencePostProcessor.php | 4 +- 2 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 src/Controller/Redirect.php diff --git a/src/Controller/Redirect.php b/src/Controller/Redirect.php new file mode 100644 index 0000000..aa0eb9a --- /dev/null +++ b/src/Controller/Redirect.php @@ -0,0 +1,89 @@ +logger = $logger->get('strawberry_runners'); + $this->queue = $queue; + $secret = \Drupal::service('config.factory')->get('strawberry_runners')->get('webhooktoken'); + } + + /** + * {@inheritdoc} + */ + public static function create(ContainerInterface $container) { + return new static( + $container->get('logger.factory'), + $container->get('queue')->get('process_payload_queue_worker') + ); + } + + /** + * Capture the payload. + * + * @return \Symfony\Component\HttpFoundation\RedirectResponse + * A simple string and 302 response. + */ + public function islandora(Request $request, $PID) { + if ($PID) { + $parts = explode(':', $PID); + } + $response = new RedirectResponse('/do/'.$parts[1], 302); + return $response; + } + + +} \ No newline at end of file diff --git a/src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php b/src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php index ad63f0f..c30f36b 100644 --- a/src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php +++ b/src/Plugin/StrawberryRunnersPostProcessor/WaczPagesSequencePostProcessor.php @@ -171,7 +171,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug $fp = $z->getStream('pages/pages.jsonl'); if ($fp) { $i = 0; - while (($buffer = fgets($fp, 4096)) !== FALSE) { + while (($buffer = fgets($fp, 32767)) !== FALSE) { // First row in a jsonl will be the headers, we do not need this one. if ($i == 0) { $i++; @@ -188,7 +188,7 @@ public function run(\stdClass $io, $context = StrawberryRunnersPostProcessorPlug } else { // Opening the ZIP file failed. - error_log('NO Pages found to extract'); + error_log('No Pages found to extract'); } }