diff --git a/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt b/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt index cfa26c22d6..5827220833 100644 --- a/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt +++ b/backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt @@ -199,7 +199,8 @@ class InsertionDeserializer : JsonDeserializer() { } data class PreprocessingAnnotation( - val source: List, + val unprocessedFields: List, + val processedFields: List, @Schema(description = "A descriptive message that helps the submitter to fix the issue") val message: String, ) diff --git a/backend/src/main/resources/db/migration/V1.8__update_error_schema.sql b/backend/src/main/resources/db/migration/V1.8__update_error_schema.sql new file mode 100644 index 0000000000..7ddf8d7e54 --- /dev/null +++ b/backend/src/main/resources/db/migration/V1.8__update_error_schema.sql @@ -0,0 +1,67 @@ +update sequence_entries_preprocessed_data +set warnings = ( + select jsonb_agg( + jsonb_build_object( + 'unprocessedFields', ( + select jsonb_agg( + jsonb_build_object( + 'name', source->>'name', + 'type', source->>'type' + ) + ) + from jsonb_array_elements(warning->'source') as source + ), + 'processedFields', ( + select jsonb_agg( + jsonb_build_object( + 'name', source->>'name', + 'type', source->>'type' + ) + ) + from jsonb_array_elements(warning->'source') as source + ), + 'message', warning->>'message' + ) + ) + from jsonb_array_elements(warnings) as warning +) +where warnings is not null + and exists ( + select 1 + from jsonb_array_elements(warnings) as warning + where warning->'source' is not null + ); + +update sequence_entries_preprocessed_data +set errors = ( + select jsonb_agg( + jsonb_build_object( + 'unprocessedFields', ( + select jsonb_agg( + jsonb_build_object( + 'name', source->>'name', + 'type', source->>'type' + ) + ) + from jsonb_array_elements(error->'source') as source + ), + 'processedFields', ( + select jsonb_agg( + jsonb_build_object( + 'name', source->>'name', + 'type', source->>'type' + ) + ) + from jsonb_array_elements(error->'source') as source + ), + 'message', error->>'message' + ) + ) + from jsonb_array_elements(errors) as error +) +where errors is not null +and exists ( + select 1 + from jsonb_array_elements(errors) as error + where error->'source' is not null + ); \ No newline at end of file diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt index b7454e0bbf..f3446306ed 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt @@ -396,7 +396,13 @@ object PreparedProcessedData { accession = accession, errors = listOf( PreprocessingAnnotation( - source = listOf( + unprocessedFields = listOf( + PreprocessingAnnotationSource( + PreprocessingAnnotationSourceType.Metadata, + "host", + ), + ), + processedFields = listOf( PreprocessingAnnotationSource( PreprocessingAnnotationSourceType.Metadata, "host", @@ -405,7 +411,13 @@ object PreparedProcessedData { "Not this kind of host", ), PreprocessingAnnotation( - source = listOf( + unprocessedFields = listOf( + PreprocessingAnnotationSource( + PreprocessingAnnotationSourceType.NucleotideSequence, + MAIN_SEGMENT, + ), + ), + processedFields = listOf( PreprocessingAnnotationSource( PreprocessingAnnotationSourceType.NucleotideSequence, MAIN_SEGMENT, @@ -420,7 +432,13 @@ object PreparedProcessedData { accession = accession, warnings = listOf( PreprocessingAnnotation( - source = listOf( + unprocessedFields = listOf( + PreprocessingAnnotationSource( + PreprocessingAnnotationSourceType.Metadata, + "host", + ), + ), + processedFields = listOf( PreprocessingAnnotationSource( PreprocessingAnnotationSourceType.Metadata, "host", @@ -429,7 +447,13 @@ object PreparedProcessedData { "Not this kind of host", ), PreprocessingAnnotation( - source = listOf( + unprocessedFields = listOf( + PreprocessingAnnotationSource( + PreprocessingAnnotationSourceType.NucleotideSequence, + MAIN_SEGMENT, + ), + ), + processedFields = listOf( PreprocessingAnnotationSource( PreprocessingAnnotationSourceType.NucleotideSequence, MAIN_SEGMENT, diff --git a/preprocessing/dummy/main.py b/preprocessing/dummy/main.py index 886e1b59d3..719763f333 100644 --- a/preprocessing/dummy/main.py +++ b/preprocessing/dummy/main.py @@ -5,7 +5,6 @@ import random import time from dataclasses import dataclass, field -from typing import List, Optional import requests @@ -72,7 +71,8 @@ class AnnotationSource: @dataclass class ProcessingAnnotation: - source: List[AnnotationSource] + unprocessedFields: list[AnnotationSource] # noqa: N815 + processedFields: list[AnnotationSource] # noqa: N815 message: str @@ -81,13 +81,11 @@ class Sequence: accession: int version: int data: dict - errors: Optional[List[ProcessingAnnotation]] = field(default_factory=list[ProcessingAnnotation]) - warnings: Optional[List[ProcessingAnnotation]] = field( - default_factory=list[ProcessingAnnotation] - ) + errors: list[ProcessingAnnotation] = field(default_factory=list) + warnings: list[ProcessingAnnotation] = field(default_factory=list) -def fetch_unprocessed_sequences(etag: str | None, n: int) -> tuple[str | None, List[Sequence]]: +def fetch_unprocessed_sequences(etag: str | None, n: int) -> tuple[str | None, list[Sequence]]: url = backendHost + "/extract-unprocessed-data" params = {"numberOfSequenceEntries": n, "pipelineVersion": pipeline_version} headers = { @@ -111,7 +109,7 @@ def fetch_unprocessed_sequences(etag: str | None, n: int) -> tuple[str | None, L ) -def parse_ndjson(ndjson_data: str) -> List[Sequence]: +def parse_ndjson(ndjson_data: str) -> list[Sequence]: json_strings = ndjson_data.split("\n") entries = [] for json_str in json_strings: @@ -123,7 +121,7 @@ def parse_ndjson(ndjson_data: str) -> List[Sequence]: return entries -def process(unprocessed: List[Sequence]) -> List[Sequence]: +def process(unprocessed: list[Sequence]) -> list[Sequence]: with open("mock-sequences.json", "r") as f: mock_sequences = json.load(f) possible_lineages = ["A.1", "A.1.1", "A.2"] @@ -143,17 +141,24 @@ def process(unprocessed: List[Sequence]) -> List[Sequence]: if addErrors and not disable_randomly: updated_sequence.errors = [ ProcessingAnnotation( - [AnnotationSource(list(metadata.keys())[0], "Metadata")], - "This is a metadata error", + unprocessedFields=[AnnotationSource(list(metadata.keys())[0], "Metadata")], + processedFields=[AnnotationSource(list(metadata.keys())[0], "Metadata")], + message="This is a metadata error", ), ProcessingAnnotation( - [ + unprocessedFields=[ AnnotationSource( list(mock_sequences["alignedNucleotideSequences"].keys())[0], "NucleotideSequence", ) ], - "This is a sequence error", + processedFields=[ + AnnotationSource( + list(mock_sequences["alignedNucleotideSequences"].keys())[0], + "NucleotideSequence", + ) + ], + message="This is a sequence error", ), ] @@ -161,17 +166,24 @@ def process(unprocessed: List[Sequence]) -> List[Sequence]: if addWarnings and not disable_randomly: updated_sequence.warnings = [ ProcessingAnnotation( - [AnnotationSource(list(metadata.keys())[0], "Metadata")], - "This is a metadata warning", + unprocessedFields=[AnnotationSource(list(metadata.keys())[0], "Metadata")], + processedFields=[AnnotationSource(list(metadata.keys())[0], "Metadata")], + message="This is a metadata warning", ), ProcessingAnnotation( - [ + unprocessedFields=[ + AnnotationSource( + list(mock_sequences["alignedNucleotideSequences"].keys())[0], + "NucleotideSequence", + ) + ], + processedFields=[ AnnotationSource( list(mock_sequences["alignedNucleotideSequences"].keys())[0], "NucleotideSequence", ) ], - "This is a sequence warning", + message="This is a sequence warning", ), ] @@ -180,9 +192,11 @@ def process(unprocessed: List[Sequence]) -> List[Sequence]: return processed -def submit_processed_sequences(processed: List[Sequence]): +def submit_processed_sequences(processed: list[Sequence]): + logging.info(sequence for sequence in processed) json_strings = [json.dumps(dataclasses.asdict(sequence)) for sequence in processed] ndjson_string = "\n".join(json_strings) + logging.info(ndjson_string) url = backendHost + "/submit-processed-data?pipelineVersion=" + str(pipeline_version) headers = {"Content-Type": "application/x-ndjson", "Authorization": "Bearer " + get_jwt()} response = requests.post(url, data=ndjson_string, headers=headers) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py index aa08c273a2..1fb81ad61f 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py @@ -36,14 +36,16 @@ def __hash__(self): @dataclass(frozen=True) class ProcessingAnnotation: - source: tuple[AnnotationSource, ...] + unprocessedFields: tuple[AnnotationSource, ...] # noqa: N815 + processedFields: tuple[AnnotationSource, ...] # noqa: N815 message: str def __post_init__(self): - object.__setattr__(self, "source", tuple(self.source)) + object.__setattr__(self, "unprocessedFields", tuple(self.unprocessedFields)) + object.__setattr__(self, "processedFields", tuple(self.processedFields)) def __hash__(self): - return hash((self.source, self.message)) + return hash((self.unprocessedFields, self.processedFields, self.message)) @dataclass diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 1f659db777..d518581ebe 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -43,6 +43,8 @@ from .processing_functions import ProcessingFunctions, format_frameshift, format_stop_codon from .sequence_checks import errors_if_non_iupac +logger = logging.getLogger(__name__) + # https://stackoverflow.com/questions/15063936 csv.field_size_limit(sys.maxsize) @@ -85,7 +87,7 @@ def parse_nextclade_tsv( if gene in config.genes: amino_acid_insertions[id][gene].append(val) else: - logging.debug( + logger.debug( "Note: Nextclade found AA insertion in gene missing from config in gene " f"{gene}: {val}" ) @@ -168,7 +170,15 @@ def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 error_dict[id] = error_dict.get(id, []) error_dict[id].append( ProcessingAnnotation( - source=( + unprocessedFields=( + ( + AnnotationSource( + name=segment, + type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ) + ), + processedFields=( ( AnnotationSource( name=segment, @@ -195,12 +205,20 @@ def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 error_dict[id] = error_dict.get(id, []) error_dict[id].append( ProcessingAnnotation( - source=( + unprocessedFields=( AnnotationSource( name="main", type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, ), ), + processedFields=( + ( + AnnotationSource( + name="main", + type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ) + ), message=( "Found unknown segments in the input data - " "check your segments are annotated correctly." @@ -243,7 +261,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 "--", input_file, ] - logging.debug(f"Running nextclade: {command}") + logger.debug(f"Running nextclade: {command}") # TODO: Capture stderr and log at DEBUG level exit_code = subprocess.run(command, check=False).returncode # noqa: S603 @@ -251,7 +269,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 msg = f"nextclade failed with exit code {exit_code}" raise Exception(msg) - logging.debug("Nextclade results available in %s", result_dir) + logger.debug("Nextclade results available in %s", result_dir) # Add aligned sequences to aligned_nucleotide_sequences # Modifies aligned_nucleotide_sequences in place @@ -272,7 +290,7 @@ def enrich_with_nextclade( # noqa: C901, PLR0912, PLR0914, PLR0915 aligned_aminoacid_sequences[sequence_id][gene] = masked_sequence except FileNotFoundError: # TODO: Add warning to each sequence - logging.info( + logger.info( f"Gene {gene} not found in Nextclade results expected at: { translation_path}" ) @@ -389,9 +407,15 @@ def add_input_metadata( ) errors.append( ProcessingAnnotation( - source=( + unprocessedFields=( + AnnotationSource( + name=segment, + type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ), + processedFields=( AnnotationSource( - name="segment", + name=segment, type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, ), ), @@ -409,7 +433,13 @@ def add_input_metadata( ) errors.append( ProcessingAnnotation( - source=( + unprocessedFields=( + AnnotationSource( + name=segment, + type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ), + processedFields=( AnnotationSource( name=segment, type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, @@ -431,7 +461,7 @@ def add_input_metadata( try: result = format_frameshift(result) except Exception: - logging.error( + logger.error( "Was unable to format frameshift - this is likely an internal error" ) result = None @@ -439,7 +469,7 @@ def add_input_metadata( try: result = format_stop_codon(result) except Exception: - logging.error( + logger.error( "Was unable to format stop codon - this is likely an internal error" ) result = None @@ -459,16 +489,19 @@ def get_metadata( # noqa: PLR0913, PLR0917 warnings: list[ProcessingAnnotation], ) -> ProcessingResult: input_data: InputMetadata = {} + input_fields: list[str] = [] if isinstance(unprocessed, UnprocessedData): metadata = unprocessed.metadata for arg_name, input_path in spec.inputs.items(): input_data[arg_name] = metadata.get(input_path) + input_fields.append(input_path) args = spec.args args["submitter"] = unprocessed.submitter else: for arg_name, input_path in spec.inputs.items(): input_data[arg_name] = add_input_metadata(spec, unprocessed, errors, input_path) + input_fields.append(input_path) args = spec.args args["submitter"] = unprocessed.inputMetadata["submitter"] @@ -483,6 +516,7 @@ def get_metadata( # noqa: PLR0913, PLR0917 args, input_data, output_field, + input_fields, ) except Exception as e: msg = f"Processing for spec: {spec} with input data: {input_data} failed with {e}" @@ -558,12 +592,18 @@ def process_single( # noqa: C901 elif not any(unprocessed.unalignedNucleotideSequences.values()): errors.append( ProcessingAnnotation( - source=[ + unprocessedFields=[ AnnotationSource( name="main", type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, ) ], + processedFields=( + AnnotationSource( + name="main", + type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ), + ), message="No sequence data found - check segments are annotated correctly", ) ) @@ -626,7 +666,13 @@ def process_single( # noqa: C901 ): errors.append( ProcessingAnnotation( - source=[ + unprocessedFields=[ + AnnotationSource( + name=input, + type=AnnotationSourceType.METADATA, + ) for input in spec.inputs.values() + ], + processedFields=[ AnnotationSource( name=output_field, type=AnnotationSourceType.METADATA, @@ -635,7 +681,7 @@ def process_single( # noqa: C901 message=(f"Metadata field {output_field} is required."), ) ) - logging.debug(f"Processed {id}: {output_metadata}") + logger.debug(f"Processed {id}: {output_metadata}") if isinstance(unprocessed, UnprocessedData): return processed_entry_no_alignment( @@ -672,7 +718,12 @@ def processed_entry_with_errors(id): ), errors=[ ProcessingAnnotation( - source=[AnnotationSource(name="unknown", type=AnnotationSourceType.METADATA)], + unprocessedFields=[ + AnnotationSource(name="unknown", type=AnnotationSourceType.METADATA) + ], + processedFields=[ + AnnotationSource(name="unknown", type=AnnotationSourceType.METADATA) + ], message=( f"Failed to process submission with id: {id} - please review your submission " "or reach out to an administrator if this error persists." @@ -693,7 +744,7 @@ def process_all( try: processed_single = process_single(id, result, config) except Exception as e: - logging.error(f"Processing failed for {id} with error: {e}") + logger.error(f"Processing failed for {id} with error: {e}") processed_single = processed_entry_with_errors(id) processed_results.append(processed_single) else: @@ -701,7 +752,7 @@ def process_all( try: processed_single = process_single(entry.accessionVersion, entry.data, config) except Exception as e: - logging.error(f"Processing failed for {id} with error: {e}") + logger.error(f"Processing failed for {id} with error: {e}") processed_single = processed_entry_with_errors(id) processed_results.append(processed_single) @@ -736,11 +787,11 @@ def download_nextclade_dataset(dataset_dir: str, config: Config) -> None: if config.nextclade_dataset_tag is not None: dataset_download_command.append(f"--tag={config.nextclade_dataset_tag}") - logging.info("Downloading Nextclade dataset: %s", dataset_download_command) + logger.info("Downloading Nextclade dataset: %s", dataset_download_command) if subprocess.run(dataset_download_command, check=False).returncode != 0: # noqa: S603 msg = "Dataset download failed" raise RuntimeError(msg) - logging.info("Nextclade dataset downloaded successfully") + logger.info("Nextclade dataset downloaded successfully") def run(config: Config) -> None: @@ -751,7 +802,7 @@ def run(config: Config) -> None: etag = None last_force_refresh = time.time() while True: - logging.debug("Fetching unprocessed sequences") + logger.debug("Fetching unprocessed sequences") # Reset etag every hour just in case if last_force_refresh + 3600 < time.time(): etag = None @@ -759,7 +810,7 @@ def run(config: Config) -> None: etag, unprocessed = fetch_unprocessed_sequences(etag, config) if not unprocessed: # sleep 1 sec and try again - logging.debug("No unprocessed sequences found. Sleeping for 1 second.") + logger.debug("No unprocessed sequences found. Sleeping for 1 second.") time.sleep(1) continue # Don't use etag if we just got data @@ -768,14 +819,14 @@ def run(config: Config) -> None: try: processed = process_all(unprocessed, dataset_dir, config) except Exception as e: - logging.exception( + logger.exception( f"Processing failed. Traceback : {e}. Unprocessed data: {unprocessed}" ) continue try: submit_processed_sequences(processed, dataset_dir, config) except RuntimeError as e: - logging.exception("Submitting processed data failed. Traceback : %s", e) + logger.exception("Submitting processed data failed. Traceback : %s", e) continue total_processed += len(processed) - logging.info("Processed %s sequences", len(processed)) + logger.info("Processed %s sequences", len(processed)) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 1809de9eb8..433b2cd4c5 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -41,9 +41,10 @@ def standardize_option(option): return " ".join(option.lower().split()) -def invalid_value_annotation(input_datum, output_field, value_type) -> ProcessingAnnotation: +def invalid_value_annotation(input_datum, output_field, input_fields, value_type) -> ProcessingAnnotation: return ProcessingAnnotation( - source=[AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)], + processedFields=[AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)], + unprocessedFields=[AnnotationSource(name=field, type=AnnotationSourceType.METADATA) for field in input_fields], message=f"Invalid {value_type} value: {input_datum} for field {output_field}.", ) @@ -92,6 +93,7 @@ def call_function( args: FunctionArgs, input_data: InputMetadata, output_field: str, + input_fields: list[str], ) -> ProcessingResult: if not hasattr(cls, function_name): msg = ( @@ -101,7 +103,7 @@ def call_function( raise ValueError(msg) func = getattr(cls, function_name) try: - result = func(input_data, output_field, args=args) + result = func(input_data, output_field, input_fields=input_fields, args=args) except Exception as e: message = ( f"Error calling function {function_name} for output field {output_field} " @@ -113,9 +115,13 @@ def call_function( warnings=[], errors=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=( f"Internal Error: Function {function_name} did not return " f"ProcessingResult with input {input_data} and args {args}, " @@ -135,9 +141,13 @@ def call_function( warnings=[], errors=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=( f"Internal Error: Function {function_name} did not return " f"ProcessingResult with input {input_data} and args {args}, " @@ -152,6 +162,7 @@ def call_function( def check_date( input_data: InputMetadata, output_field: str, + input_fields: list[str], args: FunctionArgs = None, # args is essential - even if Pylance says it's not used ) -> ProcessingResult: """Check that date is complete YYYY-MM-DD @@ -175,9 +186,13 @@ def check_date( if parsed_date > datetime.now(tz=pytz.utc): warnings.append( ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message="Date is in the future.", ) ) @@ -192,9 +207,13 @@ def check_date( warnings=warnings, errors=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=error_message, ) ], @@ -204,6 +223,7 @@ def check_date( def parse_date_into_range( input_data: InputMetadata, output_field: str, + input_fields: list[str], args: FunctionArgs = None, # args is essential - even if Pylance says it's not used ) -> ProcessingResult: """Parse date string (`input.date`) formatted as one of YYYY | YYYY-MM | YYYY-MM-DD into a range using upper bound (`input.releaseDate`) @@ -220,7 +240,7 @@ def parse_date_into_range( release_date_str = input_data.get("releaseDate", "") or "" try: - release_date = dateutil.parse(release_date_str).astimezone(pytz.utc) + release_date = dateutil.parse(release_date_str).replace(tzinfo=pytz.utc) except Exception: release_date = None @@ -293,9 +313,13 @@ class DateRange: if message: warnings.append( ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=f"Metadata field {output_field}:'{input_date_str}' - " + message, ) ) @@ -306,9 +330,13 @@ class DateRange: ) errors.append( ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=f"Metadata field {output_field}:'{input_date_str}' is in the future.", ) ) @@ -317,9 +345,13 @@ class DateRange: logger.debug(f"Lower range of date: {parsed_date} > release_date: {release_date}") errors.append( ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=( f"Metadata field {output_field}:'{input_date_str}'" "is after release date." @@ -348,9 +380,13 @@ class DateRange: warnings=[], errors=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=f"Metadata field {output_field}: Date {input_date_str} could not be parsed.", ) ], @@ -360,6 +396,7 @@ class DateRange: def parse_and_assert_past_date( # noqa: C901 input_data: InputMetadata, output_field, + input_fields: list[str], args: FunctionArgs = None, # args is essential - even if Pylance says it's not used ) -> ProcessingResult: """Parse date string. If it's incomplete, add 01-01, if no year, return null and error @@ -409,11 +446,15 @@ def parse_and_assert_past_date( # noqa: C901 if message: warnings.append( ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource( name=output_field, type=AnnotationSourceType.METADATA ) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=f"Metadata field {output_field}:'{date_str}' - " + message, ) ) @@ -422,11 +463,15 @@ def parse_and_assert_past_date( # noqa: C901 logger.debug(f"parsed_date: {parsed_date} > {datetime.now(tz=pytz.utc)}") errors.append( ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource( name=output_field, type=AnnotationSourceType.METADATA ) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=f"Metadata field {output_field}:'{date_str}' is in the future.", ) ) @@ -435,11 +480,15 @@ def parse_and_assert_past_date( # noqa: C901 logger.debug(f"parsed_date: {parsed_date} > release_date: {release_date}") errors.append( ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource( name=output_field, type=AnnotationSourceType.METADATA ) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=( f"Metadata field {output_field}:'{date_str}'" "is after release date." @@ -457,9 +506,13 @@ def parse_and_assert_past_date( # noqa: C901 warnings=[], errors=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=f"Metadata field {output_field}: Date format is not recognized.", ) ], @@ -469,6 +522,7 @@ def parse_and_assert_past_date( # noqa: C901 def parse_timestamp( input_data: InputMetadata, output_field: str, + input_fields: list[str], args: FunctionArgs = None, # args is essential - even if Pylance says it's not used ) -> ProcessingResult: """Parse a timestamp string, e.g. 2022-11-01T00:00:00Z and return a YYYY-MM-DD string""" @@ -500,9 +554,13 @@ def parse_timestamp( datum=None, errors=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=error_message, ) ], @@ -511,7 +569,10 @@ def parse_timestamp( @staticmethod def concatenate( - input_data: InputMetadata, output_field: str, args: FunctionArgs = None + input_data: InputMetadata, + output_field: str, + input_fields: list[str], + args: FunctionArgs = None, ) -> ProcessingResult: """Concatenates input fields with accession_version using the "/" separator in the order specified by the order argument. @@ -533,9 +594,13 @@ def concatenate( ) errors.append( ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message="Concatenation failed." "This may be a configuration error, please contact the administrator.", ) @@ -551,14 +616,14 @@ def concatenate( for i in range(len(order)): if type[i] == "date": processed = ProcessingFunctions.parse_and_assert_past_date( - {"date": input_data[order[i]]}, output_field + {"date": input_data[order[i]]}, output_field, input_fields ) formatted_input_data.append("" if processed.datum is None else processed.datum) errors += processed.errors warnings += processed.warnings elif type[i] == "timestamp": processed = ProcessingFunctions.parse_timestamp( - {"timestamp": input_data[order[i]]}, output_field + {"timestamp": input_data[order[i]]}, output_field, input_fields ) formatted_input_data.append("" if processed.datum is None else processed.datum) errors += processed.errors @@ -581,9 +646,13 @@ def concatenate( logging.error(f"Concatenate failed with {e} (accession_version: {accession_version})") errors.append( ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=( f"Concatenation failed for {output_field}. This is a technical error, " "please contact the administrator." @@ -598,7 +667,10 @@ def concatenate( @staticmethod def check_authors( - input_data: InputMetadata, output_field: str, args: FunctionArgs = None + input_data: InputMetadata, + output_field: str, + input_fields: list[str], + args: FunctionArgs = None, ) -> ProcessingResult: authors = input_data["authors"] @@ -630,9 +702,13 @@ def check_authors( datum=None, errors=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=error_message, ) ], @@ -647,9 +723,13 @@ def check_authors( ) warnings = [ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=warning_message, ) ] @@ -672,9 +752,13 @@ def check_authors( datum=None, errors=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=error_message, ) ], @@ -683,7 +767,7 @@ def check_authors( @staticmethod def identity( # noqa: C901, PLR0912 - input_data: InputMetadata, output_field: str, args: FunctionArgs = None + input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs = None ) -> ProcessingResult: """Identity function, takes input_data["input"] and returns it as output""" if "input" not in input_data: @@ -692,9 +776,13 @@ def identity( # noqa: C901, PLR0912 warnings=[], errors=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=f"No data found for output field: {output_field}", ) ], @@ -712,13 +800,13 @@ def identity( # noqa: C901, PLR0912 output_datum = int(input_datum) except ValueError: output_datum = None - errors.append(invalid_value_annotation(input_datum, output_field, "int")) + errors.append(invalid_value_annotation(input_datum, output_field, input_fields, "int")) case "float": try: output_datum = float(input_datum) except ValueError: output_datum = None - errors.append(invalid_value_annotation(input_datum, output_field, "float")) + errors.append(invalid_value_annotation(input_datum, output_field, input_fields, "float")) case "boolean": if input_datum.lower() == "true": output_datum = True @@ -727,7 +815,7 @@ def identity( # noqa: C901, PLR0912 else: output_datum = None errors.append( - invalid_value_annotation(input_datum, output_field, "boolean") + invalid_value_annotation(input_datum, output_field, input_fields, "boolean") ) case _: output_datum = input_datum @@ -737,7 +825,7 @@ def identity( # noqa: C901, PLR0912 @staticmethod def process_options( - input_data: InputMetadata, output_field: str, args: FunctionArgs = None + input_data: InputMetadata, output_field: str, input_fields: list[str], args: FunctionArgs = None ) -> ProcessingResult: """Checks that option is in options""" if "options" not in args: @@ -746,9 +834,13 @@ def process_options( warnings=[], errors=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=( "Website configuration error: no options specified for field " f"{output_field}, please contact an administrator.", @@ -777,9 +869,13 @@ def process_options( datum=input_datum, warnings=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=error_msg, ) ], @@ -791,9 +887,13 @@ def process_options( warnings=[], errors=[ ProcessingAnnotation( - source=[ + processedFields=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], + unprocessedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in input_fields + ], message=error_msg, ) ], diff --git a/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py b/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py index 9713ec194a..47616bfb32 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py @@ -35,7 +35,12 @@ def errors_if_non_iupac( if non_iupac_symbols: errors.append( ProcessingAnnotation( - source=[ + unprocessedFields=[ + AnnotationSource( + name=segment, type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE + ) + ], + processedFields=[ AnnotationSource( name=segment, type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE ) diff --git a/preprocessing/nextclade/tests/factory_methods.py b/preprocessing/nextclade/tests/factory_methods.py index 13b03b8b97..bdbc6a2a12 100644 --- a/preprocessing/nextclade/tests/factory_methods.py +++ b/preprocessing/nextclade/tests/factory_methods.py @@ -19,8 +19,14 @@ class ProcessingTestCase: @dataclass -class UnprocessedEntryFactory: +class ProcessingAnnotationTestCase: + unprocessedFieldsName: list[str] # noqa: N815 + processedFieldsName: list[str] # noqa: N815 + message: str + +@dataclass +class UnprocessedEntryFactory: @staticmethod def create_unprocessed_entry( metadata_dict: dict[str, str], @@ -48,8 +54,8 @@ def create_processed_entry( self, metadata_dict: dict[str, str], accession: str, - metadata_errors: list[tuple[str, str]] | None = None, - metadata_warnings: list[tuple[str, str]] | None = None, + metadata_errors: list[ProcessingAnnotationTestCase] | None = None, + metadata_warnings: list[ProcessingAnnotationTestCase] | None = None, ) -> ProcessedEntry: if metadata_errors is None: metadata_errors = [] @@ -72,15 +78,33 @@ def create_processed_entry( ), errors=[ ProcessingAnnotation( - source=[AnnotationSource(name=error[0], type=AnnotationSourceType.METADATA)], - message=error[1], + unprocessedFields=[ + AnnotationSource( + name=field, + type=AnnotationSourceType.METADATA, + ) for field in error.unprocessedFieldsName + ], + processedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in error.processedFieldsName + ], + message=error.message, ) for error in metadata_errors ], warnings=[ ProcessingAnnotation( - source=[AnnotationSource(name=warning[0], type=AnnotationSourceType.METADATA)], - message=warning[1], + unprocessedFields=[ + AnnotationSource( + name=field, + type=AnnotationSourceType.METADATA, + ) for field in warning.unprocessedFieldsName + ], + processedFields=[ + AnnotationSource(name=field, type=AnnotationSourceType.METADATA) + for field in warning.processedFieldsName + ], + message=warning.message, ) for warning in metadata_warnings ], diff --git a/preprocessing/nextclade/tests/test_config.yaml b/preprocessing/nextclade/tests/test_config.yaml index 526d2337d4..237997cafb 100644 --- a/preprocessing/nextclade/tests/test_config.yaml +++ b/preprocessing/nextclade/tests/test_config.yaml @@ -64,7 +64,7 @@ processing_spec: type: date function: parse_and_assert_past_date inputs: - date: required_collection_date + date: ncbi_required_collection_date required: true concatenated_string: function: concatenate @@ -73,7 +73,7 @@ processing_spec: type: [string, string, date] inputs: continent: continent - required_collection_date: required_collection_date + required_collection_date: ncbi_required_collection_date authors: function: check_authors inputs: diff --git a/preprocessing/nextclade/tests/test_processing_functions.py b/preprocessing/nextclade/tests/test_processing_functions.py index 3763a5585e..853d9c891a 100644 --- a/preprocessing/nextclade/tests/test_processing_functions.py +++ b/preprocessing/nextclade/tests/test_processing_functions.py @@ -2,7 +2,12 @@ from dataclasses import dataclass import pytest -from factory_methods import ProcessedEntryFactory, ProcessingTestCase, UnprocessedEntryFactory +from factory_methods import ( + ProcessedEntryFactory, + ProcessingAnnotationTestCase, + ProcessingTestCase, + UnprocessedEntryFactory, +) from loculus_preprocessing.config import Config, get_config from loculus_preprocessing.datatypes import ProcessedEntry, ProcessingAnnotation @@ -24,8 +29,8 @@ class Case: name: str metadata: dict[str, str] expected_metadata: dict[str, str] - expected_errors: list[tuple[str, str]] - expected_warnings: list[tuple[str, str]] = None + expected_errors: list[ProcessingAnnotationTestCase] + expected_warnings: list[ProcessingAnnotationTestCase] = None accession_id: str = "000999" def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingTestCase: @@ -51,9 +56,12 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT accession_id="0", expected_metadata={"concatenated_string": "LOC_0.1"}, expected_errors=[ - ("name_required", "Metadata field name_required is required."), - ( - "required_collection_date", + ProcessingAnnotationTestCase( + ["name_required"], ["name_required"], "Metadata field name_required is required." + ), + ProcessingAnnotationTestCase( + ["ncbi_required_collection_date"], + ["required_collection_date"], "Metadata field required_collection_date is required.", ), ], @@ -64,8 +72,9 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT accession_id="1", expected_metadata={"name_required": "name", "concatenated_string": "LOC_1.1"}, expected_errors=[ - ( - "required_collection_date", + ProcessingAnnotationTestCase( + ["ncbi_required_collection_date"], + ["required_collection_date"], "Metadata field required_collection_date is required.", ), ], @@ -76,7 +85,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "submissionId": "invalid_option", "continent": "Afrika", "name_required": "name", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", }, accession_id="2", expected_metadata={ @@ -85,8 +94,9 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "concatenated_string": "Afrika/LOC_2.1/2022-11-01", }, expected_errors=[ - ( - "continent", + ProcessingAnnotationTestCase( + ["continent"], + ["continent"], "Metadata field continent:'Afrika' - not in list of accepted options.", ), ], @@ -97,7 +107,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "submissionId": "collection_date_in_future", "collection_date": "2088-12-01", "name_required": "name", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", }, accession_id="3", expected_metadata={ @@ -107,8 +117,9 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "concatenated_string": "LOC_3.1/2022-11-01", }, expected_errors=[ - ( - "collection_date", + ProcessingAnnotationTestCase( + ["collection_date"], + ["collection_date"], "Metadata field collection_date:'2088-12-01' is in the future.", ), ], @@ -119,7 +130,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "submissionId": "invalid_collection_date", "collection_date": "01-02-2024", "name_required": "name", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", }, accession_id="4", expected_metadata={ @@ -128,8 +139,9 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "concatenated_string": "LOC_4.1/2022-11-01", }, expected_errors=[ - ( - "collection_date", + ProcessingAnnotationTestCase( + ["collection_date"], + ["collection_date"], "Metadata field collection_date: Date format is not recognized.", ), ], @@ -140,7 +152,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "submissionId": "invalid_timestamp", "sequenced_timestamp": " 2022-11-01Europe", "name_required": "name", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", }, accession_id="5", expected_metadata={ @@ -149,8 +161,9 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "concatenated_string": "LOC_5.1/2022-11-01", }, expected_errors=[ - ( - "sequenced_timestamp", + ProcessingAnnotationTestCase( + ["sequenced_timestamp"], + ["sequenced_timestamp"], ( "Timestamp is 2022-11-01Europe which is not in parseable YYYY-MM-DD. " "Parsing error: Unknown string format: 2022-11-01Europe" @@ -164,7 +177,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "submissionId": "date_only_year", "collection_date": "2023", "name_required": "name", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", }, accession_id="6", expected_metadata={ @@ -175,8 +188,9 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT }, expected_errors=[], expected_warnings=[ - ( - "collection_date", + ProcessingAnnotationTestCase( + ["collection_date"], + ["collection_date"], ( "Metadata field collection_date:'2023' - Month and day are missing. " "Assuming January 1st." @@ -190,7 +204,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "submissionId": "date_no_day", "collection_date": "2023-12", "name_required": "name", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", }, accession_id="7", expected_metadata={ @@ -201,8 +215,9 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT }, expected_errors=[], expected_warnings=[ - ( - "collection_date", + ProcessingAnnotationTestCase( + ["collection_date"], + ["collection_date"], "Metadata field collection_date:'2023-12' - Day is missing. Assuming the 1st.", ), ], @@ -213,7 +228,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "submissionId": "invalid_int", "age_int": "asdf", "name_required": "name", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", }, accession_id="8", expected_metadata={ @@ -222,7 +237,9 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "concatenated_string": "LOC_8.1/2022-11-01", }, expected_errors=[ - ("age_int", "Invalid int value: asdf for field age_int."), + ProcessingAnnotationTestCase( + ["age_int"], ["age_int"], "Invalid int value: asdf for field age_int." + ), ], ), Case( @@ -231,7 +248,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "submissionId": "invalid_float", "percentage_float": "asdf", "name_required": "name", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", }, accession_id="9", expected_metadata={ @@ -240,7 +257,11 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "concatenated_string": "LOC_9.1/2022-11-01", }, expected_errors=[ - ("percentage_float", "Invalid float value: asdf for field percentage_float."), + ProcessingAnnotationTestCase( + ["percentage_float"], + ["percentage_float"], + "Invalid float value: asdf for field percentage_float.", + ), ], ), Case( @@ -249,7 +270,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "submissionId": "invalid_date", "name_required": "name", "other_date": "01-02-2024", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", }, accession_id="10", expected_metadata={ @@ -258,8 +279,9 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "concatenated_string": "LOC_10.1/2022-11-01", }, expected_errors=[ - ( - "other_date", + ProcessingAnnotationTestCase( + ["other_date"], + ["other_date"], ( "Date is 01-02-2024 which is not in the required format YYYY-MM-DD. " "Parsing error: time data '01-02-2024' does not match format '%Y-%m-%d'" @@ -273,7 +295,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "submissionId": "invalid_boolean", "name_required": "name", "is_lab_host_bool": "maybe", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", }, accession_id="11", expected_metadata={ @@ -282,7 +304,11 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "concatenated_string": "LOC_11.1/2022-11-01", }, expected_errors=[ - ("is_lab_host_bool", "Invalid boolean value: maybe for field is_lab_host_bool."), + ProcessingAnnotationTestCase( + ["is_lab_host_bool"], + ["is_lab_host_bool"], + "Invalid boolean value: maybe for field is_lab_host_bool.", + ), ], ), Case( @@ -290,7 +316,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT metadata={ "submissionId": "warn_potential_author_error", "name_required": "name", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", "authors": "Anna Smith, Cameron Tucker", }, accession_id="12", @@ -302,8 +328,9 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT }, expected_errors=[], expected_warnings=[ - ( - "authors", + ProcessingAnnotationTestCase( + ["authors"], + ["authors"], "The authors list 'Anna Smith, Cameron Tucker' might not be using the Loculus format. Please ensure that authors are separated by semi-colons. Each author's name should be in the format 'last name, first name;'. Last name(s) is mandatory, a comma is mandatory to separate first names/initials from last name. Only ASCII alphabetical characters A-Z are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' or 'Xu,;' if the first name is unknown.", ), ], @@ -313,7 +340,7 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT metadata={ "submissionId": "non_ascii_authors", "name_required": "name", - "required_collection_date": "2022-11-01", + "ncbi_required_collection_date": "2022-11-01", "authors": "Møller, Anäis; Pérez, José", }, accession_id="13", @@ -323,8 +350,9 @@ def create_test_case(self, factory_custom: ProcessedEntryFactory) -> ProcessingT "concatenated_string": "LOC_13.1/2022-11-01", }, expected_errors=[ - ( - "authors", + ProcessingAnnotationTestCase( + ["authors"], + ["authors"], "The authors list 'Møller, Anäis; Pérez, José' contains non-ASCII characters. Please ensure that authors are separated by semi-colons. Each author's name should be in the format 'last name, first name;'. Last name(s) is mandatory, a comma is mandatory to separate first names/initials from last name. Only ASCII alphabetical characters A-Z are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' or 'Xu,;' if the first name is unknown.", ), ], @@ -369,7 +397,10 @@ def factory_custom(config): def sort_annotations(annotations: list[ProcessingAnnotation]) -> list[ProcessingAnnotation]: - return sorted(annotations, key=lambda x: (x.source[0].name, x.message)) + return sorted( + annotations, + key=lambda x: (x.unprocessedFields[0].name, x.processedFields[0].name, x.message), + ) def process_single_entry(test_case: ProcessingTestCase, config: Config) -> ProcessedEntry: @@ -486,31 +517,31 @@ def test_format_authors() -> None: def test_parse_date_into_range() -> None: assert ( ProcessingFunctions.parse_date_into_range( - {"date": "2021-12"}, "field_name", {"fieldType": "dateRangeString"} + {"date": "2021-12"}, "field_name", ["field_name"], {"fieldType": "dateRangeString"} ).datum == "2021-12" ), "dateRangeString: 2021-12 should be returned as is." assert ( ProcessingFunctions.parse_date_into_range( - {"date": "2021-12"}, "field_name", {"fieldType": "dateRangeLower"} + {"date": "2021-12"}, "field_name", ["field_name"], {"fieldType": "dateRangeLower"} ).datum == "2021-12-01" ), "dateRangeLower: 2021-12 should be returned as 2021-12-01." assert ( ProcessingFunctions.parse_date_into_range( - {"date": "2021-12"}, "field_name", {"fieldType": "dateRangeUpper"} + {"date": "2021-12"}, "field_name", ["field_name"], {"fieldType": "dateRangeUpper"} ).datum == "2021-12-31" ), "dateRangeUpper: 2021-12 should be returned as 2021-12-31." assert ( ProcessingFunctions.parse_date_into_range( - {"date": "2021-02"}, "field_name", {"fieldType": "dateRangeUpper"} + {"date": "2021-02"}, "field_name", ["field_name"], {"fieldType": "dateRangeUpper"} ).datum == "2021-02-28" ), "dateRangeUpper: 2021-02 should be returned as 2021-02-28." assert ( ProcessingFunctions.parse_date_into_range( - {"date": "2021"}, "field_name", {"fieldType": "dateRangeUpper"} + {"date": "2021"}, "field_name", ["field_name"], {"fieldType": "dateRangeUpper"} ).datum == "2021-12-31" ), "dateRangeUpper: 2021 should be returned as 2021-12-31." @@ -518,6 +549,7 @@ def test_parse_date_into_range() -> None: ProcessingFunctions.parse_date_into_range( {"date": "2021-12", "releaseDate": "2021-12-15"}, "field_name", + ["field_name"], {"fieldType": "dateRangeUpper"}, ).datum == "2021-12-15" @@ -526,19 +558,20 @@ def test_parse_date_into_range() -> None: ProcessingFunctions.parse_date_into_range( {"date": "", "releaseDate": "2021-12-15"}, "field_name", + ["field_name"], {"fieldType": "dateRangeUpper"}, ).datum == "2021-12-15" ), "dateRangeUpper: empty date with releaseDate 2021-12-15 should be returned as 2021-12-15." assert ( ProcessingFunctions.parse_date_into_range( - {"date": ""}, "field_name", {"fieldType": "dateRangeString"} + {"date": ""}, "field_name", ["field_name"], {"fieldType": "dateRangeString"} ).datum is None ), "dateRangeString: empty date should be returned as None." assert ( ProcessingFunctions.parse_date_into_range( - {"date": "not.date"}, "field_name", {"fieldType": "dateRangeString"} + {"date": "not.date"}, "field_name", ["field_name"], {"fieldType": "dateRangeString"} ).datum is None ), "dateRangeString: invalid date should be returned as None." @@ -546,6 +579,7 @@ def test_parse_date_into_range() -> None: ProcessingFunctions.parse_date_into_range( {"date": "", "releaseDate": "2021-12-15"}, "field_name", + ["field_name"], {"fieldType": "dateRangeLower"}, ).datum is None diff --git a/preprocessing/specification.md b/preprocessing/specification.md index 4e6fbdf893..605ce8c3be 100644 --- a/preprocessing/specification.md +++ b/preprocessing/specification.md @@ -97,7 +97,11 @@ The `errors` and `warnings` fields contain an array of objects of the following ```js { - source: { + unprocessedFields: [{ + type: "Metadata" | "NucleotideSequence", + name: string + }[], + processedFields: { type: "Metadata" | "NucleotideSequence", name: string }[], @@ -105,9 +109,11 @@ The `errors` and `warnings` fields contain an array of objects of the following } ``` -The `source` field specifies the source of the error. It can be empty if the error is very general or if it is not possible to pinpoint a specific source. If the error is caused by the value in a metadata field, the `name` field should contain the name of a metadata field. If a nucleotide sequence caused the error, the `name` field should contain the (segment) name of the nucleotide sequence. +The `unprocessedFields` field(s) specifies the source of the error. It can be empty if the error is very general or if it is not possible to pinpoint a specific unprocessed metadata source. If the error is caused by the value in a metadata field, the `name` field should contain the name of a metadata field. If a nucleotide sequence caused the error, the `name` field should contain the (segment) name of the nucleotide sequence. + +The `processedFields` field(s) work similarly but specify which fields in the processed output are affected by the error or warning. -The `message` should contain a human-readable message describing the error. +The `message` should contain a human-readable message describing the error. It may be useful to include the user input in this message. #### Metadata diff --git a/website/src/components/Edit/DataRow.tsx b/website/src/components/Edit/DataRow.tsx index f5a9d3ec1a..f978be5409 100644 --- a/website/src/components/Edit/DataRow.tsx +++ b/website/src/components/Edit/DataRow.tsx @@ -1,4 +1,5 @@ import { type FC } from 'react'; +import { Tooltip } from 'react-tooltip'; import { InputField, type KeyValuePair, type Row } from './InputField.tsx'; import WarningAmberIcon from '~icons/ic/baseline-warning-amber'; @@ -6,20 +7,33 @@ import DangerousTwoToneIcon from '~icons/ic/twotone-dangerous'; type EditableRowProps = { label?: string; + inputField: string; row: Row; onChange: (editedRow: Row) => void; }; -export const EditableDataRow: FC = ({ label, row, onChange }) => { +export const EditableDataRow: FC = ({ label, inputField, row, onChange }) => { const colorClassName = row.errors.length > 0 ? 'text-red-600' : row.warnings.length > 0 ? 'text-yellow-600' : ''; + const content = `input metadata name: ${inputField}`; + return ( - - {label ?? row.key}: + +
+ + {label ?? row.key}: + + +
- + diff --git a/website/src/components/Edit/EditPage.spec.tsx b/website/src/components/Edit/EditPage.spec.tsx index b08a443d30..59e8ae00f7 100644 --- a/website/src/components/Edit/EditPage.spec.tsx +++ b/website/src/components/Edit/EditPage.spec.tsx @@ -6,7 +6,7 @@ import { beforeEach, describe, expect, test } from 'vitest'; import { EditPage } from './EditPage.tsx'; import { defaultReviewData, editableEntry, metadataKey, testAccessToken, testOrganism } from '../../../vitest.setup.ts'; -import type { MetadataField, SequenceEntryToEdit, UnprocessedMetadataRecord } from '../../types/backend.ts'; +import type { SequenceEntryToEdit, UnprocessedMetadataRecord } from '../../types/backend.ts'; import type { ClientConfig } from '../../types/runtimeConfig.ts'; const queryClient = new QueryClient(); @@ -60,9 +60,6 @@ describe('EditPage', () => { expect(screen.getAllByText(/Unaligned nucleotide sequences/i)[0]).toBeInTheDocument(); expectTextInSequenceData.original(defaultReviewData.originalData.unalignedNucleotideSequences); - expect(screen.getByText(/Processed Data/i)).toBeInTheDocument(); - expectTextInSequenceData.processedMetadata(defaultReviewData.processedData.metadata); - expect(screen.getByText('processedInsertionSequenceName:')).toBeInTheDocument(); expect(screen.getByText('nucleotideInsertion1,nucleotideInsertion2')).toBeInTheDocument(); @@ -110,9 +107,4 @@ const expectTextInSequenceData = { expect(screen.getByText(sentenceCase(key) + ':')).toBeInTheDocument(); expect(screen.getByDisplayValue(value)).toBeInTheDocument(); }), - processedMetadata: (metadata: Record): void => - Object.entries(metadata).forEach(([key, value]) => { - expect(screen.getByText(sentenceCase(key) + ':')).toBeInTheDocument(); - expect(screen.getByText((value ?? 'null').toString())).toBeInTheDocument(); - }), }; diff --git a/website/src/components/Edit/EditPage.tsx b/website/src/components/Edit/EditPage.tsx index 934feb139e..360f0fc5be 100644 --- a/website/src/components/Edit/EditPage.tsx +++ b/website/src/components/Edit/EditPage.tsx @@ -9,7 +9,6 @@ import { routes } from '../../routes/routes.ts'; import { backendClientHooks } from '../../services/serviceHooks.ts'; import { ACCESSION_FIELD, SUBMISSION_ID_FIELD } from '../../settings.ts'; import { - type MetadataRecord, type ProcessingAnnotationSourceType, type SequenceEntryToEdit, approvedForReleaseStatus, @@ -17,7 +16,6 @@ import { import { type InputField } from '../../types/config.ts'; import type { ClientConfig } from '../../types/runtimeConfig.ts'; import { createAuthorizationHeader } from '../../utils/createAuthorizationHeader.ts'; -import { displayMetadataField } from '../../utils/displayMetadataField.ts'; import { getAccessionVersionString } from '../../utils/extractAccessionVersion.ts'; import { displayConfirmationDialog } from '../ConfirmationDialog.tsx'; import { BoxWithTabsBox, BoxWithTabsTab, BoxWithTabsTabBar } from '../common/BoxWithTabs.tsx'; @@ -134,33 +132,6 @@ const InnerEditPage: FC = ({ -
- - - -
- @@ -176,7 +147,6 @@ const InnerEditPage: FC = ({ /> - = ({ )} + +
+ + + +
); }; @@ -335,6 +332,7 @@ const EditableOriginalData: FC = ({ editedMetadata, s return ( @@ -369,6 +367,7 @@ const EditableOriginalSequences: FC = ({ editedS {editedSequences.map((field) => ( setEditedSequences((prevRows: Row[]) => @@ -382,22 +381,6 @@ const EditableOriginalSequences: FC = ({ editedS ); -type ProcessedMetadataProps = { - processedMetadata: MetadataRecord; -}; -const ProcessedMetadata: FC = ({ processedMetadata }) => ( - <> - - {Object.entries(processedMetadata).map(([key, value]) => ( - - ))} - -); - type ProcessedInsertionsProps = { processedInsertions: ReturnType; insertionType: keyof ReturnType; @@ -458,9 +441,14 @@ const mapErrorsAndWarnings = ( type: ProcessingAnnotationSourceType, ): { errors: string[]; warnings: string[] } => ({ errors: (editedData.errors ?? []) - .filter((error) => error.source.find((source) => source.name === key && source.type === type) !== undefined) + .filter( + (error) => error.processedFields.find((field) => field.name === key && field.type === type) !== undefined, + ) .map((error) => error.message), warnings: (editedData.warnings ?? []) - .filter((warning) => warning.source.find((source) => source.name === key && source.type === type) !== undefined) + .filter( + (warning) => + warning.processedFields.find((field) => field.name === key && field.type === type) !== undefined, + ) .map((warning) => warning.message), }); diff --git a/website/src/components/ReviewPage/ReviewCard.tsx b/website/src/components/ReviewPage/ReviewCard.tsx index 20b7fe5848..b20eb47935 100644 --- a/website/src/components/ReviewPage/ReviewCard.tsx +++ b/website/src/components/ReviewPage/ReviewCard.tsx @@ -88,7 +88,11 @@ export const ReviewCard: FC = ({ {data?.errors?.length !== undefined && data.errors.length > 0 && ( - + )} {data?.warnings?.length !== undefined && data.warnings.length > 0 && ( @@ -181,7 +185,7 @@ type MetadataListProps = { }; const isAnnotationPresent = (metadataField: string) => (item: ProcessingAnnotation) => - item.source[0].name === metadataField; + item.processedFields[0].name === metadataField; const MetadataList: FC = ({ data, isLoading, metadataDisplayNames }) => !isLoading && @@ -201,21 +205,26 @@ const MetadataList: FC = ({ data, isLoading, metadataDisplayN type ErrorsProps = { errors: ProcessingAnnotation[]; accession: string; + metadataDisplayNames: Map; }; -const Errors: FC = ({ errors, accession }) => { +const Errors: FC = ({ errors, accession, metadataDisplayNames }) => { return (
{errors.map((error) => { - const uniqueKey = error.source.map((source) => source.type + source.name).join('.') + accession; + const uniqueKey = + error.processedFields.map((field) => field.type + field.name).join('.') + accession; + const processedFieldName = error.processedFields + .map((field) => metadataDisplayNames.get(field.name) ?? field.name) + .join(', '); return (

- {error.message} + {processedFieldName}: {error.message}

= ({ warnings, accession }) => { return (
- {warnings.map((warning) => ( -

source.type + source.name).join('.') + accession} - className='text-yellow-500' - > - {warning.message} -

- ))} + {warnings.map((warning) => { + const processedFieldName = warning.processedFields.map((field) => field.name).join(', '); + return ( +

field.type + field.name).join('.') + accession} + className='text-yellow-500' + > + {processedFieldName}: {warning.message} +

+ ); + })}
); diff --git a/website/src/types/backend.ts b/website/src/types/backend.ts index dc31c353e9..dd843c56a6 100644 --- a/website/src/types/backend.ts +++ b/website/src/types/backend.ts @@ -28,7 +28,13 @@ const processingAnnotationSourceType = z.union([z.literal('Metadata'), z.literal export type ProcessingAnnotationSourceType = z.infer; const processingAnnotation = z.object({ - source: z.array( + unprocessedFields: z.array( + z.object({ + name: z.string(), + type: processingAnnotationSourceType, + }), + ), + processedFields: z.array( z.object({ name: z.string(), type: processingAnnotationSourceType, diff --git a/website/tests/util/preprocessingPipeline.ts b/website/tests/util/preprocessingPipeline.ts index d79c25135d..76a0156c83 100644 --- a/website/tests/util/preprocessingPipeline.ts +++ b/website/tests/util/preprocessingPipeline.ts @@ -23,9 +23,21 @@ async function submit(preprocessingOptions: PreprocessingOptions[]) { accession, version, errors: error - ? [{ source: [{ name: 'host', type: 'Metadata' }], message: 'Not this kind of host' }] + ? [ + { + unprocessedFields: [{ name: 'host', type: 'Metadata' }], + processedFields: [{ name: 'host', type: 'Metadata' }], + message: 'Not this kind of host', + }, + ] : [], - warnings: [{ source: [{ name: 'date', type: 'Metadata' }], message: '"There is no warning"-warning' }], + warnings: [ + { + unprocessedFields: [{ name: 'date', type: 'Metadata' }], + processedFields: [{ name: 'date', type: 'Metadata' }], + message: '"There is no warning"-warning', + }, + ], data: { metadata: { date: '2002-12-15', diff --git a/website/vitest.setup.ts b/website/vitest.setup.ts index 5c60342220..52f5a6322a 100755 --- a/website/vitest.setup.ts +++ b/website/vitest.setup.ts @@ -52,7 +52,13 @@ export const defaultReviewData: SequenceEntryToEdit = { groupId: 1, errors: [ { - source: [ + unprocessedFields: [ + { + name: metadataKey, + type: 'Metadata', + }, + ], + processedFields: [ { name: metadataKey, type: 'Metadata', @@ -63,7 +69,13 @@ export const defaultReviewData: SequenceEntryToEdit = { ], warnings: [ { - source: [ + unprocessedFields: [ + { + name: metadataKey, + type: 'Metadata', + }, + ], + processedFields: [ { name: metadataKey, type: 'Metadata',