From 5e6a0a4c27f327f09c2a167a0f15aec8d794695a Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Wed, 29 May 2024 03:36:28 +0200 Subject: [PATCH] fix(prepro): mask terminal gaps (#2050) Mask terminal gaps, i.e. replace terminal - with N (for nucs) and X (for aa) to not make those appear as true deletions Resolves #1402 --- .../loculus/templates/loculus-website.yaml | 2 +- preprocessing/nextclade/.mypy.ini | 6 ++ preprocessing/nextclade/environment.yml | 4 +- .../src/loculus_preprocessing/prepro.py | 63 ++++++++++++++++--- .../processing_functions.py | 12 ++-- 5 files changed, 69 insertions(+), 18 deletions(-) diff --git a/kubernetes/loculus/templates/loculus-website.yaml b/kubernetes/loculus/templates/loculus-website.yaml index 6134d55f07..1d52199a4b 100644 --- a/kubernetes/loculus/templates/loculus-website.yaml +++ b/kubernetes/loculus/templates/loculus-website.yaml @@ -53,4 +53,4 @@ spec: - name: custom-website-sealed-secret volumes: {{ include "loculus.configVolume" (dict "name" "loculus-website-config") | nindent 8 }} - {{- end }} \ No newline at end of file +{{- end }} \ No newline at end of file diff --git a/preprocessing/nextclade/.mypy.ini b/preprocessing/nextclade/.mypy.ini index f5bb32f75a..e0f205e2eb 100644 --- a/preprocessing/nextclade/.mypy.ini +++ b/preprocessing/nextclade/.mypy.ini @@ -3,3 +3,9 @@ python_version = 3.12 [mypy-Bio.*] ignore_missing_imports = True + +[mypy-jwt.*] +ignore_missing_imports = True + +[mypy-dpath.*] +ignore_missing_imports = True \ No newline at end of file diff --git a/preprocessing/nextclade/environment.yml b/preprocessing/nextclade/environment.yml index b54501565d..e0d058f98b 100644 --- a/preprocessing/nextclade/environment.yml +++ b/preprocessing/nextclade/environment.yml @@ -6,10 +6,10 @@ dependencies: - python=3.12 - biopython=1.83 - dpath=2.1 - - nextclade=3.3.1 + - nextclade=3.5 - pip=24.0 - PyYAML=6.0 - pyjwt=2.8 - python-dateutil=2.9 - pytz=2024.1 - - requests=2.31 \ No newline at end of file + - requests=2.32 \ No newline at end of file diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 39ea0e767a..a5ba8941c2 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -9,7 +9,7 @@ from http import HTTPStatus from pathlib import Path from tempfile import TemporaryDirectory -from typing import Any +from typing import Any, Literal, TypeVar import dpath import requests @@ -42,6 +42,52 @@ csv.field_size_limit(sys.maxsize) +GenericSequence = TypeVar("GenericSequence", AminoAcidSequence, NucleotideSequence) + + +def mask_terminal_gaps( + sequence: GenericSequence, mask_char: Literal["N"] | Literal["X"] = "N" +) -> GenericSequence: + # https://chatgpt.com/share/b213c687-38c4-4c62-98a8-4fecb210d479 + if not sequence: + return "" + + if mask_char not in {"N", "X"}: + error_message = "mask_char must be 'N' or 'X'" + raise ValueError(error_message) + + # Entire sequence of gaps + if not sequence.strip("-"): + return mask_char * len(sequence) + + # Find the index of the first non-'-' character + first_non_gap = 0 + while first_non_gap < len(sequence) and sequence[first_non_gap] == "-": + first_non_gap += 1 + + # Find the index of the last non-'-' character + last_non_gap = len(sequence) + while last_non_gap > 0 and sequence[last_non_gap - 1] == "-": + last_non_gap -= 1 + + # Replace terminal gaps with 'N' + return ( + mask_char * first_non_gap + + sequence[first_non_gap:last_non_gap] + + mask_char * (len(sequence) - last_non_gap) + ) + + +def load_aligned_nuc_sequences(result_dir: str) -> dict[AccessionVersion, NucleotideSequence]: + aligned_nucleotide_sequences: dict[AccessionVersion, NucleotideSequence] = {} + with open(f"{result_dir}/nextclade.aligned.fasta", encoding="utf-8") as aligned_nucs: + for aligned_sequence in SeqIO.parse(aligned_nucs, "fasta"): + sequence_id: AccessionVersion = aligned_sequence.id + sequence: NucleotideSequence = str(aligned_sequence.seq) + aligned_nucleotide_sequences[sequence_id] = mask_terminal_gaps(sequence) + return aligned_nucleotide_sequences + + def fetch_unprocessed_sequences(n: int, config: Config) -> Sequence[UnprocessedEntry]: url = config.backend_host.rstrip("/") + "/extract-unprocessed-data" logging.debug(f"Fetching {n} unprocessed sequences from {url}") @@ -123,12 +169,7 @@ def enrich_with_nextclade( logging.debug(f"Nextclade results available in {result_dir}") - aligned_nucleotide_sequences: dict[AccessionVersion, NucleotideSequence] = {} - with open(result_dir + "/nextclade.aligned.fasta", encoding="utf-8") as aligned_nucs: - aligned_nuc = SeqIO.parse(aligned_nucs, "fasta") - for aligned_sequence in aligned_nuc: - sequence_id: str = aligned_sequence.id - aligned_nucleotide_sequences[sequence_id] = str(aligned_sequence.seq) + aligned_nucleotide_sequences = load_aligned_nuc_sequences(result_dir) for gene in config.genes: translation_path = result_dir + f"/nextclade.cds_translation.{gene}.fasta" @@ -137,7 +178,10 @@ def enrich_with_nextclade( aligned_translation = SeqIO.parse(aligned_translations, "fasta") for aligned_sequence in aligned_translation: sequence_id = aligned_sequence.id - aligned_aminoacid_sequences[sequence_id][gene] = str(aligned_sequence.seq) + masked_sequence = mask_terminal_gaps( + str(aligned_sequence.seq), mask_char="X" + ) + aligned_aminoacid_sequences[sequence_id][gene] = masked_sequence except FileNotFoundError: # TODO: Add warning to each sequence logging.info( @@ -262,7 +306,8 @@ def process_single( ) continue if input_path not in unprocessed.inputMetadata: - # Suppress warning to prevent spamming for now until we have more sophisticated solution + # Suppress warning to prevent spamming for now until + # we have a more sophisticated solution # warnings.append( # ProcessingAnnotation( # source=[ diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index f482895f78..7e03977f43 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -149,12 +149,12 @@ def process_date( if len(date_str) == 0: if args and args.get("required"): errors.append( - ProcessingAnnotation( - source=[ - AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) - ], - message="Collection date is required", - ) + ProcessingAnnotation( + source=[ + AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) + ], + message="Collection date is required", + ) ) return ProcessingResult( datum=None,