From 5e6a0a4c27f327f09c2a167a0f15aec8d794695a Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Wed, 29 May 2024 03:36:28 +0200
Subject: [PATCH] fix(prepro): mask terminal gaps (#2050)

Mask terminal gaps, i.e. replace terminal - with N (for nucs) and X (for aa) to not make those appear as true deletions

Resolves #1402
---
 .../loculus/templates/loculus-website.yaml    |  2 +-
 preprocessing/nextclade/.mypy.ini             |  6 ++
 preprocessing/nextclade/environment.yml       |  4 +-
 .../src/loculus_preprocessing/prepro.py       | 63 ++++++++++++++++---
 .../processing_functions.py                   | 12 ++--
 5 files changed, 69 insertions(+), 18 deletions(-)

diff --git a/kubernetes/loculus/templates/loculus-website.yaml b/kubernetes/loculus/templates/loculus-website.yaml
index 6134d55f07..1d52199a4b 100644
--- a/kubernetes/loculus/templates/loculus-website.yaml
+++ b/kubernetes/loculus/templates/loculus-website.yaml
@@ -53,4 +53,4 @@ spec:
         - name: custom-website-sealed-secret
       volumes:
 {{ include "loculus.configVolume" (dict "name" "loculus-website-config") | nindent 8 }}
- {{- end }}
\ No newline at end of file
+{{- end }}
\ No newline at end of file
diff --git a/preprocessing/nextclade/.mypy.ini b/preprocessing/nextclade/.mypy.ini
index f5bb32f75a..e0f205e2eb 100644
--- a/preprocessing/nextclade/.mypy.ini
+++ b/preprocessing/nextclade/.mypy.ini
@@ -3,3 +3,9 @@ python_version = 3.12
 
 [mypy-Bio.*]
 ignore_missing_imports = True
+
+[mypy-jwt.*]
+ignore_missing_imports = True
+
+[mypy-dpath.*]
+ignore_missing_imports = True
\ No newline at end of file
diff --git a/preprocessing/nextclade/environment.yml b/preprocessing/nextclade/environment.yml
index b54501565d..e0d058f98b 100644
--- a/preprocessing/nextclade/environment.yml
+++ b/preprocessing/nextclade/environment.yml
@@ -6,10 +6,10 @@ dependencies:
   - python=3.12
   - biopython=1.83
   - dpath=2.1
-  - nextclade=3.3.1
+  - nextclade=3.5
   - pip=24.0
   - PyYAML=6.0
   - pyjwt=2.8
   - python-dateutil=2.9
   - pytz=2024.1
-  - requests=2.31
\ No newline at end of file
+  - requests=2.32
\ No newline at end of file
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index 39ea0e767a..a5ba8941c2 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -9,7 +9,7 @@
 from http import HTTPStatus
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Any
+from typing import Any, Literal, TypeVar
 
 import dpath
 import requests
@@ -42,6 +42,52 @@
 csv.field_size_limit(sys.maxsize)
 
 
+GenericSequence = TypeVar("GenericSequence", AminoAcidSequence, NucleotideSequence)
+
+
+def mask_terminal_gaps(
+    sequence: GenericSequence, mask_char: Literal["N"] | Literal["X"] = "N"
+) -> GenericSequence:
+    # https://chatgpt.com/share/b213c687-38c4-4c62-98a8-4fecb210d479
+    if not sequence:
+        return ""
+
+    if mask_char not in {"N", "X"}:
+        error_message = "mask_char must be 'N' or 'X'"
+        raise ValueError(error_message)
+
+    # Entire sequence of gaps
+    if not sequence.strip("-"):
+        return mask_char * len(sequence)
+
+    # Find the index of the first non-'-' character
+    first_non_gap = 0
+    while first_non_gap < len(sequence) and sequence[first_non_gap] == "-":
+        first_non_gap += 1
+
+    # Find the index of the last non-'-' character
+    last_non_gap = len(sequence)
+    while last_non_gap > 0 and sequence[last_non_gap - 1] == "-":
+        last_non_gap -= 1
+
+    # Replace terminal gaps with 'N'
+    return (
+        mask_char * first_non_gap
+        + sequence[first_non_gap:last_non_gap]
+        + mask_char * (len(sequence) - last_non_gap)
+    )
+
+
+def load_aligned_nuc_sequences(result_dir: str) -> dict[AccessionVersion, NucleotideSequence]:
+    aligned_nucleotide_sequences: dict[AccessionVersion, NucleotideSequence] = {}
+    with open(f"{result_dir}/nextclade.aligned.fasta", encoding="utf-8") as aligned_nucs:
+        for aligned_sequence in SeqIO.parse(aligned_nucs, "fasta"):
+            sequence_id: AccessionVersion = aligned_sequence.id
+            sequence: NucleotideSequence = str(aligned_sequence.seq)
+            aligned_nucleotide_sequences[sequence_id] = mask_terminal_gaps(sequence)
+    return aligned_nucleotide_sequences
+
+
 def fetch_unprocessed_sequences(n: int, config: Config) -> Sequence[UnprocessedEntry]:
     url = config.backend_host.rstrip("/") + "/extract-unprocessed-data"
     logging.debug(f"Fetching {n} unprocessed sequences from {url}")
@@ -123,12 +169,7 @@ def enrich_with_nextclade(
 
         logging.debug(f"Nextclade results available in {result_dir}")
 
-        aligned_nucleotide_sequences: dict[AccessionVersion, NucleotideSequence] = {}
-        with open(result_dir + "/nextclade.aligned.fasta", encoding="utf-8") as aligned_nucs:
-            aligned_nuc = SeqIO.parse(aligned_nucs, "fasta")
-            for aligned_sequence in aligned_nuc:
-                sequence_id: str = aligned_sequence.id
-                aligned_nucleotide_sequences[sequence_id] = str(aligned_sequence.seq)
+        aligned_nucleotide_sequences = load_aligned_nuc_sequences(result_dir)
 
         for gene in config.genes:
             translation_path = result_dir + f"/nextclade.cds_translation.{gene}.fasta"
@@ -137,7 +178,10 @@ def enrich_with_nextclade(
                     aligned_translation = SeqIO.parse(aligned_translations, "fasta")
                     for aligned_sequence in aligned_translation:
                         sequence_id = aligned_sequence.id
-                        aligned_aminoacid_sequences[sequence_id][gene] = str(aligned_sequence.seq)
+                        masked_sequence = mask_terminal_gaps(
+                            str(aligned_sequence.seq), mask_char="X"
+                        )
+                        aligned_aminoacid_sequences[sequence_id][gene] = masked_sequence
             except FileNotFoundError:
                 # TODO: Add warning to each sequence
                 logging.info(
@@ -262,7 +306,8 @@ def process_single(
                 )
                 continue
             if input_path not in unprocessed.inputMetadata:
-                # Suppress warning to prevent spamming for now until we have more sophisticated solution
+                # Suppress warning to prevent spamming for now until
+                # we have a more sophisticated solution
                 # warnings.append(
                 #     ProcessingAnnotation(
                 #         source=[
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index f482895f78..7e03977f43 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -149,12 +149,12 @@ def process_date(
         if len(date_str) == 0:
             if args and args.get("required"):
                 errors.append(
-                            ProcessingAnnotation(
-                                source=[
-                                    AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
-                                ],
-                                message="Collection date is required",
-                            )
+                    ProcessingAnnotation(
+                        source=[
+                            AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                        ],
+                        message="Collection date is required",
+                    )
                 )
             return ProcessingResult(
                 datum=None,