Skip to content

Commit

Permalink
fix(prepro): mask terminal gaps (#2050)
Browse files Browse the repository at this point in the history
Mask terminal gaps, i.e. replace terminal - with N (for nucs) and X (for aa) to not make those appear as true deletions

Resolves #1402
  • Loading branch information
corneliusroemer authored May 29, 2024
1 parent a962ff1 commit 5e6a0a4
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 18 deletions.
2 changes: 1 addition & 1 deletion kubernetes/loculus/templates/loculus-website.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,4 @@ spec:
- name: custom-website-sealed-secret
volumes:
{{ include "loculus.configVolume" (dict "name" "loculus-website-config") | nindent 8 }}
{{- end }}
{{- end }}
6 changes: 6 additions & 0 deletions preprocessing/nextclade/.mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@ python_version = 3.12

[mypy-Bio.*]
ignore_missing_imports = True

[mypy-jwt.*]
ignore_missing_imports = True

[mypy-dpath.*]
ignore_missing_imports = True
4 changes: 2 additions & 2 deletions preprocessing/nextclade/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ dependencies:
- python=3.12
- biopython=1.83
- dpath=2.1
- nextclade=3.3.1
- nextclade=3.5
- pip=24.0
- PyYAML=6.0
- pyjwt=2.8
- python-dateutil=2.9
- pytz=2024.1
- requests=2.31
- requests=2.32
63 changes: 54 additions & 9 deletions preprocessing/nextclade/src/loculus_preprocessing/prepro.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from http import HTTPStatus
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any
from typing import Any, Literal, TypeVar

import dpath
import requests
Expand Down Expand Up @@ -42,6 +42,52 @@
csv.field_size_limit(sys.maxsize)


GenericSequence = TypeVar("GenericSequence", AminoAcidSequence, NucleotideSequence)


def mask_terminal_gaps(
sequence: GenericSequence, mask_char: Literal["N"] | Literal["X"] = "N"
) -> GenericSequence:
# https://chatgpt.com/share/b213c687-38c4-4c62-98a8-4fecb210d479
if not sequence:
return ""

if mask_char not in {"N", "X"}:
error_message = "mask_char must be 'N' or 'X'"
raise ValueError(error_message)

# Entire sequence of gaps
if not sequence.strip("-"):
return mask_char * len(sequence)

# Find the index of the first non-'-' character
first_non_gap = 0
while first_non_gap < len(sequence) and sequence[first_non_gap] == "-":
first_non_gap += 1

# Find the index of the last non-'-' character
last_non_gap = len(sequence)
while last_non_gap > 0 and sequence[last_non_gap - 1] == "-":
last_non_gap -= 1

# Replace terminal gaps with 'N'
return (
mask_char * first_non_gap
+ sequence[first_non_gap:last_non_gap]
+ mask_char * (len(sequence) - last_non_gap)
)


def load_aligned_nuc_sequences(result_dir: str) -> dict[AccessionVersion, NucleotideSequence]:
aligned_nucleotide_sequences: dict[AccessionVersion, NucleotideSequence] = {}
with open(f"{result_dir}/nextclade.aligned.fasta", encoding="utf-8") as aligned_nucs:
for aligned_sequence in SeqIO.parse(aligned_nucs, "fasta"):
sequence_id: AccessionVersion = aligned_sequence.id
sequence: NucleotideSequence = str(aligned_sequence.seq)
aligned_nucleotide_sequences[sequence_id] = mask_terminal_gaps(sequence)
return aligned_nucleotide_sequences


def fetch_unprocessed_sequences(n: int, config: Config) -> Sequence[UnprocessedEntry]:
url = config.backend_host.rstrip("/") + "/extract-unprocessed-data"
logging.debug(f"Fetching {n} unprocessed sequences from {url}")
Expand Down Expand Up @@ -123,12 +169,7 @@ def enrich_with_nextclade(

logging.debug(f"Nextclade results available in {result_dir}")

aligned_nucleotide_sequences: dict[AccessionVersion, NucleotideSequence] = {}
with open(result_dir + "/nextclade.aligned.fasta", encoding="utf-8") as aligned_nucs:
aligned_nuc = SeqIO.parse(aligned_nucs, "fasta")
for aligned_sequence in aligned_nuc:
sequence_id: str = aligned_sequence.id
aligned_nucleotide_sequences[sequence_id] = str(aligned_sequence.seq)
aligned_nucleotide_sequences = load_aligned_nuc_sequences(result_dir)

for gene in config.genes:
translation_path = result_dir + f"/nextclade.cds_translation.{gene}.fasta"
Expand All @@ -137,7 +178,10 @@ def enrich_with_nextclade(
aligned_translation = SeqIO.parse(aligned_translations, "fasta")
for aligned_sequence in aligned_translation:
sequence_id = aligned_sequence.id
aligned_aminoacid_sequences[sequence_id][gene] = str(aligned_sequence.seq)
masked_sequence = mask_terminal_gaps(
str(aligned_sequence.seq), mask_char="X"
)
aligned_aminoacid_sequences[sequence_id][gene] = masked_sequence
except FileNotFoundError:
# TODO: Add warning to each sequence
logging.info(
Expand Down Expand Up @@ -262,7 +306,8 @@ def process_single(
)
continue
if input_path not in unprocessed.inputMetadata:
# Suppress warning to prevent spamming for now until we have more sophisticated solution
# Suppress warning to prevent spamming for now until
# we have a more sophisticated solution
# warnings.append(
# ProcessingAnnotation(
# source=[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,12 @@ def process_date(
if len(date_str) == 0:
if args and args.get("required"):
errors.append(
ProcessingAnnotation(
source=[
AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
],
message="Collection date is required",
)
ProcessingAnnotation(
source=[
AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
],
message="Collection date is required",
)
)
return ProcessingResult(
datum=None,
Expand Down

0 comments on commit 5e6a0a4

Please sign in to comment.