From 7d3dac472bd011a66b8c95422acb4bed4894e51d Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Tue, 11 Jun 2024 22:07:16 +0200 Subject: [PATCH] fix(prepro): Append to instead of recreating insertion dictionaries. (#2135) resolves #2134 --- .../nextclade/src/loculus_preprocessing/prepro.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 7dd118d18f..647414cb2a 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -79,7 +79,10 @@ def parse_nextclade_tsv( id = row["seqName"] nuc_ins_str: list[NucleotideInsertion] = list(row["insertions"].split(",")) - nucleotide_insertions[id] = {segment: [] if nuc_ins_str == [""] else nuc_ins_str} + if id in nucleotide_insertions: + nucleotide_insertions[id][segment] = [] if nuc_ins_str == [""] else nuc_ins_str + else: + nucleotide_insertions[id] = {segment: [] if nuc_ins_str == [""] else nuc_ins_str} aa_ins: dict[GeneName, list[AminoAcidInsertion]] = {gene: [] for gene in config.genes} aa_ins_split = row["aaInsertions"].split(",") @@ -94,7 +97,10 @@ def parse_nextclade_tsv( "Note: Nextclade found AA insertion in gene missing from config in gene " f"{gene}: {val}" ) - amino_acid_insertions[id] = aa_ins + if id in amino_acid_insertions: + amino_acid_insertions[id].update(aa_ins) + else: + amino_acid_insertions[id] = aa_ins return nucleotide_insertions, amino_acid_insertions @@ -355,7 +361,9 @@ def get_metadata( if not spec.args.get("no_warn", False): warnings.append( ProcessingAnnotation( - source=[AnnotationSource(name=input_path, type=AnnotationSourceType.METADATA)], + source=[ + AnnotationSource(name=input_path, type=AnnotationSourceType.METADATA) + ], message=f"Metadata field '{input_path}' not found in input", ) )