Skip to content

Commit

Permalink
text using cchf
Browse files Browse the repository at this point in the history
  • Loading branch information
anna-parker committed Jan 10, 2025
1 parent 87a7f6a commit a41e77f
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 8 deletions.
20 changes: 12 additions & 8 deletions ingest/scripts/deterministic_group_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def group_records(record_list, output_metadata, fasta_id_map, config, different_

joint_key = "/".join(
[
f"{segment_map[segment]}.{segment}"
f"{segment_map[segment]['insdcAccessionFull']}.{segment}"
for segment in config.nucleotide_sequences
if segment in segment_map
]
Expand Down Expand Up @@ -177,12 +177,13 @@ def main(
logger.info(f"Reading metadata from {input_metadata}")

source_of_truth = json.load(open(groups, encoding="utf-8"))
logger.info(f"Found {len(source_of_truth.keys())} source of truth groups")
accession_to_group = {}
for group, metadata in source_of_truth.items():
for accession in metadata:
accession_to_group[accession] = group

found_groups = {}
found_groups = {group: [] for group in source_of_truth}
# Map from original accession to the new concatenated accession
type Accession = str
type SubmissionId = str
Expand All @@ -202,10 +203,7 @@ def main(
ungrouped_accessions.add(record["id"])
continue
group = accession_to_group[metadata["insdcAccessionFull"]]
if group not in found_groups:
found_groups[group] = [record]
else:
found_groups[group].append(record)
found_groups[group].append(record)
if len(found_groups[group]) == len(set(source_of_truth[group])):
group_records(
found_groups[group], output_metadata, fasta_id_map, config, different_values_log
Expand All @@ -217,13 +215,19 @@ def main(

# add found_groups without all segments in file
count_unfilled_groups = 0
count_missing_tests = 0
for name, records in found_groups.items():
count_unfilled_groups += 1
logger.debug(f"{name}: Missing record {set(source_of_truth[name]) - set([record['metadata']['insdcAccessionFull'] for record in records])}")
logger.debug(
f"{name}: Missing record {set(source_of_truth[name]) - {record['metadata']['insdcAccessionFull'] for record in records}}"
)
if len(records) == 0:
count_missing_tests += 1
continue
group_records(records, output_metadata, fasta_id_map, config, different_values_log)
logger.info(different_values_log)
logger.info(f"Found {count_unfilled_groups} groups without all segments")

logger.info(f"Found {count_missing_tests} groups without any segments")

count = 0
count_ignored = 0
Expand Down
1 change: 1 addition & 0 deletions kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1487,6 +1487,7 @@ defaultOrganisms:
taxon_id: 3052518
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output
nextclade_dataset_name: nextstrain/cchfv/linked
grouping_ground_truth: https://anna-parker.github.io/influenza-a-groupings/results/cchf_groups.json
enaDeposition:
configFile:
taxon_id: 3052518
Expand Down

0 comments on commit a41e77f

Please sign in to comment.