From 500c1a586c8435322d28057b3de67d8f361045e1 Mon Sep 17 00:00:00 2001 From: jonas-fuchs Date: Wed, 13 Nov 2024 10:14:37 +0100 Subject: [PATCH] lazy fix for parsing gffs --- virheat/scripts/data_prep.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py index d00ff30..f7e77c9 100644 --- a/virheat/scripts/data_prep.py +++ b/virheat/scripts/data_prep.py @@ -294,7 +294,13 @@ def parse_gff3(file, reference): # ignore comments and last line if not line.startswith(reference): continue - gff_values = line.split("\t") + gff_values = line.strip().split("\t") + # sanity check that the line has a unique ID for the dict key + # this is a lazy fix as it will exclude e.g. exons without ID and + # only a parent -> fixing this might require more complex parsing + # and data structure + if not gff_values[8].startswith("ID="): + continue # create keys if gff_values[2] not in gff3_dict: gff3_dict[gff_values[2]] = {}