From 500c1a586c8435322d28057b3de67d8f361045e1 Mon Sep 17 00:00:00 2001
From: jonas-fuchs <jonas.michael.fuchs@googlemail.com>
Date: Wed, 13 Nov 2024 10:14:37 +0100
Subject: [PATCH] lazy fix for parsing gffs

---
 virheat/scripts/data_prep.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py
index d00ff30..f7e77c9 100644
--- a/virheat/scripts/data_prep.py
+++ b/virheat/scripts/data_prep.py
@@ -294,7 +294,13 @@ def parse_gff3(file, reference):
             # ignore comments and last line
             if not line.startswith(reference):
                 continue
-            gff_values = line.split("\t")
+            gff_values = line.strip().split("\t")
+            # sanity check that the line has a unique ID for the dict key
+            # this is a lazy fix as it will exclude e.g. exons without ID and
+            # only a parent -> fixing this might require more complex parsing
+            # and data structure
+            if not gff_values[8].startswith("ID="):
+                continue
             # create keys
             if gff_values[2] not in gff3_dict:
                 gff3_dict[gff_values[2]] = {}