From a81daa69ddafba3b2e21740fd15b6b04fdce368d Mon Sep 17 00:00:00 2001
From: jonas-fuchs <jonas.michael.fuchs@googlemail.com>
Date: Mon, 5 Aug 2024 15:32:03 +0200
Subject: [PATCH 1/5] added reference

---
 virheat/command.py           | 19 ++++++++++++-------
 virheat/scripts/data_prep.py | 21 ++++++++++++++-------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/virheat/command.py b/virheat/command.py
index 9f107af..01a1349 100644
--- a/virheat/command.py
+++ b/virheat/command.py
@@ -32,6 +32,14 @@ def get_args(sysargs):
         nargs=2,
         help="folder containing input files and output folder"
     )
+    parser.add_argument(
+        "-r",
+        "--reference",
+        type=str,
+        metavar="None",
+        default=None,
+        help="reference identifier"
+    )
     parser.add_argument(
         "--name",
         type=str,
@@ -147,10 +155,10 @@ def main(sysargs=sys.argv[1:]):
         sys.exit("\033[31m\033[1mERROR:\033[0m No VCF files provided")
     else:
         if args.scores:
-            reference_name, frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, threshold=args.threshold, scores=True)
+            frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, threshold=args.threshold, scores=True)
             n_scores = len(args.scores)
         else:
-            reference_name, frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, threshold=args.threshold)
+            frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, args.reference, threshold=args.threshold)
 
     if args.zoom:
         unique_mutations = data_prep.zoom_to_genomic_regions(unique_mutations, args.zoom)
@@ -178,10 +186,7 @@ def main(sysargs=sys.argv[1:]):
     if args.gff3_path is not None and args.genome_length is not None:
         sys.exit("\033[31m\033[1mERROR:\033[0m Do not provide the -g and -l argument simultaneously!")
     elif args.gff3_path is not None:
-        gff3_info, gff3_ref_name = data_prep.parse_gff3(args.gff3_path)
-        # issue a warning if #CHROM and gff3 do not match
-        if gff3_ref_name not in reference_name and reference_name not in gff3_ref_name:
-            print("\033[31m\033[1mWARNING:\033[0m gff3 reference does not match the vcf reference!")
+        gff3_info, gff3_ref_name = data_prep.parse_gff3(args.gff3_path, args.reference)
         genome_end = data_prep.get_genome_end(gff3_info)
         genes_with_mutations, n_tracks = data_prep.create_track_dict(unique_mutations, gff3_info, args.gff3_annotations)
     elif args.genome_length is not None:
@@ -225,7 +230,7 @@ def main(sysargs=sys.argv[1:]):
     plotting.create_heatmap(ax, frequency_array, cmap_cells)
     mutation_set = plotting.create_genome_vis(ax, genome_y_location, n_mutations, unique_mutations, start, stop)
     plotting.create_axis(ax, n_mutations, min_y_location, n_samples, file_names, start, stop, genome_y_location,
-                         unique_mutations, reference_name)
+                         unique_mutations, args.reference)
     plotting.create_mutation_legend(mutation_set, min_y_location, n_samples, n_scores)
     plotting.create_colorbar(args.threshold, cmap_cells, min_y_location, n_samples, ax)
     # plot gene track
diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py
index 4cf64a1..90e2a8c 100644
--- a/virheat/scripts/data_prep.py
+++ b/virheat/scripts/data_prep.py
@@ -47,7 +47,7 @@ def convert_string(string):
         return string
 
 
-def read_vcf(vcf_file):
+def read_vcf(vcf_file, reference):
     """
     parse vcf files to dictionary
     """
@@ -62,10 +62,10 @@ def read_vcf(vcf_file):
         header = header_lines[0]
     # get each line as frequency_lists
     with open(vcf_file, "r") as f:
-        lines = [l.split("\t") for l in f if not l.startswith('#')]
+        lines = [l.split("\t") for l in f if not l.startswith('#') and l.startswith(reference)]
     # check if vcf is empty
     if not lines:
-        print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} is empty!")
+        print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} has no variants with {reference} as #CHROM!")
     # get standard headers as keys
     for key in header[0:6]:
         vcf_dict[key] = []
@@ -104,7 +104,7 @@ def read_vcf(vcf_file):
     return vcf_dict
 
 
-def extract_vcf_data(vcf_files, threshold=0, scores=False):
+def extract_vcf_data(vcf_files, reference, threshold=0, scores=False):
     """
     extract relevant vcf data
     """
@@ -114,7 +114,7 @@ def extract_vcf_data(vcf_files, threshold=0, scores=False):
 
     for file in vcf_files:
         file_names.append(os.path.splitext(os.path.basename(file))[0])
-        vcf_dict = read_vcf(file)
+        vcf_dict = read_vcf(file, reference)
         frequency_list = []
         # write all mutation info in a '_' sep string
         for idx in range(0, len(vcf_dict["#CHROM"])):
@@ -138,8 +138,10 @@ def extract_vcf_data(vcf_files, threshold=0, scores=False):
     unique_mutations = sorted(
         {x[0] for li in frequency_lists for x in li}, key=lambda x: int(x.split("_")[0])
     )
+    if not unique_mutations:
+        sys.exit(f"\033[31m\033[1mERROR:\033[0m No unique mutations to {reference} found in all vcf files!")
 
-    return vcf_dict["#CHROM"][0], frequency_lists, unique_mutations, file_names
+    return frequency_lists, unique_mutations, file_names
 
 
 def extract_scores(unique_mutations, scores_file, aa_pos_col, score_col):
@@ -267,7 +269,7 @@ def zoom_to_genomic_regions(unique_mutations, start_stop):
     return zoomed_unique
 
 
-def parse_gff3(file):
+def parse_gff3(file, reference):
     """
     parse gff3 to dictionary
     """
@@ -280,6 +282,8 @@ def parse_gff3(file):
             if line.startswith("#") or line == "\n":
                 continue
             gff_values = line.split("\t")
+            if gff_values[0] != reference:
+                continue
             gff3_ref_name = gff_values[0]
             # create keys
             if gff_values[2] not in gff3_dict:
@@ -300,6 +304,9 @@ def parse_gff3(file):
 
     gff3_file.close()
 
+    if not gff3_dict:
+        sys.exit(f"\033[31m\033[1mERROR:\033[0m {reference} not found in gff3 file.")
+
     return gff3_dict, gff3_ref_name
 
 

From d780d2b21942decc98d1533fde935f0ccdf5fb16 Mon Sep 17 00:00:00 2001
From: jonas-fuchs <jonas.michael.fuchs@googlemail.com>
Date: Tue, 6 Aug 2024 09:49:29 +0200
Subject: [PATCH 2/5] smaller changes

---
 example_mave_data/MaveBindRBD.csv |  2 +-
 virheat/__init__.py               |  2 +-
 virheat/command.py                |  7 ++++---
 virheat/scripts/data_prep.py      | 13 +++++--------
 4 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/example_mave_data/MaveBindRBD.csv b/example_mave_data/MaveBindRBD.csv
index 11b58cb..ab422ae 100644
--- a/example_mave_data/MaveBindRBD.csv
+++ b/example_mave_data/MaveBindRBD.csv
@@ -1,4 +1,4 @@
-accession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA
+exaccession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA
 urn:mavedb:00000044-a-1#7954,NA,NA,p.Lys199Val,0.09,0.07,2,K529V
 urn:mavedb:00000044-a-1#7955,NA,NA,p.Lys199Trp,-0.02,0,1,K529W
 urn:mavedb:00000044-a-1#7956,NA,NA,p.Lys199Trp,0.01,0,2,K529W
diff --git a/virheat/__init__.py b/virheat/__init__.py
index d10d6d3..02e2c78 100644
--- a/virheat/__init__.py
+++ b/virheat/__init__.py
@@ -1,3 +1,3 @@
 """plot vcf data as a heatmap mapped to a virus genome"""
 _program = "virheat"
-__version__ = "0.7"
+__version__ = "0.7.1"
diff --git a/virheat/command.py b/virheat/command.py
index 01a1349..dd8d603 100644
--- a/virheat/command.py
+++ b/virheat/command.py
@@ -36,7 +36,8 @@ def get_args(sysargs):
         "-r",
         "--reference",
         type=str,
-        metavar="None",
+        metavar="ref_id",
+        required=True,
         default=None,
         help="reference identifier"
     )
@@ -155,7 +156,7 @@ def main(sysargs=sys.argv[1:]):
         sys.exit("\033[31m\033[1mERROR:\033[0m No VCF files provided")
     else:
         if args.scores:
-            frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, threshold=args.threshold, scores=True)
+            frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, args.reference, threshold=args.threshold, scores=True)
             n_scores = len(args.scores)
         else:
             frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, args.reference, threshold=args.threshold)
@@ -186,7 +187,7 @@ def main(sysargs=sys.argv[1:]):
     if args.gff3_path is not None and args.genome_length is not None:
         sys.exit("\033[31m\033[1mERROR:\033[0m Do not provide the -g and -l argument simultaneously!")
     elif args.gff3_path is not None:
-        gff3_info, gff3_ref_name = data_prep.parse_gff3(args.gff3_path, args.reference)
+        gff3_info = data_prep.parse_gff3(args.gff3_path, args.reference)
         genome_end = data_prep.get_genome_end(gff3_info)
         genes_with_mutations, n_tracks = data_prep.create_track_dict(unique_mutations, gff3_info, args.gff3_annotations)
     elif args.genome_length is not None:
diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py
index 90e2a8c..e60178a 100644
--- a/virheat/scripts/data_prep.py
+++ b/virheat/scripts/data_prep.py
@@ -62,10 +62,10 @@ def read_vcf(vcf_file, reference):
         header = header_lines[0]
     # get each line as frequency_lists
     with open(vcf_file, "r") as f:
-        lines = [l.split("\t") for l in f if not l.startswith('#') and l.startswith(reference)]
+        lines = [l.split("\t") for l in f if l.startswith(reference)]
     # check if vcf is empty
     if not lines:
-        print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} has no variants with {reference} as #CHROM!")
+        print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} has no variants to {reference}!")
     # get standard headers as keys
     for key in header[0:6]:
         vcf_dict[key] = []
@@ -139,7 +139,7 @@ def extract_vcf_data(vcf_files, reference, threshold=0, scores=False):
         {x[0] for li in frequency_lists for x in li}, key=lambda x: int(x.split("_")[0])
     )
     if not unique_mutations:
-        sys.exit(f"\033[31m\033[1mERROR:\033[0m No unique mutations to {reference} found in all vcf files!")
+        sys.exit(f"\033[31m\033[1mERROR:\033[0m No variants to {reference} in all vcf files!")
 
     return frequency_lists, unique_mutations, file_names
 
@@ -279,12 +279,9 @@ def parse_gff3(file, reference):
     with open(file, "r") as gff3_file:
         for line in gff3_file:
             # ignore comments and last line
-            if line.startswith("#") or line == "\n":
+            if not line.startswith(reference):
                 continue
             gff_values = line.split("\t")
-            if gff_values[0] != reference:
-                continue
-            gff3_ref_name = gff_values[0]
             # create keys
             if gff_values[2] not in gff3_dict:
                 gff3_dict[gff_values[2]] = {}
@@ -307,7 +304,7 @@ def parse_gff3(file, reference):
     if not gff3_dict:
         sys.exit(f"\033[31m\033[1mERROR:\033[0m {reference} not found in gff3 file.")
 
-    return gff3_dict, gff3_ref_name
+    return gff3_dict
 
 
 def get_genome_end(gff3_dict):

From 4b1c49b27c24c4b98808e48a93eb241cb3471589 Mon Sep 17 00:00:00 2001
From: jonas-fuchs <jonas.michael.fuchs@googlemail.com>
Date: Tue, 6 Aug 2024 10:02:02 +0200
Subject: [PATCH 3/5] text mistake fix

---
 example_mave_data/MaveBindRBD.csv |  2 +-
 virheat/command.py                | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/example_mave_data/MaveBindRBD.csv b/example_mave_data/MaveBindRBD.csv
index ab422ae..11b58cb 100644
--- a/example_mave_data/MaveBindRBD.csv
+++ b/example_mave_data/MaveBindRBD.csv
@@ -1,4 +1,4 @@
-exaccession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA
+accession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA
 urn:mavedb:00000044-a-1#7954,NA,NA,p.Lys199Val,0.09,0.07,2,K529V
 urn:mavedb:00000044-a-1#7955,NA,NA,p.Lys199Trp,-0.02,0,1,K529W
 urn:mavedb:00000044-a-1#7956,NA,NA,p.Lys199Trp,0.01,0,2,K529W
diff --git a/virheat/command.py b/virheat/command.py
index dd8d603..6bd3778 100644
--- a/virheat/command.py
+++ b/virheat/command.py
@@ -165,11 +165,13 @@ def main(sysargs=sys.argv[1:]):
         unique_mutations = data_prep.zoom_to_genomic_regions(unique_mutations, args.zoom)
     frequency_array = data_prep.create_freq_array(unique_mutations, frequency_lists)
 
-    # user specified delete options (removes mutations based on various rationales)
-    if args.delete:
-        frequency_array = data_prep.delete_common_mutations(frequency_array, unique_mutations)
-    if args.delete_n is not None:
-        frequency_array = data_prep.delete_n_mutations(frequency_array, unique_mutations, args.delete_n)
+    # enables the deletion option only if more than 1 vcf file is provided
+    if len(vcf_files) > 1:
+        # user specified delete options (removes mutations based on various rationales)
+        if args.delete:
+            frequency_array = data_prep.delete_common_mutations(frequency_array, unique_mutations)
+        if args.delete_n is not None:
+            frequency_array = data_prep.delete_n_mutations(frequency_array, unique_mutations, args.delete_n)
 
     # annotate low coverage if per base coverage from qualimap was provided
     data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations)

From 872caf545c55f876437eed80cab7655529481503 Mon Sep 17 00:00:00 2001
From: jonas-fuchs <jonas.michael.fuchs@googlemail.com>
Date: Tue, 6 Aug 2024 10:33:31 +0200
Subject: [PATCH 4/5] subset tsv df

---
 virheat/command.py           | 2 +-
 virheat/scripts/data_prep.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/virheat/command.py b/virheat/command.py
index 6bd3778..87778db 100644
--- a/virheat/command.py
+++ b/virheat/command.py
@@ -174,7 +174,7 @@ def main(sysargs=sys.argv[1:]):
             frequency_array = data_prep.delete_n_mutations(frequency_array, unique_mutations, args.delete_n)
 
     # annotate low coverage if per base coverage from qualimap was provided
-    data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations)
+    data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations, args.reference)
 
     # define relative locations of all items in the plot
     n_samples, n_mutations = len(frequency_array), len(frequency_array[0])
diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py
index e60178a..687b63a 100644
--- a/virheat/scripts/data_prep.py
+++ b/virheat/scripts/data_prep.py
@@ -187,7 +187,7 @@ def create_freq_array(unique_mutations, frequency_lists):
     return np.array(frequency_array)
 
 
-def annotate_non_covered_regions(coverage_dir, min_coverage, frequency_array, file_names, unique_mutations):
+def annotate_non_covered_regions(coverage_dir, min_coverage, frequency_array, file_names, unique_mutations, reference):
     """
     Insert nan values into np array if position is not covered. Needs
     per base coverage tsv files created by bamqc
@@ -202,6 +202,7 @@ def annotate_non_covered_regions(coverage_dir, min_coverage, frequency_array, fi
                 continue
             tsv_file = [file for file in per_base_coverage_files if os.path.splitext(os.path.basename(file))[0] == file_name][0]
             coverage = pd.read_csv(tsv_file, sep="\t")
+            coverage = coverage[coverage["#chr"] == reference]
             for j, (mutation, frequency) in enumerate(zip(unique_mutations, array)):
                 mut_pos = int(mutation.split("_")[0])
                 if coverage[coverage["pos"] == mut_pos].empty or all([frequency == 0, coverage[coverage["pos"] == mut_pos]["coverage"].iloc[0] <= min_coverage]):

From 94e6a94a808ae1dd5dc3e4535ba4a0a95be9129c Mon Sep 17 00:00:00 2001
From: jonas-fuchs <jonas.michael.fuchs@googlemail.com>
Date: Tue, 6 Aug 2024 10:41:06 +0200
Subject: [PATCH 5/5] docu update

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 0e25301..e7379dd 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,8 @@ positional arguments:
 
 options:
   -h, --help            show this help message and exit
+  -r ref_id, --reference ref_id
+                        reference identifier
   --name virHEAT_plot.pdf
                         plot name and file type (pdf, png, svg, jpg). Default: virHEAT_plot.pdf
   -l None, --genome-length None