From a81daa69ddafba3b2e21740fd15b6b04fdce368d Mon Sep 17 00:00:00 2001 From: jonas-fuchs Date: Mon, 5 Aug 2024 15:32:03 +0200 Subject: [PATCH 1/5] added reference --- virheat/command.py | 19 ++++++++++++------- virheat/scripts/data_prep.py | 21 ++++++++++++++------- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/virheat/command.py b/virheat/command.py index 9f107af..01a1349 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -32,6 +32,14 @@ def get_args(sysargs): nargs=2, help="folder containing input files and output folder" ) + parser.add_argument( + "-r", + "--reference", + type=str, + metavar="None", + default=None, + help="reference identifier" + ) parser.add_argument( "--name", type=str, @@ -147,10 +155,10 @@ def main(sysargs=sys.argv[1:]): sys.exit("\033[31m\033[1mERROR:\033[0m No VCF files provided") else: if args.scores: - reference_name, frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, threshold=args.threshold, scores=True) + frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, threshold=args.threshold, scores=True) n_scores = len(args.scores) else: - reference_name, frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, threshold=args.threshold) + frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, args.reference, threshold=args.threshold) if args.zoom: unique_mutations = data_prep.zoom_to_genomic_regions(unique_mutations, args.zoom) @@ -178,10 +186,7 @@ def main(sysargs=sys.argv[1:]): if args.gff3_path is not None and args.genome_length is not None: sys.exit("\033[31m\033[1mERROR:\033[0m Do not provide the -g and -l argument simultaneously!") elif args.gff3_path is not None: - gff3_info, gff3_ref_name = data_prep.parse_gff3(args.gff3_path) - # issue a warning if #CHROM and gff3 do not match - if gff3_ref_name not in reference_name and reference_name not in gff3_ref_name: - print("\033[31m\033[1mWARNING:\033[0m gff3 reference does not match the vcf reference!") + gff3_info, gff3_ref_name = data_prep.parse_gff3(args.gff3_path, args.reference) genome_end = data_prep.get_genome_end(gff3_info) genes_with_mutations, n_tracks = data_prep.create_track_dict(unique_mutations, gff3_info, args.gff3_annotations) elif args.genome_length is not None: @@ -225,7 +230,7 @@ def main(sysargs=sys.argv[1:]): plotting.create_heatmap(ax, frequency_array, cmap_cells) mutation_set = plotting.create_genome_vis(ax, genome_y_location, n_mutations, unique_mutations, start, stop) plotting.create_axis(ax, n_mutations, min_y_location, n_samples, file_names, start, stop, genome_y_location, - unique_mutations, reference_name) + unique_mutations, args.reference) plotting.create_mutation_legend(mutation_set, min_y_location, n_samples, n_scores) plotting.create_colorbar(args.threshold, cmap_cells, min_y_location, n_samples, ax) # plot gene track diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py index 4cf64a1..90e2a8c 100644 --- a/virheat/scripts/data_prep.py +++ b/virheat/scripts/data_prep.py @@ -47,7 +47,7 @@ def convert_string(string): return string -def read_vcf(vcf_file): +def read_vcf(vcf_file, reference): """ parse vcf files to dictionary """ @@ -62,10 +62,10 @@ def read_vcf(vcf_file): header = header_lines[0] # get each line as frequency_lists with open(vcf_file, "r") as f: - lines = [l.split("\t") for l in f if not l.startswith('#')] + lines = [l.split("\t") for l in f if not l.startswith('#') and l.startswith(reference)] # check if vcf is empty if not lines: - print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} is empty!") + print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} has no variants with {reference} as #CHROM!") # get standard headers as keys for key in header[0:6]: vcf_dict[key] = [] @@ -104,7 +104,7 @@ def read_vcf(vcf_file): return vcf_dict -def extract_vcf_data(vcf_files, threshold=0, scores=False): +def extract_vcf_data(vcf_files, reference, threshold=0, scores=False): """ extract relevant vcf data """ @@ -114,7 +114,7 @@ def extract_vcf_data(vcf_files, threshold=0, scores=False): for file in vcf_files: file_names.append(os.path.splitext(os.path.basename(file))[0]) - vcf_dict = read_vcf(file) + vcf_dict = read_vcf(file, reference) frequency_list = [] # write all mutation info in a '_' sep string for idx in range(0, len(vcf_dict["#CHROM"])): @@ -138,8 +138,10 @@ def extract_vcf_data(vcf_files, threshold=0, scores=False): unique_mutations = sorted( {x[0] for li in frequency_lists for x in li}, key=lambda x: int(x.split("_")[0]) ) + if not unique_mutations: + sys.exit(f"\033[31m\033[1mERROR:\033[0m No unique mutations to {reference} found in all vcf files!") - return vcf_dict["#CHROM"][0], frequency_lists, unique_mutations, file_names + return frequency_lists, unique_mutations, file_names def extract_scores(unique_mutations, scores_file, aa_pos_col, score_col): @@ -267,7 +269,7 @@ def zoom_to_genomic_regions(unique_mutations, start_stop): return zoomed_unique -def parse_gff3(file): +def parse_gff3(file, reference): """ parse gff3 to dictionary """ @@ -280,6 +282,8 @@ def parse_gff3(file): if line.startswith("#") or line == "\n": continue gff_values = line.split("\t") + if gff_values[0] != reference: + continue gff3_ref_name = gff_values[0] # create keys if gff_values[2] not in gff3_dict: @@ -300,6 +304,9 @@ def parse_gff3(file): gff3_file.close() + if not gff3_dict: + sys.exit(f"\033[31m\033[1mERROR:\033[0m {reference} not found in gff3 file.") + return gff3_dict, gff3_ref_name From d780d2b21942decc98d1533fde935f0ccdf5fb16 Mon Sep 17 00:00:00 2001 From: jonas-fuchs Date: Tue, 6 Aug 2024 09:49:29 +0200 Subject: [PATCH 2/5] smaller changes --- example_mave_data/MaveBindRBD.csv | 2 +- virheat/__init__.py | 2 +- virheat/command.py | 7 ++++--- virheat/scripts/data_prep.py | 13 +++++-------- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/example_mave_data/MaveBindRBD.csv b/example_mave_data/MaveBindRBD.csv index 11b58cb..ab422ae 100644 --- a/example_mave_data/MaveBindRBD.csv +++ b/example_mave_data/MaveBindRBD.csv @@ -1,4 +1,4 @@ -accession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA +exaccession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA urn:mavedb:00000044-a-1#7954,NA,NA,p.Lys199Val,0.09,0.07,2,K529V urn:mavedb:00000044-a-1#7955,NA,NA,p.Lys199Trp,-0.02,0,1,K529W urn:mavedb:00000044-a-1#7956,NA,NA,p.Lys199Trp,0.01,0,2,K529W diff --git a/virheat/__init__.py b/virheat/__init__.py index d10d6d3..02e2c78 100644 --- a/virheat/__init__.py +++ b/virheat/__init__.py @@ -1,3 +1,3 @@ """plot vcf data as a heatmap mapped to a virus genome""" _program = "virheat" -__version__ = "0.7" +__version__ = "0.7.1" diff --git a/virheat/command.py b/virheat/command.py index 01a1349..dd8d603 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -36,7 +36,8 @@ def get_args(sysargs): "-r", "--reference", type=str, - metavar="None", + metavar="ref_id", + required=True, default=None, help="reference identifier" ) @@ -155,7 +156,7 @@ def main(sysargs=sys.argv[1:]): sys.exit("\033[31m\033[1mERROR:\033[0m No VCF files provided") else: if args.scores: - frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, threshold=args.threshold, scores=True) + frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, args.reference, threshold=args.threshold, scores=True) n_scores = len(args.scores) else: frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, args.reference, threshold=args.threshold) @@ -186,7 +187,7 @@ def main(sysargs=sys.argv[1:]): if args.gff3_path is not None and args.genome_length is not None: sys.exit("\033[31m\033[1mERROR:\033[0m Do not provide the -g and -l argument simultaneously!") elif args.gff3_path is not None: - gff3_info, gff3_ref_name = data_prep.parse_gff3(args.gff3_path, args.reference) + gff3_info = data_prep.parse_gff3(args.gff3_path, args.reference) genome_end = data_prep.get_genome_end(gff3_info) genes_with_mutations, n_tracks = data_prep.create_track_dict(unique_mutations, gff3_info, args.gff3_annotations) elif args.genome_length is not None: diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py index 90e2a8c..e60178a 100644 --- a/virheat/scripts/data_prep.py +++ b/virheat/scripts/data_prep.py @@ -62,10 +62,10 @@ def read_vcf(vcf_file, reference): header = header_lines[0] # get each line as frequency_lists with open(vcf_file, "r") as f: - lines = [l.split("\t") for l in f if not l.startswith('#') and l.startswith(reference)] + lines = [l.split("\t") for l in f if l.startswith(reference)] # check if vcf is empty if not lines: - print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} has no variants with {reference} as #CHROM!") + print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} has no variants to {reference}!") # get standard headers as keys for key in header[0:6]: vcf_dict[key] = [] @@ -139,7 +139,7 @@ def extract_vcf_data(vcf_files, reference, threshold=0, scores=False): {x[0] for li in frequency_lists for x in li}, key=lambda x: int(x.split("_")[0]) ) if not unique_mutations: - sys.exit(f"\033[31m\033[1mERROR:\033[0m No unique mutations to {reference} found in all vcf files!") + sys.exit(f"\033[31m\033[1mERROR:\033[0m No variants to {reference} in all vcf files!") return frequency_lists, unique_mutations, file_names @@ -279,12 +279,9 @@ def parse_gff3(file, reference): with open(file, "r") as gff3_file: for line in gff3_file: # ignore comments and last line - if line.startswith("#") or line == "\n": + if not line.startswith(reference): continue gff_values = line.split("\t") - if gff_values[0] != reference: - continue - gff3_ref_name = gff_values[0] # create keys if gff_values[2] not in gff3_dict: gff3_dict[gff_values[2]] = {} @@ -307,7 +304,7 @@ def parse_gff3(file, reference): if not gff3_dict: sys.exit(f"\033[31m\033[1mERROR:\033[0m {reference} not found in gff3 file.") - return gff3_dict, gff3_ref_name + return gff3_dict def get_genome_end(gff3_dict): From 4b1c49b27c24c4b98808e48a93eb241cb3471589 Mon Sep 17 00:00:00 2001 From: jonas-fuchs Date: Tue, 6 Aug 2024 10:02:02 +0200 Subject: [PATCH 3/5] text mistake fix --- example_mave_data/MaveBindRBD.csv | 2 +- virheat/command.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/example_mave_data/MaveBindRBD.csv b/example_mave_data/MaveBindRBD.csv index ab422ae..11b58cb 100644 --- a/example_mave_data/MaveBindRBD.csv +++ b/example_mave_data/MaveBindRBD.csv @@ -1,4 +1,4 @@ -exaccession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA +accession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA urn:mavedb:00000044-a-1#7954,NA,NA,p.Lys199Val,0.09,0.07,2,K529V urn:mavedb:00000044-a-1#7955,NA,NA,p.Lys199Trp,-0.02,0,1,K529W urn:mavedb:00000044-a-1#7956,NA,NA,p.Lys199Trp,0.01,0,2,K529W diff --git a/virheat/command.py b/virheat/command.py index dd8d603..6bd3778 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -165,11 +165,13 @@ def main(sysargs=sys.argv[1:]): unique_mutations = data_prep.zoom_to_genomic_regions(unique_mutations, args.zoom) frequency_array = data_prep.create_freq_array(unique_mutations, frequency_lists) - # user specified delete options (removes mutations based on various rationales) - if args.delete: - frequency_array = data_prep.delete_common_mutations(frequency_array, unique_mutations) - if args.delete_n is not None: - frequency_array = data_prep.delete_n_mutations(frequency_array, unique_mutations, args.delete_n) + # enables the deletion option only if more than 1 vcf file is provided + if len(vcf_files) > 1: + # user specified delete options (removes mutations based on various rationales) + if args.delete: + frequency_array = data_prep.delete_common_mutations(frequency_array, unique_mutations) + if args.delete_n is not None: + frequency_array = data_prep.delete_n_mutations(frequency_array, unique_mutations, args.delete_n) # annotate low coverage if per base coverage from qualimap was provided data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations) From 872caf545c55f876437eed80cab7655529481503 Mon Sep 17 00:00:00 2001 From: jonas-fuchs Date: Tue, 6 Aug 2024 10:33:31 +0200 Subject: [PATCH 4/5] subset tsv df --- virheat/command.py | 2 +- virheat/scripts/data_prep.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/virheat/command.py b/virheat/command.py index 6bd3778..87778db 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -174,7 +174,7 @@ def main(sysargs=sys.argv[1:]): frequency_array = data_prep.delete_n_mutations(frequency_array, unique_mutations, args.delete_n) # annotate low coverage if per base coverage from qualimap was provided - data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations) + data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations, args.reference) # define relative locations of all items in the plot n_samples, n_mutations = len(frequency_array), len(frequency_array[0]) diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py index e60178a..687b63a 100644 --- a/virheat/scripts/data_prep.py +++ b/virheat/scripts/data_prep.py @@ -187,7 +187,7 @@ def create_freq_array(unique_mutations, frequency_lists): return np.array(frequency_array) -def annotate_non_covered_regions(coverage_dir, min_coverage, frequency_array, file_names, unique_mutations): +def annotate_non_covered_regions(coverage_dir, min_coverage, frequency_array, file_names, unique_mutations, reference): """ Insert nan values into np array if position is not covered. Needs per base coverage tsv files created by bamqc @@ -202,6 +202,7 @@ def annotate_non_covered_regions(coverage_dir, min_coverage, frequency_array, fi continue tsv_file = [file for file in per_base_coverage_files if os.path.splitext(os.path.basename(file))[0] == file_name][0] coverage = pd.read_csv(tsv_file, sep="\t") + coverage = coverage[coverage["#chr"] == reference] for j, (mutation, frequency) in enumerate(zip(unique_mutations, array)): mut_pos = int(mutation.split("_")[0]) if coverage[coverage["pos"] == mut_pos].empty or all([frequency == 0, coverage[coverage["pos"] == mut_pos]["coverage"].iloc[0] <= min_coverage]): From 94e6a94a808ae1dd5dc3e4535ba4a0a95be9129c Mon Sep 17 00:00:00 2001 From: jonas-fuchs Date: Tue, 6 Aug 2024 10:41:06 +0200 Subject: [PATCH 5/5] docu update --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 0e25301..e7379dd 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,8 @@ positional arguments: options: -h, --help show this help message and exit + -r ref_id, --reference ref_id + reference identifier --name virHEAT_plot.pdf plot name and file type (pdf, png, svg, jpg). Default: virHEAT_plot.pdf -l None, --genome-length None