From 7e1ed3832a9fc4b39b131f114143a69af5d0b3ce Mon Sep 17 00:00:00 2001 From: jonas-fuchs Date: Thu, 26 Oct 2023 14:10:46 +0200 Subject: [PATCH 1/3] introduced min_mut def --- virheat/command.py | 3 +++ virheat/scripts/data_prep.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/virheat/command.py b/virheat/command.py index 74481ff..f37be96 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -116,6 +116,9 @@ def main(sysargs=sys.argv[1:]): frequency_array = data_prep.create_freq_array(unique_mutations, frequency_lists) if args.delete: frequency_array = data_prep.delete_common_mutations(frequency_array, unique_mutations) + + if args_delete_n is not None: + frequency_array = data_prep.delete_n_mutations(frequency_array, unique_mutations, args.delete_n) # annotate low coverage if per base coveage from qualimap was provided data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations) diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py index 813776e..e8e338e 100644 --- a/virheat/scripts/data_prep.py +++ b/virheat/scripts/data_prep.py @@ -195,6 +195,26 @@ def delete_common_mutations(frequency_array, unique_mutations): return np.delete(frequency_array, mut_to_del, axis=1) +def delete_n_mutations(frequency_array, unique_mutations, min_mut): + """ + delete mutations that are not present in more than n samples + """ + mut_to_del = [] + + for idx in range(0, len(frequency_array[0])): + n_mutations = 0 + for frequency_list in frequency_array: + if frequency_list[idx] > 0: + n_mutations += 1 + # check if min_mut was reached and if not mark as to delete + if n_mutations <= min_mut: + mut_to_del.append(idx) + # delete the mutations that are found only min_mut times in all samples + for idx in sorted(mut_to_del, reverse=True): + del unique_mutations[idx] + + return np.delete(frequency_array, mut_to_del, axis=1) + def parse_gff3(file): """ From b7d2bd2ea7ca7633d4461eda14d5ac373b729a3d Mon Sep 17 00:00:00 2001 From: jonas-fuchs Date: Thu, 26 Oct 2023 14:16:00 +0200 Subject: [PATCH 2/3] updated args --- virheat/__init__.py | 2 +- virheat/command.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/virheat/__init__.py b/virheat/__init__.py index bf5eead..d80b915 100644 --- a/virheat/__init__.py +++ b/virheat/__init__.py @@ -1,3 +1,3 @@ """plot vcf data as a heatmap mapped to a virus genome""" _program = "virheat" -__version__ = "0.5.3" +__version__ = "0.5.4" diff --git a/virheat/command.py b/virheat/command.py index f37be96..d09a204 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -72,6 +72,14 @@ def get_args(sysargs): default=True, help="delete mutations that are present in all samples and their maximum frequency divergence is smaller than 0.5" ) + parser.add_argument( + "-n", + "--delete-n", + type=int, + metavar='None', + default=None, + help="do not show mutations that occur n times or less" + ) parser.add_argument( "--sort", action=argparse.BooleanOptionalAction, @@ -114,10 +122,10 @@ def main(sysargs=sys.argv[1:]): # extract vcf info reference_name, frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, threshold=args.threshold) frequency_array = data_prep.create_freq_array(unique_mutations, frequency_lists) + # user specified delete options (removes mutations based on various rationales) if args.delete: frequency_array = data_prep.delete_common_mutations(frequency_array, unique_mutations) - - if args_delete_n is not None: + if args.delete_n is not None: frequency_array = data_prep.delete_n_mutations(frequency_array, unique_mutations, args.delete_n) # annotate low coverage if per base coveage from qualimap was provided data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations) From 59a9869a90df59bb79ffafc3898dbc8e6477e46e Mon Sep 17 00:00:00 2001 From: jonas-fuchs Date: Thu, 26 Oct 2023 14:19:14 +0200 Subject: [PATCH 3/3] updated docs --- README.md | 2 ++ virheat/command.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ef4a958..e642424 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,8 @@ options: -t 0, --threshold 0 display frequencies above this threshold (0-1) --delete, --no-delete delete mutations that are present in all samples and their maximum frequency divergence is smaller than 0.5 (default: True) + -n None, --delete-n None + do not show mutations that occur n times or less (default: Do not delete) --sort, --no-sort sort sample names alphanumerically (default: False) --min-cov 20 display mutations covered at least x time (only if per base cov tsv files are provided) -v, --version show program's version number and exit diff --git a/virheat/command.py b/virheat/command.py index d09a204..d6b1fab 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -78,7 +78,7 @@ def get_args(sysargs): type=int, metavar='None', default=None, - help="do not show mutations that occur n times or less" + help="do not show mutations that occur n times or less (default: Do not delete)" ) parser.add_argument( "--sort",