From aaebb31dd7d85479bb374e55bc35cfe1470e5d1e Mon Sep 17 00:00:00 2001 From: Jonas Fuchs <78491186+jonas-fuchs@users.noreply.github.com> Date: Thu, 26 Oct 2023 21:43:35 +0200 Subject: [PATCH] Delete n (#10) * introduced min_mut def * updated args * updated docs --- README.md | 2 ++ virheat/__init__.py | 2 +- virheat/command.py | 11 +++++++++++ virheat/scripts/data_prep.py | 20 ++++++++++++++++++++ 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ef4a958..e642424 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,8 @@ options: -t 0, --threshold 0 display frequencies above this threshold (0-1) --delete, --no-delete delete mutations that are present in all samples and their maximum frequency divergence is smaller than 0.5 (default: True) + -n None, --delete-n None + do not show mutations that occur n times or less (default: Do not delete) --sort, --no-sort sort sample names alphanumerically (default: False) --min-cov 20 display mutations covered at least x time (only if per base cov tsv files are provided) -v, --version show program's version number and exit diff --git a/virheat/__init__.py b/virheat/__init__.py index bf5eead..d80b915 100644 --- a/virheat/__init__.py +++ b/virheat/__init__.py @@ -1,3 +1,3 @@ """plot vcf data as a heatmap mapped to a virus genome""" _program = "virheat" -__version__ = "0.5.3" +__version__ = "0.5.4" diff --git a/virheat/command.py b/virheat/command.py index 74481ff..d6b1fab 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -72,6 +72,14 @@ def get_args(sysargs): default=True, help="delete mutations that are present in all samples and their maximum frequency divergence is smaller than 0.5" ) + parser.add_argument( + "-n", + "--delete-n", + type=int, + metavar='None', + default=None, + help="do not show mutations that occur n times or less (default: Do not delete)" + ) parser.add_argument( "--sort", action=argparse.BooleanOptionalAction, @@ -114,8 +122,11 @@ def main(sysargs=sys.argv[1:]): # extract vcf info reference_name, frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, threshold=args.threshold) frequency_array = data_prep.create_freq_array(unique_mutations, frequency_lists) + # user specified delete options (removes mutations based on various rationales) if args.delete: frequency_array = data_prep.delete_common_mutations(frequency_array, unique_mutations) + if args.delete_n is not None: + frequency_array = data_prep.delete_n_mutations(frequency_array, unique_mutations, args.delete_n) # annotate low coverage if per base coveage from qualimap was provided data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations) diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py index 813776e..e8e338e 100644 --- a/virheat/scripts/data_prep.py +++ b/virheat/scripts/data_prep.py @@ -195,6 +195,26 @@ def delete_common_mutations(frequency_array, unique_mutations): return np.delete(frequency_array, mut_to_del, axis=1) +def delete_n_mutations(frequency_array, unique_mutations, min_mut): + """ + delete mutations that are not present in more than n samples + """ + mut_to_del = [] + + for idx in range(0, len(frequency_array[0])): + n_mutations = 0 + for frequency_list in frequency_array: + if frequency_list[idx] > 0: + n_mutations += 1 + # check if min_mut was reached and if not mark as to delete + if n_mutations <= min_mut: + mut_to_del.append(idx) + # delete the mutations that are found only min_mut times in all samples + for idx in sorted(mut_to_del, reverse=True): + del unique_mutations[idx] + + return np.delete(frequency_array, mut_to_del, axis=1) + def parse_gff3(file): """