From 3c6749f1ca2ac42d17076da88919ef587dc3b372 Mon Sep 17 00:00:00 2001 From: jonas-fuchs Date: Tue, 24 Oct 2023 10:18:28 +0200 Subject: [PATCH] switched to sequence divergence > 0.5 and updated docs --- README.md | 22 ++++++++-------------- virheat/command.py | 6 +++--- virheat/scripts/data_prep.py | 6 +++--- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 934e018..aba7667 100644 --- a/README.md +++ b/README.md @@ -28,13 +28,8 @@ pip install virheat ```shell git clone https://github.com/jonas-fuchs/virHEAT cd virHEAT -``` -and then install virHEAT with: -```shell pip install -r requirements.txt -``` -or: -```shell +# or pip install . ``` That was already it. To check if it worked: @@ -55,21 +50,20 @@ usage: virheat -l or -g [additional argum ``` positional arguments: - input folder containing vcf (and tsv) files and output folder + input folder containing input files and output folder -optional arguments: +options: -h, --help show this help message and exit -l None, --genome-length None length of the genome (needed if gff3 is not provided) -g None, --gff3-path None path to gff3 (needed if length is not provided) - -a gene, --gff3-annotations gene - annotations to display from gff3 file (standard: gene) - -t 0, --threshold 0 display frequencies above this threshold + -a [gene ...], --gff3-annotations [gene ...] + annotations to display from gff3 file (standard: gene). Multiple possible. + -t 0, --threshold 0 display frequencies above this threshold (0-1) --delete, --no-delete - delete mutations with frequencies present in all - samples (default: True) - --sort, --no-sort sort alphanumerically (default: False) + delete mutations that are present in all samples and their maximum frequency divergence is smaller than 0.5 (default: True) + --sort, --no-sort sort sample names alphanumerically (default: False) --min-cov 20 display mutations covered at least x time (only if per base cov tsv files are provided) -v, --version show program's version number and exit ``` diff --git a/virheat/command.py b/virheat/command.py index 678f8f0..74481ff 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -64,19 +64,19 @@ def get_args(sysargs): type=float, metavar="0", default=0, - help="display frequencies above this threshold" + help="display frequencies above this threshold (0-1)" ) parser.add_argument( "--delete", action=argparse.BooleanOptionalAction, default=True, - help="delete mutations that are present in all samples and their maximum frequency divergence is smaller than 0.2" + help="delete mutations that are present in all samples and their maximum frequency divergence is smaller than 0.5" ) parser.add_argument( "--sort", action=argparse.BooleanOptionalAction, default=False, - help="sort alphanumerically" + help="sort sample names alphanumerically" ) parser.add_argument( "--min-cov", diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py index 6242476..813776e 100644 --- a/virheat/scripts/data_prep.py +++ b/virheat/scripts/data_prep.py @@ -185,9 +185,9 @@ def delete_common_mutations(frequency_array, unique_mutations): # check if all mutation in a column are zero (happens with some weird callers) if all(x == 0 for x in check_all): mut_to_del.append(idx) - # check if frequencies are present in all columns and the maximal diff is greater than 0.2 - # example [0.8, 0.7, 0.6] is not deleted whereas [0.8, 0.7, 0.7] is deleted - elif all(x > 0 for x in check_all) and max(check_all)-min(check_all) >= 0.2: + # check if frequencies are present in all columns and the maximal diff is greater than 0.5 + # example [0.8, 0.7, 0.3] is not deleted whereas [0.8, 0.7, 0.7] is deleted + elif all(x > 0 for x in check_all) and max(check_all)-min(check_all) < 0.5: mut_to_del.append(idx) for idx in sorted(mut_to_del, reverse=True):