From 872caf545c55f876437eed80cab7655529481503 Mon Sep 17 00:00:00 2001 From: jonas-fuchs Date: Tue, 6 Aug 2024 10:33:31 +0200 Subject: [PATCH] subset tsv df --- virheat/command.py | 2 +- virheat/scripts/data_prep.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/virheat/command.py b/virheat/command.py index 6bd3778..87778db 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -174,7 +174,7 @@ def main(sysargs=sys.argv[1:]): frequency_array = data_prep.delete_n_mutations(frequency_array, unique_mutations, args.delete_n) # annotate low coverage if per base coverage from qualimap was provided - data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations) + data_prep.annotate_non_covered_regions(args.input[0], args.min_cov, frequency_array, file_names, unique_mutations, args.reference) # define relative locations of all items in the plot n_samples, n_mutations = len(frequency_array), len(frequency_array[0]) diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py index e60178a..687b63a 100644 --- a/virheat/scripts/data_prep.py +++ b/virheat/scripts/data_prep.py @@ -187,7 +187,7 @@ def create_freq_array(unique_mutations, frequency_lists): return np.array(frequency_array) -def annotate_non_covered_regions(coverage_dir, min_coverage, frequency_array, file_names, unique_mutations): +def annotate_non_covered_regions(coverage_dir, min_coverage, frequency_array, file_names, unique_mutations, reference): """ Insert nan values into np array if position is not covered. Needs per base coverage tsv files created by bamqc @@ -202,6 +202,7 @@ def annotate_non_covered_regions(coverage_dir, min_coverage, frequency_array, fi continue tsv_file = [file for file in per_base_coverage_files if os.path.splitext(os.path.basename(file))[0] == file_name][0] coverage = pd.read_csv(tsv_file, sep="\t") + coverage = coverage[coverage["#chr"] == reference] for j, (mutation, frequency) in enumerate(zip(unique_mutations, array)): mut_pos = int(mutation.split("_")[0]) if coverage[coverage["pos"] == mut_pos].empty or all([frequency == 0, coverage[coverage["pos"] == mut_pos]["coverage"].iloc[0] <= min_coverage]):