diff --git a/example_mave_data/MaveBindRBD.csv b/example_mave_data/MaveBindRBD.csv index 11b58cb..ab422ae 100644 --- a/example_mave_data/MaveBindRBD.csv +++ b/example_mave_data/MaveBindRBD.csv @@ -1,4 +1,4 @@ -accession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA +exaccession,hgvs_nt,hgvs_splice,hgvs_pro,score,average,library,AA urn:mavedb:00000044-a-1#7954,NA,NA,p.Lys199Val,0.09,0.07,2,K529V urn:mavedb:00000044-a-1#7955,NA,NA,p.Lys199Trp,-0.02,0,1,K529W urn:mavedb:00000044-a-1#7956,NA,NA,p.Lys199Trp,0.01,0,2,K529W diff --git a/virheat/__init__.py b/virheat/__init__.py index d10d6d3..02e2c78 100644 --- a/virheat/__init__.py +++ b/virheat/__init__.py @@ -1,3 +1,3 @@ """plot vcf data as a heatmap mapped to a virus genome""" _program = "virheat" -__version__ = "0.7" +__version__ = "0.7.1" diff --git a/virheat/command.py b/virheat/command.py index 01a1349..dd8d603 100644 --- a/virheat/command.py +++ b/virheat/command.py @@ -36,7 +36,8 @@ def get_args(sysargs): "-r", "--reference", type=str, - metavar="None", + metavar="ref_id", + required=True, default=None, help="reference identifier" ) @@ -155,7 +156,7 @@ def main(sysargs=sys.argv[1:]): sys.exit("\033[31m\033[1mERROR:\033[0m No VCF files provided") else: if args.scores: - frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, threshold=args.threshold, scores=True) + frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, args.reference, threshold=args.threshold, scores=True) n_scores = len(args.scores) else: frequency_lists, unique_mutations, file_names = data_prep.extract_vcf_data(vcf_files, args.reference, threshold=args.threshold) @@ -186,7 +187,7 @@ def main(sysargs=sys.argv[1:]): if args.gff3_path is not None and args.genome_length is not None: sys.exit("\033[31m\033[1mERROR:\033[0m Do not provide the -g and -l argument simultaneously!") elif args.gff3_path is not None: - gff3_info, gff3_ref_name = data_prep.parse_gff3(args.gff3_path, args.reference) + gff3_info = data_prep.parse_gff3(args.gff3_path, args.reference) genome_end = data_prep.get_genome_end(gff3_info) genes_with_mutations, n_tracks = data_prep.create_track_dict(unique_mutations, gff3_info, args.gff3_annotations) elif args.genome_length is not None: diff --git a/virheat/scripts/data_prep.py b/virheat/scripts/data_prep.py index 90e2a8c..e60178a 100644 --- a/virheat/scripts/data_prep.py +++ b/virheat/scripts/data_prep.py @@ -62,10 +62,10 @@ def read_vcf(vcf_file, reference): header = header_lines[0] # get each line as frequency_lists with open(vcf_file, "r") as f: - lines = [l.split("\t") for l in f if not l.startswith('#') and l.startswith(reference)] + lines = [l.split("\t") for l in f if l.startswith(reference)] # check if vcf is empty if not lines: - print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} has no variants with {reference} as #CHROM!") + print(f"\033[31m\033[1mWARNING:\033[0m {vcf_file} has no variants to {reference}!") # get standard headers as keys for key in header[0:6]: vcf_dict[key] = [] @@ -139,7 +139,7 @@ def extract_vcf_data(vcf_files, reference, threshold=0, scores=False): {x[0] for li in frequency_lists for x in li}, key=lambda x: int(x.split("_")[0]) ) if not unique_mutations: - sys.exit(f"\033[31m\033[1mERROR:\033[0m No unique mutations to {reference} found in all vcf files!") + sys.exit(f"\033[31m\033[1mERROR:\033[0m No variants to {reference} in all vcf files!") return frequency_lists, unique_mutations, file_names @@ -279,12 +279,9 @@ def parse_gff3(file, reference): with open(file, "r") as gff3_file: for line in gff3_file: # ignore comments and last line - if line.startswith("#") or line == "\n": + if not line.startswith(reference): continue gff_values = line.split("\t") - if gff_values[0] != reference: - continue - gff3_ref_name = gff_values[0] # create keys if gff_values[2] not in gff3_dict: gff3_dict[gff_values[2]] = {} @@ -307,7 +304,7 @@ def parse_gff3(file, reference): if not gff3_dict: sys.exit(f"\033[31m\033[1mERROR:\033[0m {reference} not found in gff3 file.") - return gff3_dict, gff3_ref_name + return gff3_dict def get_genome_end(gff3_dict):