diff --git a/README.md b/README.md index 9242ced..57c3efa 100644 --- a/README.md +++ b/README.md @@ -49,12 +49,13 @@ No compilation is required for ntRoot (only the dependencies), so simply add the - [ntEdit 2.0.2+](https://github.com/bcgsc/ntEdit) - [snakemake](https://snakemake.readthedocs.io/en/stable/) - [samtools](https://www.htslib.org/) - +- [bedtools](https://bedtools.readthedocs.io/en/latest/) ## Usage ``` -usage: ntroot [-h] -r REFERENCE [--reads READS] [--genome GENOME [GENOME ...]] -l L -k K [--tile TILE] [--lai] [-t T] [-z Z] [-j J] [-Y Y] [-v] [-V] [-n] [-f] +usage: ntroot [-h] [-r REFERENCE] [--reads READS] [--genome GENOME [GENOME ...]] -l L [-k K] [--tile TILE] [--lai] [-t T] [-z Z] [-j J] [-Y Y] [--custom_vcf CUSTOM_VCF] + [--strip_info] [-v] [-V] [-n] [-f] ntRoot: Ancestry inference from genomic data @@ -73,6 +74,9 @@ optional arguments: -z Z Minimum contig length [default=100] -j J controls size of k-mer subset. When checking subset of k-mers, check every jth k-mer [default=3] -Y Y Ratio of number of k-mers in the k subset that should be present to accept an edit (higher=stringent) [default=0.55] + --custom_vcf CUSTOM_VCF + Input VCF for computing ancestry. When specified, ntRoot will skip the ntEdit step, and predict ancestry from the provided VCF. + --strip_info When using --custom_vcf, strip the existing INFO field from the input VCF. -v, --verbose Verbose mode [default=False] -V, --version show program's version number and exit -n, --dry-run Print out the commands that will be executed @@ -122,6 +126,10 @@ Example command: ntroot -k 55 --reference GRCh38.fa.gz --reads ERR3242308_ -t 48 -Y 0.55 -l 1000GP_integrated_snv_v2a_27022019.GRCh38.phased_gt1.vcf.gz +If you would like to infer ancestry from a pre-existing VCF file: +
+ntroot -r GRCh38.fa.gz --custom_vcf third_party.vcf -l 1000GP_integrated_snv_v2a_27022019.GRCh38.phased_gt1.vcf.gz
+
Note: For more advanced users, and for ancestry predictions on organisms other than human, please contact us. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0a790d5..f9a2ccf 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -19,7 +19,7 @@ jobs: - script: | source activate ntroot_CI conda install --yes --name ntroot_CI -c conda-forge -c bioconda python=3.9 mamba - mamba install --yes -c conda-forge -c bioconda meson ninja snakemake perl 'ntedit>=2.0.1' samtools + mamba install --yes -c conda-forge -c bioconda meson ninja snakemake perl 'ntedit>=2.0.1' samtools bedtools displayName: Install Conda packages - script: | @@ -48,7 +48,7 @@ jobs: - script: | source activate ntroot_CI conda install --yes --name ntroot_CI -c conda-forge -c bioconda python=3.9 mamba - mamba install --yes -c conda-forge -c bioconda snakemake perl 'ntedit>=2.0.1' samtools + mamba install --yes -c conda-forge -c bioconda snakemake perl 'ntedit>=2.0.1' samtools bedtools displayName: Install Conda packages - script: | diff --git a/demo/HG002.chr21.vcf.gz b/demo/HG002.chr21.vcf.gz new file mode 100644 index 0000000..0bb94ef Binary files /dev/null and b/demo/HG002.chr21.vcf.gz differ diff --git a/demo/run_ntroot_demo.sh b/demo/run_ntroot_demo.sh index fa37547..75884c1 100755 --- a/demo/run_ntroot_demo.sh +++ b/demo/run_ntroot_demo.sh @@ -32,4 +32,16 @@ else exit 1 fi +echo "Running ntRoot input VCF demo..." + +ntroot --reference chr21.fa --custom_vcf HG002.chr21.vcf.gz -l pop-spec-snp_chr21.vcf.gz +prediction=$(cat HG002.chr21.vcf.gz.cross-ref.vcf_ancestry-predictions_tile5000000.tsv | awk '{print $1}' |head -n 2 |tail -n 1) +if [ ${prediction} == "EUR" ]; then + echo "ntRoot input VCF test successful!" +else + echo "ntRoot input VCF test failed.. Please check your installation." + exit 1 +fi + + echo "Done ntRoot tests!" diff --git a/ntroot b/ntroot index 74b5fab..2b9dbf6 100755 --- a/ntroot +++ b/ntroot @@ -37,7 +37,7 @@ def set_up_parser(): parser.add_argument("-k", help="k-mer size", - required=True, type=int) + required=False, type=int) parser.add_argument("--tile", help="Tile size for ancestry fraction inference (bp) [default=5000000]", default=5000000, type=int) parser.add_argument("--lai", help="Output ancestry predictons per tile in a separate output file", @@ -54,6 +54,12 @@ def set_up_parser(): help="Ratio of number of k-mers in the k subset that should be present to accept " "an edit (higher=stringent) " "[default=0.55]", default=0.55, type=float) + parser.add_argument("--custom_vcf", help="Input VCF for computing ancestry. " + "When specified, ntRoot will skip the ntEdit step, and " + "predict ancestry from the provided VCF.", + type=str) + parser.add_argument("--strip_info", help="When using --custom_vcf, strip the existing INFO field from the input VCF.", + action="store_true") parser.add_argument("-v", "--verbose", help="Verbose mode [default=False]", action="store_true", default=False) parser.add_argument("-V", "--version", action="version", version=NTROOT_VERSION) @@ -71,7 +77,7 @@ def main(): args = parser.parse_args() - if (args.reads and args.genome) or (not args.reads and not args.genome): + if ((args.reads and args.genome) or (not args.reads and not args.genome)) and not args.custom_vcf: raise argparse.ArgumentTypeError("Please specify --reads OR --genome") if not args.draft and not args.reference: @@ -79,6 +85,10 @@ def main(): if args.draft: args.reference = args.draft + if args.reads or args.genome: + if not args.k: + raise argparse.ArgumentTypeError("Please specify the k-mer size with -k") + base_dir = os.path.dirname(os.path.realpath(__file__)) intro_string = ["Running ntRoot...", @@ -94,26 +104,38 @@ def main(): smk_rule = "ntroot_genome_lai" else: smk_rule = "ntroot_genome" + if args.custom_vcf: + if args.lai: + smk_rule = "ntroot_input_vcf_lai" + else: + smk_rule = "ntroot_input_vcf" command = f"snakemake -s {base_dir}/ntroot_run_pipeline.smk {smk_rule} --nolock -p --cores {args.t} " \ - f"--config reference={args.reference} k={args.k} t={args.t} " \ - f"z={args.z} tile_size={args.tile} " \ - f"j={args.j} Y={args.Y} " + f"--config reference={args.reference} t={args.t} " \ + f"tile_size={args.tile} " if args.genome: intro_string.append(f"\t--genome {args.genome}") - command += f"genomes={args.genome} " + command += f"genomes={args.genome}" + elif args.custom_vcf: + intro_string.append(f"\t--custom_vcf {args.custom_vcf}") + command += f" input_vcf={args.custom_vcf}" + if args.strip_info: + intro_string.append("\t--strip_info") + command += " strip_info=True" else: intro_string.append(f"\t--reads {args.reads}") command += f"reads={args.reads}" - intro_string.extend([f"\t-k {args.k}", - f"\t--reference {args.reference}", - f"\t-t {args.t}", - f"\t-z {args.z}", - f"\t-j {args.j}", - f"\t-Y {args.Y}" - ]) + intro_string.extend([f"\t--reference {args.reference}", + f"\t-t {args.t}"]) + + if args.genome or args.reads: + intro_string.extend([f"\t-k {args.k}", + f"\t-z {args.z}", + f"\t-j {args.j}", + f"\t-Y {args.Y}"]) + command += f" k={args.k} z={args.z} j={args.j} Y={args.Y}" if args.lai: intro_string.append("\t--lai") diff --git a/ntroot_cross_reference_vcf.py b/ntroot_cross_reference_vcf.py new file mode 100755 index 0000000..eff9ff0 --- /dev/null +++ b/ntroot_cross_reference_vcf.py @@ -0,0 +1,332 @@ +#!/usr/bin/env python3 +''' +Given the result of LOJ between VCF files, output valid ntEdit VCF file +''' +import argparse +import shlex +import subprocess +import gzip + +class Vcf: + "Represents a VCF entry" + def __init__(self, chr, position, idenfitier, ref, alt, qual, filter, info, format=None, + integration=None, parse_info=True, strip_info=True): + self.chr = chr + self.position = int(position) + self.idenfitier = idenfitier + self.ref = ref + self.alt = alt.split(",") + self.qual = qual + self.filter = filter + self.info_alts = None + if parse_info: # We want to parse the elements in info to a dictionary + self.info_alts = self.parse_info(info) + if "VCF_L" in info: # Accounting for placeholders for INFO line in the temp VCF + self.info_str = info.split("VCF_L")[0].strip(";") + elif not parse_info and not strip_info: + self.info_str = info + else: + self.info_str = "" + self.format = format + self.integration = integration + + def remove_info(self): + "Clear out the info and position attributes" + self.info_alts = {} + self.position = -1 + + def get_info(self, alt_base=None): + "Return the info for the given alt_base, if specified" + info_str = "" + if self.info_str is not None: + info_str += self.info_str + if alt_base is None: + return info_str + if alt_base in self.alt and self.info_alts is not None and alt_base in self.info_alts: + info_str += ";" + ";".join(self.info_alts[alt_base]) + return info_str.strip(";") + + def strip_string_from_info(self, info): + "Strip ^NA from the info string" + return info.replace("^NA", "") + + def add_info(self, alt_dict): + "Add INFO from the dictionary" + for alt_base in alt_dict: + if alt_base in self.info_alts: + continue + self.info_alts[alt_base] = alt_dict[alt_base] + self.alt.append(alt_base) + + def parse_info(self, info): + "Parse INFO string to data structures for: alt alleles, both" + info_formatted = self.strip_string_from_info(info) if "^" in info else info + if "VCF_L" in info_formatted: + info_formatted = info_formatted.split("VCF_L")[1].strip(";") # Only parse after the VCF_L part + alt_alleles = {} + for item in info_formatted.split(";"): + if "=" in item: + key, value = item.split("=") + split_val = value.split(",") + if len(split_val) > 1 and len(split_val) == len(self.alt): + for i, v in enumerate(split_val): + if self.alt[i] not in alt_alleles: + alt_alleles[self.alt[i]] = [] + alt_alleles[self.alt[i]].append(f"{key}={v}") + else: + for alt_base in self.alt: + if alt_base not in alt_alleles: + alt_alleles[alt_base] = [] + alt_alleles[alt_base].append(f"{key}={value}") + else: + for alt_base in self.alt: + if alt_base not in alt_alleles: + alt_alleles[alt_base] = [] + alt_alleles[alt_base].append(item) + + + return alt_alleles + + def get_info_dict(self): + "Return the INFO in a dictionary form" + return {alt_base: {info.split("=")[0]: info.split("=")[1] \ + if len(info.split("=")) == 2 else None for info in info_list} \ + for alt_base, info_list in self.info_alts.items()} + + + def __str__(self): + strs = [] + for alt_base in self.alt: + if self.format is None or self.integration is None: + strs.append(f"{self.chr}\t{self.position}\t{self.idenfitier}\t{self.ref}\t{alt_base}\t{self.qual}"\ + f"\t{self.filter}\t{self.get_info(alt_base)}") + else: + info = self.get_info(alt_base) + strs.append(f"{self.chr}\t{self.position}\t{self.idenfitier}\t{self.ref}\t{alt_base}\t{self.qual}"\ + f"\t{self.filter}\t{info}\t{self.format}\t{self.integration}") + return "\n".join(strs) + + def get_tmp_print_line(self): + "Return a line for printing to a temporary file" + strs = [] + for alt_base in self.alt: + if self.get_info(alt_base) == "": + info = self.info_str + ";VCF_L;" + else: + info = self.get_info(alt_base) + ";VCF_L;" + if self.format is None or self.integration is None: + strs.append(f"{self.chr}\t{self.position}\t{self.idenfitier}\t{self.ref}\t{alt_base}\t{self.qual}"\ + f"\t{self.filter}\t{info}") + else: + strs.append(f"{self.chr}\t{self.position}\t{self.idenfitier}\t{self.ref}\t{alt_base}\t{self.qual}"\ + f"\t{self.filter}\t{info}\t{self.format}\t{self.integration}") + return "\n".join(strs) + +def write_header(vcffile, outfile, info_only=False): + "Write header from input VCF file to standard out" + vcf_open = open(vcffile, 'r', encoding="utf8") if not vcffile.endswith(".gz") else gzip.open(vcffile, 'rt') + for line in vcf_open: + line = line.strip() + if info_only and line.startswith("##INFO"): + outfile.write(f"{line}\n") + elif not info_only and line.startswith("##"): + outfile.write(f"{line}\n") + elif not info_only and line.startswith("#"): + vcf_open.close() + return line + elif line.startswith("#"): + continue + else: + vcf_open.close() + return + vcf_open.close() + +def get_all_vcf_info(vcf1, vcf2, alt_base): + "Concatenate INFO strings from both VCF files" + return f"{vcf1.get_info(None)};VCF_L;{vcf2.get_info(alt_base)}".strip(";") + +def are_compatible(vcf1, vcf2): + "Ensure that at least one alt in vcf1 matches vcf2, references match, positions match" + if vcf1.ref != vcf2.ref: + return False + if vcf1.position != vcf2.position: + return False + for alt_base in vcf1.alt: + if alt_base in vcf2.alt: + return True + return False + +def print_vcf_line(ntedit_vcf, l_vcf, outfile): + "Print the info into a VCF" + if l_vcf.position == -1: + outfile.write(f"{ntedit_vcf.get_tmp_print_line()}\n") + else: + if not are_compatible(ntedit_vcf, l_vcf): + outfile.write(f"{ntedit_vcf.get_tmp_print_line()}\n") + return + if len(ntedit_vcf.alt) > 1: + out_strs = [] + for alt_base in ntedit_vcf.alt: + out_strs.append(f"{ntedit_vcf.chr}\t{ntedit_vcf.position}\t{ntedit_vcf.idenfitier}\t{ntedit_vcf.ref}\t" + f"{alt_base}\t{ntedit_vcf.qual}\t{ntedit_vcf.filter}\t" + f"{get_all_vcf_info(ntedit_vcf, l_vcf, alt_base)}\t{ntedit_vcf.format}\t" + f"{ntedit_vcf.integration}") + out_str = "\n".join(out_strs) + outfile.write(f"{out_str}\n") + else: + alt_base = ntedit_vcf.alt[0] + out_str = (f"{ntedit_vcf.chr}\t{ntedit_vcf.position}\t{ntedit_vcf.idenfitier}" + f"\t{ntedit_vcf.ref}\t{alt_base}\t{ntedit_vcf.qual}\t{ntedit_vcf.filter}\t" + f"{get_all_vcf_info(ntedit_vcf, l_vcf, alt_base)}\t{ntedit_vcf.format}\t" + f"{ntedit_vcf.integration}") + outfile.write(f"{out_str}\n") + +def parse_bedtools_loj(infile, outfile, strip_info=False): + "Parse the LOJ from bedtools to ntEdit-formatted VCF" + ntedit_vcf = None + l_vcf = None + + with open(infile, 'r', encoding="utf8") as fin: + for line in fin: + line = line.strip().split("\t") + ntedit_vcf_new = Vcf(*line[:10], parse_info=False, strip_info=strip_info) + l_vcf_new = Vcf(*line[10:18]) + if ntedit_vcf is not None and ntedit_vcf.position == ntedit_vcf_new.position \ + and ntedit_vcf.chr == ntedit_vcf_new.chr: + # This is the same position as before, tally extra INFO from l_vcf. + # Know that this cannot be due to ntEdit VCF. + if l_vcf.position == -1 and are_compatible(ntedit_vcf, l_vcf_new): + # Currently only storing incompatible entry, OK to overwrite + l_vcf = l_vcf_new + elif are_compatible(ntedit_vcf, l_vcf_new): + l_vcf.add_info(l_vcf_new.info_alts) + l_vcf.position = l_vcf_new.position + else: + continue + + else: + if ntedit_vcf is not None: + # This is not the same position, print previous and update + print_vcf_line(ntedit_vcf, l_vcf, outfile) + ntedit_vcf = ntedit_vcf_new + l_vcf = l_vcf_new + if l_vcf_new.position != -1 and not are_compatible(ntedit_vcf_new, l_vcf_new): + l_vcf.remove_info() + + print_vcf_line(ntedit_vcf, l_vcf, outfile) + +def fold_attributes(vcf_list): + "Fold the attributes" + attribute_lists = [] + alts = [] + for vcf in vcf_list: + assert len(vcf.alt) == 1 + alts.append(vcf.alt[0]) + attribute_dict = vcf.get_info_dict() + list_attributes = [key for _, base_dict in attribute_dict.items() for key in base_dict] + attribute_lists.extend(list_attributes) + attribute_set = set(attribute_lists) + + + new_dict = {} # Key: [] to track lists of INFO + for attribute in attribute_set: + new_dict[attribute] = [] + for vcf in vcf_list: + vcf_attributes = vcf.get_info_dict()[vcf.alt[0]] + if attribute in vcf_attributes and vcf_attributes[attribute] is None and attribute in new_dict: + continue + elif attribute in vcf_attributes: + new_dict[attribute].append(vcf_attributes[attribute]) + else: + new_dict[attribute].append("NA") + + new_info_line = [] + visited_attributes = set() + for attribute in attribute_lists: + if attribute not in visited_attributes: + if None not in new_dict[attribute] and attribute != "": + new_info_line.append(f"{attribute}={','.join(new_dict[attribute])}") + visited_attributes.add(attribute) + else: + new_info_line.append(f"{attribute}") + rep_vcf = vcf_list[0] + if rep_vcf.info_str is not None: + info_line = rep_vcf.info_str + ";" + ";".join(new_info_line) + else: + info_line = ";".join(new_info_line) + info_line = info_line.strip(";") + out_str = f"{rep_vcf.chr}\t{rep_vcf.position}\t{rep_vcf.idenfitier}\t{rep_vcf.ref}\t{','.join(alts)}\t"\ + f"{rep_vcf.qual}\t{rep_vcf.filter}\t{info_line}\t{rep_vcf.format}\t{rep_vcf.integration}" + return out_str + + +def print_folded_vcf_line(vcf_list, outfile): + "Print the lines to the VCF, folding variants and INFO if applicable" + if len(vcf_list) == 1: + outline = vcf_list[0] + outfile.write(f"{outline}\n") + else: + new_info_line = fold_attributes(vcf_list) + outfile.write(f"{new_info_line}\n") + +def refold_variants(in_vcf, out_prefix): + "Re-interleave or refold the variants, so a given position is on one line" + prev_vcf = None + vcfs = [] + with open(in_vcf, 'r', encoding="utf8") as fin: + with open(f"{out_prefix}.vcf", 'w', encoding="utf8") as fout: + for line in fin: + line = line.strip() + if line.startswith("#"): + fout.write(f"{line}\n") + else: + vcf_line = Vcf(*line.split("\t")) + if prev_vcf is not None and \ + (prev_vcf.position != vcf_line.position or prev_vcf.chr != vcf_line.chr): + # Different, ready to print + print_folded_vcf_line(vcfs, fout) + vcfs = [] + vcfs.append(vcf_line) + prev_vcf = vcf_line + print_folded_vcf_line(vcfs, fout) + +def remove_tmp_vcf(filename): + "Remove this temporary file" + cmd = f"rm {filename}" + ret_code = subprocess.run(shlex.split(cmd), check=True) + assert ret_code.returncode == 0 + + +def main(): + """ + Given the result of LOJ between VCF files, output valid ntEdit VCF file + """ + parser = argparse.ArgumentParser( + description="Given the result of LOJ between VCF files, output valid ntEdit VCF file") + parser.add_argument("-b", "--bedtools", help="Bedtools intersect file, or - to read from standard in", + required=True, type=str) + parser.add_argument("--vcf", help="ntEdit VCF file (for header)", required=True, type=str) + parser.add_argument("--vcf_l", help="-l VCF file (for header)", required=True, type=str) + parser.add_argument("-p", "--prefix", help="Prefix for out files", required=False, + default="ntedit_snv_vcf", type=str) + parser.add_argument("--strip", help="Strip INFO field from --vcf. Use when have ntEdit VCF with already " + "cross-referenced ancestry", + action="store_true") + + args = parser.parse_args() + + args.bedtools = "/dev/stdin" if args.bedtools == "-" else args.bedtools + + with open(f"{args.prefix}.tmp.vcf", 'w', encoding="utf8") as fout: + header = write_header(args.vcf, fout, info_only=False) + write_header(args.vcf_l, fout, info_only=True) + fout.write(f"{header}\n") + + parse_bedtools_loj(args.bedtools, fout, args.strip) + + refold_variants(f"{args.prefix}.tmp.vcf", args.prefix) + + remove_tmp_vcf(f"{args.prefix}.tmp.vcf") + +if __name__ == "__main__": + main() diff --git a/ntroot_run_pipeline.smk b/ntroot_run_pipeline.smk index 0594868..88da020 100644 --- a/ntroot_run_pipeline.smk +++ b/ntroot_run_pipeline.smk @@ -11,7 +11,7 @@ onsuccess: reference=config["reference"] draft_base = os.path.basename(os.path.realpath(reference)) reads_prefix=config["reads"] if "reads" in config else "" -k=config["k"] +k=config["k"] if "k" in config else None genomes = config["genomes"] if "genomes" in config else "" genome_prefix = ".".join([os.path.basename(os.path.realpath(genome)).removesuffix(".fa").removesuffix(".fasta").removesuffix(".fna") for genome in genomes]) @@ -31,6 +31,11 @@ l = config["l"] if "l" in config else "" # Ancestry inference parameters tile_size = config["tile_size"] if "tile_size" in config else 5000000 +# Third-party VCF parameters +input_vcf = config["input_vcf"] if "input_vcf" in config else None +input_vcf_basename = os.path.basename(os.path.realpath(input_vcf)) if input_vcf else "None" +strip_info = config["strip_info"] if "strip_info" in config else None + # time command mac_time_command = "command time -l -o" linux_time_command = "command time -v -o" @@ -51,6 +56,12 @@ rule ntroot_genome_lai: rule ntroot_reads_lai: input: f"{reads_prefix}_ntedit_k{k}_variants.vcf_ancestry-predictions-tile-resolution_tile{tile_size}.tsv" +rule ntroot_input_vcf: + input: f"{input_vcf_basename}.cross-ref.vcf_ancestry-predictions_tile{tile_size}.tsv" + +rule ntroot_input_vcf_lai: + input: f"{input_vcf_basename}.cross-ref.vcf_ancestry-predictions-tile-resolution_tile{tile_size}.tsv" + rule ntedit_reads: input: reference = reference @@ -98,7 +109,7 @@ rule ancestry_prediction: output: predictions = "{vcf}_ancestry-predictions_tile{tile_size}.tsv" params: - benchmark = f"{time_command} ancestry_prediction_k{k}_tile{tile_size}.time", + benchmark = f"{time_command} ancestry_prediction_tile{tile_size}.time" if input_vcf_basename else f"{time_command} ancestry_prediction_k{k}_tile{tile_size}.time", tile_size = tile_size, verbosity = v shell: @@ -112,9 +123,50 @@ rule ancestry_prediction_lai: output: lai_output = "{vcf}_ancestry-predictions-tile-resolution_tile{tile_size}.tsv" params: - benchmark = f"{time_command} ancestry_prediction_k{k}_tile{tile_size}.time", + benchmark = f"{time_command} ancestry_prediction_tile{tile_size}.time" if input_vcf_basename else f"{time_command} ancestry_prediction_k{k}_tile{tile_size}.time", tile_size = tile_size, verbosity = v shell: "{params.benchmark} ntRootAncestryPredictor.pl -f {input.vcf} -t {params.tile_size} -v {params.verbosity} -r 1 -i {input.ref_fai}" +rule sort_vcf_input: + input: vcf = f"{input_vcf}" + output: vcf_sorted = temp(f"{input_vcf_basename}_sorted.vcf") + params: + benchmark = f"{time_command} sort_vcf_{input_vcf_basename}.time", + cat_cmd = "gunzip -c" if f"{input_vcf}".endswith(".gz") else "cat" + shell: + """{params.benchmark} sh -c '(echo "##fileformat=VCFv4.2" ; {params.cat_cmd} {input.vcf} |grep -v "#" |sort -k1,1 -k2,2n) > {output.vcf_sorted}'""" + +rule sort_vcf_l: + input: vcf = l + output: temp(f"{os.path.basename(os.path.realpath(l))}_sorted.tmp.vcf") + params: + benchmark = f"{time_command} sort_vcf_l.time", + cat_cmd = "gunzip -c" if f"{l}".endswith(".gz") else "cat" + shell: + """{params.benchmark} sh -c "(echo '##fileformat=VCFv4.2' ; {params.cat_cmd} {input.vcf} | awk '\$5 !~ /^ {output}" """ + +rule bedtools_intersect: + input: + sorted_vcf = f"{input_vcf_basename}_sorted.vcf", + sorted_ref_vars = f"{os.path.basename(os.path.realpath(l))}_sorted.tmp.vcf" + output: + bedtools = temp(f"{input_vcf_basename}.bedtools-intersect.bed") + params: + benchmark = f"{time_command} bedtools_intersect_{input_vcf_basename}.time" + shell: + "{params.benchmark} bedtools intersect -loj -sorted -a {input.sorted_vcf} -b {input.sorted_ref_vars} > {output.bedtools}" + +rule cross_reference_vcf: + input: + vcf = f"{input_vcf}", + ref_vars = l, + bedtools = f"{input_vcf_basename}.bedtools-intersect.bed" + output: f"{input_vcf_basename}.cross-ref.vcf" + params: + benchmark = f"{time_command} cross_reference_vcf_{input_vcf_basename}.time", + prefix=f"{input_vcf_basename}.cross-ref", + strip = "--strip" if strip_info else "" + shell: + "{params.benchmark} ntroot_cross_reference_vcf.py -b {input.bedtools} --vcf {input.vcf} --vcf_l {input.ref_vars} -p {params.prefix} {params.strip}" \ No newline at end of file