diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 34560ff..741d0f7 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -15,7 +15,7 @@ jobs: echo $CONDA/bin >> $GITHUB_PATH - name: Install dependencies run: | - conda install -n base python=3.10 conda-forge::biopython numpy==1.26 bioconda::pyranges==0.0.129 packaging bioconda::scanpy pytest -y + conda install -n base -c bioconda -c conda-forge python=3.10 packaging pandas scanpy pytest -y - name: pytest run: | diff --git a/bin/pyroe b/bin/pyroe index db86bd4..4df61c3 100755 --- a/bin/pyroe +++ b/bin/pyroe @@ -1,11 +1,6 @@ #!/usr/bin/env python import logging -from pyroe import make_splici_txome, make_spliceu_txome -from pyroe import id_to_name - -if make_spliceu_txome is None or make_splici_txome is None or id_to_name is None: - raise ImportError("To run pyroe CLI, Please install pyranges, biopython and bedtools.") from pyroe import fetch_processed_quant from pyroe import convert from pyroe import output_formats @@ -37,133 +32,6 @@ if __name__ == "__main__": help="additional help", ) - # make-splici - parser_makeSplici = subparsers.add_parser( - "make-spliced+intronic", - help="Make spliced + intronic reference", - aliases=["make-splici"], - ) - parser_makeSplici.set_defaults(command="make-spliced+intronic") - parser_makeSplici.add_argument( - "genome_path", - metavar="genome-path", - type=str, - help="The path to a genome fasta file.", - ) - parser_makeSplici.add_argument( - "gtf_path", metavar="gtf-path", type=str, help="The path to a gtf file." - ) - parser_makeSplici.add_argument( - "read_length", - metavar="read-length", - type=int, - help="The read length of the single-cell experiment being processed (determines flank size).", - ) - parser_makeSplici.add_argument( - "output_dir", - metavar="output-dir", - type=str, - help="The output directory where splici reference files will be written.", - ) - parser_makeSplici.add_argument( - "--filename-prefix", - type=str, - default="splici", - help="The file name prefix of the generated output files.", - ) - parser_makeSplici.add_argument( - "--flank-trim-length", - type=int, - default=5, - help="Determines the amount subtracted from the read length to get the flank length.", - ) - parser_makeSplici.add_argument( - "--extra-spliced", - type=str, - help="The path to an extra spliced sequence fasta file.", - ) - parser_makeSplici.add_argument( - "--extra-unspliced", - type=str, - help="The path to an extra unspliced sequence fasta file.", - ) - parser_makeSplici.add_argument( - "--bt-path", - type=str, - default="bedtools", - help="The path to bedtools v2.30.0 or greater.", - ) - parser_makeSplici.add_argument( - "--no-bt", - action="store_true", - help="A flag indicates whether bedtools will be used for generating splici reference files.", - ) - parser_makeSplici.add_argument( - "--dedup-seqs", - action="store_true", - help="A flag indicates whether identical sequences will be deduplicated.", - ) - parser_makeSplici.add_argument( - "--no-flanking-merge", - action="store_true", - help="A flag indicates whether flank lengths will be considered when merging introns.", - ) - - # make-spliceu - parser_makeSpliceu = subparsers.add_parser( - "make-spliced+unspliced", - help="Make spliced + unspliced reference", - aliases=["make-spliceu"], - ) - parser_makeSpliceu.set_defaults(command="make-spliced+unspliced") - parser_makeSpliceu.add_argument( - "genome_path", - metavar="genome-path", - type=str, - help="The path to a genome fasta file.", - ) - parser_makeSpliceu.add_argument( - "gtf_path", metavar="gtf-path", type=str, help="The path to a gtf file." - ) - parser_makeSpliceu.add_argument( - "output_dir", - metavar="output-dir", - type=str, - help="The output directory where Spliceu reference files will be written.", - ) - parser_makeSpliceu.add_argument( - "--filename-prefix", - type=str, - default="spliceu", - help="The file name prefix of the generated output files.", - ) - parser_makeSpliceu.add_argument( - "--extra-spliced", - type=str, - help="The path to an extra spliced sequence fasta file.", - ) - parser_makeSpliceu.add_argument( - "--extra-unspliced", - type=str, - help="The path to an extra unspliced sequence fasta file.", - ) - parser_makeSpliceu.add_argument( - "--bt-path", - type=str, - default="bedtools", - help="The path to bedtools v2.30.0 or greater.", - ) - parser_makeSpliceu.add_argument( - "--no-bt", - action="store_true", - help="A flag indicates whether bedtools will be used for generating Spliceu reference files.", - ) - parser_makeSpliceu.add_argument( - "--dedup-seqs", - action="store_true", - help="A flag indicates whether identical sequences will be deduplicated.", - ) - # parse available datasets available_datasets = fetch_processed_quant() epilog = "\n".join( @@ -213,20 +81,6 @@ if __name__ == "__main__": help="A flag indicates whether help messaged should not be printed.", ) - parser_id_to_name = subparsers.add_parser( - "id-to-name", help="Generate a gene id to gene name mapping file from a GTF." - ) - parser_id_to_name.set_defaults(command="id-to-name") - parser_id_to_name.add_argument("gtf_file", help="The GTF input file.") - parser_id_to_name.add_argument( - "output", help="The path to where the output tsv file will be written." - ) - parser_id_to_name.add_argument( - "--format", - help="The input format of the file (must be either GTF or GFF3). This will be inferred from the filename, but if that fails it can be provided explicitly.", - default=None, - ) - out_formats = output_formats() parser_convert = subparsers.add_parser( "convert", help="Convert alevin-fry quantification result to another format." @@ -244,7 +98,7 @@ if __name__ == "__main__": ) parser_convert.add_argument( "--output-structure", - help="The structure that U,S and A counts should occupy in the output matrix.", + help="The structure that U,S and A counts should occupy in the output matrix. It will be passed to the output_format argument of the `load_fry` function.", ) parser_convert.add_argument( "--output-format", @@ -262,34 +116,7 @@ if __name__ == "__main__": # Execute the parse_args() method args = parser.parse_args() - if args.command == "make-spliced+intronic": - make_splici_txome( - genome_path=args.genome_path, - gtf_path=args.gtf_path, - read_length=args.read_length, - output_dir=args.output_dir, - flank_trim_length=args.flank_trim_length, - filename_prefix=args.filename_prefix, - extra_spliced=args.extra_spliced, - extra_unspliced=args.extra_unspliced, - dedup_seqs=args.dedup_seqs, - no_bt=args.no_bt, - bt_path=args.bt_path, - no_flanking_merge=args.no_flanking_merge, - ) - elif args.command == "make-spliced+unspliced": - make_spliceu_txome( - genome_path=args.genome_path, - gtf_path=args.gtf_path, - output_dir=args.output_dir, - filename_prefix=args.filename_prefix, - extra_spliced=args.extra_spliced, - extra_unspliced=args.extra_unspliced, - dedup_seqs=args.dedup_seqs, - no_bt=args.no_bt, - bt_path=args.bt_path, - ) - elif args.command == "fetch-quant": + if args.command == "fetch-quant": fetch_processed_quant( dataset_ids=args.dataset_ids, fetch_dir=args.fetch_dir, @@ -299,8 +126,6 @@ if __name__ == "__main__": ) elif args.command == "convert": convert(args) - elif args.command == "id-to-name": - id_to_name(args) else: print(parser.print_help()) sys.exit(1) diff --git a/setup.cfg b/setup.cfg index 6898d64..5e71be4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = pyroe -version = 0.10.0 +version = 0.11.0 author = Dongze He, Rob Patro author_email = dhe17@umd.edu, rob@cs.umd.edu description = utilities of alevin-fry @@ -16,14 +16,14 @@ classifiers = packages = find: package_dir = = src -# scripts = -# bin/pyroe +scripts = + bin/pyroe python_requires = >=3.7 include_package_data = True install_requires = packaging >= 21.0 scanpy >= 1.8.2 - + pandas >= 1.3.0 [options.packages.find] where = src diff --git a/src/pyroe/__init__.py b/src/pyroe/__init__.py index ce93022..cb6ad9c 100644 --- a/src/pyroe/__init__.py +++ b/src/pyroe/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.10.0" +__version__ = "0.11.0" from pyroe.load_fry import load_fry from pyroe.fetch_processed_quant import fetch_processed_quant @@ -7,13 +7,4 @@ from pyroe.convert import convert from pyroe.pyroe_utils import output_formats - -# try: -# from pyroe.make_txome import make_splici_txome, make_spliceu_txome -# from pyroe.id_to_name import id_to_name -# except ImportError: -# make_splici_txome = None -# make_spliceu_txome = None -# id_to_name = None - # flake8: noqa diff --git a/src/pyroe/id_to_name.py b/src/pyroe/id_to_name.py deleted file mode 100644 index a1715c7..0000000 --- a/src/pyroe/id_to_name.py +++ /dev/null @@ -1,57 +0,0 @@ -import pyranges -import logging -import pathlib -import sys - - -def id_to_name(params): - format_map = { - "gtf": pyranges.read_gtf, - "gff": pyranges.read_gff3, - "gff3": pyranges.read_gff3, - } - annot_reader = None - if params.format is None: - p = pathlib.Path(params.gtf_file) - suffs = [z.lower().strip(".") for z in p.suffixes] - - z = None - if len(suffs) >= 1: - # look at the final suffix - z = suffs[-1] - # if the final suffix is gz and there are - # suffixes preceding it, check the penultimate - # one and use that - if z == "gz" and len(suffs) > 1: - z = suffs[-2] - - if z is None or z not in format_map.keys(): - logging.error( - "Could not determine format of annotation file. Please provide it explicitly." - ) - sys.exit(1) - else: - annot_reader = format_map[z] - else: - fmt = params.format.lower() - if fmt not in format_map.keys(): - logging.error( - f'Format must be either "gtf" or "gff3", but {fmt} was provided.' - ) - sys.exit(1) - annot_reader = format_map[fmt] - - a = annot_reader(params.gtf_file) - # only bother looking at `gene` features - a_genes = a[(a.Feature == "gene")] - id_name = {} - for k, df in a_genes: - cdf = df[["gene_id", "gene_name"]].to_dict(orient="records") - id_name.update({d["gene_id"]: d["gene_name"] for d in cdf}) - - logging.info(f"generated mappings for {len(id_name)} gene ids.") - logging.info(f"writing output to {params.output}") - - with open(params.output, "w") as ofile: - for k, v in id_name.items(): - ofile.write(f"{k}\t{v}\n") diff --git a/src/pyroe/load_fry.py b/src/pyroe/load_fry.py index d8e8357..5615403 100644 --- a/src/pyroe/load_fry.py +++ b/src/pyroe/load_fry.py @@ -1,15 +1,14 @@ -try: - import scanpy -except ModuleNotFoundError: - print( - "scanpy must be installed to enable the load_fry() function. Use `conda install -c scanpy ` or `pip install scanpy` to install it." - ) - import sys - - sys.exit(1) +from .pyroe_utils import say -def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False): +def load_fry( + frydir, + output_format="scRNA", + aux_columns=["X", "Y"], + gene_id_to_name=None, + nonzero=False, + quiet=False, +): """ load alevin-fry quantification result into an AnnData object @@ -26,6 +25,19 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False): If a customized format of the returned `AnnData` is needed, one can pass a Dictionary.\\ See Notes section for details. + aux_columns : `list[str]` (default: `["X", "Y"]`) + A list of strings contains the column names of the auxiliary information in the barcodes file starting from the second column. \\ + The first column is assumed to be the barcodes and is named as "barcodes". \\ + Extra auxiliary columns in the barcodes file without a specified name will be ignored. + + gene_id_to_name : `str` or `None` (default: `None`) + The path to a file that contains the mapping from gene names to gene ids. \\ + It is only needed if \\ + 1. you are not using the simpleaf pipeline (`simpleaf index` + `simpleaf quant`), \\ + 2. you have such a file, and, + 3. you want to add this information to the coldata of your anndata. + If you do, please ensure it is a tab-separated, two-column file without a header, and the first column is the gene ids and the second column is the gene names. + nonzero : `bool` (default: `False`) True if cells with non-zero expression value across all genes should be filtered in each layer. False if unexpressed genes should be kept. @@ -45,7 +57,7 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False): The following formats are defined: - * "scRNA": \\ + * "scRNA" and "S+A": \\ This format is recommended for single cell RNA-sequencing experiments. It returns a `X` field that contains the S+A count of each gene in each cell, and a `unspliced` field that contains the U count of each gene. @@ -89,6 +101,7 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False): import json import os import pandas as pd + import scanpy # since alevin-fry 0.4.1 the generic "meta_info.json" # has been replaced by a more informative name for each @@ -100,14 +113,17 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False): # first, check for the new file, if we don't find it, check # for the old one. if not os.path.exists(fpath): - if not quiet: - print( - f"Did not find a {meta_info_files[0]} file, checking for older {meta_info_files[1]}." - ) + say( + quiet, + f"Did not find a {meta_info_files[0]} file, checking for older {meta_info_files[1]}.", + ) + fpath = os.path.sep.join([frydir, meta_info_files[1]]) # if we don't find the old one either, then return None if not os.path.exists(fpath): - raise IOError(f"Found no {meta_info_files[1]} file either; cannot proceed.") + raise IOError( + "The profvided `frydir` doesn't contain required meta info file; cannot proceed." + ) # if we got here then we had a valid json file, so # use it to get the number of genes, and if we are @@ -115,8 +131,8 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False): meta_info = json.load(open(fpath)) ng = meta_info["num_genes"] usa_mode = meta_info["usa_mode"] - if not quiet: - print(f"USA mode: {usa_mode}") + + say(quiet, f"USA mode: {usa_mode}") # if we are in USA mode if usa_mode: @@ -125,42 +141,92 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False): # genes is ng/3. ng = int(ng / 3) output_assays = process_output_format(output_format, quiet) - elif not quiet: - print( - "Processing input in standard mode, the count matrix will be stored in field 'X'." + else: + say( + quiet, + "Processing input in standard mode, the count matrix will be stored in field 'X'.", ) if output_format != "scRNA": - print("Output_format will be ignored.") + say(quiet, "Output_format will be ignored.") - # read the actual input matrix - af_raw = scanpy.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"])) - afg = [ - line.rstrip() - for line in open( - os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"]) - ).readlines() - ][:ng] # read the gene ids - afg_df = pd.DataFrame(afg, columns=["gene_ids"]) - afg_df = afg_df.set_index("gene_ids") - # and the barcodes - abc = [ - line.rstrip() - for line in open( - os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"]) - ).readlines() - ] - abc_df = pd.DataFrame(abc, columns=["barcodes"]) - abc_df.index = abc_df["barcodes"] + afg_df = pd.read_table( + os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"]), + names=["gene_ids"], + nrows=ng + ) + + # if we have a gene name to id mapping, use it + # Otherwise, we see if there is a default mapping file + # We first hope the user gives one + gene_id_to_name_path = gene_id_to_name + # make sure the file exists + if gene_id_to_name_path is not None: + if not os.path.exists(gene_id_to_name_path): + say( + quiet, + f"The provided gene id to name mapping file {gene_id_to_name_path} does not exist; ignored.", + ) + gene_id_to_name_path = None + + # we then check if we can find a default mapping file + if gene_id_to_name_path is None: + default_gene_id_to_name_path = os.path.sep.join([frydir, "gene_id_to_name.tsv"]) + if os.path.exists(default_gene_id_to_name_path): + gene_id_to_name_path = default_gene_id_to_name_path + say( + quiet, + f"Using simpleaf gene id to name mapping file: {gene_id_to_name_path}", + ) + # read the file if we find it + if gene_id_to_name_path is not None: + gene_id_to_name_df = pd.read_table( + gene_id_to_name_path, names=["gene_ids", "gene_names"] + ) + afg_df = pd.merge(afg_df, gene_id_to_name_df, on="gene_ids", how="left") + + afg_df = afg_df.set_index("gene_ids", drop=False) + + # read the barcodes file + abc_df = pd.read_table( + os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"]), header=None + ) + + # if column names and num columns don't match, match them + # we have only barcode + if abc_df.shape[1] == 1: + columns = ["barcodes"] + else: + say(quiet, f"Found {abc_df.shape[1] - 1} auxiliary columns in barcodes file.") + columns = ["barcodes"] + aux_columns + + if abc_df.shape[1] != len(columns): + ncol = min(abc_df.shape[1], len(columns)) + + say( + quiet, + f"Number of auxiliary columns in barcodes file does not match the provided column names. Using the first {ncol} columns in the barcode file with names: {columns[:ncol]}.", + ) + + abc_df.columns = columns[:ncol] + columns = columns[:ncol] + + abc_df = abc_df.set_axis(columns, axis=1) + abc_df = abc_df.set_index(columns[0], drop=False) + + say(quiet, "Reading the quantification matrix.") + af_raw = scanpy.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"])) x = af_raw.X # if we're not in USA mode, just combine this info into # an AnnData object if not usa_mode: + say(quiet, "Constructing the AnnData object.") af = scanpy.AnnData(x.T, var=abc_df, obs=afg_df) af = af.T else: # USA mode + say(quiet, "Converting USA mode quantification into specified layers.") # otherwise, combine the sub-matrices into the output object as # specified by `output_assays` rd = {"S": range(0, ng), "U": range(ng, 2 * ng), "A": range(2 * ng, 3 * ng)} @@ -191,9 +257,9 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False): af = af[:, not_zero_genes] - if not quiet: - print(f"Filtered {np.sum(~not_zero_genes)} non-expressed genes.") + say(quiet, f"Filtered {np.sum(~not_zero_genes)} non-expressed genes.") + say(quiet, "Done.") return af @@ -226,32 +292,30 @@ def process_output_format(output_format, quiet): output_format = output_format.lower() if output_format not in predefined_format.keys(): # invalid output_format string - if not quiet: - print("A undefined Provided output_format string provided.") - print("See function help message for details.") + say(quiet, "A undefined Provided output_format string provided.") + say(quiet, "See function help message for details.") raise ValueError("Invalid output_format.") - if not quiet: - print("Using pre-defined output format:", output_format) - print( - f"Will populate output field X with sum of counts frorm {predefined_format[output_format]['X']}." - ) - for k, v in predefined_format[output_format].items(): - if k != "X": - print(f"Will combine {v} into output layer {k}.") + say(quiet, f"Using pre-defined output format: {output_format}") + say( + quiet, + f"Will populate output field X with sum of counts from {predefined_format[output_format]['X']}.", + ) + for k, v in predefined_format[output_format].items(): + if k != "X": + say(quiet, f"Will combine {v} into output layer {k}.") return predefined_format[output_format] else: - if not quiet: - print("Processing user-defined output format.") + say(quiet, "Processing user-defined output format.") # make sure the X is there if "X" not in output_format.keys(): raise ValueError( 'In USA mode some sub-matrices must be assigned to the "X" (default) output.' ) - if not quiet: - print( - f"Will populate output field X with sum of counts frorm {output_format['X']}." - ) + say( + quiet, + f"Will populate output field X with sum of counts frorm {output_format['X']}.", + ) for k, v in output_format.items(): if not v: @@ -266,8 +330,8 @@ def process_output_format(output_format, quiet): raise ValueError( f"Found non-USA element in output_format element list '{v}' for key '{k}'; cannot proceed." ) - if not quiet and (k != "X"): - print(f"Will combine {v} into output layer {k}.") + if k != "X": + say(quiet, f"Will combine {v} into output layer {k}.") return output_format else: diff --git a/src/pyroe/make_splici_txome.py b/src/pyroe/make_splici_txome.py deleted file mode 100755 index aeca2a0..0000000 --- a/src/pyroe/make_splici_txome.py +++ /dev/null @@ -1,650 +0,0 @@ -def append_extra(extra_infile, out_fa, out_t2g3col, id2name_path, col_status): - """ - extra_infile : str - path to the extra FASTA format input file from which to read the extra records - out_fa : str - path to the output FASTA file being created (will be *appended* to) - out_t2g3col : str - path to the output t2g 3-column file (will be *appended* to) - id2name_path : str - path to the output (existing) gene id to gene name tsv (will be *appended* to) - col_status : char - splicing status to use for these records (one of 'S' or 'U') - """ - - from Bio.SeqIO.FastaIO import SimpleFastaParser - - # write extra sequences to the t2g file - with open(extra_infile) as extra_in_fa: - with open(out_fa, "a") as splici_fa: - with open(out_t2g3col, "a") as t2g: - with open(id2name_path, "a") as id_to_name: - for title, sequence in SimpleFastaParser(extra_in_fa): - tid = title.split()[0] - splici_fa.write(f">{tid}\n") - splici_fa.write(f"{sequence}\n") - t2g.write(f"{tid}\t{tid}\t{col_status}\n") - id_to_name.write(f"{tid}\t{tid}\n") - - -def dedup_sequences(output_dir, in_fa, out_fa): - """ - This function deduplicates the fasta sequences in `in_fa`, writes the - deduplicated sequences in fasta format to `out_fa`, and also produces a - 2-column tsv file in `ouput_dir`/duplicate_entries.tsv recording the - entries that have been removed and the retained entry that they match. - Note: It is allowed (and in some cases intended) that `in_fa` == `out_fa`. - In this case, the input fasta file will be overwritten. - - output_dir : str - The output directory where duplicate_entries.tsv will be written - in_fa : str - The path to the input fasta file to be deduplicated - out_fa : str - The path to where the ouput deduplicated fasta file should be written - """ - import os - from Bio import SeqIO - from Bio.Seq import Seq - from Bio.SeqRecord import SeqRecord - - record_representatives = {} - - # read from the input fasta file and track - # the duplicate sequences - for record in SeqIO.parse(in_fa, "fasta"): - seq = str(record.seq) - name = record.id.split()[0] - # if we haven't seen it yet, this is the representative - # otherwise append the name of the duplicate - if seq in record_representatives: - record_representatives[seq].append(name) - else: - record_representatives[seq] = [name] - - # write out the duplicate entries in the same format - # the salmon / pufferfish indexer uses - dup_file_name = os.path.sep.join([output_dir, "duplicate_entries.tsv"]) - with open(dup_file_name, "w") as dup_file: - dup_file.write("RetainedRef\tDuplicateRef\n") - for k, vu in record_representatives.items(): - if len(vu) > 1: - v = sorted(vu) - for dup in v[1:]: - dup_file.write(f"{v[0]}\t{dup}\n") - - # write the deduplicated output to file - # Note: it might be that out_file == in_file and we - # are overwriting the input. - with open(out_fa, "w") as ofile: - for k, vu in record_representatives.items(): - v = sorted(vu) - rec = SeqRecord(Seq(k), id=v[0], description="") - SeqIO.write(rec, ofile, "fasta") - - -def check_gr(gr, output_dir, write_clean_gtf): - """ - This function checks the validity of a PyRanges object. - It will follow these rules: - 1. For gene type records, gene_id and gene_name cannot be both missing. - 2. All other types of records have to have a valid (not NaN) transcript_id. - 3. For all types of records other than gene type, - - If the gene_id and gene_name are both missing, then the transcript_id - will be used to impute them. - - If one of gene_id and gene_name is missing, then the other will be - used to impute the missing one. - - Args: - gr (pyranges): Stranded PyRanges object - """ - - import pandas as pd - import os - import pyranges as pr - import warnings - - # split gene type records with others - # we don't use gene records in splici construction - gene_gr = gr[gr.Feature == "gene"] - gr = gr[gr.Feature != "gene"] - clean_gtf_path = os.path.join(output_dir, "clean_gtf.gtf") - - # If required fields are missing, quit - if "transcript_id" not in gr.columns: - raise ValueError( - "The input GTF file doesn't contain transcript_id field; Cannot proceed." - ) - - if "gene_id" not in gr.columns: - # use gene_name as gene_id if exists, return an error otherwise - if "gene_name" not in gr.columns: - raise ValueError( - "The input GTF file doesn't contain gene_id and gene_name field; Cannot proceed." - ) - else: - warnings.warn("gene_id field does not exist, use gene_name as gene_id.") - gene_id = pd.Series(data=gr.gene_name, name="gene_id") - gr = gr.insert(gene_id) - - if "gene_name" not in gr.columns: - warnings.warn("gene_name field does not exist, use gene_id as gene_name.") - gene_name = pd.Series(data=gr.gene_id, name="gene_name") - gr = gr.insert(gene_name) - - # If there is any NaN in transcript_id field, quit - if sum(gr.transcript_id.isnull()): - - # first, write an clean GTF if needed - if write_clean_gtf: - gr = pr.concat([gene_gr, gr[gr.transcript_id.notnull()]]) - gr.to_gtf(clean_gtf_path) - # gr[gr.transcript_id.notnull()].to_gtf(clean_gtf_path) - clean_gtf_msg = f"An clean GTF file is written to {clean_gtf_path}." - else: - clean_gtf_msg = "Set the write_clean_gtf flag if a clean GTF without the invalid records is needed." - # Then, raise a value error - raise ValueError( - f"Found NaN value in exons' transcript ID; Cannot proceed.\n{clean_gtf_msg}" - ) - - # Impute missing gene_id and gene_name values - # define an object - num_nan = gr.df[["gene_id", "gene_name"]].isnull().sum(axis=1) - - if num_nan.sum(): - - # create intermediate df - gene_df = gr.df[["gene_id", "gene_name"]] - - # Firstly, write the problematic records to a file before imputation - problematic_gtf_path = os.path.join( - output_dir, "missing_gene_id_or_name_records.gtf" - ) - gr[num_nan > 0].to_gtf(problematic_gtf_path) - missing_record_msg = f"\nFound records with missing gene_id/gene_name field.\nThese records are reported in {problematic_gtf_path}." - - # impute using transcript_id - double_missing = num_nan == 2 - if double_missing.sum(): - gene_df.loc[num_nan == 2, "gene_id"] = gr.transcript_id[num_nan == 2] - gene_df.loc[num_nan == 2, "gene_name"] = gr.transcript_id[num_nan == 2] - double_missing_msg = f"\n - Found {(num_nan == 2).sum()} records missing gene_id and gene_name, imputing using transcript_id." - else: - double_missing_msg = "" - - # If one field is missing, impute using the other - # missing only gene_id - gene_id_missing = gene_df["gene_id"].isnull() - if gene_id_missing.sum(): - gene_df.loc[gene_df["gene_id"].isnull(), "gene_id"] = gene_df.loc[ - gene_df["gene_id"].isnull(), "gene_name" - ] - gene_id_missing_msg = f"\n - Found {gene_id_missing.sum()} records missing gene_id, imputing using gene_name." - else: - gene_id_missing_msg = "" - - gene_name_missing = gene_df["gene_name"].isnull() - if gene_name_missing.sum(): - gene_df.loc[gene_df["gene_name"].isnull(), "gene_name"] = gene_df.loc[ - gene_df["gene_name"].isnull(), "gene_id" - ] - gene_name_missing_msg = f"\n - Found {gene_name_missing.sum()} records missing gene_name, imputing using gene_id." - else: - gene_name_missing_msg = "" - - # write the warning message - warnings.warn( - "".join( - [ - missing_record_msg, - double_missing_msg, - gene_id_missing_msg, - gene_name_missing_msg, - ] - ) - ) - - # replace the old gene_id and gene_name fields using imputed one. - gr = gr.drop(["gene_id", "gene_name"]) - gr = gr.insert(gene_df) - # if gene_gr is used in the future, then concat them. - if write_clean_gtf: - clean_gr = pr.concat([gene_gr, gr]) - clean_gr.to_gtf(clean_gtf_path) - clean_gtf_msg = f"An clean GTF file is written to {clean_gtf_path}." - print(clean_gtf_msg) - - # gr = pr.concat([gr, gene_gr]) - - # return imputed gr - return gr - - -def make_splici_txome( - genome_path, - gtf_path, - read_length, - output_dir, - flank_trim_length=5, - filename_prefix="splici", - extra_spliced=None, - extra_unspliced=None, - dedup_seqs=False, - no_bt=False, - bt_path="bedtools", - no_flanking_merge=False, - write_clean_gtf=False, -): - """ - Construct the splici (spliced + introns) transcriptome for alevin-fry. - - Required Parameters - ---------- - genome_path : str - The path to a genome fasta file. - - gtf_path : str - The path to a gtf file. - - read_length : int - The read length of the single-cell experiment being processed. - - output_dir : str - The output directory, where the splici reference files will - be written. - - Optional Parameters - ---------- - - flank_trim_length : int (default: `5`) - The flank trimming length. - The final flank length is obtained by subtracting - the flank_trim_length from the read_length. - - filename_prefix : str (default: `splici`) - The file name prefix of the generated output files. - The derived flank length will be automatically - appended to the provided prefix. - - extra_spliced : str - A path to a fasta file. The records in this fasta file will be - regarded as spliced transcripts. - - extra_unspliced : str - The path to a fasta file. The records in this fasta file will be - regarded as introns. - - dedup_seqs : bool (default: `False`) - If True, the repeated sequences in the splici reference will be - deduplicated. - - no_bt : bool (default: `False`) - If true, biopython, instead of bedtools, will be used for - generating splici reference files. - - bt_path : str - The path to bedtools v2.30.0 or greater if it is not in the environment PATH. - - no_flanking_merge : bool (default: `False`) - If true, overlapping introns caused by the added flanking length will not be merged. - - write_clean_gtf : bool (default: `False`) - If true, when the input GTF contains invalid records, a clean GTF - file `clean_gtf.gtf` with these invalid records removed will be - exported to the output dir. - An invalid record is an exon record without an transcript ID - in the `transcript_id` field. - - - Returns - ------- - Nothing will be returned. The splici reference files will be written - to disk. - - Notes - ----- - * If using bedtools, a temp.bed and a temp.fa will be created and - then deleted. These two files encode the introns of each gene - and the exons of each transcript of each gene. - - """ - - import pyranges as pr - - import warnings - - import os - - import subprocess - import shutil - - from packaging.version import parse as parse_version - - from Bio import SeqIO - from Bio.SeqIO.FastaIO import SimpleFastaParser - - # Preparation - - # check flanking length - flank_length = read_length - flank_trim_length - - if flank_length < 0: - raise ValueError("Flank trim length cannot be larger than read length!") - - # check fasta file - if not os.path.isfile(genome_path): - raise IOError("Cannot open the input fasta file!") - - # check gtf file - if not os.path.isfile(gtf_path): - raise IOError("Cannot open the input gtf file!") - - def check_bedtools_version(bt_check_path): - try: - vstr = ( - subprocess.run([bt_path, "--version"], capture_output=True) - .stdout.decode() - .strip() - .split("v")[1] - ) - found_ver = parse_version(vstr) - req_ver = parse_version("2.30.0") - return found_ver >= req_ver - except subprocess.CalledProcessError as err: - # in this case couldn't even run subprocess - warnings.warn(f"Cannot check bedtools version.\n{err}") - return False - - # check bedtools - if not no_bt: - # check at the provided path - if not check_bedtools_version(bt_path): - # if it's not ok at the provided path, check - # the standard system path - if bt_path == "bedtools": - # in this case, there's nowhere else to check - # so give up on bedtools - print( - "bedtools in the environemnt PATH is either", - "older than v.2.30.0 or doesn't exist.", - "\nBiopython will be used.", - ) - no_bt = True - else: - print( - "bedtools specified by bt_path is either", - "older than v.2.30.0 or doesn't exist.", - "\nTry finding bedtools in the environmental PATH.", - ) - # if it's not ok at the standard system path - # fallback to biopython - if not check_bedtools_version("bedtools"): - print( - "bedtools in the environemnt PATH is either", - "older than v.2.30.0 or doesn't exist.", - "\nBiopython will be used.", - ) - no_bt = True - # found it at the system path - else: - bt_path = "bedtools" - print("Using bedtools in the environmental PATH.") - - # create out folder and temp folder inside - # create output folder - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - # specify output file names - filename_prefix = filename_prefix + "_fl" + str(flank_length) - out_fa = os.path.join(output_dir, filename_prefix + ".fa") - out_t2g3col = os.path.join(output_dir, filename_prefix + "_t2g_3col.tsv") - id2name_path = os.path.join(output_dir, "gene_id_to_name.tsv") - - # load gtf - try: - gr = pr.read_gtf(gtf_path) - except ValueError: - # in this case couldn't even run subprocess - raise RuntimeError( - "PyRanges failed to parse the input GTF file. Please check the PyRanges documentation for the expected GTF format constraints.\nhttps://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf" - ) - - gr = pr.read_gtf(gtf_path) - - # check the validity of gr - gr = check_gr(gr, output_dir, write_clean_gtf) - - # write gene id to name tsv file - gr.df[["gene_id", "gene_name"]].drop_duplicates().to_csv( - id2name_path, sep="\t", header=False, index=False - ) - - # get introns - # the introns() function uses inplace=True argument from pandas, - # which will trigger an FutureWarning. - warnings.simplefilter(action="ignore", category=FutureWarning) - introns = gr.features.introns(by="transcript") - introns.Name = introns.gene_id - - if no_flanking_merge: - introns = introns.merge(strand=True, by=["Name"], slack=0) - - introns = introns.extend(flank_length) - - if not no_flanking_merge: - introns = introns.merge(strand=True, by=["Name"], slack=0) - - introns.Gene = introns.Name - introns.Name = [ - "-I".join(map(str, z)) - for z in zip( - introns.Name, - introns.Name.groupby(introns.Name) - .cumcount() - .astype(str) - .replace( - "0", - "", - ) - .values, - ) - ] - - # trim outbounded introns - with open(genome_path) as fasta_file: - chromsize = { - title.split()[0]: len(sequence) - for title, sequence in SimpleFastaParser(fasta_file) - } - introns = pr.gf.genome_bounds(introns, chromsize, clip=True) - - # deduplicate introns - if dedup_seqs: - introns.drop_duplicate_positions() - - # add splice status for introns - introns.splice_status = "U" - - # get exons - exons = gr[gr.Feature == "exon"] - - exons.Name = exons.transcript_id - exons.Gene = exons.gene_id - exons = exons.drop(exons.columns[~exons.columns.isin(introns.columns)].tolist()) - exons = exons.sort(["Name", "Start", "End"]) - # add splice status for exons - exons.splice_status = "S" - - # concat spliced transcripts and introns as splici - splici = pr.concat([exons, introns]) - - # splici = splici.sort(["Name", "Start", "End", "Gene"]) - - # write to files - # t2g_3col.tsv - splici.df[["Name", "Gene", "splice_status"]].drop_duplicates().to_csv( - out_t2g3col, sep="\t", header=False, index=False - ) - # print(splici.head()) - tid2strand = dict(zip(splici.Name, splici.Strand)) - - # splici fasta - if not no_bt: - try: - - # create temp folder - temp_dir = os.path.join(output_dir, "temp") - if not os.path.exists(temp_dir): - os.makedirs(temp_dir) - temp_fa = os.path.join(temp_dir, "temp.fa") - temp_bed = os.path.join(temp_dir, "temp.bed") - - # write bed file - splici.to_bed(temp_bed, keep=True) - - # run bedtools, ignore strand for now - bt_r = subprocess.run( - " ".join( - [ - bt_path, - "getfasta", - "-fi", - genome_path, - "-fo", - temp_fa, - "-bed", - temp_bed, - # "-s", - "-nameOnly", - ] - ), - shell=True, - capture_output=True, - ) - - # check return code - if bt_r.returncode != 0: - raise ValueError("Bedtools failed.") - - # parse temp fasta file to concat exons of each transcript - ei_parser = SeqIO.parse(temp_fa, "fasta") - prev_rec = next(ei_parser) - # prev_rec.id = prev_rec.id.split("(")[0] - prev_rec.description = "" - with open(out_fa, "w") as out_handle: - - for seq_record in ei_parser: - # seq_record.id = seq_record.id.split("(")[0] - seq_record.description = "" - if seq_record.id == prev_rec.id: - prev_rec += seq_record - - else: - if tid2strand[prev_rec.id] == "-": - prev_rec = prev_rec.reverse_complement( - id=True, description=True - ) - SeqIO.write(prev_rec, out_handle, "fasta") - prev_rec = seq_record - # Don't forget our last customer - if tid2strand[prev_rec.id] == "-": - prev_rec = prev_rec.reverse_complement(id=True, description=True) - SeqIO.write(prev_rec, out_handle, "fasta") - shutil.rmtree(temp_dir, ignore_errors=True) - except subprocess.CalledProcessError as err: - no_bt = True - warnings.warn(f"Bedtools failed. Use biopython instead.\n{err}") - shutil.rmtree(temp_dir, ignore_errors=True) - - if no_bt: - - from Bio.Seq import Seq - from Bio.SeqFeature import SeqFeature, FeatureLocation - - from Bio.SeqRecord import SeqRecord - - with open(out_fa, "w") as out_handle: - # read fasta, process a chromosome at a time - for seq_record in SeqIO.parse(genome_path, "fasta"): - # get all records on that chromosome - chr_records = introns[introns.Chromosome == seq_record.id].df - if not chr_records.empty: - chr_records.Strand = chr_records.Strand.replace( - ["+", "-"], [+1, -1] - ) - # init seq list - intron_seqs = [] - # for each intron record - for idx, intron_record in chr_records.iterrows(): - # create Seqeture object for extracting sequence from chromosome - intron_feature = SeqFeature( - FeatureLocation(intron_record.Start, intron_record.End), - type="intron", - strand=intron_record.Strand, - id=intron_record.Name, - ) - # append the intron sequence to the seq list, specify name as well - intron_seqs.append( - SeqRecord( - intron_feature.extract(seq_record).seq, - id=intron_record.Name, - description="", - ) - ) - # finally, write all intron sequences at once. - SeqIO.write(intron_seqs, out_handle, "fasta") - - # Then, process spliced transcripts - chr_records = exons[exons.Chromosome == seq_record.id].df - if not chr_records.empty: - txp_seqs = [] - # as spliced txps are the concat of all exon sequences, fist get the sequence of each exon separately,then sum them up. - for tid, exon_records in chr_records.groupby("Name"): - - # init exon seq list - exon_seqs = [] - # get the sequence of each exon - for idx, exon_record in exon_records.iterrows(): - # create SeqFeature object for the exon record - # ignore strand for now, get reverse complement later if needed - exon_feature = SeqFeature( - FeatureLocation(exon_record.Start, exon_record.End), - type="exon", - ) - # extract exon sequence from chromosome and append to exon seq list - exon_seqs.append( - SeqRecord( - exon_feature.extract(seq_record).seq, - id=tid, - description="", - ) - ) - # append the txp sequence to spliced txp seq list - # consider strand - if tid2strand[tid] == "-": - txp_seqs.append( - sum(exon_seqs, Seq("")).reverse_complement( - id=True, description=True - ) - ) - else: - txp_seqs.append(sum(exon_seqs, Seq(""))) - # write all spliced transcript serquence at once. - SeqIO.write(txp_seqs, out_handle, "fasta") - - # append extra spliced transcript onto splici - if extra_spliced is not None: - append_extra(extra_spliced, out_fa, out_t2g3col, id2name_path, "S") - - # append extra unspliced transcript onto splici - if extra_unspliced is not None: - append_extra(extra_unspliced, out_fa, out_t2g3col, id2name_path, "U") - - if dedup_seqs: - # Note: out_fa is intentionally passed as both - # the input and output file name parameters because - # we want to overwirte the duplicate fasta with - # the deduplicated fasta. - dedup_sequences(output_dir, out_fa, out_fa) diff --git a/src/pyroe/make_txome.py b/src/pyroe/make_txome.py deleted file mode 100644 index 5485b63..0000000 --- a/src/pyroe/make_txome.py +++ /dev/null @@ -1,1230 +0,0 @@ -import os -import warnings -import subprocess -import shutil -import pyranges as pr -import pandas as pd -import numpy as np -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqFeature import SeqFeature, FeatureLocation -from Bio.SeqRecord import SeqRecord -from Bio.SeqIO.FastaIO import SimpleFastaParser -from packaging.version import parse as parse_version -import logging - -bed_required_fields = [ - "Chromosome", - "Start", - "End", - "Strand", - "Name", - "Gene", - "splice_status", -] - - -def append_extra(extra_infile, out_fa, out_t2g3col, id2name_path, col_status): - """ - extra_infile : str - path to the extra FASTA format input file from which to read the extra records - out_fa : str - path to the output FASTA file being created (will be *appended* to) - out_t2g3col : str - path to the output t2g 3-column file (will be *appended* to) - id2name_path : str - path to the output (existing) gene id to gene name tsv (will be *appended* to) - col_status : char - splicing status to use for these records (one of 'S' or 'U') - """ - - # write extra sequences to the t2g file - with open(extra_infile) as extra_in_fa: - with open(out_fa, "a") as splici_fa: - with open(out_t2g3col, "a") as t2g: - with open(id2name_path, "a") as id_to_name: - for title, sequence in SimpleFastaParser(extra_in_fa): - tid = title.split()[0] - splici_fa.write(f">{tid}\n") - splici_fa.write(f"{sequence}\n") - t2g.write(f"{tid}\t{tid}\t{col_status}\n") - id_to_name.write(f"{tid}\t{tid}\n") - - -def dedup_sequences(output_dir, in_fa, out_fa): - """ - This function deduplicates the fasta sequences in `in_fa`, writes the - deduplicated sequences in fasta format to `out_fa`, and also produces a - 2-column tsv file in `ouput_dir`/duplicate_entries.tsv recording the - entries that have been removed and the retained entry that they match. - Note: It is allowed (and in some cases intended) that `in_fa` == `out_fa`. - In this case, the input fasta file will be overwritten. - - output_dir : str - The output directory where duplicate_entries.tsv will be written - in_fa : str - The path to the input fasta file to be deduplicated - out_fa : str - The path to where the ouput deduplicated fasta file should be written - """ - record_representatives = {} - - # read from the input fasta file and track - # the duplicate sequences - for record in SeqIO.parse(in_fa, "fasta"): - seq = str(record.seq) - name = record.id.split()[0] - # if we haven't seen it yet, this is the representative - # otherwise append the name of the duplicate - if seq in record_representatives: - record_representatives[seq].append(name) - else: - record_representatives[seq] = [name] - - # write out the duplicate entries in the same format - # the salmon / pufferfish indexer uses - dup_file_name = os.path.sep.join([output_dir, "duplicate_entries.tsv"]) - with open(dup_file_name, "w") as dup_file: - dup_file.write("RetainedRef\tDuplicateRef\n") - for k, vu in record_representatives.items(): - if len(vu) > 1: - v = sorted(vu) - for dup in v[1:]: - dup_file.write(f"{v[0]}\t{dup}\n") - - # write the deduplicated output to file - # Note: it might be that out_file == in_file and we - # are overwriting the input. - with open(out_fa, "w") as ofile: - for k, vu in record_representatives.items(): - v = sorted(vu) - rec = SeqRecord(Seq(k), id=v[0], description="") - SeqIO.write(rec, ofile, "fasta") - - -def check_gr(gr, output_dir): - """ - This function checks the validity of a PyRanges object and generates a clean GTF file in the expected format if there is any invalid record in the input PyRanges object. - It applies the following rules: - 1. Each non-gene record has to have a valid transcript_id. If this is not satisfied, it returns an error. Only the records with a valid transcript_id will be written to the clean_gtf.gtf. - 2. For gene_id and gene_name metadata field, - - If these two fields are entirely missing in the GTF file, An error will be returned. At the same time, in the clean_gtf.gtf, the two fields will be imputed using the transcript_id fields. - - If one of these two fields is completely missing, a warning will be generated, and the missing field will be imputed using the other one. - - if some records have missing gene_id and/or gene_name, a warning will be printed, and the missing values will be imputed by the following rules: For records miss gene_id or gene_name, impute the missing one using the other one; If both are missing, impute them using transcript_id, which cannot be missing. - 3. If there is no "transcript" or "gene" feature record, a warning will be printed. Moreover, those missing records will be imputed using the "exon" feature records: The Start and End site of the gene/transcript will be imputed as the bounds of their corresponding exons. - 4. If the boundaries defined in the transcripts'/genes' feature records do not match those implied by their exons' feature records, report a warning but still use transcripts'/genes' feature records to extract unspliced sequences. To be specific, if some but not all transcripts/genes have their corresponding transcripts'/genes' feature records, or the Start and/or End site defined in the transcript/gene feature records do not match the corresponding exons' bounds, then the existing transcripts'/genes' feature records will be used to extract unspliced transcripts. At the same time, in the clean_gtf.gtf, all genes/transcripts that appeared in the exon feature records will have their corresponding transcripts'/genes' feature records, in which the boundaries match the corresponding exons' bounds. - - Args: - gr (`PyRanges`): A stranded PyRanges object - - Return: - `PyRanges`: A PyRanges with missing values imputed if possible. - - Besides, a clean_gtf.gtf file will be generated in the output directory - if there is any invalid records in the input GTF file. - """ - - # split gene type records with others - # we don't use gene records in splici construction - clean_gtf_path = os.path.join(output_dir, "clean_gtf.gtf") - clean_gr = pr.PyRanges() - - # If required fields are missing, quit - if "transcript_id" not in gr.columns: - logging.critical( - " The input GTF file doesn't contain transcript_id metadata field; Cannot proceed." - ) - - if "Feature" not in gr.columns: - logging.critical( - " The input GTF file doesn't contain feature field; Cannot proceed." - ) - - if "gene_id" not in gr.columns: - # use gene_name as gene_id if exists, return an error otherwise - if "gene_name" not in gr.columns: - logging.error( - " The input GTF file doesn't contain gene_id and gene_name metadata field; Cannot proceed." - ) - else: - logging.warning( - " The gene_id field does not exist; Imputing using gene_name." - ) - gene_id = pd.Series(data=gr.gene_name, name="gene_id") - gr = gr.insert(gene_id) - - if "gene_name" not in gr.columns: - logging.warning("The gene_name field does not exist; Imputing using gene_id.") - gene_name = pd.Series(data=gr.gene_id, name="gene_name") - gr = gr.insert(gene_name) - - # keep only the fields we need - gr = gr[ - [ - "Chromosome", - "Feature", - "Start", - "End", - "Strand", - "gene_id", - "gene_name", - "transcript_id", - ] - ] - - # If there is any NaN in transcript_id field, quit - # gene features don't have transcript_id, ignore - - if gr[gr.Feature == "exon"].transcript_id.isnull().any(): - # first, write an clean GTF if needed - clean_gr = gr[np.logical_and(gr.transcript_id.notnull(), gr.Feature != "gene")] - clean_gr.to_gtf(clean_gtf_path) - # Then, raise a value error - logging.critical( - "".join( - [ - " Found missing value in exons' transcript ID; Cannot proceed." - f" An clean GTF file without missing transcript_id records is written to {clean_gtf_path}.", - " If needed, rerun using the clean GTF file", - ] - ) - ) - - # Impute missing gene_id and gene_name values - # define an object - num_nan = gr.df[["gene_id", "gene_name"]].isnull().sum(axis=1) - - if num_nan.any(): - # create intermediate df - gene_df = gr.df[["gene_id", "gene_name"]] - - # Firstly, write the problematic records to a file before imputation - problematic_gtf_path = os.path.join( - output_dir, "missing_gene_id_or_name_records.gtf" - ) - gr[num_nan != 0].to_gtf(problematic_gtf_path) - missing_record_msg = f" Found records with missing gene_id/gene_name field. These records are reported in {problematic_gtf_path}." - - # impute double missing using transcript_id - double_missing = num_nan == 2 - if double_missing.sum(): - gene_df.loc[num_nan == 2, "gene_id"] = gr.transcript_id[num_nan == 2] - gene_df.loc[num_nan == 2, "gene_name"] = gr.transcript_id[num_nan == 2] - double_missing_msg = f" Imputed {(num_nan == 2).sum()} records with missing 'gene_id' and 'gene_name' using transcript_id." - else: - double_missing_msg = "" - - # If one field is missing, impute using the other - # missing only gene_id - gene_id_missing = gene_df["gene_id"].isnull() - if gene_id_missing.sum(): - gene_df.loc[gene_df["gene_id"].isnull(), "gene_id"] = gene_df.loc[ - gene_df["gene_id"].isnull(), "gene_name" - ] - gene_id_missing_msg = ( - f" Imputed {gene_id_missing.sum()} missing gene_id using gene_name." - ) - else: - gene_id_missing_msg = "" - - gene_name_missing = gene_df["gene_name"].isnull() - if gene_name_missing.sum(): - gene_df.loc[gene_df["gene_name"].isnull(), "gene_name"] = gene_df.loc[ - gene_df["gene_name"].isnull(), "gene_id" - ] - gene_name_missing_msg = ( - f" Imputed {gene_name_missing.sum()} missing gene_name using gene_id." - ) - else: - gene_name_missing_msg = "" - - # write the warning message - logging.warning( - "".join( - [ - missing_record_msg, - double_missing_msg, - gene_id_missing_msg, - gene_name_missing_msg, - ] - ) - ) - - # replace the old gene_id and gene_name fields by imputed. - gr = gr.drop(["gene_id", "gene_name"]) - gr = gr.insert(gene_df) - - # Then, records all exon records and gene records - clean_gr = pr.concat([clean_gr, gr[gr.Feature == "exon"]]) - - # check if the transcripts and genes are well defined - # first, we get the transcript annotation from exons and from the transcript feature records - # from GTF - transcript_gr = gr[gr.Feature == "transcript"].sort(["transcript_id"]) - gene_gr = gr[gr.Feature == "gene"].sort(["gene_id"]) - - # from exons - transcript_bound_from_exons = ( - gr[gr.Feature == "exon"] - .boundaries(group_by=["transcript_id", "gene_id", "gene_name"]) - .sort(["transcript_id"]) - ) - transcript_bound_from_exons.Feature = "transcript" - - gene_bound_from_exons = ( - gr[gr.Feature == "exon"] - .boundaries(group_by=["gene_id", "gene_name"]) - .sort(["gene_id"]) - ) - gene_bound_from_exons.Feature = "gene" - - # define clean transcript and gene gr - clean_transcript_gr = pr.PyRanges() - clean_gene_gr = pr.PyRanges() - - # If there is no transcript or annotation, we - # 1. give a warning, - # 2. using exons' bounds as the bound of transcripts/genes to extract unspliced sequences - # 3. write those bounds to the clean gtf file - - if transcript_gr.empty or gene_gr.empty: - if transcript_gr.empty: - logging.warning( - "".join( - [ - " The given GTF file doesn't have transcript feature records;", - " Imputing using exon feature records.", - ] - ) - ) - - transcript_gr = transcript_bound_from_exons - clean_transcript_gr = transcript_bound_from_exons - gr = pr.concat([gr, transcript_bound_from_exons]) - - if gene_gr.empty: - logging.warning( - "".join( - [ - " The given GTF file doesn't have gene feature records;", - " Imputing using exon feature records.", - ] - ) - ) - - gene_gr = gene_bound_from_exons - clean_gene_gr = gene_bound_from_exons - gr = pr.concat([gr, gene_bound_from_exons]) - else: - # If some of them are missing, we report a warning - # and use the transcripts' bounds in the original GTF file to extract introns, - # but impute the missing annotations in the clean GTF file. - # We will say in the warning message that if the users want to - # use the annotations we generated, - # they should rerun pyroe using the clean GTF file. - - # if some transcripts don't have exons - if transcript_gr.length > transcript_bound_from_exons.length: - # pyranges will ignore it anyway. Here I filter them out manually. - transcript_gr = transcript_gr[ - transcript_gr.transcript_id.isin( - transcript_bound_from_exons.transcript_id - ) - ] - - # complain - logging.warning(" Found transcript(s) without exons; Ignored.") - clean_transcript_gr = transcript_bound_from_exons - - # if some transcripts don't have features - elif transcript_gr.length < transcript_bound_from_exons.length: - clean_transcript_gr = transcript_bound_from_exons - - # complain - logging.warning( - "".join( - [ - " Found transcripts without corresponding transcript feature record;", - " Those transcripts were not used to extract unspliced sequences.", - ] - ) - ) - - # if some genes don't have exons - if gene_gr.length > gene_bound_from_exons.length: - gene_gr = gene_gr[gene_gr.gene_id.isin(gene_bound_from_exons.gene_id)] - - # complain - logging.warning("".join([" Found gene(s) without exons; Ignored."])) - clean_gene_gr = gene_bound_from_exons - - # if some genes don't have features - elif gene_gr.length < gene_bound_from_exons.length: - clean_gene_gr = gene_bound_from_exons - - # complain - logging.warning( - "".join( - [ - " Found genes without corresponding gene feature record.", - " Those genes were not used to extract unspliced sequences.", - ] - ) - ) - - # If the transcripts'/genes' bounds defined in the original GTF file - # and those found manually (using exons' bounds) are different, - # we report a warning and extract unspliced sequences - # using the transcripts'/genes' annotation in the original GTF file, - # but use manually defined transcript annotations (from their exons' bounds) - # in the clean GTF file. - - # transcripts - intersecting_txs = set(transcript_gr.transcript_id).intersection( - set(transcript_bound_from_exons.transcript_id) - ) - - if not transcript_gr[transcript_gr.transcript_id.isin(intersecting_txs)][ - ["Start", "End"] - ].df.equals( - transcript_bound_from_exons[ - transcript_bound_from_exons.transcript_id.isin(intersecting_txs) - ][["Start", "End"]].df - ): - clean_transcript_gr = transcript_bound_from_exons - - logging.warning( - "".join( - [ - " Found transcripts whose boundaries defined in their transcript feature record do not match their exons' bounds.", - " However, those boundaries were still used to extract unspliced sequences.", - ] - ) - ) - - # genes - intersecting_gs = set(gene_gr.gene_id).intersection( - set(gene_bound_from_exons.gene_id) - ) - - if not gene_gr[gene_gr.gene_id.isin(intersecting_gs)][ - ["Start", "End"] - ].df.equals( - gene_bound_from_exons[gene_bound_from_exons.gene_id.isin(intersecting_gs)][ - ["Start", "End"] - ].df - ): - clean_gene_gr = gene_bound_from_exons - - logging.warning( - "".join( - [ - " Found genes whose boundaries defined in the gene feature records do not equal to their exons' bounds.", - " However, those boundaries were still used to extract unspliced sequences.", - ] - ) - ) - - # if clean_gr is not empty, write it - clean_gr = pr.concat([clean_gr, clean_transcript_gr, clean_gene_gr]) - if not clean_gr.empty: - clean_gr.to_gtf(clean_gtf_path) - clean_gtf_msg = f" A clean GTF file with all issues fixed is generated at {clean_gtf_path}. If needed, please rerun using this clean GTF file." - logging.warning(clean_gtf_msg) - - # return imputed gr - return gr - - -def check_bedtools_version(bt_path): - try: - vstr = ( - subprocess.run([bt_path, "--version"], capture_output=True) - .stdout.decode() - .strip() - .split("v")[1] - ) - found_ver = parse_version(vstr) - req_ver = parse_version("2.30.0") - return found_ver >= req_ver - except Exception as err: - # in this case couldn't even run subprocess - logging.warning(f" Cannot check bedtools version. The error message was: {err}") - return False - - -def make_splici_txome( - genome_path, - gtf_path, - read_length, - output_dir, - flank_trim_length=5, - filename_prefix="splici", - extra_spliced=None, - extra_unspliced=None, - dedup_seqs=False, - no_bt=False, - bt_path="bedtools", - no_flanking_merge=False, -): - """ - Construct the splici (spliced + introns) transcriptome for alevin-fry. - - Required Parameters - ---------- - genome_path : str - The path to a genome fasta file. - - gtf_path : str - The path to a gtf file. - - read_length : int - The read length of the single-cell experiment being processed. - - output_dir : str - The output directory, where the splici reference files will - be written. - - Optional Parameters - ---------- - - flank_trim_length : int (default: `5`) - The flank trimming length. - The final flank length is obtained by subtracting - the flank_trim_length from the read_length. - - filename_prefix : str (default: `splici`) - The file name prefix of the generated output files. - The derived flank length will be automatically - appended to the provided prefix. - - extra_spliced : str - A path to a fasta file. The records in this fasta file will be - regarded as spliced transcripts. - - extra_unspliced : str - The path to a fasta file. The records in this fasta file will be - regarded as introns. - - dedup_seqs : bool (default: `False`) - If True, the repeated sequences in the splici reference will be - deduplicated. - - no_bt : bool (default: `False`) - If true, biopython, instead of bedtools, will be used for - generating splici reference files. - - bt_path : str - The path to bedtools v2.30.0 or greater if it is not in the environment PATH. - - no_flanking_merge : bool (default: `False`) - If true, overlapping introns caused by the added flanking length will not be merged. - - Returns - ------- - Nothing will be returned. The splici reference files will be written - to disk. - - Notes - ----- - * The input GTF file will be processed before extracting unspliced sequences. If pyroe finds invalid records, a `clean_gtf.gtf` file will be generated in the specified output directory. **Note** : The features extracted in the spliced + intronic transcriptome will not necessarily be those present in the `clean_gtf.gtf` file — as this command will prefer the input in the user-provided file wherever possible. More specifically: - * If the required metadata fields contain missing values, pyroe will impute them if possible, or return an error if not. - * **Pyroe will always extract unspliced sequences according to the boundaries defined in the transcript/gene feature records unless there is no transcript/gene feature record in the GTF file.** In this case, pyroe imputes all transcripts/genes boundaries as the bounds of the corresponding exons to extract unspliced sequences. - * If the transcript/gene feature records do not match their exon feature records, pyroe will still use transcript/gene feature records, but correct those transcript/gene feature records in the `celan_grf.gtf` according to exon feature records. - * If using bedtools, a temp.bed and a temp.fa will be created and - then deleted. These two files encode the introns of each gene - and the exons of each transcript of each gene. - - """ - # Preparation - - # check flanking length - flank_length = read_length - flank_trim_length - - if flank_length < 0: - logging.critical(" Flank trim length cannot be larger than read length!") - - # check fasta file - if not os.path.isfile(genome_path): - logging.critical(" Cannot open the input fasta file!") - - # check gtf file - if not os.path.isfile(gtf_path): - logging.critical(" Cannot open the input gtf file!") - - # check bedtools - if not no_bt: - # check at the provided path - if not check_bedtools_version(bt_path): - # if it's not ok at the provided path, check - # the standard system path - if bt_path == "bedtools": - # in this case, there's nowhere else to check - # so give up on bedtools - logging.warning( - "".join( - [ - " Bedtools in the environemnt PATH is either", - " older than v.2.30.0 or doesn't exist.", - " Biopython will be used to extract sequences.", - ] - ) - ) - no_bt = True - else: - logging.warning( - " Bedtools specified by bt_path is either", - " older than v.2.30.0 or doesn't exist.", - " Trying to find bedtools in the environmental PATH.", - ) - # if it's not ok at the standard system path - # fallback to biopython - if not check_bedtools_version("bedtools"): - logging.warning( - "".join( - [ - " Bedtools in the environemnt PATH is either", - " older than v.2.30.0 or doesn't exist.", - " Biopython will be used to extract sequences.", - ] - ) - ) - no_bt = True - # found it at the system path - else: - bt_path = "bedtools" - logging.warning(" Using bedtools in the environmental PATH.") - - # create out folder and temp folder inside - # create output folder - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - # specify output file names - filename_prefix = filename_prefix + "_fl" + str(flank_length) - out_fa = os.path.join(output_dir, filename_prefix + ".fa") - out_t2g3col = os.path.join(output_dir, filename_prefix + "_t2g_3col.tsv") - id2name_path = os.path.join(output_dir, "gene_id_to_name.tsv") - - # load gtf - try: - gr = pr.read_gtf(gtf_path) - except Exception as err: - # in this case couldn't even read the GTF file - logging.error( - "".join( - [ - " PyRanges failed to parse the input GTF file.", - " Please check the PyRanges documentation for the expected GTF format constraints at", - " https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf .", - f" The error message was: {str(err)}", - ] - ), - exc_info=True, - ) - - # check the validity of gr - gr = check_gr(gr, output_dir) - - # write gene id to name tsv file - gr.df[["gene_id", "gene_name"]].drop_duplicates().to_csv( - id2name_path, sep="\t", header=False, index=False - ) - - # get introns - # the introns() function uses inplace=True argument from pandas, - # which will trigger an FutureWarning. - warnings.simplefilter(action="ignore", category=FutureWarning) - introns = gr.features.introns(by="transcript") - warnings.simplefilter(action="default", category=FutureWarning) - - introns.Name = introns.gene_id - - if no_flanking_merge: - introns = introns.merge(strand=True, by=["Name"], slack=0) - - introns = introns.extend(flank_length) - - if not no_flanking_merge: - introns = introns.merge(strand=True, by=["Name"], slack=0) - - introns.Gene = introns.Name - introns.Name = [ - "-I".join(map(str, z)) - for z in zip( - introns.Name, - introns.Name.groupby(introns.Name) - .cumcount() - .astype(str) - .replace( - "0", - "", - ) - .values, - ) - ] - - # trim outbounded introns - with open(genome_path) as fasta_file: - chromsize = { - title.split()[0]: len(sequence) - for title, sequence in SimpleFastaParser(fasta_file) - } - - # in case the genome and gene annotaitons do not match - # try it and raise value error if this fails - try: - introns = pr.gf.genome_bounds(introns, chromsize, clip=True) - except Exception as err: - logging.error( - "".join( - [ - " Failed to refine intron bounds using genome bounds.", - " Please check if the input genome FASTA file and GTF file match each other, especially the chromosome names.", - f" The error message was: {str(err)}", - ] - ), - exc_info=True, - ) # deduplicate introns - if dedup_seqs: - introns.drop_duplicate_positions() - # add splice status for introns - introns.splice_status = "U" - - introns = introns[bed_required_fields] - - # get exons - exons = gr[gr.Feature == "exon"] - - exons.Name = exons.transcript_id - exons.Gene = exons.gene_id - exons = exons.sort(["Name", "Start", "End"]) - # add splice status for exons - exons.splice_status = "S" - # keep only required fields - exons = exons[bed_required_fields] - - # concat spliced transcripts and introns as splici - splici = pr.concat([exons, introns]) - - # write to files - # t2g_3col.tsv - splici.df[["Name", "Gene", "splice_status"]].drop_duplicates().to_csv( - out_t2g3col, sep="\t", header=False, index=False - ) - # print(splici.head()) - tid2strand = dict(zip(splici.Name, splici.Strand)) - - # splici fasta - if not no_bt: - try: - - # create temp folder - temp_dir = os.path.join(output_dir, "temp") - if not os.path.exists(temp_dir): - os.makedirs(temp_dir) - temp_fa = os.path.join(temp_dir, "temp.fa") - temp_bed = os.path.join(temp_dir, "temp.bed") - - # write bed file - splici.to_bed(temp_bed, keep=True) - - # run bedtools, ignore strand for now - bt_r = subprocess.run( - " ".join( - [ - bt_path, - "getfasta", - "-fi", - genome_path, - "-fo", - temp_fa, - "-bed", - temp_bed, - # "-s", - "-nameOnly", - ] - ), - shell=True, - capture_output=True, - ) - - # check return code - if bt_r.returncode != 0: - logging.exception(" Bedtools failed.", exc_info=True) - - # parse temp fasta file to concat exons of each transcript - ei_parser = SeqIO.parse(temp_fa, "fasta") - prev_rec = next(ei_parser) - # prev_rec.id = prev_rec.id.split("(")[0] - prev_rec.description = "" - with open(out_fa, "w") as out_handle: - - for seq_record in ei_parser: - # seq_record.id = seq_record.id.split("(")[0] - seq_record.description = "" - if seq_record.id == prev_rec.id: - prev_rec += seq_record - - else: - if tid2strand[prev_rec.id] == "-": - prev_rec = prev_rec.reverse_complement( - id=True, description=True - ) - SeqIO.write(prev_rec, out_handle, "fasta") - prev_rec = seq_record - # Don't forget our last customer - if tid2strand[prev_rec.id] == "-": - prev_rec = prev_rec.reverse_complement(id=True, description=True) - SeqIO.write(prev_rec, out_handle, "fasta") - shutil.rmtree(temp_dir, ignore_errors=True) - except Exception as err: - no_bt = True - logging.warning( - f" Bedtools failed; Using biopython instead. The error message was: \n{err}" - ) - shutil.rmtree(temp_dir, ignore_errors=True) - - if no_bt: - with open(out_fa, "w") as out_handle: - # read fasta, process a chromosome at a time - for seq_record in SeqIO.parse(genome_path, "fasta"): - # get all records on that chromosome - chr_records = introns[introns.Chromosome == seq_record.id].df - if not chr_records.empty: - chr_records.Strand = chr_records.Strand.replace( - ["+", "-"], [+1, -1] - ) - # init seq list - intron_seqs = [] - # for each intron record - for idx, intron_record in chr_records.iterrows(): - # create Seqeture object for extracting sequence from chromosome - intron_feature = SeqFeature( - FeatureLocation(intron_record.Start, intron_record.End), - type="intron", - id=intron_record.Name, - ) - intron_feature.strand = intron_record.Strand - - # append the intron sequence to the seq list, specify name as well - intron_seqs.append( - SeqRecord( - intron_feature.extract(seq_record).seq, - id=intron_record.Name, - description="", - ) - ) - # finally, write all intron sequences at once. - SeqIO.write(intron_seqs, out_handle, "fasta") - - # Then, process spliced transcripts - chr_records = exons[exons.Chromosome == seq_record.id].df - if not chr_records.empty: - txp_seqs = [] - # as spliced txps are the concat of all exon sequences, fist get the sequence of each exon separately,then sum them up. - for tid, exon_records in chr_records.groupby("Name"): - - # init exon seq list - exon_seqs = [] - # get the sequence of each exon - for idx, exon_record in exon_records.iterrows(): - # create SeqFeature object for the exon record - # ignore strand for now, get reverse complement later if needed - exon_feature = SeqFeature( - FeatureLocation(exon_record.Start, exon_record.End), - type="exon", - ) - # extract exon sequence from chromosome and append to exon seq list - exon_seqs.append( - SeqRecord( - exon_feature.extract(seq_record).seq, - id=tid, - description="", - ) - ) - # append the txp sequence to spliced txp seq list - # consider strand - if tid2strand[tid] == "-": - txp_seqs.append( - sum(exon_seqs, Seq("")).reverse_complement( - id=True, description=True - ) - ) - else: - txp_seqs.append(sum(exon_seqs, Seq(""))) - # write all spliced transcript serquence at once. - SeqIO.write(txp_seqs, out_handle, "fasta") - - # append extra spliced transcript onto splici - if extra_spliced is not None: - append_extra(extra_spliced, out_fa, out_t2g3col, id2name_path, "S") - - # append extra unspliced transcript onto splici - if extra_unspliced is not None: - append_extra(extra_unspliced, out_fa, out_t2g3col, id2name_path, "U") - - if dedup_seqs: - # Note: out_fa is intentionally passed as both - # the input and output file name parameters because - # we want to overwirte the duplicate fasta with - # the deduplicated fasta. - dedup_sequences(output_dir, out_fa, out_fa) - - -def make_spliceu_txome( - genome_path, - gtf_path, - output_dir, - filename_prefix, - extra_spliced=None, - extra_unspliced=None, - dedup_seqs=False, - no_bt=False, - bt_path="bedtools", -): - """ - Construct the spliceu (spliced + unspliced) transcriptome for alevin-fry. - - Required Parameters - ---------- - genome_path : str - The path to a genome fasta file. - - gtf_path : str - The path to a gtf file. - - output_dir : str - The output directory, where the spliceu reference files will - be written. - - Optional Parameters - ---------- - - filename_prefix : str (default: `spliceu`) - The file name prefix of the generated output files. - The derived flank length will be automatically - appended to the provided prefix. - - extra_spliced : str - A path to a fasta file. The records in this fasta file will be - regarded as spliced transcripts. - - extra_unspliced : str - The path to a fasta file. The records in this fasta file will be - regarded as introns. - - dedup_seqs : bool (default: `False`) - If True, the repeated sequences in the spliceu reference will be - deduplicated. - - no_bt : bool (default: `False`) - If true, biopython, instead of bedtools, will be used for - generating spliceu reference files. - - bt_path : str - The path to bedtools v2.30.0 or greater if it is not in the environment PATH. - - Returns - ------- - Nothing will be returned. The spliceu reference files will be written - to disk. - - Notes - ----- - * The input GTF file will be processed before extracting unspliced sequences. If pyroe finds invalid records, a `clean_gtf.gtf` file will be generated in the specified output directory. **Note** : The features extracted in the spliced + unspliced transcriptome will not necessarily be those present in the `clean_gtf.gtf` file — as this command will prefer the input in the user-provided file wherever possible. More specifically: - * If the required metadata fields contain missing values, pyroe will impute them if possible, or return an error if not. - * **Pyroe will always extract unspliced sequences according to the boundaries defined in the transcript/gene feature records unless there is no transcript/gene feature record in the GTF file.** In this case, pyroe imputes all transcripts/genes boundaries as the bounds of the corresponding exons to extract unspliced sequences. - * If the transcript/gene feature records do not match their exon feature records, pyroe will still use transcript/gene feature records, but correct those transcript/gene feature records in the `celan_grf.gtf` according to exon feature records. - * If using bedtools, a temp.bed and a temp.fa will be created and - then deleted. These two files encode the introns of each gene - and the exons of each transcript of each gene. - - """ - # Preparation - # check fasta file - if not os.path.isfile(genome_path): - logging.critical(" Cannot open the input fasta file!") - - # check gtf file - if not os.path.isfile(gtf_path): - logging.critical(" Cannot open the input gtf file!") - - # check bedtools - if not no_bt: - # check at the provided path - if not check_bedtools_version(bt_path): - # if it's not ok at the provided path, check - # the standard system path - if bt_path == "bedtools": - # in this case, there's nowhere else to check - # so give up on bedtools - logging.warning( - "".join( - [ - " Bedtools in the environemnt PATH is either", - " older than v.2.30.0 or doesn't exist.", - " Biopython will be used.", - ] - ) - ) - no_bt = True - else: - logging.warning( - "".join( - [ - "Bedtools specified by bt_path is either", - "older than v.2.30.0 or doesn't exist.", - "Trying to find bedtools in the environmental PATH.", - ] - ) - ) - # if it's not ok at the standard system path - # fallback to biopython - if not check_bedtools_version("bedtools"): - logging.warning( - "".join( - [ - " Bedtools in the environemnt PATH is either", - " older than v.2.30.0 or doesn't exist.", - " Biopython will be used.", - ] - ) - ) - no_bt = True - # found it at the system path - else: - bt_path = "bedtools" - logging.warning(" Using bedtools in the environmental PATH.") - - # create out folder and temp folder inside - # create output folder - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - # specify output file names - out_fa = os.path.join(output_dir, filename_prefix + ".fa") - out_t2g3col = os.path.join(output_dir, filename_prefix + "_t2g_3col.tsv") - out_t2g = os.path.join(output_dir, filename_prefix + "_t2g.tsv") - out_g2g = os.path.join(output_dir, filename_prefix + "_g2g.tsv") - id2name_path = os.path.join(output_dir, "gene_id_to_name.tsv") - - # load gtf - try: - gr = pr.read_gtf(gtf_path) - except ValueError as err: - # in this case couldn't even run subprocess - logging.error( - "".join( - [ - " PyRanges failed to parse the input GTF file.", - " Please check the PyRanges documentation for", - " the expected GTF format constraints at", - " https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf .", - f" The error message was: {str(err)}", - ] - ), - exc_info=True, - ) - - # check the validity of gr - gr = check_gr(gr, output_dir) - - # write gene id to name tsv file - gr.df[["gene_id", "gene_name"]].drop_duplicates().to_csv( - id2name_path, sep="\t", header=False, index=False - ) - - # get unspliced - unspliced = gr[gr.Feature == "gene"] - # unspliced = gr.boundaries("gene_id") - unspliced.Name = unspliced.gene_id + "-I" - unspliced.Gene = unspliced.gene_id - # add splice status for unspliced - unspliced.splice_status = "U" - # keep only required fields - unspliced = unspliced[bed_required_fields] - - # get exons - exons = gr[gr.Feature == "exon"] - - exons.Name = exons.transcript_id - exons.Gene = exons.gene_id - - exons = exons.sort(["Name", "Start", "End"]) - # add splice status for exons - exons.splice_status = "S" - # keep only required fields - exons = exons[bed_required_fields] - - # concat spliced transcripts and unspliced as spliceu - spliceu = pr.concat([exons, unspliced]) - - # write to files - # t2g_3col.tsv - t2g_3col = spliceu.df[["Name", "Gene", "splice_status"]].drop_duplicates() - t2g_3col.to_csv(out_t2g3col, sep="\t", header=False, index=False) - - # t2g.csv - t2g_3col[["Name", "Gene"]].to_csv(out_t2g, sep="\t", header=False, index=False) - - # g2g.csv - t2g_3col[["Gene", "Gene"]].to_csv(out_g2g, sep="\t", header=False, index=False) - - tid2strand = dict(zip(spliceu.Name, spliceu.Strand)) - - # spliceu fasta - if not no_bt: - try: - # create temp folder - temp_dir = os.path.join(output_dir, "temp") - if not os.path.exists(temp_dir): - os.makedirs(temp_dir) - temp_fa = os.path.join(temp_dir, "temp.fa") - temp_bed = os.path.join(temp_dir, "temp.bed") - - # write bed file - spliceu.to_bed(temp_bed, keep=True) - - # run bedtools, ignore strand for now - bt_r = subprocess.run( - " ".join( - [ - bt_path, - "getfasta", - "-fi", - genome_path, - "-fo", - temp_fa, - "-bed", - temp_bed, - # "-s", - "-nameOnly", - ] - ), - shell=True, - capture_output=True, - ) - - # check return code - if bt_r.returncode != 0: - logging.exception("Bedtools failed.", exc_info=True) - - # parse temp fasta file to concat exons of each transcript - ei_parser = SeqIO.parse(temp_fa, "fasta") - prev_rec = next(ei_parser) - # prev_rec.id = prev_rec.id.split("(")[0] - prev_rec.description = "" - with open(out_fa, "w") as out_handle: - - for seq_record in ei_parser: - # seq_record.id = seq_record.id.split("(")[0] - seq_record.description = "" - if seq_record.id == prev_rec.id: - prev_rec += seq_record - - else: - if tid2strand[prev_rec.id] == "-": - prev_rec = prev_rec.reverse_complement( - id=True, description=True - ) - SeqIO.write(prev_rec, out_handle, "fasta") - prev_rec = seq_record - # Don't forget our last customer - if tid2strand[prev_rec.id] == "-": - prev_rec = prev_rec.reverse_complement(id=True, description=True) - SeqIO.write(prev_rec, out_handle, "fasta") - shutil.rmtree(temp_dir, ignore_errors=True) - except Exception as err: - no_bt = True - logging.warning( - f" Bedtools failed; Using biopython instead. The error message was: {err}" - ) - shutil.rmtree(temp_dir, ignore_errors=True) - - if no_bt: - with open(out_fa, "w") as out_handle: - # read fasta, process a chromosome at a time - for seq_record in SeqIO.parse(genome_path, "fasta"): - # get all records on that chromosome - chr_records = unspliced[unspliced.Chromosome == seq_record.id].df - if not chr_records.empty: - chr_records.Strand = chr_records.Strand.replace( - ["+", "-"], [+1, -1] - ) - # init seq list - unspliced_seqs = [] - # for each unspliced record - for idx, unspliced_record in chr_records.iterrows(): - # create Seqeture object for extracting sequence from chromosome - unspliced_feature = SeqFeature( - FeatureLocation( - unspliced_record.Start, unspliced_record.End - ), - type="unspliced", - id=unspliced_record.Name, - ) - unspliced_feature.strand = unspliced_record.Strand - # append the unspliced sequence to the seq list, specify name as well - unspliced_seqs.append( - SeqRecord( - unspliced_feature.extract(seq_record).seq, - id=unspliced_record.Name, - description="", - ) - ) - # finally, write all unspliced sequences at once. - SeqIO.write(unspliced_seqs, out_handle, "fasta") - - # Then, process spliced transcripts - chr_records = exons[exons.Chromosome == seq_record.id].df - if not chr_records.empty: - txp_seqs = [] - # as spliced txps are the concat of all exon sequences, fist get the sequence of each exon separately,then sum them up. - for tid, exon_records in chr_records.groupby("Name"): - - # init exon seq list - exon_seqs = [] - # get the sequence of each exon - for idx, exon_record in exon_records.iterrows(): - # create SeqFeature object for the exon record - # ignore strand for now, get reverse complement later if needed - exon_feature = SeqFeature( - FeatureLocation(exon_record.Start, exon_record.End), - type="exon", - ) - # extract exon sequence from chromosome and append to exon seq list - exon_seqs.append( - SeqRecord( - exon_feature.extract(seq_record).seq, - id=tid, - description="", - ) - ) - # append the txp sequence to spliced txp seq list - # consider strand - if tid2strand[tid] == "-": - txp_seqs.append( - sum(exon_seqs, Seq("")).reverse_complement( - id=True, description=True - ) - ) - else: - txp_seqs.append(sum(exon_seqs, Seq(""))) - # write all spliced transcript serquence at once. - SeqIO.write(txp_seqs, out_handle, "fasta") - - # append extra spliced transcript onto spliceu - if extra_spliced is not None: - append_extra(extra_spliced, out_fa, out_t2g3col, id2name_path, "S") - - # append extra unspliced transcript onto spliceu - if extra_unspliced is not None: - append_extra(extra_unspliced, out_fa, out_t2g3col, id2name_path, "U") - - if dedup_seqs: - # Note: out_fa is intentionally passed as both - # the input and output file name parameters because - # we want to overwirte the duplicate fasta with - # the deduplicated fasta. - dedup_sequences(output_dir, out_fa, out_fa) diff --git a/tests/test_make_spliceu.py b/tests/test_make_spliceu.py deleted file mode 100644 index 0da4eb5..0000000 --- a/tests/test_make_spliceu.py +++ /dev/null @@ -1,126 +0,0 @@ -from pyroe.make_txome import make_spliceu_txome -import tempfile -import os -import pandas as pd -from Bio import SeqIO -from Bio.Seq import Seq - - -def test_make_spliceu(): - location = os.path.dirname(os.path.realpath(__file__)) - test_file_dir = os.path.join(location, "data", "test_make_splici") - genome_path = os.path.join(test_file_dir, "small_example_genome.fa") - gtf_path = os.path.join(test_file_dir, "small_example.gtf") - filename_prefix = "spliceu" - extra_spliced = os.path.join(test_file_dir, "extra_spliced.txt") - extra_unspliced = os.path.join(test_file_dir, "extra_unspliced.txt") - output_dir = tempfile.TemporaryDirectory() - make_spliceu_txome( - genome_path=genome_path, - gtf_path=gtf_path, - extra_spliced=extra_spliced, - extra_unspliced=extra_unspliced, - output_dir=output_dir.name, - filename_prefix=filename_prefix, - no_bt=True, - ) - - out_fa = os.path.join(output_dir.name, "".join([filename_prefix, ".fa"])) - out_t2g = os.path.join(output_dir.name, "".join([filename_prefix, "_t2g_3col.tsv"])) - genome = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta")) - spliceu_seqs = SeqIO.to_dict(SeqIO.parse(out_fa, "fasta")) - t2g_colnames = ["txp_name", "gene_name", "splice_status"] - t2g_3col = pd.read_csv( - out_t2g, sep="\t", header=None, names=t2g_colnames - ).set_index(t2g_colnames[0]) - - # Define expected sequences - # For each gene, - # we have 3 spliced txps. Indexing starting from 1 - # tx1 ([1,2], [36, 45], [71,80]) - # tx2 ([46,55], [91, 100]) - # txp3 ([121,130], [156, 160], [191,200]) - # unspliced [1,200] - - chr_1_seq = genome["chr1"].seq - chr_2_seq = genome["chr2"].seq - # check spliced sequences - # tx1_1 - extracted_tx1_1 = spliceu_seqs["tx1.1"].seq - expected_tx1_1 = sum([chr_1_seq[0:2], chr_1_seq[35:45], chr_1_seq[70:80]], Seq("")) - assert extracted_tx1_1 == expected_tx1_1 - - # tx1_2 - extracted_tx1_2 = spliceu_seqs["tx1.2"].seq - expected_tx1_2 = sum([chr_1_seq[45:55], chr_1_seq[90:100]], Seq("")) - assert extracted_tx1_2 == expected_tx1_2 - - # tx1_3 - extracted_tx1_3 = spliceu_seqs["tx1.3"].seq - expected_tx1_3 = sum( - [chr_1_seq[120:130], chr_1_seq[155:160], chr_1_seq[190:200]], Seq("") - ) - assert extracted_tx1_3 == expected_tx1_3 - - # tx2_1 - extracted_tx2_1 = spliceu_seqs["tx2.1"].seq - expected_tx2_1 = sum( - [chr_2_seq[0:2], chr_2_seq[35:45], chr_2_seq[70:80]], Seq("") - ).reverse_complement() - assert extracted_tx2_1 == expected_tx2_1 - - # tx2_2 - extracted_tx2_2 = spliceu_seqs["tx2.2"].seq - expected_tx2_2 = sum( - [chr_2_seq[45:55], chr_2_seq[90:100]], Seq("") - ).reverse_complement() - assert extracted_tx2_2 == expected_tx2_2 - - # tx2_3 - extracted_tx2_3 = spliceu_seqs["tx2.3"].seq - expected_tx2_3 = sum( - [chr_2_seq[120:130], chr_2_seq[155:160], chr_2_seq[190:200]], Seq("") - ).reverse_complement() - assert extracted_tx2_3 == expected_tx2_3 - - # check intron seqs - # g1_I [0,200] - - extract_g1_I = spliceu_seqs["g1-I"].seq - expected_g1_I = sum([chr_1_seq[0:200]], Seq("")) - assert extract_g1_I == expected_g1_I - - # g2_I [0,200] - - extract_g2_I = spliceu_seqs["g2-I"].seq - expected_g2_I = sum([chr_2_seq[0:200]], Seq("")).reverse_complement() - assert extract_g2_I == expected_g2_I - - # check t2g_3col - t2g_dict = t2g_3col[[t2g_colnames[1]]].to_dict()["gene_name"] - t2s_dict = t2g_3col[[t2g_colnames[2]]].to_dict()["splice_status"] - - # check txp name to gene name mapping - # for g1 - assert t2g_dict["tx1.1"] == "g1" - assert t2g_dict["tx1.2"] == "g1" - assert t2g_dict["tx1.3"] == "g1" - assert t2g_dict["g1-I"] == "g1" - - # for g2 - assert t2g_dict["tx2.1"] == "g2" - assert t2g_dict["tx2.2"] == "g2" - assert t2g_dict["tx2.3"] == "g2" - assert t2g_dict["g2-I"] == "g2" - - # check txp name to splice status mapping - assert t2s_dict["tx1.1"] == "S" - assert t2s_dict["tx1.2"] == "S" - assert t2s_dict["tx1.3"] == "S" - assert t2s_dict["g1-I"] == "U" - - # for S - assert t2s_dict["tx2.1"] == "S" - assert t2s_dict["tx2.2"] == "S" - assert t2s_dict["tx2.3"] == "S" - assert t2s_dict["g2-I"] == "U" diff --git a/tests/test_make_splici.py b/tests/test_make_splici.py deleted file mode 100644 index 21cb32f..0000000 --- a/tests/test_make_splici.py +++ /dev/null @@ -1,172 +0,0 @@ -from pyroe.make_txome import make_splici_txome -import tempfile -import os -import pandas as pd -from Bio import SeqIO -from Bio.Seq import Seq - - -def test_make_splici(): - location = os.path.dirname(os.path.realpath(__file__)) - test_file_dir = os.path.join(location, "data", "test_make_splici") - genome_path = os.path.join(test_file_dir, "small_example_genome.fa") - gtf_path = os.path.join(test_file_dir, "small_example.gtf") - filename_prefix = "splici" - extra_spliced = os.path.join(test_file_dir, "extra_spliced.txt") - extra_unspliced = os.path.join(test_file_dir, "extra_unspliced.txt") - output_dir = tempfile.TemporaryDirectory() - read_length = 5 - flank_trim_length = 2 - make_splici_txome( - genome_path=genome_path, - gtf_path=gtf_path, - read_length=read_length, - extra_spliced=extra_spliced, - extra_unspliced=extra_unspliced, - flank_trim_length=flank_trim_length, - output_dir=output_dir.name, - filename_prefix=filename_prefix, - no_bt=True, - ) - - out_fa = os.path.join(output_dir.name, "splici_fl3.fa") - out_t2g = os.path.join(output_dir.name, "splici_fl3_t2g_3col.tsv") - genome = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta")) - # gene_annotation = pr.read_gtf(gtf_path).df - splici_seqs = SeqIO.to_dict(SeqIO.parse(out_fa, "fasta")) - t2g_colnames = ["txp_name", "gene_name", "splice_status"] - t2g_3col = pd.read_csv( - out_t2g, sep="\t", header=None, names=t2g_colnames - ).set_index(t2g_colnames[0]) - - # Define expected sequences - # For each gene, - # we have 3 spliced txps. Indexing starting from 1 - # tx1 ([1,2], [36, 45], [71,80]), with intron regions ([3,35], [46,70]) - # tx2 ([46,55], [91, 100]), with intron ([56,90]) - # txp3 ([121,130], [156, 160], [191,200]) with introns ([131,155],[161,190]) - # By adding flanking length, the intron regions become - # ([0,38], [43,73]) - # ([53,93]) - # ([128,158],[158,193]) - # after collapsing, we will left three introns for each gene - # I1: [1,38] - # I2: [43,93] - # I3: [128,193] - - # python subset works as [) - - chr_1_seq = genome["chr1"].seq - chr_2_seq = genome["chr2"].seq - # check spliced sequences - # tx1_1 - extracted_tx1_1 = splici_seqs["tx1.1"].seq - expected_tx1_1 = sum([chr_1_seq[0:2], chr_1_seq[35:45], chr_1_seq[70:80]], Seq("")) - assert extracted_tx1_1 == expected_tx1_1 - - # tx1_2 - extracted_tx1_2 = splici_seqs["tx1.2"].seq - expected_tx1_2 = sum([chr_1_seq[45:55], chr_1_seq[90:100]], Seq("")) - assert extracted_tx1_2 == expected_tx1_2 - - # tx1_3 - extracted_tx1_3 = splici_seqs["tx1.3"].seq - expected_tx1_3 = sum( - [chr_1_seq[120:130], chr_1_seq[155:160], chr_1_seq[190:200]], Seq("") - ) - assert extracted_tx1_3 == expected_tx1_3 - - # tx2_1 - extracted_tx2_1 = splici_seqs["tx2.1"].seq - expected_tx2_1 = sum( - [chr_2_seq[0:2], chr_2_seq[35:45], chr_2_seq[70:80]], Seq("") - ).reverse_complement() - assert extracted_tx2_1 == expected_tx2_1 - - # tx2_2 - extracted_tx2_2 = splici_seqs["tx2.2"].seq - expected_tx2_2 = sum( - [chr_2_seq[45:55], chr_2_seq[90:100]], Seq("") - ).reverse_complement() - assert extracted_tx2_2 == expected_tx2_2 - - # tx2_3 - extracted_tx2_3 = splici_seqs["tx2.3"].seq - expected_tx2_3 = sum( - [chr_2_seq[120:130], chr_2_seq[155:160], chr_2_seq[190:200]], Seq("") - ).reverse_complement() - assert extracted_tx2_3 == expected_tx2_3 - - # check intron seqs - # I1: [1,38] - # I2: [43,93] - # I3: [128,193] - - # g1_I [0,38] - - extract_g1_I = splici_seqs["g1-I"].seq - expected_g1_I = sum([chr_1_seq[0:38]], Seq("")) - assert extract_g1_I == expected_g1_I - - # g1_I1 [43,93] - extract_g1_I1 = splici_seqs["g1-I1"].seq - expected_g1_I1 = sum([chr_1_seq[42:93]], Seq("")) - assert extract_g1_I1 == expected_g1_I1 - - # g1_I2 [128,193] - extract_g1_I2 = splici_seqs["g1-I2"].seq - expected_g1_I2 = sum([chr_1_seq[127:193]], Seq("")) - assert extract_g1_I2 == expected_g1_I2 - - # g2_I [0,38] - - extract_g2_I = splici_seqs["g2-I"].seq - expected_g2_I = sum([chr_2_seq[0:38]], Seq("")).reverse_complement() - assert extract_g2_I == expected_g2_I - - # g2_I1 [43,93] - extract_g2_I1 = splici_seqs["g2-I1"].seq - expected_g2_I1 = sum([chr_2_seq[42:93]], Seq("")).reverse_complement() - assert extract_g2_I1 == expected_g2_I1 - - # g2_I2 [128,193] - extract_g2_I2 = splici_seqs["g2-I2"].seq - expected_g2_I2 = chr_2_seq[127:193].reverse_complement() - assert extract_g2_I2 == expected_g2_I2 - - # check t2g_3col - t2g_dict = t2g_3col[[t2g_colnames[1]]].to_dict()["gene_name"] - t2s_dict = t2g_3col[[t2g_colnames[2]]].to_dict()["splice_status"] - - # check txp name to gene name mapping - # for g1 - assert t2g_dict["tx1.1"] == "g1" - assert t2g_dict["tx1.2"] == "g1" - assert t2g_dict["tx1.3"] == "g1" - assert t2g_dict["g1-I"] == "g1" - assert t2g_dict["g1-I1"] == "g1" - assert t2g_dict["g1-I2"] == "g1" - - # for g2 - assert t2g_dict["tx2.1"] == "g2" - assert t2g_dict["tx2.2"] == "g2" - assert t2g_dict["tx2.3"] == "g2" - assert t2g_dict["g2-I"] == "g2" - assert t2g_dict["g2-I1"] == "g2" - assert t2g_dict["g2-I2"] == "g2" - - # check txp name to splice status mapping - assert t2s_dict["tx1.1"] == "S" - assert t2s_dict["tx1.2"] == "S" - assert t2s_dict["tx1.3"] == "S" - assert t2s_dict["g1-I"] == "U" - assert t2s_dict["g1-I1"] == "U" - assert t2s_dict["g1-I2"] == "U" - - # for S - assert t2s_dict["tx2.1"] == "S" - assert t2s_dict["tx2.2"] == "S" - assert t2s_dict["tx2.3"] == "S" - assert t2s_dict["g2-I"] == "U" - assert t2s_dict["g2-I1"] == "U" - assert t2s_dict["g2-I2"] == "U"