diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 34560ff..741d0f7 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -15,7 +15,7 @@ jobs:
         echo $CONDA/bin >> $GITHUB_PATH
     - name: Install dependencies
       run: |
-        conda install -n base python=3.10 conda-forge::biopython numpy==1.26 bioconda::pyranges==0.0.129 packaging bioconda::scanpy pytest -y
+        conda install -n base -c bioconda -c conda-forge python=3.10 packaging pandas scanpy pytest -y
 
     - name: pytest
       run: |
diff --git a/bin/pyroe b/bin/pyroe
index db86bd4..4df61c3 100755
--- a/bin/pyroe
+++ b/bin/pyroe
@@ -1,11 +1,6 @@
 #!/usr/bin/env python
 
 import logging
-from pyroe import make_splici_txome, make_spliceu_txome
-from pyroe import id_to_name
-
-if make_spliceu_txome is None or make_splici_txome is None or id_to_name is None:
-    raise ImportError("To run pyroe CLI, Please install pyranges, biopython and bedtools.")
 from pyroe import fetch_processed_quant
 from pyroe import convert
 from pyroe import output_formats
@@ -37,133 +32,6 @@ if __name__ == "__main__":
         help="additional help",
     )
 
-    # make-splici
-    parser_makeSplici = subparsers.add_parser(
-        "make-spliced+intronic",
-        help="Make spliced + intronic reference",
-        aliases=["make-splici"],
-    )
-    parser_makeSplici.set_defaults(command="make-spliced+intronic")
-    parser_makeSplici.add_argument(
-        "genome_path",
-        metavar="genome-path",
-        type=str,
-        help="The path to a genome fasta file.",
-    )
-    parser_makeSplici.add_argument(
-        "gtf_path", metavar="gtf-path", type=str, help="The path to a gtf file."
-    )
-    parser_makeSplici.add_argument(
-        "read_length",
-        metavar="read-length",
-        type=int,
-        help="The read length of the single-cell experiment being processed (determines flank size).",
-    )
-    parser_makeSplici.add_argument(
-        "output_dir",
-        metavar="output-dir",
-        type=str,
-        help="The output directory where splici reference files will be written.",
-    )
-    parser_makeSplici.add_argument(
-        "--filename-prefix",
-        type=str,
-        default="splici",
-        help="The file name prefix of the generated output files.",
-    )
-    parser_makeSplici.add_argument(
-        "--flank-trim-length",
-        type=int,
-        default=5,
-        help="Determines the amount subtracted from the read length to get the flank length.",
-    )
-    parser_makeSplici.add_argument(
-        "--extra-spliced",
-        type=str,
-        help="The path to an extra spliced sequence fasta file.",
-    )
-    parser_makeSplici.add_argument(
-        "--extra-unspliced",
-        type=str,
-        help="The path to an extra unspliced sequence fasta file.",
-    )
-    parser_makeSplici.add_argument(
-        "--bt-path",
-        type=str,
-        default="bedtools",
-        help="The path to bedtools v2.30.0 or greater.",
-    )
-    parser_makeSplici.add_argument(
-        "--no-bt",
-        action="store_true",
-        help="A flag indicates whether bedtools will be used for generating splici reference files.",
-    )
-    parser_makeSplici.add_argument(
-        "--dedup-seqs",
-        action="store_true",
-        help="A flag indicates whether identical sequences will be deduplicated.",
-    )
-    parser_makeSplici.add_argument(
-        "--no-flanking-merge",
-        action="store_true",
-        help="A flag indicates whether flank lengths will be considered when merging introns.",
-    )
-
-    # make-spliceu
-    parser_makeSpliceu = subparsers.add_parser(
-        "make-spliced+unspliced",
-        help="Make spliced + unspliced reference",
-        aliases=["make-spliceu"],
-    )
-    parser_makeSpliceu.set_defaults(command="make-spliced+unspliced")
-    parser_makeSpliceu.add_argument(
-        "genome_path",
-        metavar="genome-path",
-        type=str,
-        help="The path to a genome fasta file.",
-    )
-    parser_makeSpliceu.add_argument(
-        "gtf_path", metavar="gtf-path", type=str, help="The path to a gtf file."
-    )
-    parser_makeSpliceu.add_argument(
-        "output_dir",
-        metavar="output-dir",
-        type=str,
-        help="The output directory where Spliceu reference files will be written.",
-    )
-    parser_makeSpliceu.add_argument(
-        "--filename-prefix",
-        type=str,
-        default="spliceu",
-        help="The file name prefix of the generated output files.",
-    )
-    parser_makeSpliceu.add_argument(
-        "--extra-spliced",
-        type=str,
-        help="The path to an extra spliced sequence fasta file.",
-    )
-    parser_makeSpliceu.add_argument(
-        "--extra-unspliced",
-        type=str,
-        help="The path to an extra unspliced sequence fasta file.",
-    )
-    parser_makeSpliceu.add_argument(
-        "--bt-path",
-        type=str,
-        default="bedtools",
-        help="The path to bedtools v2.30.0 or greater.",
-    )
-    parser_makeSpliceu.add_argument(
-        "--no-bt",
-        action="store_true",
-        help="A flag indicates whether bedtools will be used for generating Spliceu reference files.",
-    )
-    parser_makeSpliceu.add_argument(
-        "--dedup-seqs",
-        action="store_true",
-        help="A flag indicates whether identical sequences will be deduplicated.",
-    )
-
     # parse available datasets
     available_datasets = fetch_processed_quant()
     epilog = "\n".join(
@@ -213,20 +81,6 @@ if __name__ == "__main__":
         help="A flag indicates whether help messaged should not be printed.",
     )
 
-    parser_id_to_name = subparsers.add_parser(
-        "id-to-name", help="Generate a gene id to gene name mapping file from a GTF."
-    )
-    parser_id_to_name.set_defaults(command="id-to-name")
-    parser_id_to_name.add_argument("gtf_file", help="The GTF input file.")
-    parser_id_to_name.add_argument(
-        "output", help="The path to where the output tsv file will be written."
-    )
-    parser_id_to_name.add_argument(
-        "--format",
-        help="The input format of the file (must be either GTF or GFF3). This will be inferred from the filename, but if that fails it can be provided explicitly.",
-        default=None,
-    )
-
     out_formats = output_formats()
     parser_convert = subparsers.add_parser(
         "convert", help="Convert alevin-fry quantification result to another format."
@@ -244,7 +98,7 @@ if __name__ == "__main__":
     )
     parser_convert.add_argument(
         "--output-structure",
-        help="The structure that U,S and A counts should occupy in the output matrix.",
+        help="The structure that U,S and A counts should occupy in the output matrix. It will be passed to the output_format argument of the `load_fry` function.",
     )
     parser_convert.add_argument(
         "--output-format",
@@ -262,34 +116,7 @@ if __name__ == "__main__":
 
     # Execute the parse_args() method
     args = parser.parse_args()
-    if args.command == "make-spliced+intronic":
-        make_splici_txome(
-            genome_path=args.genome_path,
-            gtf_path=args.gtf_path,
-            read_length=args.read_length,
-            output_dir=args.output_dir,
-            flank_trim_length=args.flank_trim_length,
-            filename_prefix=args.filename_prefix,
-            extra_spliced=args.extra_spliced,
-            extra_unspliced=args.extra_unspliced,
-            dedup_seqs=args.dedup_seqs,
-            no_bt=args.no_bt,
-            bt_path=args.bt_path,
-            no_flanking_merge=args.no_flanking_merge,
-        )
-    elif args.command == "make-spliced+unspliced":
-        make_spliceu_txome(
-            genome_path=args.genome_path,
-            gtf_path=args.gtf_path,
-            output_dir=args.output_dir,
-            filename_prefix=args.filename_prefix,
-            extra_spliced=args.extra_spliced,
-            extra_unspliced=args.extra_unspliced,
-            dedup_seqs=args.dedup_seqs,
-            no_bt=args.no_bt,
-            bt_path=args.bt_path,
-        )
-    elif args.command == "fetch-quant":
+    if args.command == "fetch-quant":
         fetch_processed_quant(
             dataset_ids=args.dataset_ids,
             fetch_dir=args.fetch_dir,
@@ -299,8 +126,6 @@ if __name__ == "__main__":
         )
     elif args.command == "convert":
         convert(args)
-    elif args.command == "id-to-name":
-        id_to_name(args)
     else:
         print(parser.print_help())
         sys.exit(1)
diff --git a/setup.cfg b/setup.cfg
index 6898d64..5e71be4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = pyroe
-version = 0.10.0
+version = 0.11.0
 author = Dongze He, Rob Patro
 author_email = dhe17@umd.edu, rob@cs.umd.edu
 description = utilities of alevin-fry
@@ -16,14 +16,14 @@ classifiers =
 packages = find:
 package_dir =
     = src
-# scripts =
-#     bin/pyroe
+scripts =
+    bin/pyroe
 python_requires = >=3.7
 include_package_data = True
 install_requires = 
     packaging >= 21.0
     scanpy >= 1.8.2
-
+    pandas >= 1.3.0
 
 [options.packages.find]
 where = src
diff --git a/src/pyroe/__init__.py b/src/pyroe/__init__.py
index ce93022..cb6ad9c 100644
--- a/src/pyroe/__init__.py
+++ b/src/pyroe/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.10.0"
+__version__ = "0.11.0"
 
 from pyroe.load_fry import load_fry
 from pyroe.fetch_processed_quant import fetch_processed_quant
@@ -7,13 +7,4 @@
 from pyroe.convert import convert
 from pyroe.pyroe_utils import output_formats
 
-
-# try:
-#     from pyroe.make_txome import make_splici_txome, make_spliceu_txome
-#     from pyroe.id_to_name import id_to_name
-# except ImportError:
-#     make_splici_txome = None
-#     make_spliceu_txome = None
-#     id_to_name = None
-
 # flake8: noqa
diff --git a/src/pyroe/id_to_name.py b/src/pyroe/id_to_name.py
deleted file mode 100644
index a1715c7..0000000
--- a/src/pyroe/id_to_name.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import pyranges
-import logging
-import pathlib
-import sys
-
-
-def id_to_name(params):
-    format_map = {
-        "gtf": pyranges.read_gtf,
-        "gff": pyranges.read_gff3,
-        "gff3": pyranges.read_gff3,
-    }
-    annot_reader = None
-    if params.format is None:
-        p = pathlib.Path(params.gtf_file)
-        suffs = [z.lower().strip(".") for z in p.suffixes]
-
-        z = None
-        if len(suffs) >= 1:
-            # look at the final suffix
-            z = suffs[-1]
-            # if the final suffix is gz and there are
-            # suffixes preceding it, check the penultimate
-            # one and use that
-            if z == "gz" and len(suffs) > 1:
-                z = suffs[-2]
-
-        if z is None or z not in format_map.keys():
-            logging.error(
-                "Could not determine format of annotation file. Please provide it explicitly."
-            )
-            sys.exit(1)
-        else:
-            annot_reader = format_map[z]
-    else:
-        fmt = params.format.lower()
-        if fmt not in format_map.keys():
-            logging.error(
-                f'Format must be either "gtf" or "gff3", but {fmt} was provided.'
-            )
-            sys.exit(1)
-        annot_reader = format_map[fmt]
-
-    a = annot_reader(params.gtf_file)
-    # only bother looking at `gene` features
-    a_genes = a[(a.Feature == "gene")]
-    id_name = {}
-    for k, df in a_genes:
-        cdf = df[["gene_id", "gene_name"]].to_dict(orient="records")
-        id_name.update({d["gene_id"]: d["gene_name"] for d in cdf})
-
-    logging.info(f"generated mappings for {len(id_name)} gene ids.")
-    logging.info(f"writing output to {params.output}")
-
-    with open(params.output, "w") as ofile:
-        for k, v in id_name.items():
-            ofile.write(f"{k}\t{v}\n")
diff --git a/src/pyroe/load_fry.py b/src/pyroe/load_fry.py
index d8e8357..5615403 100644
--- a/src/pyroe/load_fry.py
+++ b/src/pyroe/load_fry.py
@@ -1,15 +1,14 @@
-try:
-    import scanpy
-except ModuleNotFoundError:
-    print(
-        "scanpy must be installed to enable the load_fry() function. Use `conda install -c scanpy ` or `pip install scanpy` to install it."
-    )
-    import sys
-
-    sys.exit(1)
+from .pyroe_utils import say
 
 
-def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False):
+def load_fry(
+    frydir,
+    output_format="scRNA",
+    aux_columns=["X", "Y"],
+    gene_id_to_name=None,
+    nonzero=False,
+    quiet=False,
+):
     """
     load alevin-fry quantification result into an AnnData object
 
@@ -26,6 +25,19 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False):
         If a customized format of the returned `AnnData` is needed, one can pass a Dictionary.\\
         See Notes section for details.
 
+    aux_columns : `list[str]` (default: `["X", "Y"]`)
+        A list of strings contains the column names of the auxiliary information in the barcodes file starting from the second column. \\
+        The first column is assumed to be the barcodes and is named as "barcodes". \\
+        Extra auxiliary columns in the barcodes file without a specified name will be ignored.
+
+    gene_id_to_name : `str` or `None` (default: `None`)
+        The path to a file that contains the mapping from gene names to gene ids. \\
+        It is only needed if \\
+            1. you are not using the simpleaf pipeline (`simpleaf index` + `simpleaf quant`), \\
+            2. you have such a file, and,
+            3. you want to add this information to the coldata of your anndata.
+        If you do, please ensure it is a tab-separated, two-column file without a header, and the first column is the gene ids and the second column is the gene names.
+
     nonzero : `bool` (default: `False`)
         True if cells with non-zero expression value across all genes should be filtered in each layer.
         False if unexpressed genes should be kept.
@@ -45,7 +57,7 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False):
 
     The following formats are defined:
 
-    * "scRNA": \\
+    * "scRNA" and "S+A": \\
         This format is recommended for single cell RNA-sequencing experiments.
         It returns a `X` field that contains the S+A count of each gene in each cell,
         and a `unspliced` field that contains the U count of each gene.
@@ -89,6 +101,7 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False):
     import json
     import os
     import pandas as pd
+    import scanpy
 
     # since alevin-fry 0.4.1 the generic "meta_info.json"
     # has been replaced by a more informative name for each
@@ -100,14 +113,17 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False):
     # first, check for the new file, if we don't find it, check
     # for the old one.
     if not os.path.exists(fpath):
-        if not quiet:
-            print(
-                f"Did not find a {meta_info_files[0]} file, checking for older {meta_info_files[1]}."
-            )
+        say(
+            quiet,
+            f"Did not find a {meta_info_files[0]} file, checking for older {meta_info_files[1]}.",
+        )
+
         fpath = os.path.sep.join([frydir, meta_info_files[1]])
         # if we don't find the old one either, then return None
         if not os.path.exists(fpath):
-            raise IOError(f"Found no {meta_info_files[1]} file either; cannot proceed.")
+            raise IOError(
+                "The profvided `frydir` doesn't contain required meta info file; cannot proceed."
+            )
 
     # if we got here then we had a valid json file, so
     # use it to get the number of genes, and if we are
@@ -115,8 +131,8 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False):
     meta_info = json.load(open(fpath))
     ng = meta_info["num_genes"]
     usa_mode = meta_info["usa_mode"]
-    if not quiet:
-        print(f"USA mode: {usa_mode}")
+
+    say(quiet, f"USA mode: {usa_mode}")
 
     # if we are in USA mode
     if usa_mode:
@@ -125,42 +141,92 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False):
         # genes is ng/3.
         ng = int(ng / 3)
         output_assays = process_output_format(output_format, quiet)
-    elif not quiet:
-        print(
-            "Processing input in standard mode, the count matrix will be stored in field 'X'."
+    else:
+        say(
+            quiet,
+            "Processing input in standard mode, the count matrix will be stored in field 'X'.",
         )
         if output_format != "scRNA":
-            print("Output_format will be ignored.")
+            say(quiet, "Output_format will be ignored.")
 
-    # read the actual input matrix
-    af_raw = scanpy.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"]))
-    afg = [
-        line.rstrip()
-        for line in open(
-            os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"])
-        ).readlines()
-    ][:ng]
     # read the gene ids
-    afg_df = pd.DataFrame(afg, columns=["gene_ids"])
-    afg_df = afg_df.set_index("gene_ids")
-    # and the barcodes
-    abc = [
-        line.rstrip()
-        for line in open(
-            os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"])
-        ).readlines()
-    ]
-    abc_df = pd.DataFrame(abc, columns=["barcodes"])
-    abc_df.index = abc_df["barcodes"]
+    afg_df = pd.read_table(
+        os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"]),
+        names=["gene_ids"],
+        nrows=ng
+    )
+
+    # if we have a gene name to id mapping, use it
+    # Otherwise, we see if there is a default mapping file
+    # We first hope the user gives one
+    gene_id_to_name_path = gene_id_to_name
+    # make sure the file exists
+    if gene_id_to_name_path is not None:
+        if not os.path.exists(gene_id_to_name_path):
+            say(
+                quiet,
+                f"The provided gene id to name mapping file {gene_id_to_name_path} does not exist; ignored.",
+            )
+            gene_id_to_name_path = None
+
+    # we then check if we can find a default mapping file
+    if gene_id_to_name_path is None:
+        default_gene_id_to_name_path = os.path.sep.join([frydir, "gene_id_to_name.tsv"])
+        if os.path.exists(default_gene_id_to_name_path):
+            gene_id_to_name_path = default_gene_id_to_name_path
+            say(
+                quiet,
+                f"Using simpleaf gene id to name mapping file: {gene_id_to_name_path}",
+            )
 
+    # read the file if we find it
+    if gene_id_to_name_path is not None:
+        gene_id_to_name_df = pd.read_table(
+            gene_id_to_name_path, names=["gene_ids", "gene_names"]
+        )
+        afg_df = pd.merge(afg_df, gene_id_to_name_df, on="gene_ids", how="left")
+
+    afg_df = afg_df.set_index("gene_ids", drop=False)
+
+    # read the barcodes file
+    abc_df = pd.read_table(
+        os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"]), header=None
+    )
+
+    # if column names and num columns don't match, match them
+    # we have only barcode
+    if abc_df.shape[1] == 1:
+        columns = ["barcodes"]
+    else:
+        say(quiet, f"Found {abc_df.shape[1] - 1} auxiliary columns in barcodes file.")
+        columns = ["barcodes"] + aux_columns
+
+    if abc_df.shape[1] != len(columns):
+        ncol = min(abc_df.shape[1], len(columns))
+
+        say(
+            quiet,
+            f"Number of auxiliary columns in barcodes file does not match the provided column names. Using the first {ncol} columns in the barcode file with names: {columns[:ncol]}.",
+        )
+
+        abc_df.columns = columns[:ncol]
+        columns = columns[:ncol]
+
+    abc_df = abc_df.set_axis(columns, axis=1)
+    abc_df = abc_df.set_index(columns[0], drop=False)
+
+    say(quiet, "Reading the quantification matrix.")
+    af_raw = scanpy.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"]))
     x = af_raw.X
     # if we're not in USA mode, just combine this info into
     # an AnnData object
     if not usa_mode:
+        say(quiet, "Constructing the AnnData object.")
         af = scanpy.AnnData(x.T, var=abc_df, obs=afg_df)
         af = af.T
 
     else:  # USA mode
+        say(quiet, "Converting USA mode quantification into specified layers.")
         # otherwise, combine the sub-matrices into the output object as
         # specified by `output_assays`
         rd = {"S": range(0, ng), "U": range(ng, 2 * ng), "A": range(2 * ng, 3 * ng)}
@@ -191,9 +257,9 @@ def load_fry(frydir, output_format="scRNA", nonzero=False, quiet=False):
 
         af = af[:, not_zero_genes]
 
-        if not quiet:
-            print(f"Filtered {np.sum(~not_zero_genes)} non-expressed genes.")
+        say(quiet, f"Filtered {np.sum(~not_zero_genes)} non-expressed genes.")
 
+    say(quiet, "Done.")
     return af
 
 
@@ -226,32 +292,30 @@ def process_output_format(output_format, quiet):
             output_format = output_format.lower()
             if output_format not in predefined_format.keys():
                 # invalid output_format string
-                if not quiet:
-                    print("A undefined Provided output_format string provided.")
-                    print("See function help message for details.")
+                say(quiet, "A undefined Provided output_format string provided.")
+                say(quiet, "See function help message for details.")
                 raise ValueError("Invalid output_format.")
-            if not quiet:
-                print("Using pre-defined output format:", output_format)
-                print(
-                    f"Will populate output field X with sum of counts frorm {predefined_format[output_format]['X']}."
-                )
-                for k, v in predefined_format[output_format].items():
-                    if k != "X":
-                        print(f"Will combine {v} into output layer {k}.")
+            say(quiet, f"Using pre-defined output format: {output_format}")
+            say(
+                quiet,
+                f"Will populate output field X with sum of counts from {predefined_format[output_format]['X']}.",
+            )
+            for k, v in predefined_format[output_format].items():
+                if k != "X":
+                    say(quiet, f"Will combine {v} into output layer {k}.")
 
             return predefined_format[output_format]
         else:
-            if not quiet:
-                print("Processing user-defined output format.")
+            say(quiet, "Processing user-defined output format.")
             # make sure the X is there
             if "X" not in output_format.keys():
                 raise ValueError(
                     'In USA mode some sub-matrices must be assigned to the "X" (default) output.'
                 )
-            if not quiet:
-                print(
-                    f"Will populate output field X with sum of counts frorm {output_format['X']}."
-                )
+            say(
+                quiet,
+                f"Will populate output field X with sum of counts frorm {output_format['X']}.",
+            )
 
             for k, v in output_format.items():
                 if not v:
@@ -266,8 +330,8 @@ def process_output_format(output_format, quiet):
                     raise ValueError(
                         f"Found non-USA element in output_format element list '{v}' for key '{k}'; cannot proceed."
                     )
-                if not quiet and (k != "X"):
-                    print(f"Will combine {v} into output layer {k}.")
+                if k != "X":
+                    say(quiet, f"Will combine {v} into output layer {k}.")
 
             return output_format
     else:
diff --git a/src/pyroe/make_splici_txome.py b/src/pyroe/make_splici_txome.py
deleted file mode 100755
index aeca2a0..0000000
--- a/src/pyroe/make_splici_txome.py
+++ /dev/null
@@ -1,650 +0,0 @@
-def append_extra(extra_infile, out_fa, out_t2g3col, id2name_path, col_status):
-    """
-    extra_infile : str
-        path to the extra FASTA format input file from which to read the extra records
-    out_fa : str
-        path to the output FASTA file being created (will be *appended* to)
-    out_t2g3col : str
-        path to the output t2g 3-column file (will be *appended* to)
-    id2name_path : str
-        path to the output (existing) gene id to gene name tsv (will be *appended* to)
-    col_status : char
-        splicing status to use for these records (one of 'S' or 'U')
-    """
-
-    from Bio.SeqIO.FastaIO import SimpleFastaParser
-
-    # write extra sequences to the t2g file
-    with open(extra_infile) as extra_in_fa:
-        with open(out_fa, "a") as splici_fa:
-            with open(out_t2g3col, "a") as t2g:
-                with open(id2name_path, "a") as id_to_name:
-                    for title, sequence in SimpleFastaParser(extra_in_fa):
-                        tid = title.split()[0]
-                        splici_fa.write(f">{tid}\n")
-                        splici_fa.write(f"{sequence}\n")
-                        t2g.write(f"{tid}\t{tid}\t{col_status}\n")
-                        id_to_name.write(f"{tid}\t{tid}\n")
-
-
-def dedup_sequences(output_dir, in_fa, out_fa):
-    """
-    This function deduplicates the fasta sequences in `in_fa`, writes the
-    deduplicated sequences in fasta format to `out_fa`, and also produces a
-    2-column tsv file in `ouput_dir`/duplicate_entries.tsv recording the
-    entries that have been removed and the retained entry that they match.
-    Note: It is allowed (and in some cases intended) that `in_fa` == `out_fa`.
-    In this case, the input fasta file will be overwritten.
-
-    output_dir : str
-        The output directory where duplicate_entries.tsv will be written
-    in_fa : str
-        The path to the input fasta file to be deduplicated
-    out_fa : str
-        The path to where the ouput deduplicated fasta file should be written
-    """
-    import os
-    from Bio import SeqIO
-    from Bio.Seq import Seq
-    from Bio.SeqRecord import SeqRecord
-
-    record_representatives = {}
-
-    # read from the input fasta file and track
-    # the duplicate sequences
-    for record in SeqIO.parse(in_fa, "fasta"):
-        seq = str(record.seq)
-        name = record.id.split()[0]
-        # if we haven't seen it yet, this is the representative
-        # otherwise append the name of the duplicate
-        if seq in record_representatives:
-            record_representatives[seq].append(name)
-        else:
-            record_representatives[seq] = [name]
-
-    # write out the duplicate entries in the same format
-    # the salmon / pufferfish indexer uses
-    dup_file_name = os.path.sep.join([output_dir, "duplicate_entries.tsv"])
-    with open(dup_file_name, "w") as dup_file:
-        dup_file.write("RetainedRef\tDuplicateRef\n")
-        for k, vu in record_representatives.items():
-            if len(vu) > 1:
-                v = sorted(vu)
-                for dup in v[1:]:
-                    dup_file.write(f"{v[0]}\t{dup}\n")
-
-    # write the deduplicated output to file
-    # Note: it might be that out_file == in_file and we
-    # are overwriting the input.
-    with open(out_fa, "w") as ofile:
-        for k, vu in record_representatives.items():
-            v = sorted(vu)
-            rec = SeqRecord(Seq(k), id=v[0], description="")
-            SeqIO.write(rec, ofile, "fasta")
-
-
-def check_gr(gr, output_dir, write_clean_gtf):
-    """
-    This function checks the validity of a PyRanges object.
-    It will follow these rules:
-    1. For gene type records, gene_id and gene_name cannot be both missing.
-    2. All other types of records have to have a valid (not NaN) transcript_id.
-    3. For all types of records other than gene type,
-        - If the gene_id and gene_name are both missing, then the transcript_id
-    will be used to impute them.
-        - If one of gene_id and gene_name is missing, then the other will be
-    used to impute the missing one.
-
-    Args:
-        gr (pyranges): Stranded PyRanges object
-    """
-
-    import pandas as pd
-    import os
-    import pyranges as pr
-    import warnings
-
-    # split gene type records with others
-    # we don't use gene records in splici construction
-    gene_gr = gr[gr.Feature == "gene"]
-    gr = gr[gr.Feature != "gene"]
-    clean_gtf_path = os.path.join(output_dir, "clean_gtf.gtf")
-
-    # If required fields are missing, quit
-    if "transcript_id" not in gr.columns:
-        raise ValueError(
-            "The input GTF file doesn't contain transcript_id field; Cannot proceed."
-        )
-
-    if "gene_id" not in gr.columns:
-        # use gene_name as gene_id if exists, return an error otherwise
-        if "gene_name" not in gr.columns:
-            raise ValueError(
-                "The input GTF file doesn't contain gene_id and gene_name field; Cannot proceed."
-            )
-        else:
-            warnings.warn("gene_id field does not exist, use gene_name as gene_id.")
-            gene_id = pd.Series(data=gr.gene_name, name="gene_id")
-            gr = gr.insert(gene_id)
-
-    if "gene_name" not in gr.columns:
-        warnings.warn("gene_name field does not exist, use gene_id as gene_name.")
-        gene_name = pd.Series(data=gr.gene_id, name="gene_name")
-        gr = gr.insert(gene_name)
-
-    # If there is any NaN in transcript_id field, quit
-    if sum(gr.transcript_id.isnull()):
-
-        # first, write an clean GTF if needed
-        if write_clean_gtf:
-            gr = pr.concat([gene_gr, gr[gr.transcript_id.notnull()]])
-            gr.to_gtf(clean_gtf_path)
-            # gr[gr.transcript_id.notnull()].to_gtf(clean_gtf_path)
-            clean_gtf_msg = f"An clean GTF file is written to {clean_gtf_path}."
-        else:
-            clean_gtf_msg = "Set the write_clean_gtf flag if a clean GTF without the invalid records is needed."
-        # Then, raise a value error
-        raise ValueError(
-            f"Found NaN value in exons' transcript ID; Cannot proceed.\n{clean_gtf_msg}"
-        )
-
-    # Impute missing gene_id and gene_name values
-    # define an object
-    num_nan = gr.df[["gene_id", "gene_name"]].isnull().sum(axis=1)
-
-    if num_nan.sum():
-
-        # create intermediate df
-        gene_df = gr.df[["gene_id", "gene_name"]]
-
-        # Firstly, write the problematic records to a file before imputation
-        problematic_gtf_path = os.path.join(
-            output_dir, "missing_gene_id_or_name_records.gtf"
-        )
-        gr[num_nan > 0].to_gtf(problematic_gtf_path)
-        missing_record_msg = f"\nFound records with missing gene_id/gene_name field.\nThese records are reported in {problematic_gtf_path}."
-
-        # impute using transcript_id
-        double_missing = num_nan == 2
-        if double_missing.sum():
-            gene_df.loc[num_nan == 2, "gene_id"] = gr.transcript_id[num_nan == 2]
-            gene_df.loc[num_nan == 2, "gene_name"] = gr.transcript_id[num_nan == 2]
-            double_missing_msg = f"\n  - Found {(num_nan == 2).sum()} records missing gene_id and gene_name, imputing using transcript_id."
-        else:
-            double_missing_msg = ""
-
-        # If one field is missing, impute using the other
-        # missing only gene_id
-        gene_id_missing = gene_df["gene_id"].isnull()
-        if gene_id_missing.sum():
-            gene_df.loc[gene_df["gene_id"].isnull(), "gene_id"] = gene_df.loc[
-                gene_df["gene_id"].isnull(), "gene_name"
-            ]
-            gene_id_missing_msg = f"\n  - Found {gene_id_missing.sum()} records missing gene_id, imputing using gene_name."
-        else:
-            gene_id_missing_msg = ""
-
-        gene_name_missing = gene_df["gene_name"].isnull()
-        if gene_name_missing.sum():
-            gene_df.loc[gene_df["gene_name"].isnull(), "gene_name"] = gene_df.loc[
-                gene_df["gene_name"].isnull(), "gene_id"
-            ]
-            gene_name_missing_msg = f"\n  - Found {gene_name_missing.sum()} records missing gene_name, imputing using gene_id."
-        else:
-            gene_name_missing_msg = ""
-
-        # write the warning message
-        warnings.warn(
-            "".join(
-                [
-                    missing_record_msg,
-                    double_missing_msg,
-                    gene_id_missing_msg,
-                    gene_name_missing_msg,
-                ]
-            )
-        )
-
-        # replace the old gene_id and gene_name fields using imputed one.
-        gr = gr.drop(["gene_id", "gene_name"])
-        gr = gr.insert(gene_df)
-        # if gene_gr is used in the future, then concat them.
-        if write_clean_gtf:
-            clean_gr = pr.concat([gene_gr, gr])
-            clean_gr.to_gtf(clean_gtf_path)
-            clean_gtf_msg = f"An clean GTF file is written to {clean_gtf_path}."
-            print(clean_gtf_msg)
-
-            # gr = pr.concat([gr, gene_gr])
-
-    # return imputed gr
-    return gr
-
-
-def make_splici_txome(
-    genome_path,
-    gtf_path,
-    read_length,
-    output_dir,
-    flank_trim_length=5,
-    filename_prefix="splici",
-    extra_spliced=None,
-    extra_unspliced=None,
-    dedup_seqs=False,
-    no_bt=False,
-    bt_path="bedtools",
-    no_flanking_merge=False,
-    write_clean_gtf=False,
-):
-    """
-    Construct the splici (spliced + introns) transcriptome for alevin-fry.
-
-    Required Parameters
-    ----------
-    genome_path : str
-        The path to a genome fasta file.
-
-    gtf_path : str
-        The path to a gtf file.
-
-    read_length : int
-        The read length of the single-cell experiment being processed.
-
-    output_dir : str
-        The output directory, where the splici reference files will
-        be written.
-
-    Optional Parameters
-    ----------
-
-    flank_trim_length : int (default: `5`)
-        The flank trimming length.
-        The final flank length is obtained by subtracting
-        the flank_trim_length from the read_length.
-
-    filename_prefix : str (default: `splici`)
-        The file name prefix of the generated output files.
-        The derived flank length will be automatically
-        appended to the provided prefix.
-
-    extra_spliced : str
-        A path to a fasta file. The records in this fasta file will be
-        regarded as spliced transcripts.
-
-    extra_unspliced : str
-        The path to a fasta file. The records in this fasta file will be
-        regarded as introns.
-
-    dedup_seqs : bool  (default: `False`)
-        If True, the repeated sequences in the splici reference will be
-        deduplicated.
-
-    no_bt : bool (default: `False`)
-        If true, biopython, instead of bedtools, will be used for
-        generating splici reference files.
-
-    bt_path : str
-        The path to bedtools v2.30.0 or greater if it is not in the environment PATH.
-
-    no_flanking_merge : bool (default: `False`)
-        If true, overlapping introns caused by the added flanking length will not be merged.
-
-    write_clean_gtf : bool (default: `False`)
-        If true, when the input GTF contains invalid records, a clean GTF
-        file `clean_gtf.gtf` with these invalid records removed will be
-        exported to the output dir.
-        An invalid record is an exon record without an transcript ID
-        in the `transcript_id` field.
-
-
-    Returns
-    -------
-    Nothing will be returned. The splici reference files will be written
-    to disk.
-
-    Notes
-    -----
-    * If using bedtools, a temp.bed and a temp.fa will be created and
-        then deleted. These two files encode the introns of each gene
-        and the exons of each transcript of each gene.
-
-    """
-
-    import pyranges as pr
-
-    import warnings
-
-    import os
-
-    import subprocess
-    import shutil
-
-    from packaging.version import parse as parse_version
-
-    from Bio import SeqIO
-    from Bio.SeqIO.FastaIO import SimpleFastaParser
-
-    # Preparation
-
-    # check flanking length
-    flank_length = read_length - flank_trim_length
-
-    if flank_length < 0:
-        raise ValueError("Flank trim length cannot be larger than read length!")
-
-    # check fasta file
-    if not os.path.isfile(genome_path):
-        raise IOError("Cannot open the input fasta file!")
-
-    # check gtf file
-    if not os.path.isfile(gtf_path):
-        raise IOError("Cannot open the input gtf file!")
-
-    def check_bedtools_version(bt_check_path):
-        try:
-            vstr = (
-                subprocess.run([bt_path, "--version"], capture_output=True)
-                .stdout.decode()
-                .strip()
-                .split("v")[1]
-            )
-            found_ver = parse_version(vstr)
-            req_ver = parse_version("2.30.0")
-            return found_ver >= req_ver
-        except subprocess.CalledProcessError as err:
-            # in this case couldn't even run subprocess
-            warnings.warn(f"Cannot check bedtools version.\n{err}")
-            return False
-
-    # check bedtools
-    if not no_bt:
-        # check at the provided path
-        if not check_bedtools_version(bt_path):
-            # if it's not ok at the provided path, check
-            # the standard system path
-            if bt_path == "bedtools":
-                # in this case, there's nowhere else to check
-                # so give up on bedtools
-                print(
-                    "bedtools in the environemnt PATH is either",
-                    "older than v.2.30.0 or doesn't exist.",
-                    "\nBiopython will be used.",
-                )
-                no_bt = True
-            else:
-                print(
-                    "bedtools specified by bt_path is either",
-                    "older than v.2.30.0 or doesn't exist.",
-                    "\nTry finding bedtools in the environmental PATH.",
-                )
-                # if it's not ok at the standard system path
-                # fallback to biopython
-                if not check_bedtools_version("bedtools"):
-                    print(
-                        "bedtools in the environemnt PATH is either",
-                        "older than v.2.30.0 or doesn't exist.",
-                        "\nBiopython will be used.",
-                    )
-                    no_bt = True
-                # found it at the system path
-                else:
-                    bt_path = "bedtools"
-                    print("Using bedtools in the environmental PATH.")
-
-    # create out folder and temp folder inside
-    # create output folder
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # specify output file names
-    filename_prefix = filename_prefix + "_fl" + str(flank_length)
-    out_fa = os.path.join(output_dir, filename_prefix + ".fa")
-    out_t2g3col = os.path.join(output_dir, filename_prefix + "_t2g_3col.tsv")
-    id2name_path = os.path.join(output_dir, "gene_id_to_name.tsv")
-
-    # load gtf
-    try:
-        gr = pr.read_gtf(gtf_path)
-    except ValueError:
-        # in this case couldn't even run subprocess
-        raise RuntimeError(
-            "PyRanges failed to parse the input GTF file. Please check the PyRanges documentation for the expected GTF format constraints.\nhttps://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf"
-        )
-
-    gr = pr.read_gtf(gtf_path)
-
-    # check the validity of gr
-    gr = check_gr(gr, output_dir, write_clean_gtf)
-
-    # write gene id to name tsv file
-    gr.df[["gene_id", "gene_name"]].drop_duplicates().to_csv(
-        id2name_path, sep="\t", header=False, index=False
-    )
-
-    # get introns
-    # the introns() function uses inplace=True argument from pandas,
-    # which will trigger an FutureWarning.
-    warnings.simplefilter(action="ignore", category=FutureWarning)
-    introns = gr.features.introns(by="transcript")
-    introns.Name = introns.gene_id
-
-    if no_flanking_merge:
-        introns = introns.merge(strand=True, by=["Name"], slack=0)
-
-    introns = introns.extend(flank_length)
-
-    if not no_flanking_merge:
-        introns = introns.merge(strand=True, by=["Name"], slack=0)
-
-    introns.Gene = introns.Name
-    introns.Name = [
-        "-I".join(map(str, z))
-        for z in zip(
-            introns.Name,
-            introns.Name.groupby(introns.Name)
-            .cumcount()
-            .astype(str)
-            .replace(
-                "0",
-                "",
-            )
-            .values,
-        )
-    ]
-
-    # trim outbounded introns
-    with open(genome_path) as fasta_file:
-        chromsize = {
-            title.split()[0]: len(sequence)
-            for title, sequence in SimpleFastaParser(fasta_file)
-        }
-    introns = pr.gf.genome_bounds(introns, chromsize, clip=True)
-
-    # deduplicate introns
-    if dedup_seqs:
-        introns.drop_duplicate_positions()
-
-    # add splice status for introns
-    introns.splice_status = "U"
-
-    # get exons
-    exons = gr[gr.Feature == "exon"]
-
-    exons.Name = exons.transcript_id
-    exons.Gene = exons.gene_id
-    exons = exons.drop(exons.columns[~exons.columns.isin(introns.columns)].tolist())
-    exons = exons.sort(["Name", "Start", "End"])
-    # add splice status for exons
-    exons.splice_status = "S"
-
-    # concat spliced transcripts and introns as splici
-    splici = pr.concat([exons, introns])
-
-    # splici = splici.sort(["Name", "Start", "End", "Gene"])
-
-    # write to files
-    # t2g_3col.tsv
-    splici.df[["Name", "Gene", "splice_status"]].drop_duplicates().to_csv(
-        out_t2g3col, sep="\t", header=False, index=False
-    )
-    # print(splici.head())
-    tid2strand = dict(zip(splici.Name, splici.Strand))
-
-    # splici fasta
-    if not no_bt:
-        try:
-
-            # create temp folder
-            temp_dir = os.path.join(output_dir, "temp")
-            if not os.path.exists(temp_dir):
-                os.makedirs(temp_dir)
-            temp_fa = os.path.join(temp_dir, "temp.fa")
-            temp_bed = os.path.join(temp_dir, "temp.bed")
-
-            # write bed file
-            splici.to_bed(temp_bed, keep=True)
-
-            # run bedtools, ignore strand for now
-            bt_r = subprocess.run(
-                " ".join(
-                    [
-                        bt_path,
-                        "getfasta",
-                        "-fi",
-                        genome_path,
-                        "-fo",
-                        temp_fa,
-                        "-bed",
-                        temp_bed,
-                        # "-s",
-                        "-nameOnly",
-                    ]
-                ),
-                shell=True,
-                capture_output=True,
-            )
-
-            # check return code
-            if bt_r.returncode != 0:
-                raise ValueError("Bedtools failed.")
-
-            # parse temp fasta file to concat exons of each transcript
-            ei_parser = SeqIO.parse(temp_fa, "fasta")
-            prev_rec = next(ei_parser)
-            # prev_rec.id = prev_rec.id.split("(")[0]
-            prev_rec.description = ""
-            with open(out_fa, "w") as out_handle:
-
-                for seq_record in ei_parser:
-                    # seq_record.id = seq_record.id.split("(")[0]
-                    seq_record.description = ""
-                    if seq_record.id == prev_rec.id:
-                        prev_rec += seq_record
-
-                    else:
-                        if tid2strand[prev_rec.id] == "-":
-                            prev_rec = prev_rec.reverse_complement(
-                                id=True, description=True
-                            )
-                        SeqIO.write(prev_rec, out_handle, "fasta")
-                        prev_rec = seq_record
-                # Don't forget our last customer
-                if tid2strand[prev_rec.id] == "-":
-                    prev_rec = prev_rec.reverse_complement(id=True, description=True)
-                SeqIO.write(prev_rec, out_handle, "fasta")
-            shutil.rmtree(temp_dir, ignore_errors=True)
-        except subprocess.CalledProcessError as err:
-            no_bt = True
-            warnings.warn(f"Bedtools failed. Use biopython instead.\n{err}")
-            shutil.rmtree(temp_dir, ignore_errors=True)
-
-    if no_bt:
-
-        from Bio.Seq import Seq
-        from Bio.SeqFeature import SeqFeature, FeatureLocation
-
-        from Bio.SeqRecord import SeqRecord
-
-        with open(out_fa, "w") as out_handle:
-            # read fasta, process a chromosome at a time
-            for seq_record in SeqIO.parse(genome_path, "fasta"):
-                # get all records on that chromosome
-                chr_records = introns[introns.Chromosome == seq_record.id].df
-                if not chr_records.empty:
-                    chr_records.Strand = chr_records.Strand.replace(
-                        ["+", "-"], [+1, -1]
-                    )
-                    # init seq list
-                    intron_seqs = []
-                    # for each intron record
-                    for idx, intron_record in chr_records.iterrows():
-                        # create Seqeture object for extracting sequence from chromosome
-                        intron_feature = SeqFeature(
-                            FeatureLocation(intron_record.Start, intron_record.End),
-                            type="intron",
-                            strand=intron_record.Strand,
-                            id=intron_record.Name,
-                        )
-                        # append the intron sequence to the seq list, specify name as well
-                        intron_seqs.append(
-                            SeqRecord(
-                                intron_feature.extract(seq_record).seq,
-                                id=intron_record.Name,
-                                description="",
-                            )
-                        )
-                    # finally, write all intron sequences at once.
-                    SeqIO.write(intron_seqs, out_handle, "fasta")
-
-                # Then, process spliced transcripts
-                chr_records = exons[exons.Chromosome == seq_record.id].df
-                if not chr_records.empty:
-                    txp_seqs = []
-                    # as spliced txps are the concat of all exon sequences, fist get the sequence of each exon separately,then sum them up.
-                    for tid, exon_records in chr_records.groupby("Name"):
-
-                        # init exon seq list
-                        exon_seqs = []
-                        # get the sequence of each exon
-                        for idx, exon_record in exon_records.iterrows():
-                            # create SeqFeature object for the exon record
-                            # ignore strand for now, get reverse complement later if needed
-                            exon_feature = SeqFeature(
-                                FeatureLocation(exon_record.Start, exon_record.End),
-                                type="exon",
-                            )
-                            # extract exon sequence from chromosome and append to exon seq list
-                            exon_seqs.append(
-                                SeqRecord(
-                                    exon_feature.extract(seq_record).seq,
-                                    id=tid,
-                                    description="",
-                                )
-                            )
-                        # append the txp sequence to spliced txp seq list
-                        # consider strand
-                        if tid2strand[tid] == "-":
-                            txp_seqs.append(
-                                sum(exon_seqs, Seq("")).reverse_complement(
-                                    id=True, description=True
-                                )
-                            )
-                        else:
-                            txp_seqs.append(sum(exon_seqs, Seq("")))
-                    # write all spliced transcript serquence at once.
-                    SeqIO.write(txp_seqs, out_handle, "fasta")
-
-    # append extra spliced transcript onto splici
-    if extra_spliced is not None:
-        append_extra(extra_spliced, out_fa, out_t2g3col, id2name_path, "S")
-
-    # append extra unspliced transcript onto splici
-    if extra_unspliced is not None:
-        append_extra(extra_unspliced, out_fa, out_t2g3col, id2name_path, "U")
-
-    if dedup_seqs:
-        # Note: out_fa is intentionally passed as both
-        # the input and output file name parameters because
-        # we want to overwirte the duplicate fasta with
-        # the deduplicated fasta.
-        dedup_sequences(output_dir, out_fa, out_fa)
diff --git a/src/pyroe/make_txome.py b/src/pyroe/make_txome.py
deleted file mode 100644
index 5485b63..0000000
--- a/src/pyroe/make_txome.py
+++ /dev/null
@@ -1,1230 +0,0 @@
-import os
-import warnings
-import subprocess
-import shutil
-import pyranges as pr
-import pandas as pd
-import numpy as np
-from Bio import SeqIO
-from Bio.Seq import Seq
-from Bio.SeqFeature import SeqFeature, FeatureLocation
-from Bio.SeqRecord import SeqRecord
-from Bio.SeqIO.FastaIO import SimpleFastaParser
-from packaging.version import parse as parse_version
-import logging
-
-bed_required_fields = [
-    "Chromosome",
-    "Start",
-    "End",
-    "Strand",
-    "Name",
-    "Gene",
-    "splice_status",
-]
-
-
-def append_extra(extra_infile, out_fa, out_t2g3col, id2name_path, col_status):
-    """
-    extra_infile : str
-        path to the extra FASTA format input file from which to read the extra records
-    out_fa : str
-        path to the output FASTA file being created (will be *appended* to)
-    out_t2g3col : str
-        path to the output t2g 3-column file (will be *appended* to)
-    id2name_path : str
-        path to the output (existing) gene id to gene name tsv (will be *appended* to)
-    col_status : char
-        splicing status to use for these records (one of 'S' or 'U')
-    """
-
-    # write extra sequences to the t2g file
-    with open(extra_infile) as extra_in_fa:
-        with open(out_fa, "a") as splici_fa:
-            with open(out_t2g3col, "a") as t2g:
-                with open(id2name_path, "a") as id_to_name:
-                    for title, sequence in SimpleFastaParser(extra_in_fa):
-                        tid = title.split()[0]
-                        splici_fa.write(f">{tid}\n")
-                        splici_fa.write(f"{sequence}\n")
-                        t2g.write(f"{tid}\t{tid}\t{col_status}\n")
-                        id_to_name.write(f"{tid}\t{tid}\n")
-
-
-def dedup_sequences(output_dir, in_fa, out_fa):
-    """
-    This function deduplicates the fasta sequences in `in_fa`, writes the
-    deduplicated sequences in fasta format to `out_fa`, and also produces a
-    2-column tsv file in `ouput_dir`/duplicate_entries.tsv recording the
-    entries that have been removed and the retained entry that they match.
-    Note: It is allowed (and in some cases intended) that `in_fa` == `out_fa`.
-    In this case, the input fasta file will be overwritten.
-
-    output_dir : str
-        The output directory where duplicate_entries.tsv will be written
-    in_fa : str
-        The path to the input fasta file to be deduplicated
-    out_fa : str
-        The path to where the ouput deduplicated fasta file should be written
-    """
-    record_representatives = {}
-
-    # read from the input fasta file and track
-    # the duplicate sequences
-    for record in SeqIO.parse(in_fa, "fasta"):
-        seq = str(record.seq)
-        name = record.id.split()[0]
-        # if we haven't seen it yet, this is the representative
-        # otherwise append the name of the duplicate
-        if seq in record_representatives:
-            record_representatives[seq].append(name)
-        else:
-            record_representatives[seq] = [name]
-
-    # write out the duplicate entries in the same format
-    # the salmon / pufferfish indexer uses
-    dup_file_name = os.path.sep.join([output_dir, "duplicate_entries.tsv"])
-    with open(dup_file_name, "w") as dup_file:
-        dup_file.write("RetainedRef\tDuplicateRef\n")
-        for k, vu in record_representatives.items():
-            if len(vu) > 1:
-                v = sorted(vu)
-                for dup in v[1:]:
-                    dup_file.write(f"{v[0]}\t{dup}\n")
-
-    # write the deduplicated output to file
-    # Note: it might be that out_file == in_file and we
-    # are overwriting the input.
-    with open(out_fa, "w") as ofile:
-        for k, vu in record_representatives.items():
-            v = sorted(vu)
-            rec = SeqRecord(Seq(k), id=v[0], description="")
-            SeqIO.write(rec, ofile, "fasta")
-
-
-def check_gr(gr, output_dir):
-    """
-    This function checks the validity of a PyRanges object and generates a clean GTF file in the expected format if there is any invalid record in the input PyRanges object.
-    It applies the following rules:
-    1. Each non-gene record has to have a valid transcript_id. If this is not satisfied, it returns an error. Only the records with a valid transcript_id will be written to the clean_gtf.gtf.
-    2. For gene_id and gene_name metadata field,
-            - If these two fields are entirely missing in the GTF file, An error will be returned. At the same time, in the clean_gtf.gtf, the two fields will be imputed using the transcript_id fields.
-            - If one of these two fields is completely missing, a warning will be generated, and the missing field will be imputed using the other one.
-            - if some records have missing gene_id and/or gene_name, a warning will be printed, and the missing values will be imputed by the following rules: For records miss gene_id or gene_name, impute the missing one using the other one; If both are missing, impute them using transcript_id, which cannot be missing.
-    3. If there is no "transcript" or "gene" feature record, a warning will be printed. Moreover, those missing records will be imputed using the "exon" feature records: The Start and End site of the gene/transcript will be imputed as the bounds of their corresponding exons.
-    4. If the boundaries defined in the transcripts'/genes' feature records do not match those implied by their exons' feature records, report a warning but still use transcripts'/genes' feature records to extract unspliced sequences. To be specific, if some but not all transcripts/genes have their corresponding transcripts'/genes' feature records, or the Start and/or End site defined in the transcript/gene feature records do not match the corresponding exons' bounds, then the existing transcripts'/genes' feature records will be used to extract unspliced transcripts. At the same time, in the clean_gtf.gtf, all genes/transcripts that appeared in the exon feature records will have their corresponding transcripts'/genes' feature records, in which the boundaries match the corresponding exons' bounds.
-
-    Args:
-        gr (`PyRanges`): A stranded PyRanges object
-
-    Return:
-        `PyRanges`: A PyRanges with missing values imputed if possible.
-
-        Besides, a clean_gtf.gtf file will be generated in the output directory
-        if there is any invalid records in the input GTF file.
-    """
-
-    # split gene type records with others
-    # we don't use gene records in splici construction
-    clean_gtf_path = os.path.join(output_dir, "clean_gtf.gtf")
-    clean_gr = pr.PyRanges()
-
-    # If required fields are missing, quit
-    if "transcript_id" not in gr.columns:
-        logging.critical(
-            " The input GTF file doesn't contain transcript_id metadata field; Cannot proceed."
-        )
-
-    if "Feature" not in gr.columns:
-        logging.critical(
-            " The input GTF file doesn't contain feature field; Cannot proceed."
-        )
-
-    if "gene_id" not in gr.columns:
-        # use gene_name as gene_id if exists, return an error otherwise
-        if "gene_name" not in gr.columns:
-            logging.error(
-                " The input GTF file doesn't contain gene_id and gene_name metadata field; Cannot proceed."
-            )
-        else:
-            logging.warning(
-                " The gene_id field does not exist; Imputing using gene_name."
-            )
-            gene_id = pd.Series(data=gr.gene_name, name="gene_id")
-            gr = gr.insert(gene_id)
-
-    if "gene_name" not in gr.columns:
-        logging.warning("The gene_name field does not exist; Imputing using gene_id.")
-        gene_name = pd.Series(data=gr.gene_id, name="gene_name")
-        gr = gr.insert(gene_name)
-
-    # keep only the fields we need
-    gr = gr[
-        [
-            "Chromosome",
-            "Feature",
-            "Start",
-            "End",
-            "Strand",
-            "gene_id",
-            "gene_name",
-            "transcript_id",
-        ]
-    ]
-
-    # If there is any NaN in transcript_id field, quit
-    # gene features don't have transcript_id, ignore
-
-    if gr[gr.Feature == "exon"].transcript_id.isnull().any():
-        # first, write an clean GTF if needed
-        clean_gr = gr[np.logical_and(gr.transcript_id.notnull(), gr.Feature != "gene")]
-        clean_gr.to_gtf(clean_gtf_path)
-        # Then, raise a value error
-        logging.critical(
-            "".join(
-                [
-                    " Found missing value in exons' transcript ID; Cannot proceed."
-                    f" An clean GTF file without missing transcript_id records is written to {clean_gtf_path}.",
-                    " If needed, rerun using the clean GTF file",
-                ]
-            )
-        )
-
-    # Impute missing gene_id and gene_name values
-    # define an object
-    num_nan = gr.df[["gene_id", "gene_name"]].isnull().sum(axis=1)
-
-    if num_nan.any():
-        # create intermediate df
-        gene_df = gr.df[["gene_id", "gene_name"]]
-
-        # Firstly, write the problematic records to a file before imputation
-        problematic_gtf_path = os.path.join(
-            output_dir, "missing_gene_id_or_name_records.gtf"
-        )
-        gr[num_nan != 0].to_gtf(problematic_gtf_path)
-        missing_record_msg = f" Found records with missing gene_id/gene_name field. These records are reported in {problematic_gtf_path}."
-
-        # impute double missing using transcript_id
-        double_missing = num_nan == 2
-        if double_missing.sum():
-            gene_df.loc[num_nan == 2, "gene_id"] = gr.transcript_id[num_nan == 2]
-            gene_df.loc[num_nan == 2, "gene_name"] = gr.transcript_id[num_nan == 2]
-            double_missing_msg = f" Imputed {(num_nan == 2).sum()} records with missing 'gene_id' and 'gene_name' using transcript_id."
-        else:
-            double_missing_msg = ""
-
-        # If one field is missing, impute using the other
-        # missing only gene_id
-        gene_id_missing = gene_df["gene_id"].isnull()
-        if gene_id_missing.sum():
-            gene_df.loc[gene_df["gene_id"].isnull(), "gene_id"] = gene_df.loc[
-                gene_df["gene_id"].isnull(), "gene_name"
-            ]
-            gene_id_missing_msg = (
-                f" Imputed {gene_id_missing.sum()} missing gene_id using gene_name."
-            )
-        else:
-            gene_id_missing_msg = ""
-
-        gene_name_missing = gene_df["gene_name"].isnull()
-        if gene_name_missing.sum():
-            gene_df.loc[gene_df["gene_name"].isnull(), "gene_name"] = gene_df.loc[
-                gene_df["gene_name"].isnull(), "gene_id"
-            ]
-            gene_name_missing_msg = (
-                f" Imputed {gene_name_missing.sum()} missing gene_name using gene_id."
-            )
-        else:
-            gene_name_missing_msg = ""
-
-        # write the warning message
-        logging.warning(
-            "".join(
-                [
-                    missing_record_msg,
-                    double_missing_msg,
-                    gene_id_missing_msg,
-                    gene_name_missing_msg,
-                ]
-            )
-        )
-
-        # replace the old gene_id and gene_name fields by imputed.
-        gr = gr.drop(["gene_id", "gene_name"])
-        gr = gr.insert(gene_df)
-
-        # Then, records all exon records and gene records
-        clean_gr = pr.concat([clean_gr, gr[gr.Feature == "exon"]])
-
-    # check if the transcripts and genes are well defined
-    # first, we get the transcript annotation from exons and from the transcript feature records
-    # from GTF
-    transcript_gr = gr[gr.Feature == "transcript"].sort(["transcript_id"])
-    gene_gr = gr[gr.Feature == "gene"].sort(["gene_id"])
-
-    # from exons
-    transcript_bound_from_exons = (
-        gr[gr.Feature == "exon"]
-        .boundaries(group_by=["transcript_id", "gene_id", "gene_name"])
-        .sort(["transcript_id"])
-    )
-    transcript_bound_from_exons.Feature = "transcript"
-
-    gene_bound_from_exons = (
-        gr[gr.Feature == "exon"]
-        .boundaries(group_by=["gene_id", "gene_name"])
-        .sort(["gene_id"])
-    )
-    gene_bound_from_exons.Feature = "gene"
-
-    # define clean transcript and gene gr
-    clean_transcript_gr = pr.PyRanges()
-    clean_gene_gr = pr.PyRanges()
-
-    # If there is no transcript or annotation, we
-    # 1. give a warning,
-    # 2. using exons' bounds as the bound of transcripts/genes to extract unspliced sequences
-    # 3. write those bounds to the clean gtf file
-
-    if transcript_gr.empty or gene_gr.empty:
-        if transcript_gr.empty:
-            logging.warning(
-                "".join(
-                    [
-                        " The given GTF file doesn't have transcript feature records;",
-                        " Imputing using exon feature records.",
-                    ]
-                )
-            )
-
-            transcript_gr = transcript_bound_from_exons
-            clean_transcript_gr = transcript_bound_from_exons
-            gr = pr.concat([gr, transcript_bound_from_exons])
-
-        if gene_gr.empty:
-            logging.warning(
-                "".join(
-                    [
-                        " The given GTF file doesn't have gene feature records;",
-                        " Imputing using exon feature records.",
-                    ]
-                )
-            )
-
-            gene_gr = gene_bound_from_exons
-            clean_gene_gr = gene_bound_from_exons
-            gr = pr.concat([gr, gene_bound_from_exons])
-    else:
-        # If some of them are missing, we report a warning
-        # and use the transcripts' bounds in the original GTF file to extract introns,
-        # but impute the missing annotations in the clean GTF file.
-        # We will say in the warning message that if the users want to
-        # use the annotations we generated,
-        # they should rerun pyroe using the clean GTF file.
-
-        # if some transcripts don't have exons
-        if transcript_gr.length > transcript_bound_from_exons.length:
-            # pyranges will ignore it anyway. Here I filter them out manually.
-            transcript_gr = transcript_gr[
-                transcript_gr.transcript_id.isin(
-                    transcript_bound_from_exons.transcript_id
-                )
-            ]
-
-            # complain
-            logging.warning(" Found transcript(s) without exons; Ignored.")
-            clean_transcript_gr = transcript_bound_from_exons
-
-        # if some transcripts don't have features
-        elif transcript_gr.length < transcript_bound_from_exons.length:
-            clean_transcript_gr = transcript_bound_from_exons
-
-            # complain
-            logging.warning(
-                "".join(
-                    [
-                        " Found transcripts without corresponding transcript feature record;",
-                        " Those transcripts were not used to extract unspliced sequences.",
-                    ]
-                )
-            )
-
-        # if some genes don't have exons
-        if gene_gr.length > gene_bound_from_exons.length:
-            gene_gr = gene_gr[gene_gr.gene_id.isin(gene_bound_from_exons.gene_id)]
-
-            # complain
-            logging.warning("".join([" Found gene(s) without exons; Ignored."]))
-            clean_gene_gr = gene_bound_from_exons
-
-        # if some genes don't have features
-        elif gene_gr.length < gene_bound_from_exons.length:
-            clean_gene_gr = gene_bound_from_exons
-
-            # complain
-            logging.warning(
-                "".join(
-                    [
-                        " Found genes without corresponding gene feature record.",
-                        " Those genes were not used to extract unspliced sequences.",
-                    ]
-                )
-            )
-
-        # If the transcripts'/genes' bounds defined in the original GTF file
-        # and those found manually (using exons' bounds) are different,
-        # we report a warning and extract unspliced sequences
-        # using the transcripts'/genes' annotation in the original GTF file,
-        # but use manually defined transcript annotations (from their exons' bounds)
-        # in the clean GTF file.
-
-        # transcripts
-        intersecting_txs = set(transcript_gr.transcript_id).intersection(
-            set(transcript_bound_from_exons.transcript_id)
-        )
-
-        if not transcript_gr[transcript_gr.transcript_id.isin(intersecting_txs)][
-            ["Start", "End"]
-        ].df.equals(
-            transcript_bound_from_exons[
-                transcript_bound_from_exons.transcript_id.isin(intersecting_txs)
-            ][["Start", "End"]].df
-        ):
-            clean_transcript_gr = transcript_bound_from_exons
-
-            logging.warning(
-                "".join(
-                    [
-                        " Found transcripts whose boundaries defined in their transcript feature record do not match their exons' bounds.",
-                        " However, those boundaries were still used to extract unspliced sequences.",
-                    ]
-                )
-            )
-
-        # genes
-        intersecting_gs = set(gene_gr.gene_id).intersection(
-            set(gene_bound_from_exons.gene_id)
-        )
-
-        if not gene_gr[gene_gr.gene_id.isin(intersecting_gs)][
-            ["Start", "End"]
-        ].df.equals(
-            gene_bound_from_exons[gene_bound_from_exons.gene_id.isin(intersecting_gs)][
-                ["Start", "End"]
-            ].df
-        ):
-            clean_gene_gr = gene_bound_from_exons
-
-            logging.warning(
-                "".join(
-                    [
-                        " Found genes whose boundaries defined in the gene feature records do not equal to their exons' bounds.",
-                        " However, those boundaries were still used to extract unspliced sequences.",
-                    ]
-                )
-            )
-
-    # if clean_gr is not empty, write it
-    clean_gr = pr.concat([clean_gr, clean_transcript_gr, clean_gene_gr])
-    if not clean_gr.empty:
-        clean_gr.to_gtf(clean_gtf_path)
-        clean_gtf_msg = f" A clean GTF file with all issues fixed is generated at {clean_gtf_path}. If needed, please rerun using this clean GTF file."
-        logging.warning(clean_gtf_msg)
-
-    # return imputed gr
-    return gr
-
-
-def check_bedtools_version(bt_path):
-    try:
-        vstr = (
-            subprocess.run([bt_path, "--version"], capture_output=True)
-            .stdout.decode()
-            .strip()
-            .split("v")[1]
-        )
-        found_ver = parse_version(vstr)
-        req_ver = parse_version("2.30.0")
-        return found_ver >= req_ver
-    except Exception as err:
-        # in this case couldn't even run subprocess
-        logging.warning(f" Cannot check bedtools version. The error message was: {err}")
-        return False
-
-
-def make_splici_txome(
-    genome_path,
-    gtf_path,
-    read_length,
-    output_dir,
-    flank_trim_length=5,
-    filename_prefix="splici",
-    extra_spliced=None,
-    extra_unspliced=None,
-    dedup_seqs=False,
-    no_bt=False,
-    bt_path="bedtools",
-    no_flanking_merge=False,
-):
-    """
-    Construct the splici (spliced + introns) transcriptome for alevin-fry.
-
-    Required Parameters
-    ----------
-    genome_path : str
-        The path to a genome fasta file.
-
-    gtf_path : str
-        The path to a gtf file.
-
-    read_length : int
-        The read length of the single-cell experiment being processed.
-
-    output_dir : str
-        The output directory, where the splici reference files will
-        be written.
-
-    Optional Parameters
-    ----------
-
-    flank_trim_length : int (default: `5`)
-        The flank trimming length.
-        The final flank length is obtained by subtracting
-        the flank_trim_length from the read_length.
-
-    filename_prefix : str (default: `splici`)
-        The file name prefix of the generated output files.
-        The derived flank length will be automatically
-        appended to the provided prefix.
-
-    extra_spliced : str
-        A path to a fasta file. The records in this fasta file will be
-        regarded as spliced transcripts.
-
-    extra_unspliced : str
-        The path to a fasta file. The records in this fasta file will be
-        regarded as introns.
-
-    dedup_seqs : bool  (default: `False`)
-        If True, the repeated sequences in the splici reference will be
-        deduplicated.
-
-    no_bt : bool (default: `False`)
-        If true, biopython, instead of bedtools, will be used for
-        generating splici reference files.
-
-    bt_path : str
-        The path to bedtools v2.30.0 or greater if it is not in the environment PATH.
-
-    no_flanking_merge : bool (default: `False`)
-        If true, overlapping introns caused by the added flanking length will not be merged.
-
-    Returns
-    -------
-    Nothing will be returned. The splici reference files will be written
-    to disk.
-
-    Notes
-    -----
-    * The input GTF file will be processed before extracting unspliced sequences. If pyroe finds invalid records, a `clean_gtf.gtf` file will be generated in the specified output directory.  **Note** : The features extracted in the spliced + intronic transcriptome will not necessarily be those present in the `clean_gtf.gtf` file — as this command will prefer the input in the user-provided file wherever possible.  More specifically:
-            * If the required metadata fields contain missing values, pyroe will impute them if possible, or return an error if not.
-            * **Pyroe will always extract unspliced sequences according to the boundaries defined in the transcript/gene feature records unless there is no transcript/gene feature record in the GTF file.** In this case, pyroe imputes all transcripts/genes boundaries as the bounds of the corresponding exons to extract unspliced sequences.
-            * If the transcript/gene feature records do not match their exon feature records, pyroe will still use transcript/gene feature records, but correct those transcript/gene feature records in the `celan_grf.gtf` according to exon feature records.
-    * If using bedtools, a temp.bed and a temp.fa will be created and
-        then deleted. These two files encode the introns of each gene
-        and the exons of each transcript of each gene.
-
-    """
-    # Preparation
-
-    # check flanking length
-    flank_length = read_length - flank_trim_length
-
-    if flank_length < 0:
-        logging.critical(" Flank trim length cannot be larger than read length!")
-
-    # check fasta file
-    if not os.path.isfile(genome_path):
-        logging.critical(" Cannot open the input fasta file!")
-
-    # check gtf file
-    if not os.path.isfile(gtf_path):
-        logging.critical(" Cannot open the input gtf file!")
-
-    # check bedtools
-    if not no_bt:
-        # check at the provided path
-        if not check_bedtools_version(bt_path):
-            # if it's not ok at the provided path, check
-            # the standard system path
-            if bt_path == "bedtools":
-                # in this case, there's nowhere else to check
-                # so give up on bedtools
-                logging.warning(
-                    "".join(
-                        [
-                            " Bedtools in the environemnt PATH is either",
-                            " older than v.2.30.0 or doesn't exist.",
-                            " Biopython will be used to extract sequences.",
-                        ]
-                    )
-                )
-                no_bt = True
-            else:
-                logging.warning(
-                    " Bedtools specified by bt_path is either",
-                    " older than v.2.30.0 or doesn't exist.",
-                    " Trying to find bedtools in the environmental PATH.",
-                )
-                # if it's not ok at the standard system path
-                # fallback to biopython
-                if not check_bedtools_version("bedtools"):
-                    logging.warning(
-                        "".join(
-                            [
-                                " Bedtools in the environemnt PATH is either",
-                                " older than v.2.30.0 or doesn't exist.",
-                                " Biopython will be used to extract sequences.",
-                            ]
-                        )
-                    )
-                    no_bt = True
-                # found it at the system path
-                else:
-                    bt_path = "bedtools"
-                    logging.warning(" Using bedtools in the environmental PATH.")
-
-    # create out folder and temp folder inside
-    # create output folder
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # specify output file names
-    filename_prefix = filename_prefix + "_fl" + str(flank_length)
-    out_fa = os.path.join(output_dir, filename_prefix + ".fa")
-    out_t2g3col = os.path.join(output_dir, filename_prefix + "_t2g_3col.tsv")
-    id2name_path = os.path.join(output_dir, "gene_id_to_name.tsv")
-
-    # load gtf
-    try:
-        gr = pr.read_gtf(gtf_path)
-    except Exception as err:
-        # in this case couldn't even read the GTF file
-        logging.error(
-            "".join(
-                [
-                    " PyRanges failed to parse the input GTF file.",
-                    " Please check the PyRanges documentation for the expected GTF format constraints at",
-                    " https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf .",
-                    f" The error message was: {str(err)}",
-                ]
-            ),
-            exc_info=True,
-        )
-
-    # check the validity of gr
-    gr = check_gr(gr, output_dir)
-
-    # write gene id to name tsv file
-    gr.df[["gene_id", "gene_name"]].drop_duplicates().to_csv(
-        id2name_path, sep="\t", header=False, index=False
-    )
-
-    # get introns
-    # the introns() function uses inplace=True argument from pandas,
-    # which will trigger an FutureWarning.
-    warnings.simplefilter(action="ignore", category=FutureWarning)
-    introns = gr.features.introns(by="transcript")
-    warnings.simplefilter(action="default", category=FutureWarning)
-
-    introns.Name = introns.gene_id
-
-    if no_flanking_merge:
-        introns = introns.merge(strand=True, by=["Name"], slack=0)
-
-    introns = introns.extend(flank_length)
-
-    if not no_flanking_merge:
-        introns = introns.merge(strand=True, by=["Name"], slack=0)
-
-    introns.Gene = introns.Name
-    introns.Name = [
-        "-I".join(map(str, z))
-        for z in zip(
-            introns.Name,
-            introns.Name.groupby(introns.Name)
-            .cumcount()
-            .astype(str)
-            .replace(
-                "0",
-                "",
-            )
-            .values,
-        )
-    ]
-
-    # trim outbounded introns
-    with open(genome_path) as fasta_file:
-        chromsize = {
-            title.split()[0]: len(sequence)
-            for title, sequence in SimpleFastaParser(fasta_file)
-        }
-
-    # in case the genome and gene annotaitons do not match
-    # try it and raise value error if this fails
-    try:
-        introns = pr.gf.genome_bounds(introns, chromsize, clip=True)
-    except Exception as err:
-        logging.error(
-            "".join(
-                [
-                    " Failed to refine intron bounds using genome bounds.",
-                    " Please check if the input genome FASTA file and GTF file match each other, especially the chromosome names.",
-                    f" The error message was: {str(err)}",
-                ]
-            ),
-            exc_info=True,
-        )  # deduplicate introns
-    if dedup_seqs:
-        introns.drop_duplicate_positions()
-    # add splice status for introns
-    introns.splice_status = "U"
-
-    introns = introns[bed_required_fields]
-
-    # get exons
-    exons = gr[gr.Feature == "exon"]
-
-    exons.Name = exons.transcript_id
-    exons.Gene = exons.gene_id
-    exons = exons.sort(["Name", "Start", "End"])
-    # add splice status for exons
-    exons.splice_status = "S"
-    # keep only required fields
-    exons = exons[bed_required_fields]
-
-    # concat spliced transcripts and introns as splici
-    splici = pr.concat([exons, introns])
-
-    # write to files
-    # t2g_3col.tsv
-    splici.df[["Name", "Gene", "splice_status"]].drop_duplicates().to_csv(
-        out_t2g3col, sep="\t", header=False, index=False
-    )
-    # print(splici.head())
-    tid2strand = dict(zip(splici.Name, splici.Strand))
-
-    # splici fasta
-    if not no_bt:
-        try:
-
-            # create temp folder
-            temp_dir = os.path.join(output_dir, "temp")
-            if not os.path.exists(temp_dir):
-                os.makedirs(temp_dir)
-            temp_fa = os.path.join(temp_dir, "temp.fa")
-            temp_bed = os.path.join(temp_dir, "temp.bed")
-
-            # write bed file
-            splici.to_bed(temp_bed, keep=True)
-
-            # run bedtools, ignore strand for now
-            bt_r = subprocess.run(
-                " ".join(
-                    [
-                        bt_path,
-                        "getfasta",
-                        "-fi",
-                        genome_path,
-                        "-fo",
-                        temp_fa,
-                        "-bed",
-                        temp_bed,
-                        # "-s",
-                        "-nameOnly",
-                    ]
-                ),
-                shell=True,
-                capture_output=True,
-            )
-
-            # check return code
-            if bt_r.returncode != 0:
-                logging.exception(" Bedtools failed.", exc_info=True)
-
-            # parse temp fasta file to concat exons of each transcript
-            ei_parser = SeqIO.parse(temp_fa, "fasta")
-            prev_rec = next(ei_parser)
-            # prev_rec.id = prev_rec.id.split("(")[0]
-            prev_rec.description = ""
-            with open(out_fa, "w") as out_handle:
-
-                for seq_record in ei_parser:
-                    # seq_record.id = seq_record.id.split("(")[0]
-                    seq_record.description = ""
-                    if seq_record.id == prev_rec.id:
-                        prev_rec += seq_record
-
-                    else:
-                        if tid2strand[prev_rec.id] == "-":
-                            prev_rec = prev_rec.reverse_complement(
-                                id=True, description=True
-                            )
-                        SeqIO.write(prev_rec, out_handle, "fasta")
-                        prev_rec = seq_record
-                # Don't forget our last customer
-                if tid2strand[prev_rec.id] == "-":
-                    prev_rec = prev_rec.reverse_complement(id=True, description=True)
-                SeqIO.write(prev_rec, out_handle, "fasta")
-            shutil.rmtree(temp_dir, ignore_errors=True)
-        except Exception as err:
-            no_bt = True
-            logging.warning(
-                f" Bedtools failed; Using biopython instead. The error message was: \n{err}"
-            )
-            shutil.rmtree(temp_dir, ignore_errors=True)
-
-    if no_bt:
-        with open(out_fa, "w") as out_handle:
-            # read fasta, process a chromosome at a time
-            for seq_record in SeqIO.parse(genome_path, "fasta"):
-                # get all records on that chromosome
-                chr_records = introns[introns.Chromosome == seq_record.id].df
-                if not chr_records.empty:
-                    chr_records.Strand = chr_records.Strand.replace(
-                        ["+", "-"], [+1, -1]
-                    )
-                    # init seq list
-                    intron_seqs = []
-                    # for each intron record
-                    for idx, intron_record in chr_records.iterrows():
-                        # create Seqeture object for extracting sequence from chromosome
-                        intron_feature = SeqFeature(
-                            FeatureLocation(intron_record.Start, intron_record.End),
-                            type="intron",
-                            id=intron_record.Name,
-                        )
-                        intron_feature.strand = intron_record.Strand
-
-                        # append the intron sequence to the seq list, specify name as well
-                        intron_seqs.append(
-                            SeqRecord(
-                                intron_feature.extract(seq_record).seq,
-                                id=intron_record.Name,
-                                description="",
-                            )
-                        )
-                    # finally, write all intron sequences at once.
-                    SeqIO.write(intron_seqs, out_handle, "fasta")
-
-                # Then, process spliced transcripts
-                chr_records = exons[exons.Chromosome == seq_record.id].df
-                if not chr_records.empty:
-                    txp_seqs = []
-                    # as spliced txps are the concat of all exon sequences, fist get the sequence of each exon separately,then sum them up.
-                    for tid, exon_records in chr_records.groupby("Name"):
-
-                        # init exon seq list
-                        exon_seqs = []
-                        # get the sequence of each exon
-                        for idx, exon_record in exon_records.iterrows():
-                            # create SeqFeature object for the exon record
-                            # ignore strand for now, get reverse complement later if needed
-                            exon_feature = SeqFeature(
-                                FeatureLocation(exon_record.Start, exon_record.End),
-                                type="exon",
-                            )
-                            # extract exon sequence from chromosome and append to exon seq list
-                            exon_seqs.append(
-                                SeqRecord(
-                                    exon_feature.extract(seq_record).seq,
-                                    id=tid,
-                                    description="",
-                                )
-                            )
-                        # append the txp sequence to spliced txp seq list
-                        # consider strand
-                        if tid2strand[tid] == "-":
-                            txp_seqs.append(
-                                sum(exon_seqs, Seq("")).reverse_complement(
-                                    id=True, description=True
-                                )
-                            )
-                        else:
-                            txp_seqs.append(sum(exon_seqs, Seq("")))
-                    # write all spliced transcript serquence at once.
-                    SeqIO.write(txp_seqs, out_handle, "fasta")
-
-    # append extra spliced transcript onto splici
-    if extra_spliced is not None:
-        append_extra(extra_spliced, out_fa, out_t2g3col, id2name_path, "S")
-
-    # append extra unspliced transcript onto splici
-    if extra_unspliced is not None:
-        append_extra(extra_unspliced, out_fa, out_t2g3col, id2name_path, "U")
-
-    if dedup_seqs:
-        # Note: out_fa is intentionally passed as both
-        # the input and output file name parameters because
-        # we want to overwirte the duplicate fasta with
-        # the deduplicated fasta.
-        dedup_sequences(output_dir, out_fa, out_fa)
-
-
-def make_spliceu_txome(
-    genome_path,
-    gtf_path,
-    output_dir,
-    filename_prefix,
-    extra_spliced=None,
-    extra_unspliced=None,
-    dedup_seqs=False,
-    no_bt=False,
-    bt_path="bedtools",
-):
-    """
-    Construct the spliceu (spliced + unspliced) transcriptome for alevin-fry.
-
-    Required Parameters
-    ----------
-    genome_path : str
-        The path to a genome fasta file.
-
-    gtf_path : str
-        The path to a gtf file.
-
-    output_dir : str
-        The output directory, where the spliceu reference files will
-        be written.
-
-    Optional Parameters
-    ----------
-
-    filename_prefix : str (default: `spliceu`)
-        The file name prefix of the generated output files.
-        The derived flank length will be automatically
-        appended to the provided prefix.
-
-    extra_spliced : str
-        A path to a fasta file. The records in this fasta file will be
-        regarded as spliced transcripts.
-
-    extra_unspliced : str
-        The path to a fasta file. The records in this fasta file will be
-        regarded as introns.
-
-    dedup_seqs : bool  (default: `False`)
-        If True, the repeated sequences in the spliceu reference will be
-        deduplicated.
-
-    no_bt : bool (default: `False`)
-        If true, biopython, instead of bedtools, will be used for
-        generating spliceu reference files.
-
-    bt_path : str
-        The path to bedtools v2.30.0 or greater if it is not in the environment PATH.
-
-    Returns
-    -------
-    Nothing will be returned. The spliceu reference files will be written
-    to disk.
-
-    Notes
-    -----
-    * The input GTF file will be processed before extracting unspliced sequences. If pyroe finds invalid records, a `clean_gtf.gtf` file will be generated in the specified output directory.  **Note** : The features extracted in the spliced + unspliced transcriptome will not necessarily be those present in the `clean_gtf.gtf` file — as this command will prefer the input in the user-provided file wherever possible.  More specifically:
-            * If the required metadata fields contain missing values, pyroe will impute them if possible, or return an error if not.
-            * **Pyroe will always extract unspliced sequences according to the boundaries defined in the transcript/gene feature records unless there is no transcript/gene feature record in the GTF file.** In this case, pyroe imputes all transcripts/genes boundaries as the bounds of the corresponding exons to extract unspliced sequences.
-            * If the transcript/gene feature records do not match their exon feature records, pyroe will still use transcript/gene feature records, but correct those transcript/gene feature records in the `celan_grf.gtf` according to exon feature records.
-    * If using bedtools, a temp.bed and a temp.fa will be created and
-        then deleted. These two files encode the introns of each gene
-        and the exons of each transcript of each gene.
-
-    """
-    # Preparation
-    # check fasta file
-    if not os.path.isfile(genome_path):
-        logging.critical(" Cannot open the input fasta file!")
-
-    # check gtf file
-    if not os.path.isfile(gtf_path):
-        logging.critical(" Cannot open the input gtf file!")
-
-    # check bedtools
-    if not no_bt:
-        # check at the provided path
-        if not check_bedtools_version(bt_path):
-            # if it's not ok at the provided path, check
-            # the standard system path
-            if bt_path == "bedtools":
-                # in this case, there's nowhere else to check
-                # so give up on bedtools
-                logging.warning(
-                    "".join(
-                        [
-                            " Bedtools in the environemnt PATH is either",
-                            " older than v.2.30.0 or doesn't exist.",
-                            " Biopython will be used.",
-                        ]
-                    )
-                )
-                no_bt = True
-            else:
-                logging.warning(
-                    "".join(
-                        [
-                            "Bedtools specified by bt_path is either",
-                            "older than v.2.30.0 or doesn't exist.",
-                            "Trying to find bedtools in the environmental PATH.",
-                        ]
-                    )
-                )
-                # if it's not ok at the standard system path
-                # fallback to biopython
-                if not check_bedtools_version("bedtools"):
-                    logging.warning(
-                        "".join(
-                            [
-                                " Bedtools in the environemnt PATH is either",
-                                " older than v.2.30.0 or doesn't exist.",
-                                " Biopython will be used.",
-                            ]
-                        )
-                    )
-                    no_bt = True
-                # found it at the system path
-                else:
-                    bt_path = "bedtools"
-                    logging.warning(" Using bedtools in the environmental PATH.")
-
-    # create out folder and temp folder inside
-    # create output folder
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    # specify output file names
-    out_fa = os.path.join(output_dir, filename_prefix + ".fa")
-    out_t2g3col = os.path.join(output_dir, filename_prefix + "_t2g_3col.tsv")
-    out_t2g = os.path.join(output_dir, filename_prefix + "_t2g.tsv")
-    out_g2g = os.path.join(output_dir, filename_prefix + "_g2g.tsv")
-    id2name_path = os.path.join(output_dir, "gene_id_to_name.tsv")
-
-    # load gtf
-    try:
-        gr = pr.read_gtf(gtf_path)
-    except ValueError as err:
-        # in this case couldn't even run subprocess
-        logging.error(
-            "".join(
-                [
-                    " PyRanges failed to parse the input GTF file.",
-                    " Please check the PyRanges documentation for",
-                    " the expected GTF format constraints at",
-                    " https://pyranges.readthedocs.io/en/latest/autoapi/pyranges/readers/index.html?highlight=read_gtf#pyranges.readers.read_gtf .",
-                    f" The error message was: {str(err)}",
-                ]
-            ),
-            exc_info=True,
-        )
-
-    # check the validity of gr
-    gr = check_gr(gr, output_dir)
-
-    # write gene id to name tsv file
-    gr.df[["gene_id", "gene_name"]].drop_duplicates().to_csv(
-        id2name_path, sep="\t", header=False, index=False
-    )
-
-    # get unspliced
-    unspliced = gr[gr.Feature == "gene"]
-    # unspliced = gr.boundaries("gene_id")
-    unspliced.Name = unspliced.gene_id + "-I"
-    unspliced.Gene = unspliced.gene_id
-    # add splice status for unspliced
-    unspliced.splice_status = "U"
-    # keep only required fields
-    unspliced = unspliced[bed_required_fields]
-
-    # get exons
-    exons = gr[gr.Feature == "exon"]
-
-    exons.Name = exons.transcript_id
-    exons.Gene = exons.gene_id
-
-    exons = exons.sort(["Name", "Start", "End"])
-    # add splice status for exons
-    exons.splice_status = "S"
-    # keep only required fields
-    exons = exons[bed_required_fields]
-
-    # concat spliced transcripts and unspliced as spliceu
-    spliceu = pr.concat([exons, unspliced])
-
-    # write to files
-    # t2g_3col.tsv
-    t2g_3col = spliceu.df[["Name", "Gene", "splice_status"]].drop_duplicates()
-    t2g_3col.to_csv(out_t2g3col, sep="\t", header=False, index=False)
-
-    # t2g.csv
-    t2g_3col[["Name", "Gene"]].to_csv(out_t2g, sep="\t", header=False, index=False)
-
-    # g2g.csv
-    t2g_3col[["Gene", "Gene"]].to_csv(out_g2g, sep="\t", header=False, index=False)
-
-    tid2strand = dict(zip(spliceu.Name, spliceu.Strand))
-
-    # spliceu fasta
-    if not no_bt:
-        try:
-            # create temp folder
-            temp_dir = os.path.join(output_dir, "temp")
-            if not os.path.exists(temp_dir):
-                os.makedirs(temp_dir)
-            temp_fa = os.path.join(temp_dir, "temp.fa")
-            temp_bed = os.path.join(temp_dir, "temp.bed")
-
-            # write bed file
-            spliceu.to_bed(temp_bed, keep=True)
-
-            # run bedtools, ignore strand for now
-            bt_r = subprocess.run(
-                " ".join(
-                    [
-                        bt_path,
-                        "getfasta",
-                        "-fi",
-                        genome_path,
-                        "-fo",
-                        temp_fa,
-                        "-bed",
-                        temp_bed,
-                        # "-s",
-                        "-nameOnly",
-                    ]
-                ),
-                shell=True,
-                capture_output=True,
-            )
-
-            # check return code
-            if bt_r.returncode != 0:
-                logging.exception("Bedtools failed.", exc_info=True)
-
-            # parse temp fasta file to concat exons of each transcript
-            ei_parser = SeqIO.parse(temp_fa, "fasta")
-            prev_rec = next(ei_parser)
-            # prev_rec.id = prev_rec.id.split("(")[0]
-            prev_rec.description = ""
-            with open(out_fa, "w") as out_handle:
-
-                for seq_record in ei_parser:
-                    # seq_record.id = seq_record.id.split("(")[0]
-                    seq_record.description = ""
-                    if seq_record.id == prev_rec.id:
-                        prev_rec += seq_record
-
-                    else:
-                        if tid2strand[prev_rec.id] == "-":
-                            prev_rec = prev_rec.reverse_complement(
-                                id=True, description=True
-                            )
-                        SeqIO.write(prev_rec, out_handle, "fasta")
-                        prev_rec = seq_record
-                # Don't forget our last customer
-                if tid2strand[prev_rec.id] == "-":
-                    prev_rec = prev_rec.reverse_complement(id=True, description=True)
-                SeqIO.write(prev_rec, out_handle, "fasta")
-            shutil.rmtree(temp_dir, ignore_errors=True)
-        except Exception as err:
-            no_bt = True
-            logging.warning(
-                f" Bedtools failed; Using biopython instead. The error message was: {err}"
-            )
-            shutil.rmtree(temp_dir, ignore_errors=True)
-
-    if no_bt:
-        with open(out_fa, "w") as out_handle:
-            # read fasta, process a chromosome at a time
-            for seq_record in SeqIO.parse(genome_path, "fasta"):
-                # get all records on that chromosome
-                chr_records = unspliced[unspliced.Chromosome == seq_record.id].df
-                if not chr_records.empty:
-                    chr_records.Strand = chr_records.Strand.replace(
-                        ["+", "-"], [+1, -1]
-                    )
-                    # init seq list
-                    unspliced_seqs = []
-                    # for each unspliced record
-                    for idx, unspliced_record in chr_records.iterrows():
-                        # create Seqeture object for extracting sequence from chromosome
-                        unspliced_feature = SeqFeature(
-                            FeatureLocation(
-                                unspliced_record.Start, unspliced_record.End
-                            ),
-                            type="unspliced",
-                            id=unspliced_record.Name,
-                        )
-                        unspliced_feature.strand = unspliced_record.Strand
-                        # append the unspliced sequence to the seq list, specify name as well
-                        unspliced_seqs.append(
-                            SeqRecord(
-                                unspliced_feature.extract(seq_record).seq,
-                                id=unspliced_record.Name,
-                                description="",
-                            )
-                        )
-                    # finally, write all unspliced sequences at once.
-                    SeqIO.write(unspliced_seqs, out_handle, "fasta")
-
-                # Then, process spliced transcripts
-                chr_records = exons[exons.Chromosome == seq_record.id].df
-                if not chr_records.empty:
-                    txp_seqs = []
-                    # as spliced txps are the concat of all exon sequences, fist get the sequence of each exon separately,then sum them up.
-                    for tid, exon_records in chr_records.groupby("Name"):
-
-                        # init exon seq list
-                        exon_seqs = []
-                        # get the sequence of each exon
-                        for idx, exon_record in exon_records.iterrows():
-                            # create SeqFeature object for the exon record
-                            # ignore strand for now, get reverse complement later if needed
-                            exon_feature = SeqFeature(
-                                FeatureLocation(exon_record.Start, exon_record.End),
-                                type="exon",
-                            )
-                            # extract exon sequence from chromosome and append to exon seq list
-                            exon_seqs.append(
-                                SeqRecord(
-                                    exon_feature.extract(seq_record).seq,
-                                    id=tid,
-                                    description="",
-                                )
-                            )
-                        # append the txp sequence to spliced txp seq list
-                        # consider strand
-                        if tid2strand[tid] == "-":
-                            txp_seqs.append(
-                                sum(exon_seqs, Seq("")).reverse_complement(
-                                    id=True, description=True
-                                )
-                            )
-                        else:
-                            txp_seqs.append(sum(exon_seqs, Seq("")))
-                    # write all spliced transcript serquence at once.
-                    SeqIO.write(txp_seqs, out_handle, "fasta")
-
-    # append extra spliced transcript onto spliceu
-    if extra_spliced is not None:
-        append_extra(extra_spliced, out_fa, out_t2g3col, id2name_path, "S")
-
-    # append extra unspliced transcript onto spliceu
-    if extra_unspliced is not None:
-        append_extra(extra_unspliced, out_fa, out_t2g3col, id2name_path, "U")
-
-    if dedup_seqs:
-        # Note: out_fa is intentionally passed as both
-        # the input and output file name parameters because
-        # we want to overwirte the duplicate fasta with
-        # the deduplicated fasta.
-        dedup_sequences(output_dir, out_fa, out_fa)
diff --git a/tests/test_make_spliceu.py b/tests/test_make_spliceu.py
deleted file mode 100644
index 0da4eb5..0000000
--- a/tests/test_make_spliceu.py
+++ /dev/null
@@ -1,126 +0,0 @@
-from pyroe.make_txome import make_spliceu_txome
-import tempfile
-import os
-import pandas as pd
-from Bio import SeqIO
-from Bio.Seq import Seq
-
-
-def test_make_spliceu():
-    location = os.path.dirname(os.path.realpath(__file__))
-    test_file_dir = os.path.join(location, "data", "test_make_splici")
-    genome_path = os.path.join(test_file_dir, "small_example_genome.fa")
-    gtf_path = os.path.join(test_file_dir, "small_example.gtf")
-    filename_prefix = "spliceu"
-    extra_spliced = os.path.join(test_file_dir, "extra_spliced.txt")
-    extra_unspliced = os.path.join(test_file_dir, "extra_unspliced.txt")
-    output_dir = tempfile.TemporaryDirectory()
-    make_spliceu_txome(
-        genome_path=genome_path,
-        gtf_path=gtf_path,
-        extra_spliced=extra_spliced,
-        extra_unspliced=extra_unspliced,
-        output_dir=output_dir.name,
-        filename_prefix=filename_prefix,
-        no_bt=True,
-    )
-
-    out_fa = os.path.join(output_dir.name, "".join([filename_prefix, ".fa"]))
-    out_t2g = os.path.join(output_dir.name, "".join([filename_prefix, "_t2g_3col.tsv"]))
-    genome = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta"))
-    spliceu_seqs = SeqIO.to_dict(SeqIO.parse(out_fa, "fasta"))
-    t2g_colnames = ["txp_name", "gene_name", "splice_status"]
-    t2g_3col = pd.read_csv(
-        out_t2g, sep="\t", header=None, names=t2g_colnames
-    ).set_index(t2g_colnames[0])
-
-    # Define expected sequences
-    # For each gene,
-    # we have 3 spliced txps. Indexing starting from 1
-    # tx1 ([1,2], [36, 45], [71,80])
-    # tx2 ([46,55], [91, 100])
-    # txp3 ([121,130], [156, 160], [191,200])
-    # unspliced [1,200]
-
-    chr_1_seq = genome["chr1"].seq
-    chr_2_seq = genome["chr2"].seq
-    # check spliced sequences
-    # tx1_1
-    extracted_tx1_1 = spliceu_seqs["tx1.1"].seq
-    expected_tx1_1 = sum([chr_1_seq[0:2], chr_1_seq[35:45], chr_1_seq[70:80]], Seq(""))
-    assert extracted_tx1_1 == expected_tx1_1
-
-    # tx1_2
-    extracted_tx1_2 = spliceu_seqs["tx1.2"].seq
-    expected_tx1_2 = sum([chr_1_seq[45:55], chr_1_seq[90:100]], Seq(""))
-    assert extracted_tx1_2 == expected_tx1_2
-
-    # tx1_3
-    extracted_tx1_3 = spliceu_seqs["tx1.3"].seq
-    expected_tx1_3 = sum(
-        [chr_1_seq[120:130], chr_1_seq[155:160], chr_1_seq[190:200]], Seq("")
-    )
-    assert extracted_tx1_3 == expected_tx1_3
-
-    # tx2_1
-    extracted_tx2_1 = spliceu_seqs["tx2.1"].seq
-    expected_tx2_1 = sum(
-        [chr_2_seq[0:2], chr_2_seq[35:45], chr_2_seq[70:80]], Seq("")
-    ).reverse_complement()
-    assert extracted_tx2_1 == expected_tx2_1
-
-    # tx2_2
-    extracted_tx2_2 = spliceu_seqs["tx2.2"].seq
-    expected_tx2_2 = sum(
-        [chr_2_seq[45:55], chr_2_seq[90:100]], Seq("")
-    ).reverse_complement()
-    assert extracted_tx2_2 == expected_tx2_2
-
-    # tx2_3
-    extracted_tx2_3 = spliceu_seqs["tx2.3"].seq
-    expected_tx2_3 = sum(
-        [chr_2_seq[120:130], chr_2_seq[155:160], chr_2_seq[190:200]], Seq("")
-    ).reverse_complement()
-    assert extracted_tx2_3 == expected_tx2_3
-
-    # check intron seqs
-    # g1_I [0,200]
-
-    extract_g1_I = spliceu_seqs["g1-I"].seq
-    expected_g1_I = sum([chr_1_seq[0:200]], Seq(""))
-    assert extract_g1_I == expected_g1_I
-
-    # g2_I [0,200]
-
-    extract_g2_I = spliceu_seqs["g2-I"].seq
-    expected_g2_I = sum([chr_2_seq[0:200]], Seq("")).reverse_complement()
-    assert extract_g2_I == expected_g2_I
-
-    # check t2g_3col
-    t2g_dict = t2g_3col[[t2g_colnames[1]]].to_dict()["gene_name"]
-    t2s_dict = t2g_3col[[t2g_colnames[2]]].to_dict()["splice_status"]
-
-    # check txp name to gene name mapping
-    # for g1
-    assert t2g_dict["tx1.1"] == "g1"
-    assert t2g_dict["tx1.2"] == "g1"
-    assert t2g_dict["tx1.3"] == "g1"
-    assert t2g_dict["g1-I"] == "g1"
-
-    # for g2
-    assert t2g_dict["tx2.1"] == "g2"
-    assert t2g_dict["tx2.2"] == "g2"
-    assert t2g_dict["tx2.3"] == "g2"
-    assert t2g_dict["g2-I"] == "g2"
-
-    # check txp name to splice status mapping
-    assert t2s_dict["tx1.1"] == "S"
-    assert t2s_dict["tx1.2"] == "S"
-    assert t2s_dict["tx1.3"] == "S"
-    assert t2s_dict["g1-I"] == "U"
-
-    # for S
-    assert t2s_dict["tx2.1"] == "S"
-    assert t2s_dict["tx2.2"] == "S"
-    assert t2s_dict["tx2.3"] == "S"
-    assert t2s_dict["g2-I"] == "U"
diff --git a/tests/test_make_splici.py b/tests/test_make_splici.py
deleted file mode 100644
index 21cb32f..0000000
--- a/tests/test_make_splici.py
+++ /dev/null
@@ -1,172 +0,0 @@
-from pyroe.make_txome import make_splici_txome
-import tempfile
-import os
-import pandas as pd
-from Bio import SeqIO
-from Bio.Seq import Seq
-
-
-def test_make_splici():
-    location = os.path.dirname(os.path.realpath(__file__))
-    test_file_dir = os.path.join(location, "data", "test_make_splici")
-    genome_path = os.path.join(test_file_dir, "small_example_genome.fa")
-    gtf_path = os.path.join(test_file_dir, "small_example.gtf")
-    filename_prefix = "splici"
-    extra_spliced = os.path.join(test_file_dir, "extra_spliced.txt")
-    extra_unspliced = os.path.join(test_file_dir, "extra_unspliced.txt")
-    output_dir = tempfile.TemporaryDirectory()
-    read_length = 5
-    flank_trim_length = 2
-    make_splici_txome(
-        genome_path=genome_path,
-        gtf_path=gtf_path,
-        read_length=read_length,
-        extra_spliced=extra_spliced,
-        extra_unspliced=extra_unspliced,
-        flank_trim_length=flank_trim_length,
-        output_dir=output_dir.name,
-        filename_prefix=filename_prefix,
-        no_bt=True,
-    )
-
-    out_fa = os.path.join(output_dir.name, "splici_fl3.fa")
-    out_t2g = os.path.join(output_dir.name, "splici_fl3_t2g_3col.tsv")
-    genome = SeqIO.to_dict(SeqIO.parse(genome_path, "fasta"))
-    # gene_annotation = pr.read_gtf(gtf_path).df
-    splici_seqs = SeqIO.to_dict(SeqIO.parse(out_fa, "fasta"))
-    t2g_colnames = ["txp_name", "gene_name", "splice_status"]
-    t2g_3col = pd.read_csv(
-        out_t2g, sep="\t", header=None, names=t2g_colnames
-    ).set_index(t2g_colnames[0])
-
-    # Define expected sequences
-    # For each gene,
-    # we have 3 spliced txps. Indexing starting from 1
-    # tx1 ([1,2], [36, 45], [71,80]), with intron regions ([3,35], [46,70])
-    # tx2 ([46,55], [91, 100]), with intron ([56,90])
-    # txp3 ([121,130], [156, 160], [191,200]) with introns ([131,155],[161,190])
-    # By adding flanking length, the intron regions become
-    #  ([0,38], [43,73])
-    #  ([53,93])
-    #  ([128,158],[158,193])
-    # after collapsing, we will left three introns for each gene
-    # I1: [1,38]
-    # I2: [43,93]
-    # I3: [128,193]
-
-    # python subset works as [)
-
-    chr_1_seq = genome["chr1"].seq
-    chr_2_seq = genome["chr2"].seq
-    # check spliced sequences
-    # tx1_1
-    extracted_tx1_1 = splici_seqs["tx1.1"].seq
-    expected_tx1_1 = sum([chr_1_seq[0:2], chr_1_seq[35:45], chr_1_seq[70:80]], Seq(""))
-    assert extracted_tx1_1 == expected_tx1_1
-
-    # tx1_2
-    extracted_tx1_2 = splici_seqs["tx1.2"].seq
-    expected_tx1_2 = sum([chr_1_seq[45:55], chr_1_seq[90:100]], Seq(""))
-    assert extracted_tx1_2 == expected_tx1_2
-
-    # tx1_3
-    extracted_tx1_3 = splici_seqs["tx1.3"].seq
-    expected_tx1_3 = sum(
-        [chr_1_seq[120:130], chr_1_seq[155:160], chr_1_seq[190:200]], Seq("")
-    )
-    assert extracted_tx1_3 == expected_tx1_3
-
-    # tx2_1
-    extracted_tx2_1 = splici_seqs["tx2.1"].seq
-    expected_tx2_1 = sum(
-        [chr_2_seq[0:2], chr_2_seq[35:45], chr_2_seq[70:80]], Seq("")
-    ).reverse_complement()
-    assert extracted_tx2_1 == expected_tx2_1
-
-    # tx2_2
-    extracted_tx2_2 = splici_seqs["tx2.2"].seq
-    expected_tx2_2 = sum(
-        [chr_2_seq[45:55], chr_2_seq[90:100]], Seq("")
-    ).reverse_complement()
-    assert extracted_tx2_2 == expected_tx2_2
-
-    # tx2_3
-    extracted_tx2_3 = splici_seqs["tx2.3"].seq
-    expected_tx2_3 = sum(
-        [chr_2_seq[120:130], chr_2_seq[155:160], chr_2_seq[190:200]], Seq("")
-    ).reverse_complement()
-    assert extracted_tx2_3 == expected_tx2_3
-
-    # check intron seqs
-    # I1: [1,38]
-    # I2: [43,93]
-    # I3: [128,193]
-
-    # g1_I [0,38]
-
-    extract_g1_I = splici_seqs["g1-I"].seq
-    expected_g1_I = sum([chr_1_seq[0:38]], Seq(""))
-    assert extract_g1_I == expected_g1_I
-
-    # g1_I1 [43,93]
-    extract_g1_I1 = splici_seqs["g1-I1"].seq
-    expected_g1_I1 = sum([chr_1_seq[42:93]], Seq(""))
-    assert extract_g1_I1 == expected_g1_I1
-
-    # g1_I2 [128,193]
-    extract_g1_I2 = splici_seqs["g1-I2"].seq
-    expected_g1_I2 = sum([chr_1_seq[127:193]], Seq(""))
-    assert extract_g1_I2 == expected_g1_I2
-
-    # g2_I [0,38]
-
-    extract_g2_I = splici_seqs["g2-I"].seq
-    expected_g2_I = sum([chr_2_seq[0:38]], Seq("")).reverse_complement()
-    assert extract_g2_I == expected_g2_I
-
-    # g2_I1 [43,93]
-    extract_g2_I1 = splici_seqs["g2-I1"].seq
-    expected_g2_I1 = sum([chr_2_seq[42:93]], Seq("")).reverse_complement()
-    assert extract_g2_I1 == expected_g2_I1
-
-    # g2_I2 [128,193]
-    extract_g2_I2 = splici_seqs["g2-I2"].seq
-    expected_g2_I2 = chr_2_seq[127:193].reverse_complement()
-    assert extract_g2_I2 == expected_g2_I2
-
-    # check t2g_3col
-    t2g_dict = t2g_3col[[t2g_colnames[1]]].to_dict()["gene_name"]
-    t2s_dict = t2g_3col[[t2g_colnames[2]]].to_dict()["splice_status"]
-
-    # check txp name to gene name mapping
-    # for g1
-    assert t2g_dict["tx1.1"] == "g1"
-    assert t2g_dict["tx1.2"] == "g1"
-    assert t2g_dict["tx1.3"] == "g1"
-    assert t2g_dict["g1-I"] == "g1"
-    assert t2g_dict["g1-I1"] == "g1"
-    assert t2g_dict["g1-I2"] == "g1"
-
-    # for g2
-    assert t2g_dict["tx2.1"] == "g2"
-    assert t2g_dict["tx2.2"] == "g2"
-    assert t2g_dict["tx2.3"] == "g2"
-    assert t2g_dict["g2-I"] == "g2"
-    assert t2g_dict["g2-I1"] == "g2"
-    assert t2g_dict["g2-I2"] == "g2"
-
-    # check txp name to splice status mapping
-    assert t2s_dict["tx1.1"] == "S"
-    assert t2s_dict["tx1.2"] == "S"
-    assert t2s_dict["tx1.3"] == "S"
-    assert t2s_dict["g1-I"] == "U"
-    assert t2s_dict["g1-I1"] == "U"
-    assert t2s_dict["g1-I2"] == "U"
-
-    # for S
-    assert t2s_dict["tx2.1"] == "S"
-    assert t2s_dict["tx2.2"] == "S"
-    assert t2s_dict["tx2.3"] == "S"
-    assert t2s_dict["g2-I"] == "U"
-    assert t2s_dict["g2-I1"] == "U"
-    assert t2s_dict["g2-I2"] == "U"