From 01fe7362f55ae656cdfbe95bb43f4a754d816157 Mon Sep 17 00:00:00 2001 From: shjenkins94 Date: Fri, 29 Sep 2023 19:42:04 +0900 Subject: [PATCH] add gene_prefix argument --- modules/make_query_isoforms.py | 19 +++++++++++++------ toga.py | 8 ++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/modules/make_query_isoforms.py b/modules/make_query_isoforms.py index e240e6d..7ea5613 100755 --- a/modules/make_query_isoforms.py +++ b/modules/make_query_isoforms.py @@ -33,7 +33,6 @@ MODULE_NAME_FOR_LOG = "make_query_isoforms" TOGA_GENE_PREFIX = "TOGA" - def parse_args(): """Read CMD args.""" app = argparse.ArgumentParser() @@ -47,6 +46,12 @@ def parse_args(): help="Disable color filter", ) app.add_argument("--log_file", help="Log file") + app.add_argument( + "--gene_prefix", + "--gp", + default="TOGA", + help="Prefix to use for query gene identifiers. Default value is TOGA", + ) if len(sys.argv) < 3: app.print_help() sys.exit(0) @@ -202,7 +207,7 @@ def intersect_exons(chr_dir_exons, exon_id_to_transcript): return G -def parse_components(components, trans_to_range): +def parse_components(components, trans_to_range, gene_prefix=None): """Get genes data. Each gene has the following data: @@ -211,9 +216,10 @@ def parse_components(components, trans_to_range): 3) Genomic range. """ to_log(f"{MODULE_NAME_FOR_LOG}: parsing components data to identify query genes") + gp = TOGA_GENE_PREFIX if gene_prefix is None else gene_prefix genes_data = [] # save gene objects here for num, component in enumerate(components, 1): - gene_id = f"{TOGA_GENE_PREFIX}_{num}" # need to name them somehow + gene_id = f"{gp}_{num:011}" # need to name them somehow # get transcripts and their ranges transcripts = set(component.nodes()) regions = [trans_to_range[t] for t in transcripts] @@ -266,7 +272,7 @@ def save_regions(genes_data, output): def get_query_isoforms_data( - query_bed, query_isoforms, save_genes_track=None, ignore_color=False + query_bed, query_isoforms, save_genes_track=None, ignore_color=False, gene_prefix=None, ): """Create isoforms track for query.""" to_log(f"{MODULE_NAME_FOR_LOG}: inferring genes from annotated isoforms in the query") @@ -291,7 +297,7 @@ def get_query_isoforms_data( components = get_graph_components(conn_graph) to_log(f"{MODULE_NAME_FOR_LOG}: identified {len(components)} connected components in the graph") # covert components to isoforms table - genes_data = parse_components(components, trans_to_range) + genes_data = parse_components(components, trans_to_range, gene_prefix) # save the results save_isoforms(genes_data, query_isoforms) save_regions(genes_data, save_genes_track) @@ -305,4 +311,5 @@ def get_query_isoforms_data( args.output, save_genes_track=args.genes_track, ignore_color=args.ignore_color, - ) + gene_prefix=args.gene_prefix, + ) \ No newline at end of file diff --git a/toga.py b/toga.py index af75a26..66cf274 100755 --- a/toga.py +++ b/toga.py @@ -168,6 +168,7 @@ def __init__(self, args): ) # mics things + self.gene_prefix = args.gene_prefix self.isoforms_arg = args.isoforms if args.isoforms else None self.isoforms = None # will be assigned after completeness check self.chain_jobs = args.chain_jobs_num @@ -1550,6 +1551,7 @@ def __orthology_type_map(self): self.query_annotation, query_isoforms_file, save_genes_track=query_gene_spans, + gene_prefix=self.gene_prefix, ) to_log("Calling orthology types mapping step...") skipped_ref_trans = os.path.join(self.wd, "ref_orphan_transcripts.txt") @@ -1720,6 +1722,12 @@ def parse_args(): '"CURRENT_DIR/PROJECT_NAME". If not provided, TOGA will try to extract ' "the project name from chain filename, which is not recommended.", ) + app.add_argument( + "--gene_prefix", + "--gp", + default="TOGA", + help="Prefix to use for query gene identifiers. Default value is TOGA", + ) app.add_argument( "--min_score", "--msc",