Skip to content

Commit

Permalink
Merge branch 'master' into fix_u12
Browse files Browse the repository at this point in the history
  • Loading branch information
kirilenkobm authored Sep 30, 2023
2 parents e45b0f6 + 6638de5 commit c97f2a7
Show file tree
Hide file tree
Showing 9 changed files with 149 additions and 730 deletions.
275 changes: 36 additions & 239 deletions CESAR_wrapper.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<img src="https://github.com/hillerlab/TOGA/blob/master/supply/logo.png" width="500">

[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
![version](https://img.shields.io/badge/version-1.1.5-blue)
![version](https://img.shields.io/badge/version-1.1.6-blue)
[![DOI](https://zenodo.org/badge/277817661.svg)](https://zenodo.org/badge/latestdoi/277817661)
[![License](https://img.shields.io/github/license/hillerlab/TOGA.svg)](https://github.com/hillerlab/TOGA/blob/master/LICENSE)
[![made-with-Nextflow](https://img.shields.io/badge/Made%20with-Nextflow-23aa62.svg)](https://www.nextflow.io/)
Expand Down
13 changes: 11 additions & 2 deletions VersionHistory.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ Documentation improvements.
* A little improvement in the `run_test.sh` script.
* Better versions management - added version.py

# TOGA 1.1.5 (current release) #
# TOGA 1.1.5 #

* extract_codon_alignment.py: "!" characters inserted my MACSE2.0 to compensate frameshifts are replaced with "N".
* CESAR_wrapper.py -> does not use `/dev/shm/` or `/tmp/` partitions anymore.
Expand All @@ -77,6 +77,15 @@ Documentation improvements.
* (started): better code organisation - constants class
* Replaced CESAR2.0 submodule with Kirilenko's lightweight fork (without Kent, etc.)

# TOGA 1.1.6 (planned)
# TOGA 1.1.6 (current release) #

* Removed obsolete optimised CESAR path.
* Fixed a minor bug with re-running CESAR jobs memory parameter if no buckets were specified.
* (by shjenkins94) added gene_prefix argument
* (by shjenkins94) fixed U12 argument check bug
* minor code style improvements

# TOGA 1.1.7 (planned) #

* (planned) Step management - now user can rerun TOGA from any desired step.
* (planned) Using CESAR in the single exon mode for most of the exons to solve the RAM bottleneck (maybe TOGA 1.2.x?)
1 change: 1 addition & 0 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ class InactMutClassesConst:
SSM_D = "SSMD" # Donor, right, GT,GC
SSM_A = "SSMA" # Acceptor, left, AG


# Standalone constants #
COMPLEMENT_BASE = {
"A": "T",
Expand Down
19 changes: 13 additions & 6 deletions modules/make_query_isoforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
MODULE_NAME_FOR_LOG = "make_query_isoforms"
TOGA_GENE_PREFIX = "TOGA"


def parse_args():
"""Read CMD args."""
app = argparse.ArgumentParser()
Expand All @@ -47,6 +46,12 @@ def parse_args():
help="Disable color filter",
)
app.add_argument("--log_file", help="Log file")
app.add_argument(
"--gene_prefix",
"--gp",
default="TOGA",
help="Prefix to use for query gene identifiers. Default value is TOGA",
)
if len(sys.argv) < 3:
app.print_help()
sys.exit(0)
Expand Down Expand Up @@ -202,7 +207,7 @@ def intersect_exons(chr_dir_exons, exon_id_to_transcript):
return G


def parse_components(components, trans_to_range):
def parse_components(components, trans_to_range, gene_prefix=None):
"""Get genes data.
Each gene has the following data:
Expand All @@ -211,9 +216,10 @@ def parse_components(components, trans_to_range):
3) Genomic range.
"""
to_log(f"{MODULE_NAME_FOR_LOG}: parsing components data to identify query genes")
gp = TOGA_GENE_PREFIX if gene_prefix is None else gene_prefix
genes_data = [] # save gene objects here
for num, component in enumerate(components, 1):
gene_id = f"{TOGA_GENE_PREFIX}_{num}" # need to name them somehow
gene_id = f"{gp}_{num:011}" # need to name them somehow
# get transcripts and their ranges
transcripts = set(component.nodes())
regions = [trans_to_range[t] for t in transcripts]
Expand Down Expand Up @@ -266,7 +272,7 @@ def save_regions(genes_data, output):


def get_query_isoforms_data(
query_bed, query_isoforms, save_genes_track=None, ignore_color=False
query_bed, query_isoforms, save_genes_track=None, ignore_color=False, gene_prefix=None,
):
"""Create isoforms track for query."""
to_log(f"{MODULE_NAME_FOR_LOG}: inferring genes from annotated isoforms in the query")
Expand All @@ -291,7 +297,7 @@ def get_query_isoforms_data(
components = get_graph_components(conn_graph)
to_log(f"{MODULE_NAME_FOR_LOG}: identified {len(components)} connected components in the graph")
# covert components to isoforms table
genes_data = parse_components(components, trans_to_range)
genes_data = parse_components(components, trans_to_range, gene_prefix)
# save the results
save_isoforms(genes_data, query_isoforms)
save_regions(genes_data, save_genes_track)
Expand All @@ -305,4 +311,5 @@ def get_query_isoforms_data(
args.output,
save_genes_track=args.genes_track,
ignore_color=args.ignore_color,
)
gene_prefix=args.gene_prefix,
)
2 changes: 1 addition & 1 deletion parallel_jobs_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def execute(self, joblist_path, manager_data, label, wait=False, **kwargs):
self.joblist_path = joblist_path
self.manager_data = manager_data
self.label = label
self.memory_limit = int(kwargs.get("memory_limit", 16))
self.memory_limit = int(kwargs.get("memory_limit", "16"))

self.nf_project_path = manager_data.get("nextflow_dir", None) # in fact, contains NF logs
self.keep_logs = manager_data.get("keep_nf_logs", False)
Expand Down
142 changes: 15 additions & 127 deletions split_exon_realign_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
os.path.join(LOCATION, "cesar_runner.py")
) # script that will run jobs

# TODO: remove everything related to bigmem partition, see issue
# https://github.com/hillerlab/TOGA/issues/102
BIGMEM_LIM = 500 # mem limit for bigmem partition
REL_LENGTH_THR = 50
ABS_LENGTH_TRH = 1000000
Expand Down Expand Up @@ -182,23 +184,6 @@ def parse_args():
app.add_argument(
"--fragments_data", help="Gene: fragments file for fragmented genomes."
)
app.add_argument(
"--opt_cesar",
action="store_true",
dest="opt_cesar",
help="Using lastz-optimized version of CESAR",
)
app.add_argument(
"--precomp_memory_data",
default=None,
help="Memory consumption was already precomputed",
)
app.add_argument(
"--precomp_regions_data_dir",
default=None,
help=("Directory containing files with precomputed regions. "
"Likely is not needed for standalone script scenario. ")
)
app.add_argument(
"--predefined_glp_class_path",
default=None,
Expand Down Expand Up @@ -408,17 +393,20 @@ def define_short_q_proj_stat(q_chrom, q_start, q_end, q_2bit):
# N are two-sided?
# simply copy-pasted solution from CESAR wrapper.py
# can be further optimised
gap_ranges = 0
for match in finditer(ASM_GAP_PATTERN, query_seq, IGNORECASE):
span_start, span_end = match.span()
# gap_ranges = 0
for _ in finditer(ASM_GAP_PATTERN, query_seq, IGNORECASE):
# span_start, span_end = match.span()
# gap_ranges.append((seq_start + span_start, seq_start + span_end))
gap_ranges += 1
if gap_ranges == 0:
# no assembly gaps: really deleted -> Lost
return L
else:
# there are assembly gaps -> Missing
# gap_ranges += 1
return M
# M if ranges num > 1 else L
return L
# if gap_ranges == 0:
# # no assembly gaps: really deleted -> Lost
# return L
# else:
# # there are assembly gaps -> Missing
# return M


def precompute_regions(
Expand Down Expand Up @@ -656,36 +644,6 @@ def save_jobs(filled_buckets, bucket_jobs_num, jobs_dir):
return to_combine


# def save_bigmem_jobs(bigmem_joblist, jobs_dir):
# """Save bigmem jobs."""
# # TODO: try to merge with save_jobs() func
# # one bigmem job per joblist, but not more than 100
# # if > 100: something is wrong
# joblist_size = len(bigmem_joblist)
# num_of_parts = joblist_size if joblist_size <= BIGMEM_LIM else BIGMEM_JOBSNUM
# if num_of_parts == 0:
# return None # no bigmem jobs
# to_log(f"{MODULE_NAME_FOR_LOG}: saving {joblist_size} jobs to BIGMEM queue")
# to_log(
# f"!!{MODULE_NAME_FOR_LOG}: this part is a subject of being "
# f"refactored or excluded from the pipeline"
# )
#
# bigmem_parts = split_in_n_lists(bigmem_joblist, num_of_parts)
# bigmem_files_num = len(bigmem_parts) # in case if num of jobs < BIGMEM_JOBSNUM
# bigmem_paths = []
# if bigmem_files_num == 0:
# return None # no bigmem jobs at all
# for num, bigmem_part in enumerate(bigmem_parts):
# file_name = f"cesar_job_{num}_bigmem"
# file_path = os.path.abspath(os.path.join(jobs_dir, file_name))
# f = open(file_path, "w")
# f.write("\n".join(bigmem_part) + "\n")
# f.close()
# bigmem_paths.append(file_path)
# return bigmem_paths


def save_combined_joblist(
to_combine,
combined_file,
Expand Down Expand Up @@ -738,50 +696,10 @@ def read_fragments_data(in_file):
return ret


def read_precomp_mem(precomp_file):
"""Read precomputed memory if exists."""
ret = {}
to_log(f"{MODULE_NAME_FOR_LOG}: reading precomputed memory requirements from {precomp_file}")
if precomp_file is None:
to_log(f"{MODULE_NAME_FOR_LOG}: no data provided: skip")
return ret
f = open(precomp_file, "r")
for line in f:
if line == "\n":
continue
line_data = line.rstrip().split("\t")
transcript = line_data[0]
mem_raw = float(line_data[1])
mem = math.ceil(mem_raw) + 1.25
ret[transcript] = mem
to_log(f"{MODULE_NAME_FOR_LOG}: got precomputed CESAR RAM requirements for {len(ret)} transcripts")
f.close()
return ret


def get_trans_to_regions_file(precomp_regions_data_dir):
ret = {}
to_log(f"{MODULE_NAME_FOR_LOG}: loading precomputed regions from {precomp_regions_data_dir}")
if precomp_regions_data_dir is None:
to_log(f"{MODULE_NAME_FOR_LOG}: no data provided: skipping this step")
return ret
files_in = os.listdir(precomp_regions_data_dir)
for filename in files_in:
path = os.path.abspath(os.path.join(precomp_regions_data_dir, filename))
f = open(path, "r")
transcripts_in_file = set(x.split("\t")[0] for x in f)
f.close()
for t in transcripts_in_file:
ret[t] = path
to_log(f"{MODULE_NAME_FOR_LOG}: got data for {len(ret)} transcripts")
return ret


def build_job(gene,
chains_arg,
args,
gene_fragments,
trans_to_reg_precomp_file,
u12_this_gene,
cesar_temp_dir,
mask_all_first_10p=False
Expand All @@ -805,12 +723,8 @@ def build_job(gene,
job = job + " --check_loss" if args.check_loss else job
job = job + " --no_fpi" if args.no_fpi else job
job = job + " --fragments" if gene_fragments else job
job = job + " --opt_cesar" if args.opt_cesar else job
job = job + " --alt_frame_del" # TODO: toga master script parameter

precomp_file = trans_to_reg_precomp_file.get(gene)
job = job + f" --predefined_regions {precomp_file}" if precomp_file else job

# add U12 introns data if this gene has them:
job = job + f" --u12 {os.path.abspath(args.u12)}" if u12_this_gene else job

Expand Down Expand Up @@ -851,11 +765,8 @@ def _get_chain_arg_and_gig_arg(chains_list, chain_to_mem):
return gig_arg


def compute_memory(chains, precomp_gig, block_sizes, gene_chains_data, gene_fragments, mem_limit):
def compute_memory(chains, block_sizes, gene_chains_data, gene_fragments, mem_limit):
"""Compute memory requirements for different chains."""
if precomp_gig:
# chains arg: just all chains, everything is precomputed
return {tuple(chains): (precomp_gig, MEM_FIT)}
# proceed to memory estimation
# the same procedure as inside CESAR2.0 code
# required memory depends on numerous params
Expand Down Expand Up @@ -918,11 +829,6 @@ def main():
# need it to make subsequent commands
u12_data = read_u12_data(args.u12)

# if memory is precomputed: use it
precomp_mem = read_precomp_mem(args.precomp_memory_data)

# TODO: to optimize later
trans_to_reg_precomp_file = get_trans_to_regions_file(args.precomp_regions_data_dir)
# get lists of orthologous chains per each gene
# skipped_1 - no chains found -> log them
predefined_glp_class = {} # for projections which are M and L without CESAR
Expand Down Expand Up @@ -975,7 +881,6 @@ def main():

all_jobs = {}
skipped_3 = []
# bigmem_jobs = []

for gene in batch.keys():
u12_this_gene = u12_data.get(gene)
Expand All @@ -1001,9 +906,7 @@ def main():
if len(chains) == 0:
continue

precomp_gig = precomp_mem.get(gene, None)
chain_arg_to_gig = compute_memory(chains,
precomp_gig,
block_sizes,
gene_chains_data,
gene_fragments,
Expand All @@ -1014,7 +917,6 @@ def main():
chains_arg,
args,
gene_fragments,
trans_to_reg_precomp_file,
u12_this_gene,
cesar_temp_dir,
mask_all_first_10p=args.mask_all_first_10p)
Expand Down Expand Up @@ -1099,20 +1001,6 @@ def main():
args.cesar_logs_dir
)

# save bigmem jobs, a bit different logic
# bigmem_paths = save_bigmem_jobs(bigmem_jobs, args.jobs_dir)
# if bigmem_paths:
# save_combined_joblist(
# bigmem_paths,
# args.bigmem,
# args.results,
# args.check_loss,
# args.rejected_log,
# None, # TODO: decide what we do with this branch
# args.cesar_logs_dir,
# name="bigmem",
# )

# save skipped genes if required
if args.skipped_genes:
skipped = skipped_1 + skipped_2 + skipped_3
Expand Down
Loading

0 comments on commit c97f2a7

Please sign in to comment.