Skip to content

Commit

Permalink
Use model resolution as the default setting to extract input sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
PedroBarbosa committed Jun 12, 2024
1 parent 4ac9d40 commit 48b54ac
Show file tree
Hide file tree
Showing 10 changed files with 71 additions and 72 deletions.
4 changes: 2 additions & 2 deletions dress/configs/generate_binfiller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ generate:
preprocessing:
cache_dir: data/cache/
genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
use_model_resolution: false
use_full_triplet: false
extract_dynamically: false
extract_full_triplet: false
fitness:
minimize_fitness: false
fitness_function: bin_filler
Expand Down
4 changes: 2 additions & 2 deletions dress/configs/generate_binfiller_pwm_grammar.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ generate:
preprocessing:
cache_dir: data/cache/
genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
use_model_resolution: false
use_full_triplet: false
extract_dynamically: false
extract_full_triplet: false
fitness:
minimize_fitness: false
fitness_function: bin_filler
Expand Down
4 changes: 2 additions & 2 deletions dress/configs/generate_iad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ generate:
preprocessing:
cache_dir: data/cache/
genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
use_model_resolution: false
use_full_triplet: false
extract_dynamically: false
extract_full_triplet: false
fitness:
minimize_fitness: false
fitness_function: increase_archive_diversity
Expand Down
4 changes: 2 additions & 2 deletions dress/configs/generate_iad_pwm_grammar.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ generate:
preprocessing:
cache_dir: data/cache/
genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
use_model_resolution: false
use_full_triplet: false
extract_dynamically: false
extract_full_triplet: false
fitness:
minimize_fitness: false
fitness_function: increase_archive_diversity
Expand Down
4 changes: 2 additions & 2 deletions dress/datasetgeneration/json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
"properties": {
"dry_run": {"type": "boolean"},
"disable_gpu": {"type": "boolean"},
"use_full_triplet": {"type": "boolean"},
"use_model_resolution": {"type": "boolean"},
"extract_full_triplet": {"type": "boolean"},
"extract_dynamically": {"type": "boolean"},
"verbosity": {"type": "integer"},
"shuffle_input": {"type": ["null", "string"]},
"outdir": {"type": "string"},
Expand Down
16 changes: 8 additions & 8 deletions dress/datasetgeneration/preprocessing/gtf_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ def preprocessing(data: pr.PyRanges, **kwargs):
df=extracted,
fasta=genome,
extend_borders=100,
use_full_triplet=kwargs["use_full_triplet"],
use_model_resolution=kwargs["use_model_resolution"],
extract_full_triplet=kwargs["extract_full_triplet"],
extract_dynamically=kwargs["extract_dynamically"],
model = kwargs["model"]
)

Expand Down Expand Up @@ -158,10 +158,10 @@ def write_output(
Additional arguments in **kwargs:
outdir (str): Output directory.
outbasename (str): Output basename.
use_full_triplet (bool): Whether to use the full exon triplet as input sequence
extract_full_triplet (bool): Whether to use the full exon triplet as input sequence
when making model inferences.
use_model_resolution (bool): Whether to use the model resolution to determine input
sequence size when making model inferences.
extract_dynamically (bool): Whether to extract input sequence dynamically based on
triplet length and model resolution.
"""

to_write = {
Expand Down Expand Up @@ -200,10 +200,10 @@ def write_output(
)

out_flag = ''
if kwargs['use_full_triplet']:
if kwargs['extract_full_triplet']:
out_flag = "_full_triplet"
elif kwargs['use_model_resolution']:
out_flag = "_model_res"
elif kwargs['extract_dynamically']:
out_flag = "_dynamically"

if len(extracted_with_seqs) > 0:
extracted_with_seqs[
Expand Down
63 changes: 32 additions & 31 deletions dress/datasetgeneration/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def process_ss_idx(


def _get_flat_ss(
info: pd.Series, _level: str, start: int, end: int, use_full_tiplet: bool
info: pd.Series, _level: str, start: int, end: int, extract_full_triplet: bool
):
"""
Extracts flat splice site indexes for sequences
Expand All @@ -96,7 +96,7 @@ def _get_flat_ss(
extensions.
end (int): End coordinate of the flat sequence,
after accounting for the extensions.
use_full_tiplet (bool, optional): Whether `start` and `end`
extract_full_triplet (bool, optional): Whether `start` and `end`
coordinates represent the true start and end of
the sequence at a given surrounding level. If `False`,
they represent start and end positions up to the
Expand All @@ -109,7 +109,7 @@ def _get_flat_ss(
_info = info.copy()

# Check out of scope indexes
if use_full_tiplet is False:
if extract_full_triplet is False:
if _level != 0:
cols = [
"Start_upstream" + _level,
Expand Down Expand Up @@ -443,8 +443,8 @@ def generate_pipeline_input(
df: pd.DataFrame,
fasta: str,
extend_borders: int = 0,
use_full_triplet: bool = False,
use_model_resolution: bool = True,
extract_full_triplet: bool = False,
extract_dynamically: bool = False,
model: str = "spliceai",
):
"""
Expand All @@ -457,22 +457,23 @@ def generate_pipeline_input(
associated transcript ID.
Sequence extraction can be done in three ways:
- `use_full_triplet` is `True`: The complete sequence
- `extract_full_triplet` is `True`: The complete sequence
from the start of the exon upstream until the end of the
exon downstream is extracted, regardless of the size of
the resulting sequence. This can result in very large
sequences if introns are very long.
- `use_model_resolution` is `True`: The sequence is
extracted up to the maximum resolution of the model. For example,
for SpliceAI, the extracted sequence will be the cassette exon plus
5000bp upstream of the acceptor and 5000bp downstream of the donor.
- `use_model_resolution` is `False` and `use_full_triplet` is `False`:
If neither of the above is set (default), the sequence will be extracted
- `extract_dynamically` is `True`: The sequence will be extracted
such that: if the full exon triplet is smaller than the model resolution,
the full exon triplet is extracted, and then at inference time the sequence
is padded to the model resolution. If the exon triplet is larger than the
model resolution, the sequence is trimmed at the model resolution, which may not
include upstream and/or downstream exons.
- `extract_dynamically` is `False` and `extract_full_triplet` is `False`:
If neither of the above is set (default), the sequence is
extracted up to the maximum resolution of the model. For example,
for SpliceAI, the extracted sequence will be the cassette exon plus
5000bp upstream of the acceptor and 5000bp downstream of the donor.
If `extend_borders` > 0, the sequence will be extended on both sides.
This is useful for cases where the acceptor of the uptream exon
Expand All @@ -481,9 +482,8 @@ def generate_pipeline_input(
may not be properly captured (because the full context is not present),
extending the sequence can help to provide a more realistic prediction
as if those positions were in the middle of the sequence. This extension
is applied when `use_full_triplet` is `True` or in the default setting
(`use_model_resolution=False` and `use_full_triplet=False`) when the
sequence is shorter than the model resolution.
is applied when `extract_full_triplet` or `extract_dynamically`is `True`,
the latter only if the upstream/downstream exon is within the model resolution.
Returns:
Tuples with the following information:
Expand All @@ -493,8 +493,8 @@ def generate_pipeline_input(
"""

assert any(
x is False for x in [use_full_triplet, use_model_resolution]
), "Can't set both `use_full_triplet` and `use_model_resolution` to `True`."
x is False for x in [extract_full_triplet, extract_dynamically]
), "Can't set both `extract_full_triplet` and `extract_dynamically` to `True`."

if list(df.filter(regex="stream")):
# Extract level so that we know the
Expand Down Expand Up @@ -524,7 +524,7 @@ def generate_pipeline_input(

for _, seq_record in df.iterrows():

if use_full_triplet:
if extract_full_triplet:
if seq_record.Strand == "+":
start = "Start_upstream" + _level if level != 0 else "Start"
end = "End_downstream" + _level if level != 0 else "End"
Expand Down Expand Up @@ -561,7 +561,7 @@ def generate_pipeline_input(
_level,
start=seq_record[start] - extend_borders,
end=seq_record[end] + extend_borders,
use_full_tiplet=use_full_triplet,
extract_full_triplet=extract_full_triplet,
)

out.append(
Expand All @@ -583,7 +583,7 @@ def generate_pipeline_input(
region="upstream",
_level=_level,
extend_borders=extend_borders,
use_model_resolution=use_model_resolution,
no_model_resolution=extract_dynamically,
model=model,
)

Expand All @@ -592,7 +592,7 @@ def generate_pipeline_input(
region="downstream",
_level=_level,
extend_borders=extend_borders,
use_model_resolution=use_model_resolution,
no_model_resolution=extract_dynamically,
model=model,
)

Expand Down Expand Up @@ -629,7 +629,7 @@ def generate_pipeline_input(
_level,
start=min(left, right),
end=max(left, right),
use_full_tiplet=use_full_triplet,
extract_full_triplet=extract_full_triplet,
)

out.append(
Expand Down Expand Up @@ -676,36 +676,37 @@ def _get_slack(
region: Literal["upstream", "downstream"],
_level: str,
extend_borders: int,
use_model_resolution: bool,
no_model_resolution: bool,
model: str
):
"""
Returns the number of base pairs to extend
coordinates surrounding the central exon.
If `use_model_resolution` is `True` returns the number of
base pairs corresponding to the model resolution
(e.g., 5000bp on each side for SpliceAI). If `False`,
returns the number of base pairs up to location of the
upstream or downstream exons, if their distance is lower
If `no_model_resolution` is `True` returns the number
of base pairs up to location of the upstream or
downstream exons, if their distance is lower
than the model resolution. If the distance is higher,
returns the distance to the model resolution.
returns the distance to the model resolution. If `False`,
returns the number of base pairs corresponding to the model
resolution (e.g., 5000bp on each side for SpliceAI).
Args:
seq_record (pd.Series): A row from the input dataframe
region (Literal): Either 'upstream' or 'downstream'
_level (str): The level to be considered
extend_borders (int): The number of base pairs to extend the coordinates,
regardless of the region
use_model_resolution (bool): Whether to use the model resolution
no_model_resolution (bool): Whether to avoid using the model resolution
to extract distances
model (str): The model to be used
Returns:
int: The number of base pairs to extend the coordinates
"""
models = {"spliceai": 5000, "pangolin": 5000}
model_res = models[model]
if use_model_resolution:
if no_model_resolution is False:
return model_res

if region == "upstream":
Expand Down
28 changes: 13 additions & 15 deletions dress/datasetgeneration/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,28 +599,26 @@ def grammar_options(fun):
"config file is presented in 'dress/configs/generate.yaml' file.",
)
@click.option(
"-umr",
"--use_model_resolution",
"-esd",
"--extract_dynamically",
is_flag=True,
help="Whether to use the model resolution for extracting and predicting the full sequence. "
"If set, for example with '--model spliceai', the input sequence will include the "
"target exon plus 5000bp on either side. This ensures that both the acceptor and "
"donor positions are evaluated within the real genomic context, regardless of "
"whether surrounding exons fall within or outside the model resolution. "
"Default: 'False'. When not set (default), the surrounding context of the target exon is "
"dynamically extracted. If upstream/downstream exons are within the model resolution, "
"sequences are shorter and padded during inference. If they lie beyond the model "
"resolution, the sequence is trimmed to the model resolution. Cannot be used with '--use_full_triplet'.",
help="Extract sequences dynamically based on the exon triplet size and the model resolution. "
"If upstream/downstream exons are within the model resolution, sequences will be shorter and padded "
"during inference. If they lie beyond the model resolution, sequences are trimmed to the model resolution. "
"Default: use the model resolution for extracting surrounding context and making inferences. For example, "
"with '--model spliceai', the default setting will include the target exon plus 5000bp on either side. This ensures that both the acceptor and donor positions are evaluated within the real genomic context, regardless "
"of whether surrounding exons fall within or outside the 5000bp."
)
@click.option(
"-ufs",
"--use_full_triplet",
"-efs",
"--extract_full_triplet",
is_flag=True,
help=(
"Whether to extract and predict the full sequence from the start coordinate of the upstream "
"Extract and predict the full sequences from the start coordinate of the upstream "
"exon to the end coordinate of the downstream exon, regardless of the model resolution. "
"Default: 'False'. Use this option with caution, as it can easily lead to memory exhaustion "
"if the sequence triplet size is too large (e.g., large introns). Cannot be used with '--use_model_resolution'."
"if the sequence triplet size is too large (e.g., large introns). Default: use the model resolution, "
"as described in '--extract_dynamically'."
),
)
@click.option(
Expand Down
4 changes: 2 additions & 2 deletions dress/datasetgeneration/validate_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
"--cache_dir",
"--genome",
"--config",
"--use_model_resolution",
"--use_full_triplet"
"--extract_dynamically",
"--extract_full_triplet"
],
},
{
Expand Down
12 changes: 6 additions & 6 deletions tests/preprocessing_raw_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ def test_full_sequence(self, setup_paths, initial_setup, common_tests):
df=extracted,
fasta=open_fasta(genome, cache_dir),
extend_borders=100,
use_full_triplet=True,
use_model_resolution=False,
extract_full_triplet=True,
extract_dynamically=False,
)

assert na_exons.shape[0] == 0
Expand Down Expand Up @@ -148,8 +148,8 @@ def test_trimmed_sequence(self, setup_paths, initial_setup, common_tests):
df=extracted,
fasta=open_fasta(genome, cache_dir),
extend_borders=100,
use_full_triplet=False,
use_model_resolution=False,
extract_full_triplet=False,
extract_dynamically=True,
)

len_test = {
Expand Down Expand Up @@ -231,8 +231,8 @@ def test_model_resolution(self, setup_paths, initial_setup, common_tests):
df=extracted,
fasta=open_fasta(genome, cache_dir),
extend_borders=100,
use_full_triplet=False,
use_model_resolution=True,
extract_full_triplet=False,
extract_dynamically=False,
model="spliceai",
)

Expand Down

0 comments on commit 48b54ac

Please sign in to comment.