Skip to content

Commit

Permalink
Add '--use_model_resolution' as a new mode to extract input sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
PedroBarbosa committed Jun 10, 2024
1 parent 77b047b commit 6f29633
Show file tree
Hide file tree
Showing 11 changed files with 350 additions and 218 deletions.
3 changes: 2 additions & 1 deletion dress/configs/generate_binfiller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ generate:
preprocessing:
cache_dir: data/cache/
genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
use_full_sequence: false
use_model_resolution: false
use_full_triplet: false
fitness:
minimize_fitness: false
fitness_function: bin_filler
Expand Down
3 changes: 2 additions & 1 deletion dress/configs/generate_binfiller_pwm_grammar.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ generate:
preprocessing:
cache_dir: data/cache/
genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
use_full_sequence: false
use_model_resolution: false
use_full_triplet: false
fitness:
minimize_fitness: false
fitness_function: bin_filler
Expand Down
3 changes: 2 additions & 1 deletion dress/configs/generate_iad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ generate:
preprocessing:
cache_dir: data/cache/
genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
use_full_sequence: false
use_model_resolution: false
use_full_triplet: false
fitness:
minimize_fitness: false
fitness_function: increase_archive_diversity
Expand Down
3 changes: 2 additions & 1 deletion dress/configs/generate_iad_pwm_grammar.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ generate:
preprocessing:
cache_dir: data/cache/
genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
use_full_sequence: false
use_model_resolution: false
use_full_triplet: false
fitness:
minimize_fitness: false
fitness_function: increase_archive_diversity
Expand Down
6 changes: 4 additions & 2 deletions dress/datasetevaluation/representation/motifs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import re
import pandas as pd
from typing import Union
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

from dress.datasetevaluation.representation.motifs.rbp_lists import RBP_SUBSETS
import numpy as np
Expand Down Expand Up @@ -406,7 +408,7 @@ def _remove_self_contained(gr: pr.PyRanges, scan_method: str) -> pr.PyRanges:
df = pd.merge(df, contained_same_rbp, how="left", on=to_drop_cols).drop(
columns=to_clean_cols
)
df.Has_self_submotif.fillna(False, inplace=True)
df.fillna({'Has_self_submotif': False}, inplace=True)

#######################
# Other RBP contained #
Expand All @@ -428,7 +430,7 @@ def _remove_self_contained(gr: pr.PyRanges, scan_method: str) -> pr.PyRanges:
df = pd.merge(df, contained_other_rbp, how="left", on=to_drop_cols).drop(
columns=to_clean_cols[:-1]
)
df.Has_other_submotif.fillna(False, inplace=True)
df.fillna({'Has_other_submotif': False}, inplace=True)
# logger.debug(".. {} hits flagged ..".format(contained_other_rbp.shape[0]))

return pr.PyRanges(df)
Expand Down
2 changes: 2 additions & 0 deletions dress/datasetgeneration/json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
"properties": {
"dry_run": {"type": "boolean"},
"disable_gpu": {"type": "boolean"},
"use_full_triplet": {"type": "boolean"},
"use_model_resolution": {"type": "boolean"},
"verbosity": {"type": "integer"},
"shuffle_input": {"type": ["null", "string"]},
"outdir": {"type": "string"},
Expand Down
16 changes: 13 additions & 3 deletions dress/datasetgeneration/preprocessing/gtf_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,9 @@ def preprocessing(data: pr.PyRanges, **kwargs):
df=extracted,
fasta=genome,
extend_borders=100,
use_full_seqs=kwargs["use_full_sequence"],
use_full_triplet=kwargs["use_full_triplet"],
use_model_resolution=kwargs["use_model_resolution"],
model = kwargs["model"]
)

if os.path.isdir(kwargs["outdir"]):
Expand Down Expand Up @@ -156,7 +158,10 @@ def write_output(
Additional arguments in **kwargs:
outdir (str): Output directory.
outbasename (str): Output basename.
use_full_sequence (bool): Whether to use the full sequence when running the black box model.
use_full_triplet (bool): Whether to use the full exon triplet as input sequence
when making model inferences.
use_model_resolution (bool): Whether to use the model resolution to determine input
sequence size when making model inferences.
"""

to_write = {
Expand Down Expand Up @@ -193,8 +198,13 @@ def write_output(
]
],
)

out_flag = ''
if kwargs['use_full_triplet']:
out_flag = "_full_triplet"
elif kwargs['use_model_resolution']:
out_flag = "_model_res"

out_flag = "" if kwargs["use_full_sequence"] else "_trimmed_at_5000bp"
if len(extracted_with_seqs) > 0:
extracted_with_seqs[
["header", "acceptor_idx", "donor_idx", "tx_id", "exon"]
Expand Down
Loading

0 comments on commit 6f29633

Please sign in to comment.