Add '--use_model_resolution' as a new mode to extract input sequences

PedroBarbosa · Jun 10, 2024 · 6f29633 · 6f29633
1 parent 77b047b
commit 6f29633
Show file tree

Hide file tree

Showing 11 changed files with 350 additions and 218 deletions.
diff --git a/dress/configs/generate_binfiller.yaml b/dress/configs/generate_binfiller.yaml
@@ -15,7 +15,8 @@ generate:
   preprocessing:
     cache_dir: data/cache/
     genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
-    use_full_sequence: false
+    use_model_resolution: false
+    use_full_triplet: false
   fitness:
     minimize_fitness: false
     fitness_function: bin_filler

diff --git a/dress/configs/generate_binfiller_pwm_grammar.yaml b/dress/configs/generate_binfiller_pwm_grammar.yaml
@@ -15,7 +15,8 @@ generate:
   preprocessing:
     cache_dir: data/cache/
     genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
-    use_full_sequence: false
+    use_model_resolution: false
+    use_full_triplet: false
   fitness:
     minimize_fitness: false
     fitness_function: bin_filler

diff --git a/dress/configs/generate_iad.yaml b/dress/configs/generate_iad.yaml
@@ -15,7 +15,8 @@ generate:
   preprocessing:
     cache_dir: data/cache/
     genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
-    use_full_sequence: false
+    use_model_resolution: false
+    use_full_triplet: false
   fitness:
     minimize_fitness: false
     fitness_function: increase_archive_diversity

diff --git a/dress/configs/generate_iad_pwm_grammar.yaml b/dress/configs/generate_iad_pwm_grammar.yaml
@@ -15,7 +15,8 @@ generate:
   preprocessing:
     cache_dir: data/cache/
     genome: data/cache/Homo_sapiens.GRCh38.dna.primary_assembly.fa
-    use_full_sequence: false
+    use_model_resolution: false
+    use_full_triplet: false
   fitness:
     minimize_fitness: false
     fitness_function: increase_archive_diversity

diff --git a/dress/datasetevaluation/representation/motifs/utils.py b/dress/datasetevaluation/representation/motifs/utils.py
@@ -4,6 +4,8 @@
 import re
 import pandas as pd
 from typing import Union
+import pandas as pd
+pd.set_option('future.no_silent_downcasting', True)
 
 from dress.datasetevaluation.representation.motifs.rbp_lists import RBP_SUBSETS
 import numpy as np
@@ -406,7 +408,7 @@ def _remove_self_contained(gr: pr.PyRanges, scan_method: str) -> pr.PyRanges:
             df = pd.merge(df, contained_same_rbp, how="left", on=to_drop_cols).drop(
                 columns=to_clean_cols
             )
-            df.Has_self_submotif.fillna(False, inplace=True)
+            df.fillna({'Has_self_submotif': False}, inplace=True)
 
         #######################
         # Other RBP contained #
@@ -428,7 +430,7 @@ def _remove_self_contained(gr: pr.PyRanges, scan_method: str) -> pr.PyRanges:
             df = pd.merge(df, contained_other_rbp, how="left", on=to_drop_cols).drop(
                 columns=to_clean_cols[:-1]
             )
-            df.Has_other_submotif.fillna(False, inplace=True)
+            df.fillna({'Has_other_submotif': False}, inplace=True)
             # logger.debug(".. {} hits flagged ..".format(contained_other_rbp.shape[0]))
 
     return pr.PyRanges(df)

diff --git a/dress/datasetgeneration/json_schema.py b/dress/datasetgeneration/json_schema.py
@@ -7,6 +7,8 @@
             "properties": {
                 "dry_run": {"type": "boolean"},
                 "disable_gpu": {"type": "boolean"},
+                "use_full_triplet": {"type": "boolean"},
+                "use_model_resolution": {"type": "boolean"},
                 "verbosity": {"type": "integer"},
                 "shuffle_input": {"type": ["null", "string"]},
                 "outdir": {"type": "string"},

diff --git a/dress/datasetgeneration/preprocessing/gtf_cache.py b/dress/datasetgeneration/preprocessing/gtf_cache.py
@@ -121,7 +121,9 @@ def preprocessing(data: pr.PyRanges, **kwargs):
         df=extracted,
         fasta=genome,
         extend_borders=100,
-        use_full_seqs=kwargs["use_full_sequence"],
+        use_full_triplet=kwargs["use_full_triplet"],
+        use_model_resolution=kwargs["use_model_resolution"],
+        model = kwargs["model"]
     )
 
     if os.path.isdir(kwargs["outdir"]):
@@ -156,7 +158,10 @@ def write_output(
         Additional arguments in **kwargs:
             outdir (str): Output directory.
             outbasename (str): Output basename.
-            use_full_sequence (bool): Whether to use the full sequence when running the black box model.
+            use_full_triplet (bool): Whether to use the full exon triplet as input sequence 
+        when making model inferences.
+            use_model_resolution (bool): Whether to use the model resolution to determine input
+        sequence size when making model inferences.
     """
 
     to_write = {
@@ -193,8 +198,13 @@ def write_output(
                     ]
                 ],
             )
+
+    out_flag = ''
+    if kwargs['use_full_triplet']:
+        out_flag = "_full_triplet"
+    elif kwargs['use_model_resolution']:
+        out_flag = "_model_res"
 
-    out_flag = "" if kwargs["use_full_sequence"] else "_trimmed_at_5000bp"
     if len(extracted_with_seqs) > 0:
         extracted_with_seqs[
             ["header", "acceptor_idx", "donor_idx", "tx_id", "exon"]