adjust dge ratio description #17, refactor write params

jlab · Nov 15, 2024 · 6a5466f · 6a5466f
1 parent 15914f3
commit 6a5466f
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -46,7 +46,6 @@ git lfs install
 
 If you already cloned the repo, remove it, install git-lfs and clone again.
 
-
 #### Instal g++ (Optional, for performance)
 
 ```
@@ -173,15 +172,15 @@ marbel
 ### Specifying Number of Species, Orthogroups, and Samples
 
 ```sh
-marbel --n-species 30 --n-orthogroups 1500 --n-samples 15 20
+marbel --n-species 10 --n-orthogroups 500 --n-samples 5 8
 ```
 
 This command will generate a dataset with:
 
-- 30 species
-- 1500 orthologous groups
-- 15 samples for group 1
-- 20 samples for group 2
+- 10 species
+- 500 orthologous groups
+- 5 samples for group 1
+- 8 samples for group 2
 
 ## Contributing
 

diff --git a/src/marbel/data_generations.py b/src/marbel/data_generations.py
@@ -348,8 +348,8 @@ def write_as_fastq(fa_path, fq_path):
             SeqIO.write(record, fastq, "fastq")
 
 
-def summarize_parameters(number_of_orthogous_groups, number_of_species, number_of_sample, outdir, max_phylo_distance,
-                         min_identity, deg_ratio, seed, output_format, read_length, library_size, library_distribution, library_sizes, result_file):
+def write_parameter_summary(number_of_orthogous_groups, number_of_species, number_of_sample, outdir, max_phylo_distance,
+                            min_identity, deg_ratio, seed, output_format, error_model, read_length, library_size, library_distribution, library_sizes, summary_dir):
     """
     Writes the simulation parameters to the result_file.
 
@@ -360,50 +360,37 @@ def summarize_parameters(number_of_orthogous_groups, number_of_species, number_o
         outdir (str): The output directory.
         max_phylo_distance (float): The maximum phylogenetic distance.
         min_identity (float): The minimum sequence identity.
-        deg_ratio (tuple): The ratio of up and down regulated genes (up, down).
+        deg_ratio (float): The ratio of up and down regulated genes.
         seed (int): The seed for the simulation.
         compressed (bool): Compression of files.
         read_length (int): The read length.
         result_file (file): The file to write the summary to.
     """
-    result_file.write(f"Number of orthogroups: {number_of_orthogous_groups}\n")
-    result_file.write(f"Number of species: {number_of_species}\n")
-    result_file.write(f"Number of samples: {number_of_sample}\n")
-    result_file.write(f"Output directory: {outdir}\n")
-    result_file.write(f"Max phylogenetic distance: {max_phylo_distance}\n")
-    result_file.write(f"Min identity: {min_identity}\n")
-    result_file.write(f"Up and down regulated genes: {deg_ratio}\n")
-    result_file.write(f"Seed: {seed}\n")
-    result_file.write(f"File compression: {output_format}\n")
-    result_file.write(f"Read length: {read_length}\n")
-    result_file.write(f"Library size: {library_size}\n")
-    result_file.write(f"Library size distribution: {library_distribution}\n")
-    result_file.write(f"Library sizes for samples: {library_sizes}\n")
-
-
-def generate_report(number_of_orthogous_groups, number_of_species, number_of_sample,
-                    outdir, max_phylo_distance, min_identity, deg_ratio, seed, compressed, gene_summary, read_length, library_size, library_distribution,
-                    library_sizes):
+    with open(f"{summary_dir}/marbel_params.txt", "w") as result_file:
+        result_file.write(f"Number of orthogroups: {number_of_orthogous_groups}\n")
+        result_file.write(f"Number of species: {number_of_species}\n")
+        result_file.write(f"Number of samples: {number_of_sample}\n")
+        result_file.write(f"Output directory: {outdir}\n")
+        result_file.write(f"Max phylogenetic distance: {max_phylo_distance}\n")
+        result_file.write(f"Min identity: {min_identity}\n")
+        result_file.write(f"Ratio of up and down regulated genes: {deg_ratio}\n")
+        result_file.write(f"Seed: {seed}\n")
+        result_file.write(f"File compression: {output_format}\n")
+        result_file.write(f"Model used: {error_model}\n")
+        result_file.write(f"Read length: {read_length}\n")
+        result_file.write(f"Library size: {library_size}\n")
+        result_file.write(f"Library size distribution: {library_distribution}\n")
+        result_file.write(f"Library sizes for samples: {library_sizes}\n")
+
+
+def generate_report(summary_dir, gene_summary):
     """
     Generates a report of the simulation parameters.
 
     Parameters:
-        number_of_orthogous_groups (int): The number of orthologous groups.
-        number_of_species (int): The number of species.
-        number_of_sample (tuple): The number of samples (group 1, group 2).
-        outdir (str): The output directory.
-        max_phylo_distance (float): The maximum phylogenetic distance.
-        min_identity (float): The minimum sequence identity.
-        deg_ratio (tuple): The ratio of up and down regulated genes (up, down).
-        seed (int): The seed for the simulation.
-        compressed (bool): Generate compressed output.
+        summary_dir (str): The output directory for the summary
         gene_summary (pandas.DataFrame): The summary of genes.
-        read_length (int): The read length.
     """
-    summary_dir = f"{outdir}/summary"
-    with open(f"{summary_dir}/marbel_params.txt", "w") as f:
-        summarize_parameters(number_of_orthogous_groups, number_of_species, number_of_sample, outdir,
-                             max_phylo_distance, min_identity, deg_ratio, seed, compressed, read_length, library_size, library_distribution, library_sizes, f)
     gene_summary.to_csv(f"{summary_dir}/gene_summary.csv", index=False)
     with open(f"{summary_dir}/species_tree.newick", "w") as f:
         species_subtree = species_tree.copy()

diff --git a/src/marbel/meta_tran_sim.py b/src/marbel/meta_tran_sim.py
@@ -12,7 +12,7 @@
 from marbel.presets import __version__, MAX_SPECIES, MAX_ORTHO_GROUPS, rank_distance, LibrarySizeDistribution, Rank, ErrorModel, DESEQ2_FITTED_A0, DESEQ2_FITTED_A1
 from marbel.data_generations import draw_random_species, create_ortholgous_group_rates, filter_by_seq_id_and_phylo_dist, create_sample_values, create_fastq_samples, draw_library_sizes
 from marbel.data_generations import draw_orthogroups_by_rate, draw_orthogroups, generate_species_abundance, generate_read_mean_counts, aggregate_gene_data, filter_genes_from_ground, generate_report
-from marbel.data_generations import draw_dge_factors
+from marbel.data_generations import draw_dge_factors, write_parameter_summary
 
 app = typer.Typer()
 
@@ -52,10 +52,10 @@ def sample_callback(value: Optional[Tuple[int, int]]):
 
 
 def dge_ratio_callback(value: float):
-    if value < 0 or value > 1:
+    if value < 0:
         raise typer.BadParameter("Ratio cannot be negative")
-    if value > 1:
-        raise typer.BadParameter("DGE ratio must be smaller than 0.5")
+    if value >= 1:
+        raise typer.BadParameter("DGE ratio must be smaller than 1")
     return value
 
 
@@ -89,10 +89,7 @@ def main(n_species: Annotated[int, typer.Option(callback=species_callback,
                                                           + "with a more diverse phylogenetic distance.")] = None,
          min_identity: Annotated[float, typer.Option(help="Minimum mean sequence identity score for an orthologous groups."
                                                           + "Specify for more ")] = None,
-         dge_ratio: Annotated[float, typer.Option(callback=dge_ratio_callback,
-                                                  help="Ratio of up and down regulated genes."
-                                                  + "The first value is the ratio of up regulated genes, the second represents the ratio of"
-                                                  + "down regulated genes")] = 0.1,
+         dge_ratio: Annotated[float, typer.Option(callback=dge_ratio_callback, help="Ratio of up and down regulated genes. Must be between 0 and 1")] = 0.1,
          seed: Annotated[int, typer.Option(help="Seed for the sampling. Set for reproducibility")] = None,
          error_model: Annotated[ErrorModel, typer.Option(help="Sequencer model for the reads, use basic or perfect (no errors) for custom read length")] = ErrorModel.HiSeq,
          compressed: Annotated[bool, typer.Option(help="Compress the output fastq files")] = True,
@@ -107,7 +104,7 @@ def main(n_species: Annotated[int, typer.Option(callback=species_callback,
     number_of_orthogous_groups = n_orthogroups
     number_of_species = n_species
     number_of_sample = n_samples
-
+    dge_ratio = dge_ratio / 2
     # maybe change to synthetic species later on, for now just use the available species
     # generate some plots so the user can see the distribution
 
@@ -146,6 +143,9 @@ def main(n_species: Annotated[int, typer.Option(callback=species_callback,
     sample_library_sizes = draw_library_sizes(library_size, library_size_distribution, sum(number_of_sample))
     gene_summary_df["gene_name"] = gene_summary_df["gene_name"].apply(lambda x: re.sub(r'##.*?##', '', x))
     create_fastq_samples(gene_summary_df, outdir, compressed, error_model, seed, sample_library_sizes, read_length, threads)
+    write_parameter_summary(number_of_orthogous_groups, number_of_species, number_of_sample, outdir,
+                            max_phylo_distance, min_identity, dge_ratio, seed, compressed, read_length, library_size, library_size_distribution, sample_library_sizes, summary_dir)
+
     generate_report(number_of_orthogous_groups, number_of_species, number_of_sample, outdir,
                     max_phylo_distance, min_identity, dge_ratio, seed, compressed, gene_summary_df, read_length, library_size, library_size_distribution,
                     sample_library_sizes)