Skip to content

Commit

Permalink
adjust dge ratio description #17, refactor write params
Browse files Browse the repository at this point in the history
  • Loading branch information
tensulin committed Nov 15, 2024
1 parent 15914f3 commit 6a5466f
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 50 deletions.
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ git lfs install

If you already cloned the repo, remove it, install git-lfs and clone again.


#### Instal g++ (Optional, for performance)

```
Expand Down Expand Up @@ -173,15 +172,15 @@ marbel
### Specifying Number of Species, Orthogroups, and Samples

```sh
marbel --n-species 30 --n-orthogroups 1500 --n-samples 15 20
marbel --n-species 10 --n-orthogroups 500 --n-samples 5 8
```

This command will generate a dataset with:

- 30 species
- 1500 orthologous groups
- 15 samples for group 1
- 20 samples for group 2
- 10 species
- 500 orthologous groups
- 5 samples for group 1
- 8 samples for group 2

## Contributing

Expand Down
57 changes: 22 additions & 35 deletions src/marbel/data_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,8 @@ def write_as_fastq(fa_path, fq_path):
SeqIO.write(record, fastq, "fastq")


def summarize_parameters(number_of_orthogous_groups, number_of_species, number_of_sample, outdir, max_phylo_distance,
min_identity, deg_ratio, seed, output_format, read_length, library_size, library_distribution, library_sizes, result_file):
def write_parameter_summary(number_of_orthogous_groups, number_of_species, number_of_sample, outdir, max_phylo_distance,
min_identity, deg_ratio, seed, output_format, error_model, read_length, library_size, library_distribution, library_sizes, summary_dir):
"""
Writes the simulation parameters to the result_file.
Expand All @@ -360,50 +360,37 @@ def summarize_parameters(number_of_orthogous_groups, number_of_species, number_o
outdir (str): The output directory.
max_phylo_distance (float): The maximum phylogenetic distance.
min_identity (float): The minimum sequence identity.
deg_ratio (tuple): The ratio of up and down regulated genes (up, down).
deg_ratio (float): The ratio of up and down regulated genes.
seed (int): The seed for the simulation.
compressed (bool): Compression of files.
read_length (int): The read length.
result_file (file): The file to write the summary to.
"""
result_file.write(f"Number of orthogroups: {number_of_orthogous_groups}\n")
result_file.write(f"Number of species: {number_of_species}\n")
result_file.write(f"Number of samples: {number_of_sample}\n")
result_file.write(f"Output directory: {outdir}\n")
result_file.write(f"Max phylogenetic distance: {max_phylo_distance}\n")
result_file.write(f"Min identity: {min_identity}\n")
result_file.write(f"Up and down regulated genes: {deg_ratio}\n")
result_file.write(f"Seed: {seed}\n")
result_file.write(f"File compression: {output_format}\n")
result_file.write(f"Read length: {read_length}\n")
result_file.write(f"Library size: {library_size}\n")
result_file.write(f"Library size distribution: {library_distribution}\n")
result_file.write(f"Library sizes for samples: {library_sizes}\n")


def generate_report(number_of_orthogous_groups, number_of_species, number_of_sample,
outdir, max_phylo_distance, min_identity, deg_ratio, seed, compressed, gene_summary, read_length, library_size, library_distribution,
library_sizes):
with open(f"{summary_dir}/marbel_params.txt", "w") as result_file:
result_file.write(f"Number of orthogroups: {number_of_orthogous_groups}\n")
result_file.write(f"Number of species: {number_of_species}\n")
result_file.write(f"Number of samples: {number_of_sample}\n")
result_file.write(f"Output directory: {outdir}\n")
result_file.write(f"Max phylogenetic distance: {max_phylo_distance}\n")
result_file.write(f"Min identity: {min_identity}\n")
result_file.write(f"Ratio of up and down regulated genes: {deg_ratio}\n")
result_file.write(f"Seed: {seed}\n")
result_file.write(f"File compression: {output_format}\n")
result_file.write(f"Model used: {error_model}\n")
result_file.write(f"Read length: {read_length}\n")
result_file.write(f"Library size: {library_size}\n")
result_file.write(f"Library size distribution: {library_distribution}\n")
result_file.write(f"Library sizes for samples: {library_sizes}\n")


def generate_report(summary_dir, gene_summary):
"""
Generates a report of the simulation parameters.
Parameters:
number_of_orthogous_groups (int): The number of orthologous groups.
number_of_species (int): The number of species.
number_of_sample (tuple): The number of samples (group 1, group 2).
outdir (str): The output directory.
max_phylo_distance (float): The maximum phylogenetic distance.
min_identity (float): The minimum sequence identity.
deg_ratio (tuple): The ratio of up and down regulated genes (up, down).
seed (int): The seed for the simulation.
compressed (bool): Generate compressed output.
summary_dir (str): The output directory for the summary
gene_summary (pandas.DataFrame): The summary of genes.
read_length (int): The read length.
"""
summary_dir = f"{outdir}/summary"
with open(f"{summary_dir}/marbel_params.txt", "w") as f:
summarize_parameters(number_of_orthogous_groups, number_of_species, number_of_sample, outdir,
max_phylo_distance, min_identity, deg_ratio, seed, compressed, read_length, library_size, library_distribution, library_sizes, f)
gene_summary.to_csv(f"{summary_dir}/gene_summary.csv", index=False)
with open(f"{summary_dir}/species_tree.newick", "w") as f:
species_subtree = species_tree.copy()
Expand Down
18 changes: 9 additions & 9 deletions src/marbel/meta_tran_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from marbel.presets import __version__, MAX_SPECIES, MAX_ORTHO_GROUPS, rank_distance, LibrarySizeDistribution, Rank, ErrorModel, DESEQ2_FITTED_A0, DESEQ2_FITTED_A1
from marbel.data_generations import draw_random_species, create_ortholgous_group_rates, filter_by_seq_id_and_phylo_dist, create_sample_values, create_fastq_samples, draw_library_sizes
from marbel.data_generations import draw_orthogroups_by_rate, draw_orthogroups, generate_species_abundance, generate_read_mean_counts, aggregate_gene_data, filter_genes_from_ground, generate_report
from marbel.data_generations import draw_dge_factors
from marbel.data_generations import draw_dge_factors, write_parameter_summary

app = typer.Typer()

Expand Down Expand Up @@ -52,10 +52,10 @@ def sample_callback(value: Optional[Tuple[int, int]]):


def dge_ratio_callback(value: float):
if value < 0 or value > 1:
if value < 0:
raise typer.BadParameter("Ratio cannot be negative")
if value > 1:
raise typer.BadParameter("DGE ratio must be smaller than 0.5")
if value >= 1:
raise typer.BadParameter("DGE ratio must be smaller than 1")
return value


Expand Down Expand Up @@ -89,10 +89,7 @@ def main(n_species: Annotated[int, typer.Option(callback=species_callback,
+ "with a more diverse phylogenetic distance.")] = None,
min_identity: Annotated[float, typer.Option(help="Minimum mean sequence identity score for an orthologous groups."
+ "Specify for more ")] = None,
dge_ratio: Annotated[float, typer.Option(callback=dge_ratio_callback,
help="Ratio of up and down regulated genes."
+ "The first value is the ratio of up regulated genes, the second represents the ratio of"
+ "down regulated genes")] = 0.1,
dge_ratio: Annotated[float, typer.Option(callback=dge_ratio_callback, help="Ratio of up and down regulated genes. Must be between 0 and 1")] = 0.1,
seed: Annotated[int, typer.Option(help="Seed for the sampling. Set for reproducibility")] = None,
error_model: Annotated[ErrorModel, typer.Option(help="Sequencer model for the reads, use basic or perfect (no errors) for custom read length")] = ErrorModel.HiSeq,
compressed: Annotated[bool, typer.Option(help="Compress the output fastq files")] = True,
Expand All @@ -107,7 +104,7 @@ def main(n_species: Annotated[int, typer.Option(callback=species_callback,
number_of_orthogous_groups = n_orthogroups
number_of_species = n_species
number_of_sample = n_samples

dge_ratio = dge_ratio / 2
# maybe change to synthetic species later on, for now just use the available species
# generate some plots so the user can see the distribution

Expand Down Expand Up @@ -146,6 +143,9 @@ def main(n_species: Annotated[int, typer.Option(callback=species_callback,
sample_library_sizes = draw_library_sizes(library_size, library_size_distribution, sum(number_of_sample))
gene_summary_df["gene_name"] = gene_summary_df["gene_name"].apply(lambda x: re.sub(r'##.*?##', '', x))
create_fastq_samples(gene_summary_df, outdir, compressed, error_model, seed, sample_library_sizes, read_length, threads)
write_parameter_summary(number_of_orthogous_groups, number_of_species, number_of_sample, outdir,
max_phylo_distance, min_identity, dge_ratio, seed, compressed, read_length, library_size, library_size_distribution, sample_library_sizes, summary_dir)

generate_report(number_of_orthogous_groups, number_of_species, number_of_sample, outdir,
max_phylo_distance, min_identity, dge_ratio, seed, compressed, gene_summary_df, read_length, library_size, library_size_distribution,
sample_library_sizes)
Expand Down

0 comments on commit 6a5466f

Please sign in to comment.