updated docs

KevinMenden · Mar 24, 2021 · 11678f0 · 11678f0
1 parent eb5e46c
commit 11678f0
Show file tree

Hide file tree

Showing 7 changed files with 66 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,14 @@
 # Scaden Changelog
 
+## Version 1.1.0
+
+* Reduced memory usage of `scaden simulate` significantly by performing simulation for one dataset at a time.
+* Using `.h5ad` format to store simulated data
+* Allow reading data in `.h5ad` format for improved performance (courtesy of @eboileau)
+* Improved logging and using rich progress bar for training
+* Gene subsetting is now done only when merging datasets, which will allow to generate different combinations
+of simulated datasets
+
 ### Version 1.0.2
 
 * General improvement of logging using the 'rich' library for colorized output

diff --git a/docs/blog.md b/docs/blog.md
@@ -4,3 +4,8 @@ that have been (or will be) implemented in Scaden.
 
 # Scaden v1.1.0 - Performance Improvements (21.03.2021)
 
+Scaden v1.1.0 brings significantly improved memory consumption for the data simulation step, which was a asked for 
+quite frequently. Now, instead of using about 4 GB of memory to simulate a small dataset, Scaden only uses 1 GB. This will
+allow to create datasets from large collections of scRNA-seq datasets without needing excessive memory. Furthermore,
+Scaden now stores the simulated data in `.h5ad` format with the full list of genes. This way you can simulate from a
+scRNA-seq dataset once and combine it with other datasets in the future.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,14 @@
 # Scaden Changelog
 
+## Version 1.1.0
+
+* Reduced memory usage of `scaden simulate` significantly by performing simulation for one dataset at a time.
+* Using `.h5ad` format to store simulated data
+* Allow reading data in `.h5ad` format for improved performance (courtesy of @eboileau)
+* Improved logging and using rich progress bar for training
+* Gene subsetting is now done only when merging datasets, which will allow to generate different combinations
+of simulated datasets
+
 ### Version 1.0.2
 
 * General improvement of logging using the 'rich' library for colorized output

diff --git a/docs/usage.md b/docs/usage.md
@@ -126,7 +126,8 @@ This command will create the artificial samples in the current working directory
 For Scaden to work properly, your input files have to be correctly formatted. As long as you use Scadens inbuilt functionality to generate the training data, you should have no problem 
 with formatting there. The prediction file, however, you have to format yourself. This should be a file of shape m X n, where m are your features (genes) and n your samples. So each row corresponds to 
 a gene, and each column to a sample. Leave the column name for the genes empy (just put a `\t` there). This is a rather standard format to store gene expression tables, so you should have not much work assuring that the
-format fits.
+format fits. Since version `v1.1.0` it is also possible to load data for simulation in `.h5ad` format for improved performance. In this case, the AnnData object should have
+a `Celltype` column in the `obs` field.
 
 Your data can either be raw counts or normalized, just make sure that they are not in logarithmic space already. When loading a prediction file, Scaden applies its scaling procedure to it, which involves taking the logarithm of your counts.
 So as long as they are not already in logarithmic space, Scaden will be able to handle both raw and normalized counts / expression values.

diff --git a/scaden/__main__.py b/scaden/__main__.py
@@ -11,7 +11,7 @@
 from scaden.process import processing
 from scaden.simulate import simulation
 from scaden.example import exampleData
-
+from scaden.merge import merge_datasets
 """
 
 author: Kevin Menden
@@ -33,6 +33,7 @@
 
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 
+
 def main():
     text = """
      ____                _            
@@ -146,7 +147,7 @@ def predict(data_path, model_dir, outname, seed):
     "--var_cutoff",
     default=0.1,
     help="Filter out genes with a variance less than the specified cutoff. A low cutoff is recommended,"
-    "this should only remove genes that are obviously uninformative.",
+         "this should only remove genes that are obviously uninformative.",
 )
 def process(data_path, prediction_data, processed_path, var_cutoff):
     """ Process a dataset for training """
@@ -157,9 +158,12 @@ def process(data_path, prediction_data, processed_path, var_cutoff):
         var_cutoff=var_cutoff,
     )
 
+
 """
 Simulate dataset
 """
+
+
 @cli.command()
 @click.option("--out", "-o", default="./", help="Directory to store output files in")
 @click.option("--data", "-d", default=".", help="Path to scRNA-seq dataset(s)")
@@ -211,9 +215,25 @@ def simulate(out, data, cells, n_samples, pattern, unknown, prefix, data_format)
     )
 
 
+"""
+Merge simulated datasets
+"""
+
+
+@cli.command()
+@click.option("--data", "-d", default=".", help="Directory containing simulated datasets (in .h5ad format)")
+@click.option("--prefix", "-p", default="data", help="Prefix of output file [default: data]")
+@click.option("--files", "-f", default=None, help="Comma-separated list of filenames to merge")
+def merge(data, prefix, files):
+    """ Merge simulated datasets into on training dataset """
+    merge_datasets(data_dir=data, prefix=prefix, files=files)
+
+
 """
 Generate example data
 """
+
+
 @cli.command()
 @click.option("--cells", "-c", default=10, help="Number of cells [default: 10]")
 @click.option("--types", "-t", default=5, help="Number of cell types [default: 5]")

diff --git a/scaden/merge.py b/scaden/merge.py
@@ -0,0 +1,18 @@
+from scaden.simulation import BulkSimulator
+
+"""
+Merge simulate datasets
+"""
+
+
+def merge_datasets(data_dir, prefix, files=None):
+
+    bulk_simulator = BulkSimulator()
+
+    if files:
+        files = files.split(",")
+
+    # Merge the resulting datasets
+    bulk_simulator.merge_datasets(data_dir=data_dir,
+                                  files=files,
+                                  out_name=prefix + ".h5ad")
diff --git a/scaden/simulate.py b/scaden/simulate.py
@@ -22,6 +22,6 @@ def simulation(simulate_dir, data_dir, sample_size, num_samples, pattern,
     bulk_simulator.simulate()
 
     # Merge the resulting datasets
-    bulk_simulator.merge_datasets(data_dir=bulk_simulator.out_dir,
+    bulk_simulator.merge_datasets(data_dir=simulate_dir,
                                   files=bulk_simulator.dataset_files,
                                   out_name=out_prefix + ".h5ad")