Merge pull request #17 from vib-singlecell-nf/pyscenic_0.10.0_updates

pySCENIC 0.10.0 updates
vib-singlecell-nf · Feb 28, 2020 · 648edf7 · 648edf7
2 parents ab4339c + 061a07f
commit 648edf7
Show file tree

Hide file tree

Showing 11 changed files with 142 additions and 177 deletions.
diff --git a/bin/arboreto_with_multiprocessing.py b/bin/arboreto_with_multiprocessing.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+
+import sys
+import time
+import loompy as lp
+import pandas as pd
+from multiprocessing import Pool, cpu_count
+import argparse
+import tqdm
+
+from arboreto.utils import load_tf_names
+from arboreto.algo import genie3, grnboost2, _prepare_input
+from arboreto.core import SGBM_KWARGS, RF_KWARGS, EARLY_STOP_WINDOW_LENGTH
+from arboreto.core import to_tf_matrix, target_gene_indices, infer_partial_network
+
+from pyscenic.cli.utils import load_exp_matrix
+
+
+################################################################################
+################################################################################
+
+parser_grn = argparse.ArgumentParser(description='Run Arboreto using a multiprocessing pool')
+
+parser_grn.add_argument('expression_mtx_fname',
+        type=argparse.FileType('r'),
+        help='The name of the file that contains the expression matrix for the single cell experiment.'
+        ' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).')
+parser_grn.add_argument('tfs_fname',
+        type=argparse.FileType('r'),
+        help='The name of the file that contains the list of transcription factors (TXT; one TF per line).')
+parser_grn.add_argument('-m', '--method', choices=['genie3', 'grnboost2'],
+        default='grnboost2',
+        help='The algorithm for gene regulatory network reconstruction (default: grnboost2).')
+parser_grn.add_argument('-o', '--output',
+        type=argparse.FileType('w'), default=sys.stdout,
+        help='Output file/stream, i.e. a table of TF-target genes (TSV).')
+parser_grn.add_argument('--num_workers',
+        type=int, default=cpu_count(),
+        help='The number of workers to use. (default: {}).'.format(cpu_count()))
+parser_grn.add_argument('--seed', type=int, required=False, default=None,
+        help='Seed value for regressor random state initialization (optional)')
+
+parser_grn.add_argument('--cell_id_attribute',
+        type=str, default='CellID',
+        help='The name of the column attribute that specifies the identifiers of the cells in the loom file.')
+parser_grn.add_argument('--gene_attribute',
+        type=str, default='Gene',
+        help='The name of the row attribute that specifies the gene symbols in the loom file.')
+parser_grn.add_argument('--sparse', action='store_const', const=True, default=False,
+        help='If set, load the expression data as a sparse (CSC) matrix.')
+parser_grn.add_argument('-t', '--transpose', action='store_const', const = 'yes',
+        help='Transpose the expression matrix (rows=genes x columns=cells).')
+
+args = parser_grn.parse_args()
+
+
+################################################################################
+################################################################################
+
+
+if(args.method == 'grnboost2'):
+    method_params = [
+        'GBM',      # regressor_type
+        SGBM_KWARGS # regressor_kwargs
+        ]
+elif(args.method == 'genie3'):
+    method_params = [
+        'RF',       # regressor_type
+        RF_KWARGS   # regressor_kwargs
+        ]
+
+
+def run_infer_partial_network(target_gene_index):
+    target_gene_name = gene_names[target_gene_index]
+    target_gene_expression = ex_matrix[:, target_gene_index]
+
+    n = infer_partial_network(
+        regressor_type=method_params[0],
+        regressor_kwargs=method_params[1],
+        tf_matrix=tf_matrix,
+        tf_matrix_gene_names=tf_matrix_gene_names,
+        target_gene_name=target_gene_name,
+        target_gene_expression=target_gene_expression,
+        include_meta=False,
+        early_stop_window_length=EARLY_STOP_WINDOW_LENGTH,
+        seed=args.seed)
+    return( n )
+
+
+if __name__ == '__main__':
+
+    start_time = time.time()
+    ex_matrix = load_exp_matrix(args.expression_mtx_fname.name,
+                             (args.transpose == 'yes'),
+                             args.sparse,
+                             args.cell_id_attribute,
+                             args.gene_attribute)
+
+    if args.sparse:
+        gene_names = ex_matrix[1]
+        ex_matrix = ex_matrix[0]
+    else:
+        gene_names = ex_matrix.columns
+
+    end_time = time.time()
+    print(f'Loaded expression matrix of {ex_matrix.shape[0]} cells and {ex_matrix.shape[1]} genes in {end_time - start_time} seconds...', file=sys.stdout)
+
+    tf_names = load_tf_names(args.tfs_fname.name)
+    print(f'Loaded {len(tf_names)} TFs...', file=sys.stdout)
+
+    ex_matrix, gene_names, tf_names = _prepare_input(ex_matrix, gene_names, tf_names)
+    tf_matrix, tf_matrix_gene_names = to_tf_matrix(ex_matrix, gene_names, tf_names)
+
+    print(f'starting {args.method} using {args.num_workers} processes...', file=sys.stdout)
+    start_time = time.time()
+
+    with Pool(args.num_workers) as p:
+        adjs = list(tqdm.tqdm(p.imap(run_infer_partial_network,
+                                     target_gene_indices(gene_names, target_genes='all'),
+                                     chunksize=1
+                                     ),
+                              total=len(gene_names)))
+
+    adj = pd.concat(adjs).sort_values(by='importance', ascending=False)
+
+    end_time = time.time()
+    print(f'Done in {end_time - start_time} seconds.', file=sys.stdout)
+
+    adj.to_csv(args.output, index=False, sep="\t")
+
diff --git a/bin/grnboost2_without_dask.py b/bin/grnboost2_without_dask.py
diff --git a/conf/append.config b/conf/append.config
@@ -2,7 +2,6 @@
 params {
     sc {
         scenic {
-            container = 'vibsinglecellnf/scenic:0.9.19'
             report_ipynb = '/src/scenic/bin/reports/scenic_report.ipynb'
             existingScenicLoom = ''
             sampleSuffixWithExtension = '' // Suffix after the sample name in the file path

diff --git a/conf/min/base/v0.0.1.config b/conf/min/base/v0.0.1.config
@@ -13,7 +13,7 @@ params {
     sc {
         scenic {
             // Container settings
-            container = 'aertslab/sctx-pyscenic:0.9.19'
+            container = 'aertslab/pyscenic:0.10.0'
 
             // Reporting settings
             reports {

diff --git a/conf/multi_runs.config b/conf/multi_runs.config
@@ -1,7 +1,6 @@
 params {
     sc {
         scenic {
-            container = 'dweemx/sctx-pyscenic:0.9.19-6'
             numRuns = 2
             // AUCell parameters
             aucell {

diff --git a/conf/test_multi_runs.config b/conf/test_multi_runs.config
@@ -1,7 +1,6 @@
 params {
     sc {
         scenic {
-            container = 'dweemx/sctx-pyscenic:0.9.19-6'
             numRuns = 2
             // AUCell parameters
             aucell {

diff --git a/main.nf b/main.nf
@@ -35,7 +35,7 @@ if(!isAppendOnlyMode && !(params.global.genome.assembly in ['dm6','hg38']))
 
 include './../channels/file.nf' params(params)
 
-include GRNBOOST2_WITHOUT_DASK                                    from './processes/grnboost2withoutDask'  params(params)
+include ARBORETO_WITH_MULTIPROCESSING                             from './processes/arboreto_with_multiprocessing'  params(params)
 include CISTARGET as CISTARGET__MOTIF                             from './processes/cistarget'             params(params)
 include CISTARGET as CISTARGET__TRACK                             from './processes/cistarget'             params(params)
 include AUCELL as AUCELL__MOTIF                                   from './processes/aucell'                params(params)
@@ -73,7 +73,7 @@ workflow scenic {
     main:
         /* GRN */
         tfs = file(params.sc.scenic.grn.tfs)
-        grn = GRNBOOST2_WITHOUT_DASK( filteredLoom.combine(runs), tfs )
+        grn = ARBORETO_WITH_MULTIPROCESSING( filteredLoom.combine(runs), tfs )
 
         /* cisTarget motif analysis */
         // channel for SCENIC databases resources:
@@ -191,4 +191,4 @@ workflow {
             throw new Exception("The given filteredLoom required parameter does not exist in the params.sc.scenic scope.")
         scenic( Channel.of( tuple("foobar", file(params.sc.scenic.filteredLoom)) ) )
 
-}
+}
diff --git a/multi_runs.Dockerfile b/multi_runs.Dockerfile
diff --git a/processes/grnboost2withoutDask.nf → processes/arboreto_with_multiprocessing.nf b/processes/grnboost2withoutDask.nf → processes/arboreto_with_multiprocessing.nf
@@ -5,13 +5,13 @@ binDir = !params.containsKey("test") ? "${workflow.projectDir}/src/scenic/bin/"
 toolParams = params.sc.scenic
 processParams = params.sc.scenic.grn
 
-process GRNBOOST2_WITHOUT_DASK {
+process ARBORETO_WITH_MULTIPROCESSING {
 
     // Process will be submitted as job if toolParams.labels.processExecutor = 'qsub' (default)
     label "${processParams.labels ? processParams.labels.processExecutor : "local"}"
     cache 'deep'
     container toolParams.container
-    publishDir "${toolParams.scenicoutdir}/${sampleId}/grnboost2withoutDask/${"numRuns" in toolParams && toolParams.numRuns > 1 ? "run_" + runId : ""}", mode: 'link', overwrite: true
+    publishDir "${toolParams.scenicoutdir}/${sampleId}/arboreto_with_multiprocessing/${"numRuns" in toolParams && toolParams.numRuns > 1 ? "run_" + runId : ""}", mode: 'link', overwrite: true
     clusterOptions "-l nodes=1:ppn=${processParams.numWorkers} -l pmem=${processParams.pmem} -l walltime=${processParams.walltime} -A ${params.global.qsubaccount}"
     maxForks processParams.maxForks
 
@@ -40,13 +40,14 @@ process GRNBOOST2_WITHOUT_DASK {
         outputFileName = "numRuns" in toolParams && toolParams.numRuns > 1 ? sampleId + "__run_" + runId +"__adj.tsv" : sampleId + "__adj.tsv"
         seed = "numRuns" in toolParams && toolParams.numRuns > 1 ? (params.seed + runId) : params.seed
         """
-        ${binDir}grnboost2_without_dask.py \
+        ${binDir}arboreto_with_multiprocessing.py \
             $filteredLoom \
             $tfs \
             --output ${outputFileName} \
             --num_workers ${processParams.numWorkers} \
             --cell_id_attribute ${toolParams.cell_id_attribute} \
             --gene_attribute ${toolParams.gene_attribute} \
+            --method ${processParams.algorithm} \
             --seed ${seed}
         """