Skip to content
This repository has been archived by the owner on Dec 8, 2020. It is now read-only.

Commit

Permalink
Merge pull request #17 from vib-singlecell-nf/pyscenic_0.10.0_updates
Browse files Browse the repository at this point in the history
pySCENIC 0.10.0 updates
  • Loading branch information
cflerin authored Feb 28, 2020
2 parents ab4339c + 061a07f commit 648edf7
Show file tree
Hide file tree
Showing 11 changed files with 142 additions and 177 deletions.
130 changes: 130 additions & 0 deletions bin/arboreto_with_multiprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env python3

import sys
import time
import loompy as lp
import pandas as pd
from multiprocessing import Pool, cpu_count
import argparse
import tqdm

from arboreto.utils import load_tf_names
from arboreto.algo import genie3, grnboost2, _prepare_input
from arboreto.core import SGBM_KWARGS, RF_KWARGS, EARLY_STOP_WINDOW_LENGTH
from arboreto.core import to_tf_matrix, target_gene_indices, infer_partial_network

from pyscenic.cli.utils import load_exp_matrix


################################################################################
################################################################################

parser_grn = argparse.ArgumentParser(description='Run Arboreto using a multiprocessing pool')

parser_grn.add_argument('expression_mtx_fname',
type=argparse.FileType('r'),
help='The name of the file that contains the expression matrix for the single cell experiment.'
' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).')
parser_grn.add_argument('tfs_fname',
type=argparse.FileType('r'),
help='The name of the file that contains the list of transcription factors (TXT; one TF per line).')
parser_grn.add_argument('-m', '--method', choices=['genie3', 'grnboost2'],
default='grnboost2',
help='The algorithm for gene regulatory network reconstruction (default: grnboost2).')
parser_grn.add_argument('-o', '--output',
type=argparse.FileType('w'), default=sys.stdout,
help='Output file/stream, i.e. a table of TF-target genes (TSV).')
parser_grn.add_argument('--num_workers',
type=int, default=cpu_count(),
help='The number of workers to use. (default: {}).'.format(cpu_count()))
parser_grn.add_argument('--seed', type=int, required=False, default=None,
help='Seed value for regressor random state initialization (optional)')

parser_grn.add_argument('--cell_id_attribute',
type=str, default='CellID',
help='The name of the column attribute that specifies the identifiers of the cells in the loom file.')
parser_grn.add_argument('--gene_attribute',
type=str, default='Gene',
help='The name of the row attribute that specifies the gene symbols in the loom file.')
parser_grn.add_argument('--sparse', action='store_const', const=True, default=False,
help='If set, load the expression data as a sparse (CSC) matrix.')
parser_grn.add_argument('-t', '--transpose', action='store_const', const = 'yes',
help='Transpose the expression matrix (rows=genes x columns=cells).')

args = parser_grn.parse_args()


################################################################################
################################################################################


if(args.method == 'grnboost2'):
method_params = [
'GBM', # regressor_type
SGBM_KWARGS # regressor_kwargs
]
elif(args.method == 'genie3'):
method_params = [
'RF', # regressor_type
RF_KWARGS # regressor_kwargs
]


def run_infer_partial_network(target_gene_index):
target_gene_name = gene_names[target_gene_index]
target_gene_expression = ex_matrix[:, target_gene_index]

n = infer_partial_network(
regressor_type=method_params[0],
regressor_kwargs=method_params[1],
tf_matrix=tf_matrix,
tf_matrix_gene_names=tf_matrix_gene_names,
target_gene_name=target_gene_name,
target_gene_expression=target_gene_expression,
include_meta=False,
early_stop_window_length=EARLY_STOP_WINDOW_LENGTH,
seed=args.seed)
return( n )


if __name__ == '__main__':

start_time = time.time()
ex_matrix = load_exp_matrix(args.expression_mtx_fname.name,
(args.transpose == 'yes'),
args.sparse,
args.cell_id_attribute,
args.gene_attribute)

if args.sparse:
gene_names = ex_matrix[1]
ex_matrix = ex_matrix[0]
else:
gene_names = ex_matrix.columns

end_time = time.time()
print(f'Loaded expression matrix of {ex_matrix.shape[0]} cells and {ex_matrix.shape[1]} genes in {end_time - start_time} seconds...', file=sys.stdout)

tf_names = load_tf_names(args.tfs_fname.name)
print(f'Loaded {len(tf_names)} TFs...', file=sys.stdout)

ex_matrix, gene_names, tf_names = _prepare_input(ex_matrix, gene_names, tf_names)
tf_matrix, tf_matrix_gene_names = to_tf_matrix(ex_matrix, gene_names, tf_names)

print(f'starting {args.method} using {args.num_workers} processes...', file=sys.stdout)
start_time = time.time()

with Pool(args.num_workers) as p:
adjs = list(tqdm.tqdm(p.imap(run_infer_partial_network,
target_gene_indices(gene_names, target_genes='all'),
chunksize=1
),
total=len(gene_names)))

adj = pd.concat(adjs).sort_values(by='importance', ascending=False)

end_time = time.time()
print(f'Done in {end_time - start_time} seconds.', file=sys.stdout)

adj.to_csv(args.output, index=False, sep="\t")

133 changes: 0 additions & 133 deletions bin/grnboost2_without_dask.py

This file was deleted.

1 change: 0 additions & 1 deletion conf/append.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
params {
sc {
scenic {
container = 'vibsinglecellnf/scenic:0.9.19'
report_ipynb = '/src/scenic/bin/reports/scenic_report.ipynb'
existingScenicLoom = ''
sampleSuffixWithExtension = '' // Suffix after the sample name in the file path
Expand Down
2 changes: 1 addition & 1 deletion conf/min/base/v0.0.1.config
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ params {
sc {
scenic {
// Container settings
container = 'aertslab/sctx-pyscenic:0.9.19'
container = 'aertslab/pyscenic:0.10.0'

// Reporting settings
reports {
Expand Down
1 change: 0 additions & 1 deletion conf/multi_runs.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
params {
sc {
scenic {
container = 'dweemx/sctx-pyscenic:0.9.19-6'
numRuns = 2
// AUCell parameters
aucell {
Expand Down
1 change: 0 additions & 1 deletion conf/test_multi_runs.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
params {
sc {
scenic {
container = 'dweemx/sctx-pyscenic:0.9.19-6'
numRuns = 2
// AUCell parameters
aucell {
Expand Down
6 changes: 3 additions & 3 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ if(!isAppendOnlyMode && !(params.global.genome.assembly in ['dm6','hg38']))

include './../channels/file.nf' params(params)

include GRNBOOST2_WITHOUT_DASK from './processes/grnboost2withoutDask' params(params)
include ARBORETO_WITH_MULTIPROCESSING from './processes/arboreto_with_multiprocessing' params(params)
include CISTARGET as CISTARGET__MOTIF from './processes/cistarget' params(params)
include CISTARGET as CISTARGET__TRACK from './processes/cistarget' params(params)
include AUCELL as AUCELL__MOTIF from './processes/aucell' params(params)
Expand Down Expand Up @@ -73,7 +73,7 @@ workflow scenic {
main:
/* GRN */
tfs = file(params.sc.scenic.grn.tfs)
grn = GRNBOOST2_WITHOUT_DASK( filteredLoom.combine(runs), tfs )
grn = ARBORETO_WITH_MULTIPROCESSING( filteredLoom.combine(runs), tfs )

/* cisTarget motif analysis */
// channel for SCENIC databases resources:
Expand Down Expand Up @@ -191,4 +191,4 @@ workflow {
throw new Exception("The given filteredLoom required parameter does not exist in the params.sc.scenic scope.")
scenic( Channel.of( tuple("foobar", file(params.sc.scenic.filteredLoom)) ) )

}
}
32 changes: 0 additions & 32 deletions multi_runs.Dockerfile

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ binDir = !params.containsKey("test") ? "${workflow.projectDir}/src/scenic/bin/"
toolParams = params.sc.scenic
processParams = params.sc.scenic.grn

process GRNBOOST2_WITHOUT_DASK {
process ARBORETO_WITH_MULTIPROCESSING {

// Process will be submitted as job if toolParams.labels.processExecutor = 'qsub' (default)
label "${processParams.labels ? processParams.labels.processExecutor : "local"}"
cache 'deep'
container toolParams.container
publishDir "${toolParams.scenicoutdir}/${sampleId}/grnboost2withoutDask/${"numRuns" in toolParams && toolParams.numRuns > 1 ? "run_" + runId : ""}", mode: 'link', overwrite: true
publishDir "${toolParams.scenicoutdir}/${sampleId}/arboreto_with_multiprocessing/${"numRuns" in toolParams && toolParams.numRuns > 1 ? "run_" + runId : ""}", mode: 'link', overwrite: true
clusterOptions "-l nodes=1:ppn=${processParams.numWorkers} -l pmem=${processParams.pmem} -l walltime=${processParams.walltime} -A ${params.global.qsubaccount}"
maxForks processParams.maxForks

Expand Down Expand Up @@ -40,13 +40,14 @@ process GRNBOOST2_WITHOUT_DASK {
outputFileName = "numRuns" in toolParams && toolParams.numRuns > 1 ? sampleId + "__run_" + runId +"__adj.tsv" : sampleId + "__adj.tsv"
seed = "numRuns" in toolParams && toolParams.numRuns > 1 ? (params.seed + runId) : params.seed
"""
${binDir}grnboost2_without_dask.py \
${binDir}arboreto_with_multiprocessing.py \
$filteredLoom \
$tfs \
--output ${outputFileName} \
--num_workers ${processParams.numWorkers} \
--cell_id_attribute ${toolParams.cell_id_attribute} \
--gene_attribute ${toolParams.gene_attribute} \
--method ${processParams.algorithm} \
--seed ${seed}
"""

Expand Down
Loading

0 comments on commit 648edf7

Please sign in to comment.