Skip to content
This repository has been archived by the owner on Dec 8, 2020. It is now read-only.

pySCENIC 0.10.0 updates #17

Merged
merged 4 commits into from
Feb 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions bin/arboreto_with_multiprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env python3

import sys
import time
import loompy as lp
import pandas as pd
from multiprocessing import Pool, cpu_count
import argparse
import tqdm

from arboreto.utils import load_tf_names
from arboreto.algo import genie3, grnboost2, _prepare_input
from arboreto.core import SGBM_KWARGS, RF_KWARGS, EARLY_STOP_WINDOW_LENGTH
from arboreto.core import to_tf_matrix, target_gene_indices, infer_partial_network

from pyscenic.cli.utils import load_exp_matrix


################################################################################
################################################################################

parser_grn = argparse.ArgumentParser(description='Run Arboreto using a multiprocessing pool')

parser_grn.add_argument('expression_mtx_fname',
type=argparse.FileType('r'),
help='The name of the file that contains the expression matrix for the single cell experiment.'
' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).')
parser_grn.add_argument('tfs_fname',
type=argparse.FileType('r'),
help='The name of the file that contains the list of transcription factors (TXT; one TF per line).')
parser_grn.add_argument('-m', '--method', choices=['genie3', 'grnboost2'],
default='grnboost2',
help='The algorithm for gene regulatory network reconstruction (default: grnboost2).')
parser_grn.add_argument('-o', '--output',
type=argparse.FileType('w'), default=sys.stdout,
help='Output file/stream, i.e. a table of TF-target genes (TSV).')
parser_grn.add_argument('--num_workers',
type=int, default=cpu_count(),
help='The number of workers to use. (default: {}).'.format(cpu_count()))
parser_grn.add_argument('--seed', type=int, required=False, default=None,
help='Seed value for regressor random state initialization (optional)')

parser_grn.add_argument('--cell_id_attribute',
type=str, default='CellID',
help='The name of the column attribute that specifies the identifiers of the cells in the loom file.')
parser_grn.add_argument('--gene_attribute',
type=str, default='Gene',
help='The name of the row attribute that specifies the gene symbols in the loom file.')
parser_grn.add_argument('--sparse', action='store_const', const=True, default=False,
help='If set, load the expression data as a sparse (CSC) matrix.')
parser_grn.add_argument('-t', '--transpose', action='store_const', const = 'yes',
help='Transpose the expression matrix (rows=genes x columns=cells).')

args = parser_grn.parse_args()


################################################################################
################################################################################


if(args.method == 'grnboost2'):
method_params = [
'GBM', # regressor_type
SGBM_KWARGS # regressor_kwargs
]
elif(args.method == 'genie3'):
method_params = [
'RF', # regressor_type
RF_KWARGS # regressor_kwargs
]


def run_infer_partial_network(target_gene_index):
target_gene_name = gene_names[target_gene_index]
target_gene_expression = ex_matrix[:, target_gene_index]

n = infer_partial_network(
regressor_type=method_params[0],
regressor_kwargs=method_params[1],
tf_matrix=tf_matrix,
tf_matrix_gene_names=tf_matrix_gene_names,
target_gene_name=target_gene_name,
target_gene_expression=target_gene_expression,
include_meta=False,
early_stop_window_length=EARLY_STOP_WINDOW_LENGTH,
seed=args.seed)
return( n )


if __name__ == '__main__':

start_time = time.time()
ex_matrix = load_exp_matrix(args.expression_mtx_fname.name,
(args.transpose == 'yes'),
args.sparse,
args.cell_id_attribute,
args.gene_attribute)

if args.sparse:
gene_names = ex_matrix[1]
ex_matrix = ex_matrix[0]
else:
gene_names = ex_matrix.columns

end_time = time.time()
print(f'Loaded expression matrix of {ex_matrix.shape[0]} cells and {ex_matrix.shape[1]} genes in {end_time - start_time} seconds...', file=sys.stdout)

tf_names = load_tf_names(args.tfs_fname.name)
print(f'Loaded {len(tf_names)} TFs...', file=sys.stdout)

ex_matrix, gene_names, tf_names = _prepare_input(ex_matrix, gene_names, tf_names)
tf_matrix, tf_matrix_gene_names = to_tf_matrix(ex_matrix, gene_names, tf_names)

print(f'starting {args.method} using {args.num_workers} processes...', file=sys.stdout)
start_time = time.time()

with Pool(args.num_workers) as p:
adjs = list(tqdm.tqdm(p.imap(run_infer_partial_network,
target_gene_indices(gene_names, target_genes='all'),
chunksize=1
),
total=len(gene_names)))

adj = pd.concat(adjs).sort_values(by='importance', ascending=False)

end_time = time.time()
print(f'Done in {end_time - start_time} seconds.', file=sys.stdout)

adj.to_csv(args.output, index=False, sep="\t")

133 changes: 0 additions & 133 deletions bin/grnboost2_without_dask.py

This file was deleted.

1 change: 0 additions & 1 deletion conf/append.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
params {
sc {
scenic {
container = 'vibsinglecellnf/scenic:0.9.19'
report_ipynb = '/src/scenic/bin/reports/scenic_report.ipynb'
existingScenicLoom = ''
sampleSuffixWithExtension = '' // Suffix after the sample name in the file path
Expand Down
2 changes: 1 addition & 1 deletion conf/min/base/v0.0.1.config
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ params {
sc {
scenic {
// Container settings
container = 'aertslab/sctx-pyscenic:0.9.19'
container = 'aertslab/pyscenic:0.10.0'

// Reporting settings
reports {
Expand Down
1 change: 0 additions & 1 deletion conf/multi_runs.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
params {
sc {
scenic {
container = 'dweemx/sctx-pyscenic:0.9.19-6'
numRuns = 2
// AUCell parameters
aucell {
Expand Down
1 change: 0 additions & 1 deletion conf/test_multi_runs.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
params {
sc {
scenic {
container = 'dweemx/sctx-pyscenic:0.9.19-6'
numRuns = 2
// AUCell parameters
aucell {
Expand Down
6 changes: 3 additions & 3 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ if(!isAppendOnlyMode && !(params.global.genome.assembly in ['dm6','hg38']))

include './../channels/file.nf' params(params)

include GRNBOOST2_WITHOUT_DASK from './processes/grnboost2withoutDask' params(params)
include ARBORETO_WITH_MULTIPROCESSING from './processes/arboreto_with_multiprocessing' params(params)
include CISTARGET as CISTARGET__MOTIF from './processes/cistarget' params(params)
include CISTARGET as CISTARGET__TRACK from './processes/cistarget' params(params)
include AUCELL as AUCELL__MOTIF from './processes/aucell' params(params)
Expand Down Expand Up @@ -73,7 +73,7 @@ workflow scenic {
main:
/* GRN */
tfs = file(params.sc.scenic.grn.tfs)
grn = GRNBOOST2_WITHOUT_DASK( filteredLoom.combine(runs), tfs )
grn = ARBORETO_WITH_MULTIPROCESSING( filteredLoom.combine(runs), tfs )

/* cisTarget motif analysis */
// channel for SCENIC databases resources:
Expand Down Expand Up @@ -191,4 +191,4 @@ workflow {
throw new Exception("The given filteredLoom required parameter does not exist in the params.sc.scenic scope.")
scenic( Channel.of( tuple("foobar", file(params.sc.scenic.filteredLoom)) ) )

}
}
32 changes: 0 additions & 32 deletions multi_runs.Dockerfile

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ binDir = !params.containsKey("test") ? "${workflow.projectDir}/src/scenic/bin/"
toolParams = params.sc.scenic
processParams = params.sc.scenic.grn

process GRNBOOST2_WITHOUT_DASK {
process ARBORETO_WITH_MULTIPROCESSING {

// Process will be submitted as job if toolParams.labels.processExecutor = 'qsub' (default)
label "${processParams.labels ? processParams.labels.processExecutor : "local"}"
cache 'deep'
container toolParams.container
publishDir "${toolParams.scenicoutdir}/${sampleId}/grnboost2withoutDask/${"numRuns" in toolParams && toolParams.numRuns > 1 ? "run_" + runId : ""}", mode: 'link', overwrite: true
publishDir "${toolParams.scenicoutdir}/${sampleId}/arboreto_with_multiprocessing/${"numRuns" in toolParams && toolParams.numRuns > 1 ? "run_" + runId : ""}", mode: 'link', overwrite: true
clusterOptions "-l nodes=1:ppn=${processParams.numWorkers} -l pmem=${processParams.pmem} -l walltime=${processParams.walltime} -A ${params.global.qsubaccount}"
maxForks processParams.maxForks

Expand Down Expand Up @@ -40,13 +40,14 @@ process GRNBOOST2_WITHOUT_DASK {
outputFileName = "numRuns" in toolParams && toolParams.numRuns > 1 ? sampleId + "__run_" + runId +"__adj.tsv" : sampleId + "__adj.tsv"
seed = "numRuns" in toolParams && toolParams.numRuns > 1 ? (params.seed + runId) : params.seed
"""
${binDir}grnboost2_without_dask.py \
${binDir}arboreto_with_multiprocessing.py \
$filteredLoom \
$tfs \
--output ${outputFileName} \
--num_workers ${processParams.numWorkers} \
--cell_id_attribute ${toolParams.cell_id_attribute} \
--gene_attribute ${toolParams.gene_attribute} \
--method ${processParams.algorithm} \
--seed ${seed}
"""

Expand Down
Loading