pipeline_iCLIP.ini

#################################################################
#
#
# Pipeline pipeline_iCLIP.py configuration file for sphinxreport
#
# This pipeline.ini file lists some configuration options that you might 
# want a user to set automatically.
#
# Add pipeline specific options into separate sections
#
################################################################
## general options
[general]

# Project name
projectname=iCLIP_Pipeline

# Copyright statement
copyright=CGAT (2010-2014)

# The short X.Y version.
version=0.1

# The full version, including alpha/beta/rc tags.
release=0.1

database=csvdb
csv2db_options=--backend=sqlite --retry --map=gene_id:str --map=contig:str --map=transcript_id:str 

input=../demux_fq
mappers=star
strip_sequence=0

########################################################
# the genome to use (UCSC convention)
genome=hg19
genome_dir=/ifs/mirror/genomes/index

[reads]

bc_pattern=NNNXXXXNN
5prime_adapt=AGATCGGAAGAGCGACGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
3prime_adapt=AGAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG
paired=
length=100

[phix]

genome=phix
bowtie_exe=bowtie
bowtie_options=-v 2 --best --strata -a
bowtie_threads=12
bowtie_memory=1.9G

[experiment]
input=FlipIn
length_bins=45,85,120

[dedup]
options = --method=directional-adjacency
################################################################
#
# Location of annotation database
#
################################################################
[annotations]
database=/ifs/data/annotations/hg19_ensembl72/csvdb
db=/ifs/data/annotations/hg19_ensembl72/csvdb

# directory with annotation information
dir=/ifs/data/annotations/hg19_ensembl72


###############################################################
#
#Options for the mapping pipeline
#
###############################################################
[pipeline_mapping]
jobs=100

[fasta]
genome_dir=/ifs/mirror/genomes/bowtie

[geneset]
# set, if ribosomal and other repetetive rna genes should be removed
# (note: lincRNA are kept in)
remove_repetetive_rna=1

# pattern to match for contigs to remove
remove_contigs=chrM|chrMT|_random|chrUn|_hap|chrGL|chrHSCHR

# UCSC mappability track to use for estimating mappability
# of a transcript
mappability=/ifs/mirror/ucsc/hg19/encodeDCC/wgEncodeMapability/wgEncodeCrgMapabilityAlign36mer.bigWig

# minimum flanking error for isoform annotation
flank=5000
###############################################################
[star]
# star executable. 
executable=STAR

# directory with star indices
index_dir=/ifs/mirror/genomes/star

# options to specify different --genomeDir if using sjdb 
# see guidelines for generating genomes with a splice junction database
# (/ifs/mirror/genomes/star/CGAT-README)
# leave blank to use the genome as specified above without junctions.
genome=hg19_junc75_99

# options for star. Please see the star manual for a list
# of options. There are many.
options=--outFilterMultimapNmax 1 --outFilterType BySJout --outFilterMismatchNoverLmax 0.2 --outFilterScoreMin 0.8 --alignSJDBoverhangMin 1


# number of threads to use
threads=12

# memory required for STAR. The memory requirement depends on
# the size of the suffix array. Note that the total memory requirement
# is threads * memory. Our nodes/blades have ~23.5GB.
# After testing, 1.9 (*12) seems to be the magic number
memory=3G
###############################################################
[tophat2]
# tophat executable. An alternative is cgat-tophat, which attempts
# to paralleize the segment_junction step
executable=tophat2

# default values, but enabling extra search options. Added the
# directory containing the bowtie indexed reference transcriptome
options=

# threads to use
threads=4

# memory required for tophat jobs - note that this is multiplied by the
# number of threads. Thus, 4 threads * 2G = 8G
memory=2G

# --mate-inner-dist parameter for tophat, required for paired-ended runs.
# inner distance (in TopHat) = insert length - 2 * read length
# also: tophat2
mate_inner_dist=60

# map to reference transcriptome
# also: tophat2
include_reference_transcriptome=1

# library type (see tophat/cufflinks manual) - also used by cufflinks and cuffdiff
# also: tophat2
library_type=fr-unstranded

################################################################
[bowtie]
# bowtie executable
executable=bowtie

# directory with bowtie indices
index_dir=/ifs/mirror/genomes/bowtie

# options for mapping with bowtie
# note that bowtie two does not accept any of these defaults
# !! For paired end mapping note that the default max insert length is 250 !!
# !! This can be modified with -X                                          !!
#options=-v 2 --best --strata -a
#for bowtie2:
options=--all

# options for mapping with bowtie against transcriptome
# (reporting options are set automatically)
transcriptome_options=-v 2

# threads to use
threads=12

# memory required for bowtie jobs - per thread
memory=1.9G

################################################################
[merge]
pattern_input=(.+)/(.+\-.+\-[^_]+)_(.+)
pattern_output=\1/merged_\2


[clusters]
fdr=0
window_size=15
min_reproducible=2
pthresh=0.1

#######################################################
#######################################################
#######################################################
## options related to motif search
#######################################################
[motifs]

# maximum number of characters for motif discovery
max_size=1000000

####
# masker to use for meme, valid options are 'dustmasker' and 'repeatmasker'
# (the latter requires an appropriately annotated genome.fasta file)
masker=dustmasker

# minimum number of sequences to use for motif search
min_sequences=100

[meme]

# meme model to use
model=zoops

# maximum number of characters for motif discovery
max_sequences=2500

# number of motifs to find with meme
nmotifs=5

# meme_options
options=-minw 4 -maxw 30

#order for background model
background_order=1

[dreme]

options=

################################################################
#
# sphinxreport build options
#
################################################################
[report]

# prefix to use for publishing the report from this pipeline
prefix=iCLIP_fullrun5/
engine=cgatreport
table_class=sortable

[iclip]
dir=.
database=./csvdb
exportdir=export