-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathprepare.yaml
95 lines (82 loc) · 5.42 KB
/
prepare.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# This is the Snakemake configuration file that specifies paths and
# and options for the prepare pipeline. Anybody wishing to use
# the provided snakemake pipeline should first fill out this file with paths to
# their own data, as the Snakefile requires it.
# Every config option has reasonable defaults unless it is labeled as "required."
# All paths are relative to the directory that Snakemake is executed in.
# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/)
# The path to a text file specifying where to find read information for each sample
# Each row in the sample file should represent a different sample.
# The sample file should have 3 columns (each separated by a single tab):
# <unique_sample_name> <fastq1_path> <fastq2_path>
# Alternatively, if you don't have FASTQ files, but you have BAM files, each
# row can have just two columns (where, again, each are separated by a tab):
# <unique_sample_name> <bam_path>
# If a BAM file is provided, PCR duplicates will not be removed as they usually
# are. If you'd like to skip the peak calling step as well, you must provide a
# third column:
# <unique_sample_name> <bam_path> <bed_path>
# Note that if you provide a BAM file, it must have read group information and
# no PCR duplicates. There must be an index (.bam.bai) file in the same
# directory as the BAM file. And the BAM file itself must have a .bam file
# extension, while the BED file must have a .bed file extension.
# this is a required input!
sample_file: data/samples.tsv
# Which samples from the samples_file should we execute the pipeline on?
# Comment out this line or set it to a falsey value if you want to run all samples in the sample file
SAMP_NAMES: [SRR891269]
# The path to a reference genome
# The BWA index files, samtools index file (.fa.fai), and samtools dictionary file (.dict) must be stored in the same directory
# required!
genome: /iblm/netapp/data1/external/GRC37/combined/bwa_index_assembly19/Homo_sapiens_assembly19.fasta # replace this with the path to your own reference genome
# The path to the directory in which to place all of the output files
# Defined relative to whatever directory you execute the snakemake command in
# Defaults to "out" if this line is commented out or set to a falsey value
out: out
# Which callers do you want to run to find SNVs?
# If you don't want to run the snp pipeline, set this to a falsey value or comment out the line
snp_callers: [gatk-snp, varscan-snp, vardict-snp, pg-snp]
# Which callers do you want to run to find indels?
# If you don't want to run the indel pipeline, set this to a falsey value or comment out the line
indel_callers: [gatk-indel, varscan-indel, vardict-indel, pindel, illumina-strelka, pg-indel]
# indel_callers: [gatk-indel, varscan-indel, illumina-manta, vardict-indel, pindel, illumina-strelka, breakca, delly, pg-indel] # uncomment this line if you want to use all of the callers (instead of the most important subset -- the subset gives you comparable performance without the extra work)
# Parameters to use when calling bcftools to filter the VCF output of every caller script
# See http://www.htslib.org/doc/bcftools.html#view for all possible options
# Use an empty string or comment out this line if you'd like your VCFs unfiltered
# bcftools_params: "-M 2"
# what should our classifier predict?
# use "INS" if you want the classifier to predict insertions,
# use "DEL" if you want it to predict deletions,
# use "SNP" if you want it to predict SNVs, or
# use "." if you want it to simply predict whether any variant exists at that site
# __deprecated__: you can also specify a multilabel classification by providing a list instead of a single string (ex: ["INS", "DEL"] to predict both ins and dels)
# If this value is not provided, it will default to "."
label: "."
# Whether to "normalize" the VCF output of every caller script
# This option is not recommended if you plan to feed the data to a random forest
# Normalization usually involves left-alignment and trimming of variants
# See https://genome.sph.umich.edu/wiki/Variant_Normalization for more info
# In our pipeline, the normalization step also reduces counts of variants that appear at the same position to 1
# If this value is not provided, it will default to false
no_normalize: false
# Whether to replace NA values in the dataset before outputting the final TSV
# This option is not recommended if you plan to feed the data to a random forest
# Unless otherwise specified in the Caller Specific Parameters below, NA values will be replaced with 0
# If this value is not provided, it will default to false
keep_na: false
# A list of filtering expressions for filtering the sites before outputting the final TSV
# A filtering expression consists of the following concatenated together:
# - the caller id
# - a tilde ~
# - one of awk's comparison operators (the following are currently supported: >, <, ==)
# - a value to filter on
# Comment out these lines to disable filtering
# Note that your filters may have unattended consequences if 'keep_na' (above) is set to true
snp_filter: ['gatk-snp~DP>10']
indel_filter: ['gatk-indel~DP>10']
# Whether to normalize weird numerical columns before outputting the final TSV
# (ex: scientific notation or numbers followed by a percent sign)
# We recommended setting this to false in order to allow normalization if you
# plan to feed the data to a random forest
# If this option is commented out, it will default to false
pure_numerics: false