configs/prepare.yaml

# This is the Snakemake configuration file that specifies paths and 
# and options for the prepare pipeline. Anybody wishing to use
# the provided snakemake pipeline should first fill out this file with paths to
# their own data, as the Snakefile requires it.
# Every config option has reasonable defaults unless it is labeled as "required."
# All paths are relative to the directory that Snakemake is executed in.
# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/)


# The path to a text file specifying where to find read information for each sample
# Each row in the sample file should represent a different sample.
# The sample file should have 3 columns (each separated by a single tab):
#       <unique_sample_name> <fastq1_path> <fastq2_path>
# Alternatively, if you don't have FASTQ files, but you have BAM files, each
# row can have just two columns (where, again, each are separated by a tab):
#       <unique_sample_name> <bam_path>
# If a BAM file is provided, PCR duplicates will not be removed as they usually
# are. If you'd like to skip the peak calling step as well, you must provide a
# third column:
#       <unique_sample_name> <bam_path> <bed_path>
# Note that if you provide a BAM file, it must have read group information and
# no PCR duplicates. There must be an index (.bam.bai) file in the same
# directory as the BAM file. And the BAM file itself must have a .bam file
# extension, while the BED file must have a .bed file extension.
# this is a required input!
sample_file: data/samples.tsv

# Which samples from the samples_file should we execute the pipeline on?
# Comment out this line or set it to a falsey value if you want to run all samples in the sample file
SAMP_NAMES: [SRR891269]

# The path to a reference genome
# The BWA index files, samtools index file (.fa.fai), and samtools dictionary file (.dict) must be stored in the same directory
# required!
genome: /iblm/netapp/data1/external/GRC37/combined/bwa_index_assembly19/Homo_sapiens_assembly19.fasta # replace this with the path to your own reference genome

# The path to the directory in which to place all of the output files
# Defined relative to whatever directory you execute the snakemake command in
# Defaults to "out" if this line is commented out or set to a falsey value
out: out

# Which callers do you want to run to find SNVs?
# If you don't want to run the snp pipeline, set this to a falsey value or comment out the line
snp_callers: [gatk-snp, varscan-snp, vardict-snp, pg-snp]

# Which callers do you want to run to find indels?
# If you don't want to run the indel pipeline, set this to a falsey value or comment out the line
indel_callers: [gatk-indel, varscan-indel, vardict-indel, pindel, illumina-strelka, pg-indel]
# indel_callers: [gatk-indel, varscan-indel, illumina-manta, vardict-indel, pindel, illumina-strelka, breakca, delly, pg-indel] # uncomment this line if you want to use all of the callers (instead of the most important subset -- the subset gives you comparable performance without the extra work)

# Parameters to use when calling bcftools to filter the VCF output of every caller script
# See http://www.htslib.org/doc/bcftools.html#view for all possible options
# Use an empty string or comment out this line if you'd like your VCFs unfiltered
# bcftools_params: "-M 2"

# what should our classifier predict?
# use "INS" if you want the classifier to predict insertions,
# use "DEL" if you want it to predict deletions,
# use "SNP" if you want it to predict SNVs, or
# use "." if you want it to simply predict whether any variant exists at that site
# __deprecated__: you can also specify a multilabel classification by providing a list instead of a single string (ex: ["INS", "DEL"] to predict both ins and dels)
# If this value is not provided, it will default to "."
label: "."

# Whether to "normalize" the VCF output of every caller script
# This option is not recommended if you plan to feed the data to a random forest
# Normalization usually involves left-alignment and trimming of variants
# See https://genome.sph.umich.edu/wiki/Variant_Normalization for more info
# In our pipeline, the normalization step also reduces counts of variants that appear at the same position to 1
# If this value is not provided, it will default to false
no_normalize: false

# Whether to replace NA values in the dataset before outputting the final TSV
# This option is not recommended if you plan to feed the data to a random forest
# Unless otherwise specified in the Caller Specific Parameters below, NA values will be replaced with 0
# If this value is not provided, it will default to false
keep_na: false

# A list of filtering expressions for filtering the sites before outputting the final TSV
# A filtering expression consists of the following concatenated together:
#   - the caller id
#   - a tilde ~
#   - one of awk's comparison operators (the following are currently supported: >, <, ==)
#   - a value to filter on
# Comment out these lines to disable filtering
# Note that your filters may have unattended consequences if 'keep_na' (above) is set to true
snp_filter: ['gatk-snp~DP>10']
indel_filter: ['gatk-indel~DP>10']

# Whether to normalize weird numerical columns before outputting the final TSV
# (ex: scientific notation or numbers followed by a percent sign)
# We recommended setting this to false in order to allow normalization if you
# plan to feed the data to a random forest
# If this option is commented out, it will default to false
pure_numerics: false